diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc index 4552d7b5ba9..d095d220b97 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc @@ -299,12 +299,11 @@ StatusOr CpuExecutable::ExecuteAsyncOnStream( const Shape& expected_shape = entry_comp->parameter_instruction(i)->shape(); const Shape& actual_shape = arguments[i].Buffers().shape(); - CHECK( - Shape::Equal().IgnoreDynamicDimension()(expected_shape, actual_shape)) - << absl::StreamFormat( - "Shape mismatch on argument %d. Expected %s, but was %s.", i, - expected_shape.ToString(/*print_layout=*/true), - actual_shape.ToString(/*print_layout=*/true)); + TF_RET_CHECK( + ShapeUtil::DynamicShapeIsCompatible(actual_shape, expected_shape)) + << "Shape mismatch on argument " << i << ", " + << expected_shape.ToString(/*print_layout=*/true) << " vs. " + << actual_shape.ToString(/*print_layout=*/true); } } diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc index ebf7cc440dd..61ce6200a28 100644 --- a/tensorflow/compiler/xla/service/executable.cc +++ b/tensorflow/compiler/xla/service/executable.cc @@ -28,10 +28,57 @@ limitations under the License. #include "tensorflow/core/lib/io/path.h" #include "tensorflow/core/lib/strings/proto_serialization.h" #include "tensorflow/core/platform/env.h" +#include "tensorflow/core/platform/errors.h" #include "tensorflow/stream_executor/device_description.h" namespace xla { +ExecutionInput::~ExecutionInput() { + for (auto& index : unowned_indices_) { + auto buffer = buffers_.mutable_element(index)->Release(); + if (buffer) { + buffer->Release(); + } + } +} + +Status ExecutionInput::SetDynamicShape(Shape dynamic_shape) { + const Shape& input_shape = shape(); + if (!ShapeUtil::DynamicShapeIsCompatible(input_shape, dynamic_shape)) { + return tensorflow::errors::InvalidArgument( + "Cannot set dynamic shape: ", input_shape.DebugString(), " vs. ", + dynamic_shape.DebugString()); + } + dynamic_shape_ = absl::make_unique(std::move(dynamic_shape)); + return Status::OK(); +} + +void ExecutionInput::SetUnownedBuffer(const ShapeIndex& index, + MaybeOwningDeviceMemory buffer) { + *buffers_.mutable_element(index) = std::move(buffer); + unowned_indices_.insert(index); +} + +xla::StatusOr ExecutionInput::ToShapedBuffer( + se::DeviceMemoryAllocator* allocator, int device_ordinal) const { + const Shape& input_shape = shape(); + xla::ShapedBuffer shaped_buffer(input_shape, input_shape, + allocator->platform(), device_ordinal); + for (const auto& index_buffer : Buffers()) { + const tensorflow::se::OwningDeviceMemory* mem = + index_buffer.second.AsOwningDeviceMemory(); + if (mem != nullptr && (mem->allocator() != allocator || + mem->device_ordinal() != device_ordinal)) { + return tensorflow::errors::InvalidArgument( + "Device buffer at index ", index_buffer.first.ToString(), + " has mismatching allocator/device"); + } + shaped_buffer.set_buffer(index_buffer.second.AsDeviceMemoryBase(), + index_buffer.first); + } + return std::move(shaped_buffer); +} + StatusOr Executable::ExecuteOnStream( const ServiceExecutableRunOptions* run_options, absl::Span arguments, diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h index 2c979662d24..6881f6dd68a 100644 --- a/tensorflow/compiler/xla/service/executable.h +++ b/tensorflow/compiler/xla/service/executable.h @@ -17,6 +17,7 @@ limitations under the License. #define TENSORFLOW_COMPILER_XLA_SERVICE_EXECUTABLE_H_ #include +#include #include #include @@ -65,31 +66,32 @@ class ExecutionInput { : buffers_(std::move(buffers)) {} ExecutionInput(ExecutionInput&&) = default; - ~ExecutionInput() { - for (auto& index : unowned_indices_) { - auto buffer = buffers_.mutable_element(index)->Release(); - if (buffer) { - buffer->Release(); - } - } - } + ~ExecutionInput(); ExecutionInput& operator=(ExecutionInput&&) = default; - const Shape& shape() const { return buffers_.shape(); } + const Shape& shape() const { + return dynamic_shape_ != nullptr ? *dynamic_shape_ : buffers_.shape(); + } + + Status SetDynamicShape(Shape dynamic_shape); + + xla::StatusOr ToShapedBuffer( + se::DeviceMemoryAllocator* allocator, int device_ordinal) const; void SetBuffer(const ShapeIndex& index, MaybeOwningDeviceMemory buffer) { *buffers_.mutable_element(index) = std::move(buffer); } void SetUnownedBuffer(const ShapeIndex& index, - MaybeOwningDeviceMemory buffer) { - *buffers_.mutable_element(index) = std::move(buffer); - unowned_indices_.push_back(index); - } + MaybeOwningDeviceMemory buffer); void SetUnownedIndex(const ShapeIndex& index) { - unowned_indices_.push_back(index); + unowned_indices_.insert(index); + } + + void ClearUnownedIndex(const ShapeIndex& index) { + unowned_indices_.erase(index); } const ShapeTree& Buffers() const { return buffers_; } @@ -106,9 +108,10 @@ class ExecutionInput { private: ShapeTree buffers_; - // (Unordered) set of indices of buffers that should be returned to the - // caller if an error occurs when enqueuing the computation. - std::vector unowned_indices_; + // Set of indices of buffers that should be returned to the caller if an error + // occurs when enqueuing the computation. + std::set unowned_indices_; + std::unique_ptr dynamic_shape_; }; // ExecutionOutput encapsulates the output buffers of a execution and the @@ -145,7 +148,6 @@ class ExecutionOutput { to_be_released_.push_back(std::move(mem)); } - // Should be called once it is known that the execute operation succeeded, // before returning the ExecutionOutput to the caller. ExecutionOutput& Commit() { diff --git a/tensorflow/compiler/xla/service/maybe_owning_device_memory.cc b/tensorflow/compiler/xla/service/maybe_owning_device_memory.cc index c4bf48bcc00..c7505f5fa4a 100644 --- a/tensorflow/compiler/xla/service/maybe_owning_device_memory.cc +++ b/tensorflow/compiler/xla/service/maybe_owning_device_memory.cc @@ -14,7 +14,9 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h" + #include "absl/types/variant.h" + namespace xla { tensorflow::se::DeviceMemoryBase MaybeOwningDeviceMemory::AsDeviceMemoryBase() @@ -38,4 +40,10 @@ MaybeOwningDeviceMemory::Release() { return std::move(absl::get(mem_)); } +const tensorflow::se::OwningDeviceMemory* +MaybeOwningDeviceMemory::AsOwningDeviceMemory() const { + return HasOwnership() ? &absl::get(mem_) + : nullptr; +} + } // namespace xla diff --git a/tensorflow/compiler/xla/service/maybe_owning_device_memory.h b/tensorflow/compiler/xla/service/maybe_owning_device_memory.h index 7d23d178130..0b56fed0a72 100644 --- a/tensorflow/compiler/xla/service/maybe_owning_device_memory.h +++ b/tensorflow/compiler/xla/service/maybe_owning_device_memory.h @@ -57,6 +57,10 @@ class MaybeOwningDeviceMemory { // A nullopt is returned if the HasOwnership() == false; absl::optional Release(); + // If the device memory is owned, returns a pointer to the internal + // OwningDeviceMemory, otherwise nullptr is returned. + const tensorflow::se::OwningDeviceMemory* AsOwningDeviceMemory() const; + // Returns true if the device_memory has ownership over underlying memory. bool HasOwnership() const; diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc index ab46e49b181..bce40578132 100644 --- a/tensorflow/compiler/xla/shape_util.cc +++ b/tensorflow/compiler/xla/shape_util.cc @@ -1461,7 +1461,7 @@ ShapeUtil::ReshapeLeavesDimensionsUnmodified( return shape; } -/* static */ bool ShapeUtil::DynamicShapeIsCompatible( +/* static */ bool ShapeUtil::DynamicArrayShapeIsCompatible( const xla::Shape& dynamic_shape, const xla::Shape& bounded_shape) { if (dynamic_shape.rank() != bounded_shape.rank()) { return false; @@ -1474,6 +1474,36 @@ ShapeUtil::ReshapeLeavesDimensionsUnmodified( return true; } +/* static */ bool ShapeUtil::DynamicShapeIsCompatible( + const xla::Shape& dynamic_shape, const xla::Shape& bounded_shape) { + bool compatible = true; + xla::ShapeUtil::ForEachSubshape(dynamic_shape, [&](const Shape& sub_shape, + const ShapeIndex& index) { + if (compatible) { + auto subshape_result = TryGetSubshape(bounded_shape, index); + if (subshape_result.ok()) { + const Shape* bounded_sub_shape = subshape_result.ConsumeValueOrDie(); + if (sub_shape.IsTuple()) { + if (!bounded_sub_shape->IsTuple()) { + compatible = false; + } + } else { + if (bounded_sub_shape->IsTuple()) { + compatible = false; + } else if (!sub_shape.is_static() && + !DynamicArrayShapeIsCompatible(sub_shape, + *bounded_sub_shape)) { + compatible = false; + } + } + } else { + compatible = false; + } + } + }); + return compatible; +} + /* static */ Shape ShapeUtil::FilterDimensions( const std::function& p, Shape shape) { CHECK(shape.IsArray()); diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h index dde56587482..fe1a8acf6e4 100644 --- a/tensorflow/compiler/xla/shape_util.h +++ b/tensorflow/compiler/xla/shape_util.h @@ -657,7 +657,11 @@ class ShapeUtil { Shape shape); // Returns true if `dynamic_shape` has dimensions that are less-equal to the - // "bounded_shape". + // "bounded_shape". Shapes must be arrays. + static bool DynamicArrayShapeIsCompatible(const xla::Shape& dynamic_shape, + const xla::Shape& bounded_shape); + + // Same as DynamicArrayShapeIsCompatible() but supports tuples. static bool DynamicShapeIsCompatible(const xla::Shape& dynamic_shape, const xla::Shape& bounded_shape); diff --git a/tensorflow/compiler/xrt/BUILD b/tensorflow/compiler/xrt/BUILD index 332c8ff9a14..6a704be4adb 100644 --- a/tensorflow/compiler/xrt/BUILD +++ b/tensorflow/compiler/xrt/BUILD @@ -74,6 +74,7 @@ cc_library( "//tensorflow/compiler/xla/client:local_client", "//tensorflow/compiler/xla/service:backend", "//tensorflow/compiler/xla/service:executable", + "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/service:shaped_buffer", "//tensorflow/core:core_cpu_internal", "//tensorflow/core:framework", diff --git a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc index 2fc599e42df..3bd8af577c8 100644 --- a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc +++ b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc @@ -51,12 +51,6 @@ namespace tensorflow { namespace { -struct InputBuffers { - std::vector> input_tuples; - std::vector input_allocations; - std::vector input_pointers; -}; - uint32 InitialRandomSeed() { // Support plumbing the TF seed through to XLA is being worked on. // If a user wants deterministic behavior, their best option @@ -80,75 +74,51 @@ uint32 GetXLARandomSeed() { return counter.fetch_add(2); } -xla::StatusOr GetInputBuffers( - XRTMemoryManager::WorkingSet* working_set, xla::Backend* backend, - const std::vector& input_coords, bool release_inputs) { - InputBuffers input_buffers; - input_buffers.input_tuples.reserve(input_coords.size()); - input_buffers.input_allocations.reserve(input_coords.size()); - input_buffers.input_pointers.reserve(input_coords.size()); - for (size_t i = 0; i < input_coords.size(); ++i) { - TF_RETURN_IF_ERROR( - working_set->LookupAndPin(backend, input_coords[i].handle)); - auto tuple = working_set->PinnedTuples().back(); - input_buffers.input_tuples.emplace_back(tuple); - if (release_inputs) { - // We are holding a reference to the tuple, so we can safely delete it - // from the resource manager here. - TF_RETURN_IF_ERROR( - working_set->MemoryManager()->Release(input_coords[i].handle)); - VLOG(2) << "Released allocation handle " << input_coords[i].handle; - } - if (input_coords[i].index.empty()) { - TF_ASSIGN_OR_RETURN(xla::ShapedBuffer shaped_buffer, - tuple->ToShapedBuffer()); - input_buffers.input_allocations.emplace_back(std::move(shaped_buffer)); - } else { - TF_ASSIGN_OR_RETURN(xla::ShapedBuffer shaped_buffer, - tuple->ToShapedBuffer()); - TF_ASSIGN_OR_RETURN(xla::ShapedBuffer sub_shaped_buffer, - shaped_buffer.SubShapedBuffer(input_coords[i].index)); - input_buffers.input_allocations.emplace_back( - std::move(sub_shaped_buffer)); - } +std::vector GetDynamicInputInfo( + const xla::ComputationLayout& computation_layout) { + std::vector input_is_dynamic; + input_is_dynamic.reserve(computation_layout.parameter_count()); + for (int64 i = 0; i < computation_layout.parameter_count(); ++i) { + input_is_dynamic.push_back( + !computation_layout.parameter_shape(i).is_static()); } - for (size_t i = 0; i < input_buffers.input_allocations.size(); ++i) { - input_buffers.input_pointers.push_back(&input_buffers.input_allocations[i]); - } - return std::move(input_buffers); + return input_is_dynamic; } -xla::StatusOr GetChainedOpInputs( +xla::StatusOr>> GetInputTuples( + xla::LocalExecutable* executable, XRTMemoryManager::WorkingSet* working_set, + xla::Backend* backend, const std::vector& input_coords, + bool release_inputs) { + const xla::ComputationLayout& computation_layout = + executable->executable()->module_config().entry_computation_layout(); + + return GetInputTupleAllocations( + input_coords, working_set, backend, computation_layout.parameter_count(), + [&](int64 i) { return computation_layout.parameter_shape(i); }, + release_inputs); +} + +xla::StatusOr>> GetChainedOpInputTuples( const xrt::XRTChainedExecuteOp& op, absl::Span> op_inputs) { - InputBuffers input_buffers; - input_buffers.input_tuples.reserve(op.inputs_size()); - input_buffers.input_allocations.reserve(op.inputs_size()); - input_buffers.input_pointers.reserve(op.inputs_size()); + std::vector> input_tuples; + input_tuples.reserve(op.inputs_size()); for (int i = 0; i < op.inputs_size(); ++i) { auto& input = op.inputs(i); - input_buffers.input_tuples.emplace_back(op_inputs[i]); // Thanks to the greatness of proto3, there is no way to query for // explicitly set fields, so the default for output_index (zero) means no // sub-index. As consequence, the real index is output_index - 1. if (input.output_index() == 0) { - TF_ASSIGN_OR_RETURN(xla::ShapedBuffer shaped_buffer, - input_buffers.input_tuples.back()->ToShapedBuffer()); - input_buffers.input_allocations.emplace_back(std::move(shaped_buffer)); + input_tuples.emplace_back(op_inputs[i]); } else { - TF_ASSIGN_OR_RETURN(xla::ShapedBuffer shaped_buffer, - input_buffers.input_tuples.back()->ToShapedBuffer()); - TF_ASSIGN_OR_RETURN( - xla::ShapedBuffer sub_shaped_buffer, - shaped_buffer.SubShapedBuffer({input.output_index() - 1})); - input_buffers.input_allocations.emplace_back( - std::move(sub_shaped_buffer)); + XRTTupleAllocation* sub_tuple; + TF_RETURN_IF_ERROR(XRTTupleAllocation::MakeSubBuffer( + op_inputs[i].get(), {input.output_index() - 1}, &sub_tuple, + /*alias_parent_allocation=*/true)); + input_tuples.emplace_back(sub_tuple); } } - for (size_t i = 0; i < input_buffers.input_allocations.size(); ++i) { - input_buffers.input_pointers.push_back(&input_buffers.input_allocations[i]); - } - return std::move(input_buffers); + return input_tuples; } // Given a shape, returns a byte array representing the shape metadata of the @@ -228,12 +198,11 @@ Status UpdateMetadata(se::Stream* stream, se::DeviceMemory* buffer, // As we can't expand the size of an existing memory allocation, a reallocation // is required. A list of new allocations are returned after this function. The // caller is reponsible for maintaining those allocations. -xla::StatusOr> UpdateDynamicInputs( +Status UpdateDynamicInputs( se::Stream* stream, se::DeviceMemoryAllocator* allocator, - std::vector runtime_inputs, + std::vector* execution_inputs, const std::vector& compile_time_shapes) { - std::vector new_allocations; - TF_RET_CHECK(runtime_inputs.size() == compile_time_shapes.size()); + TF_RET_CHECK(execution_inputs->size() == compile_time_shapes.size()); TF_ASSIGN_OR_RETURN(auto compiler, xla::Compiler::GetForPlatform( stream->parent()->platform())); auto shape_size_fn = compiler->ShapeSizeBytesFunction(); @@ -242,57 +211,61 @@ xla::StatusOr> UpdateDynamicInputs( if (compile_time_shape.is_static()) { continue; } - auto* runtime_input = runtime_inputs[i]; - + xla::ExecutionInput* execution_input = &(*execution_inputs)[i]; bool element_modified = false; TF_RETURN_IF_ERROR(xla::ShapeUtil::ForEachSubshapeWithStatus( compile_time_shape, - [&](const xla::Shape& compile_time_shape, + [&](const xla::Shape& sub_shape, const xla::ShapeIndex& index) -> Status { - if (compile_time_shape.IsTuple() || compile_time_shape.is_static()) { + if (sub_shape.IsTuple() || sub_shape.is_static()) { return Status::OK(); } - const xla::Shape& runtime_shape = xla::ShapeUtil::GetSubshape( - runtime_input->on_device_shape(), index); - TF_RET_CHECK(!runtime_shape.IsTuple()); - TF_RET_CHECK(xla::ShapeUtil::DynamicShapeIsCompatible( - runtime_shape, compile_time_shape)); - se::DeviceMemoryBase* static_input = - runtime_input->buffers().mutable_element(index); TF_ASSIGN_OR_RETURN( - auto dynamic_input, + const xla::Shape* runtime_shape, + xla::ShapeUtil::TryGetSubshape(execution_input->shape(), index)); + TF_RET_CHECK(!runtime_shape->IsTuple()); + TF_RET_CHECK(xla::ShapeUtil::DynamicArrayShapeIsCompatible( + *runtime_shape, sub_shape)); + TF_ASSIGN_OR_RETURN( + se::OwningDeviceMemory dynamic_input, allocator->Allocate(stream->parent()->device_ordinal(), - shape_size_fn(compile_time_shape))); - new_allocations.emplace_back(std::move(dynamic_input)); - se::DeviceMemory* dynamic_input_base = - new_allocations.back().ptr(); + shape_size_fn(sub_shape))); + + se::DeviceMemoryBase static_input = + execution_input->Buffer(index).AsDeviceMemoryBase(); + se::DeviceMemory* dynamic_input_base = dynamic_input.ptr(); // Send the original data to the new location. - stream->ThenMemcpyD2D(dynamic_input_base, *static_input, - static_input->size()); + stream->ThenMemcpyD2D(dynamic_input_base, static_input, + static_input.size()); TF_RETURN_IF_ERROR(UpdateMetadata(stream, dynamic_input_base, - compile_time_shape, runtime_shape)); + sub_shape, *runtime_shape)); // Modify the memory location in the input shape tree to point to the // new input. - runtime_input->set_buffer(*dynamic_input_base, index); + execution_input->SetBuffer( + index, xla::MaybeOwningDeviceMemory(std::move(dynamic_input))); + execution_input->ClearUnownedIndex(index); element_modified = true; return Status::OK(); })); if (element_modified) { - runtime_input->set_shapes(compile_time_shape, compile_time_shape); + TF_RETURN_IF_ERROR(execution_input->SetDynamicShape(compile_time_shape)); + TF_ASSIGN_OR_RETURN(xla::ShapedBuffer shaped_buffer, + execution_input->ToShapedBuffer( + allocator, stream->parent()->device_ordinal())); // The input location has been modified, need to fix tuple table to // point to the correct address. TF_ASSIGN_OR_RETURN( auto transfer_manager, xla::TransferManager::GetForPlatform(stream->parent()->platform())); TF_RETURN_IF_ERROR( - transfer_manager->WriteTupleIndexTablesAsync(stream, *runtime_input)); + transfer_manager->WriteTupleIndexTablesAsync(stream, shaped_buffer)); } } - return std::move(new_allocations); + return Status::OK(); } xla::StatusOr ReadMetadataLiteral( - se::Stream* stream, se::DeviceMemoryBase* buffer, + se::Stream* stream, se::DeviceMemoryBase buffer, const xla::Shape& buffer_shape, xla::TransferManager* transfer_manager) { TF_ASSIGN_OR_RETURN(auto compiler, xla::Compiler::GetForPlatform( stream->parent()->platform())); @@ -302,7 +275,7 @@ xla::StatusOr ReadMetadataLiteral( const int64 offset = shape_size_fn(buffer_shape_static); int64 metadata_size = shape_size_fn(buffer_shape) - offset; TF_RET_CHECK(metadata_size != 0); - auto buffer_8 = se::DeviceMemory(*buffer); + auto buffer_8 = se::DeviceMemory(buffer); auto metadata_buffer = stream->parent()->GetSubBuffer(&buffer_8, offset, metadata_size); return transfer_manager->TransferArrayFromDevice( @@ -315,7 +288,7 @@ xla::StatusOr ReadMetadataLiteral( // dimension sizes from the metadata, and update output shapes. The result shape // is a static and concrete shape. xla::Status UpdateDynamicOutputs(se::Stream* stream, - xla::ShapedBuffer* shaped_buffer, + const xla::ShapedBuffer& shaped_buffer, xla::Shape* output_host_shape, xla::Shape* output_device_shape) { DCHECK(output_device_shape->is_dynamic()); @@ -323,8 +296,8 @@ xla::Status UpdateDynamicOutputs(se::Stream* stream, auto transfer_manager, xla::TransferManager::GetForPlatform(stream->parent()->platform())); TF_RETURN_IF_ERROR(stream->BlockHostUntilDone()); - TF_RETURN_IF_ERROR(shaped_buffer->buffers().ForEachMutableElementWithStatus( - [&](const xla::ShapeIndex& index, se::DeviceMemoryBase* buffer) { + TF_RETURN_IF_ERROR(shaped_buffer.buffers().ForEachElementWithStatus( + [&](const xla::ShapeIndex& index, const se::DeviceMemoryBase& buffer) { const xla::Shape& buffer_shape = xla::ShapeUtil::GetSubshape(*output_device_shape, index); if (buffer_shape.IsTuple()) { @@ -352,19 +325,18 @@ xla::Status UpdateDynamicOutputs(se::Stream* stream, return Status::OK(); } -// Create output tuple from run_result. xla::StatusOr> CreateOutputTuple( - se::Stream* stream, xla::ScopedShapedBuffer run_result, - xla::Backend* backend, int device_ordinal) { + se::Stream* stream, xla::ExecutionOutput run_result, xla::Backend* backend, + int device_ordinal) { XRTTupleAllocation* output_tuple; - xla::ShapedBuffer shaped_buffer = run_result.release(); + const xla::ScopedShapedBuffer& shaped_buffer = run_result.Result(); if (shaped_buffer.on_device_shape().is_dynamic()) { // Update dynamic shapes from output buffer, and create a XRT tensor with // dimension sizes read from metadata. xla::Shape output_host_shape = shaped_buffer.on_host_shape(); xla::Shape output_device_shape = shaped_buffer.on_device_shape(); TF_RETURN_IF_ERROR(UpdateDynamicOutputs( - stream, &shaped_buffer, &output_host_shape, &output_device_shape)); + stream, shaped_buffer, &output_host_shape, &output_device_shape)); TF_RETURN_IF_ERROR(XRTTupleAllocation::CreateFromBuffer( shaped_buffer, output_host_shape, output_device_shape, backend, device_ordinal, &output_tuple)); @@ -373,15 +345,27 @@ xla::StatusOr> CreateOutputTuple( TF_RETURN_IF_ERROR(XRTTupleAllocation::CreateFromBuffer( shaped_buffer, backend, device_ordinal, &output_tuple)); } + // After the output tuple is created, we can release the output result + // buffers, to make sure they won't be cleared by its destructor. + (void)run_result.ConsumeResult().release(); return RefPtr(output_tuple); } xla::StatusOr> RunExecutable( OpKernelContext* context, XRTGenericDeviceAccessor::ScopedRef* device_ref, - xla::LocalExecutable* executable, const InputBuffers& input_buffers, - se::Stream* stream, int rng_seed, + xla::LocalExecutable* executable, + absl::Span> input_tuples, + bool release_inputs, se::Stream* stream, int rng_seed, const xrt::CommonExecutionConfig& config) { - VLOG(2) << "Executing computation."; + const xla::ComputationLayout& computation_layout = + executable->executable()->module_config().entry_computation_layout(); + std::vector input_is_dynamic = GetDynamicInputInfo(computation_layout); + TF_ASSIGN_OR_RETURN( + std::vector execution_inputs, + GetArgumentsBuffers( + executable->executable()->module().input_output_alias_config(), + input_tuples, input_is_dynamic, release_inputs)); + xla::ExecutableRunOptions run_options; run_options.set_stream(stream); run_options.set_allocator(device_ref->backend()->memory_allocator()); @@ -419,51 +403,28 @@ xla::StatusOr> RunExecutable( } run_options.set_gpu_executable_run_options(&gpu_options); - Env* env = Env::Default(); - auto start_time = env->NowMicros(); const std::vector& shape_layouts = executable->executable() ->module_config() .entry_computation_layout() .parameter_layouts(); - TF_ASSIGN_OR_RETURN(auto new_allocations, - UpdateDynamicInputs(stream, run_options.allocator(), - input_buffers.input_pointers, - shape_layouts)); - auto new_allocations_ptr = - std::make_shared>( - std::move(new_allocations)); + TF_RETURN_IF_ERROR(UpdateDynamicInputs(stream, run_options.allocator(), + &execution_inputs, shape_layouts)); TF_ASSIGN_OR_RETURN( - xla::ScopedShapedBuffer run_result, - executable->Run(input_buffers.input_pointers, run_options)); - // Retain the new allocation for input memory until the end of execution. - stream->ThenDoHostCallback([new_allocations_ptr]() { return Status::OK(); }); - - auto elapsed = env->NowMicros() - start_time; - VLOG(2) << "Elapsed time: " << elapsed << "us"; + xla::ExecutionOutput run_result, + executable->Run(std::move(execution_inputs), run_options)); TF_ASSIGN_OR_RETURN( RefPtr output_tuple_ptr, CreateOutputTuple(stream, std::move(run_result), device_ref->backend(), device_ref->device_ordinal())); - // The ScopedShapedBuffer returned by the executable Run() API, in case of // input/output buffer aliasing, might have holes in it, which need to be // filled using the proper input tuples buffers which are the source of // aliasing. - const xla::HloInputOutputAliasConfig& input_output_alias = - executable->executable()->module().input_output_alias_config(); - auto alias_function = - [&](const xla::ShapeIndex& output_index, - const xla::HloInputOutputAliasConfig::Alias& alias) -> Status { - TF_RET_CHECK(alias.parameter_number < input_buffers.input_tuples.size()); - return alias.kind == xla::HloInputOutputAliasConfig::AliasKind::kUserAlias - ? output_tuple_ptr->AliasBufferFrom( - *input_buffers.input_tuples[alias.parameter_number], - alias.parameter_index, output_index) - : Status::OK(); - }; - TF_RETURN_IF_ERROR(input_output_alias.ForEachAliasWithStatus(alias_function)); + TF_RETURN_IF_ERROR(RebuildOutputAliases( + output_tuple_ptr, input_tuples, + executable->executable()->module().input_output_alias_config())); return std::move(output_tuple_ptr); } @@ -471,12 +432,13 @@ xla::StatusOr> RunExecutable( xla::StatusOr> ExecuteComputation( OpKernelContext* context, XRTMemoryManager* memory_manager, XRTGenericDeviceAccessor::ScopedRef* device_ref, - xla::LocalExecutable* executable, const InputBuffers& input_buffers, - se::Stream* stream, int rng_seed, + xla::LocalExecutable* executable, + absl::Span> input_tuples, + bool release_inputs, se::Stream* stream, int rng_seed, const xrt::CommonExecutionConfig& config) { auto runfn = [&]() { - return RunExecutable(context, device_ref, executable, input_buffers, stream, - rng_seed, config); + return RunExecutable(context, device_ref, executable, input_tuples, + release_inputs, stream, rng_seed, config); }; // We pass zero as requested_free_size as there is no simple way to get the @@ -495,12 +457,13 @@ xla::StatusOr> ExecuteComputation( se::Stream* stream, int rng_seed, const xrt::CommonExecutionConfig& config) { XRTMemoryManager::WorkingSet working_set(memory_manager); - TF_ASSIGN_OR_RETURN(InputBuffers input_buffers, - GetInputBuffers(&working_set, device_ref->backend(), - input_coords, release_inputs)); + TF_ASSIGN_OR_RETURN( + std::vector> input_tuples, + GetInputTuples(executable, &working_set, device_ref->backend(), + input_coords, release_inputs)); return ExecuteComputation(context, memory_manager.get(), device_ref, - executable, input_buffers, stream, rng_seed, - config); + executable, input_tuples, release_inputs, stream, + rng_seed, config); } // XRTExecuteOp @@ -653,16 +616,16 @@ Status XRTExecuteChainedOp::DoWork(OpKernelContext* context) { auto execute_op = [&](const xrt::XRTChainedExecuteOp& op, absl::Span> op_inputs) -> xla::StatusOr> { - TF_ASSIGN_OR_RETURN(InputBuffers input_buffers, - GetChainedOpInputs(op, op_inputs)); - std::unique_ptr entry; TF_RETURN_IF_ERROR(cache->Lookup(op.computation_handle(), &entry)); xla::LocalExecutable* executable = entry->get().get_executable(); - return ExecuteComputation(context, memory_manager.get(), &device_ref, - executable, input_buffers, stream, rng_seed, - config.common_config()); + TF_ASSIGN_OR_RETURN(std::vector> input_tuples, + GetChainedOpInputTuples(op, op_inputs)); + + return ExecuteComputation( + context, memory_manager.get(), &device_ref, executable, input_tuples, + /*release_inputs=*/false, stream, rng_seed, config.common_config()); }; return ExecuteChained(context, memory_manager, device_ref.backend(), diff --git a/tensorflow/compiler/xrt/xrt_util.cc b/tensorflow/compiler/xrt/xrt_util.cc index b8a0afc92c5..926ba23c7af 100644 --- a/tensorflow/compiler/xrt/xrt_util.cc +++ b/tensorflow/compiler/xrt/xrt_util.cc @@ -221,6 +221,140 @@ xla::StatusOr> GetComputationInputs( return std::move(input_coords); } +bool InputShapeMatches(const xla::Shape& parameter_shape, + const xla::Shape& input_shape) { + auto shape_checker = [&](const xla::Shape& pshape, + const xla::ShapeIndex& index) { + if (pshape.IsArray()) { + TF_ASSIGN_OR_RETURN(const xla::Shape* ishape, + xla::ShapeUtil::TryGetSubshape(input_shape, index)); + if (pshape.rank() != ishape->rank() || + pshape.element_type() != ishape->element_type()) { + return errors::InvalidArgument("Mismatching shapes"); + } + if (pshape.is_static() && pshape.layout() != ishape->layout()) { + return errors::InvalidArgument("Mismatching layouts"); + } + for (int64 dim = 0; dim < pshape.rank(); ++dim) { + if (pshape.is_dynamic_dimension(dim)) { + if (pshape.dimensions(dim) < ishape->dimensions(dim)) { + return errors::InvalidArgument("Mismatching shapes"); + } + } else if (pshape.dimensions(dim) != ishape->dimensions(dim)) { + return errors::InvalidArgument("Mismatching shapes"); + } + } + } + return Status::OK(); + }; + return xla::ShapeUtil::ForEachSubshapeWithStatus(parameter_shape, + shape_checker) + .ok(); +} + +xla::StatusOr>> GetInputTupleAllocations( + const std::vector& input_coords, + XRTMemoryManager::WorkingSet* working_set, xla::Backend* backend, + int64 num_input_shapes, + const std::function& shape_getter, bool release_inputs) { + if (input_coords.size() != num_input_shapes) { + return errors::InvalidArgument( + "Number of inputs does not match executable proto input shapes: ", + input_coords.size(), " vs. ", num_input_shapes); + } + std::vector> input_tuples; + input_tuples.reserve(input_coords.size()); + for (size_t i = 0; i < input_coords.size(); ++i) { + TF_RETURN_IF_ERROR( + working_set->LookupAndPin(backend, input_coords[i].handle)); + auto tuple = working_set->PinnedTuples().back(); + if (release_inputs) { + // We are holding a reference to the tuple, so we can safely delete it + // from the resource manager here. + TF_RETURN_IF_ERROR( + working_set->MemoryManager()->Release(input_coords[i].handle)); + VLOG(2) << "Released allocation handle " << input_coords[i].handle; + } + xla::Shape input_shape = shape_getter(i); + if (!InputShapeMatches(input_shape, tuple->on_host_shape())) { + return errors::InvalidArgument( + "Run-time shape mismatch for XRTExecute argument[", i, "] (", + input_coords[i].handle, "). Expected ", input_shape.DebugString(), + "; got ", tuple->on_host_shape().DebugString()); + } + if (input_coords[i].index.empty()) { + input_tuples.emplace_back(std::move(tuple)); + } else { + XRTTupleAllocation* sub_tuple; + TF_RETURN_IF_ERROR(XRTTupleAllocation::MakeSubBuffer( + tuple.get(), input_coords[i].index, &sub_tuple, + /*alias_parent_allocation=*/true)); + input_tuples.emplace_back(sub_tuple); + } + } + return std::move(input_tuples); +} + +Status RebuildOutputAliases( + const RefPtr& output_tuple, + absl::Span> input_tuples, + const xla::HloInputOutputAliasConfig& input_output_alias) { + auto alias_function = + [&](const xla::ShapeIndex& output_index, + const xla::HloInputOutputAliasConfig::Alias& alias) -> Status { + TF_RET_CHECK(alias.parameter_number < input_tuples.size()); + return alias.kind == xla::HloInputOutputAliasConfig::AliasKind::kUserAlias + ? output_tuple->AliasBufferFrom( + *input_tuples[alias.parameter_number], + alias.parameter_index, output_index) + : Status::OK(); + }; + return input_output_alias.ForEachAliasWithStatus(alias_function); +} + +xla::StatusOr> GetArgumentsBuffers( + const xla::HloInputOutputAliasConfig& input_output_alias, + absl::Span> input_tuples, + const std::vector& input_is_dynamic, bool release_inputs) { + auto is_dynamic = [&](size_t arg) { + return arg < input_is_dynamic.size() && input_is_dynamic[arg]; + }; + std::vector arguments; + // Don't alias dynamic input -- Due to the underlying implementation, + // aliased inputs have two owners: XRTAllocation and return value of + // this function. If an argument is dynamic and the ownership is + // released to output of this function, TPUExecute will free it and + // reallocate a new one, which creates a double freeing issue where + // XRTAllocation also attempts to release the buffer. + bool alias_outputs = release_inputs && input_tuples.size() == 1 && + input_tuples[0]->IsExclusiveOwner() && !is_dynamic(0); + arguments.reserve(input_tuples.size()); + for (int64 i = 0; i < input_tuples.size(); ++i) { + auto alias_checker = + [&](const xla::ShapeIndex& index) -> xla::StatusOr { + // Only the buffers which the caller explicitly marked as aliased + // (kUserAlias), should create aliases. + // The XLA compiler might create opportunistic aliases (kSystemAlias) + // which need a different handling. With a system alias we know that XLA + // is going to reuse a given input parameter buffer for a given output, so + // unless it is known at call site that the input buffer has no more uses, + // a copy needs to be made at call site. With user specified alias the + // caller tells us that he expects a given output to land over the buffers + // of a given parametter. + if (input_output_alias.ParameterAliasKind(i, index) == + xla::HloInputOutputAliasConfig::AliasKind::kUserAlias) { + TF_RET_CHECK(!is_dynamic(i)); + return true; + } + return alias_outputs; + }; + TF_ASSIGN_OR_RETURN(xla::ExecutionInput exec_input, + input_tuples[i]->ToExecutionInput(alias_checker)); + arguments.emplace_back(std::move(exec_input)); + } + return std::move(arguments); +} + Status CreateExecuteOutput(OpKernelContext* context, XRTMemoryManager* memory_manager, RefPtr output_tuple, diff --git a/tensorflow/compiler/xrt/xrt_util.h b/tensorflow/compiler/xrt/xrt_util.h index cc1480fdb00..832c106621f 100644 --- a/tensorflow/compiler/xrt/xrt_util.h +++ b/tensorflow/compiler/xrt/xrt_util.h @@ -23,6 +23,8 @@ limitations under the License. #include #include "tensorflow/compiler/xla/service/backend.h" +#include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h" +#include "tensorflow/compiler/xla/shape.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/xla.pb.h" @@ -69,6 +71,25 @@ xla::DebugOptions BuildXlaDebugOptions(const xla::DebugOptions& ref_options); xla::StatusOr> GetComputationInputs( OpKernelContext* context, const char* input_name); +bool InputShapeMatches(const xla::Shape& parameter_shape, + const xla::Shape& input_shape); + +xla::StatusOr>> GetInputTupleAllocations( + const std::vector& input_coords, + XRTMemoryManager::WorkingSet* working_set, xla::Backend* backend, + int64 num_input_shapes, + const std::function& shape_getter, bool release_inputs); + +Status RebuildOutputAliases( + const RefPtr& output_tuple, + absl::Span> input_tuples, + const xla::HloInputOutputAliasConfig& input_output_alias); + +xla::StatusOr> GetArgumentsBuffers( + const xla::HloInputOutputAliasConfig& input_output_alias, + absl::Span> input_tuples, + const std::vector& input_is_dynamic, bool release_inputs); + // Create the XRT execute output tensor given the computation result // (output_tuple). The return_exploded_tuple tells whether a tuple result should // be returned as vector of handles representing each tuple child.