diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD index af856e92e70..229827c77c8 100644 --- a/tensorflow/compiler/xla/service/cpu/BUILD +++ b/tensorflow/compiler/xla/service/cpu/BUILD @@ -242,16 +242,9 @@ cc_library( "//tensorflow/compiler/xla/service:hlo_dataflow_analysis", "//tensorflow/compiler/xla/service:hlo_execution_profile", "//tensorflow/compiler/xla/service:logical_buffer", - "//tensorflow/compiler/xla/service:maybe_owning_device_memory", "//tensorflow/compiler/xla/service:shaped_buffer", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", - "//tensorflow/core/platform:env", - "//tensorflow/core/platform:logging", - "//tensorflow/core/platform:macros", - "//tensorflow/core/platform:mutex", - "//tensorflow/core/platform:platform_port", - "//tensorflow/core/platform:types", "//tensorflow/core/profiler/lib:traceme", "//tensorflow/stream_executor:device_memory_allocator", "//tensorflow/stream_executor/host:host_stream", diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc index 083c3d31d74..9b79e8ca8d7 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc @@ -32,7 +32,6 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/service/logical_buffer.h" -#include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h" #include "tensorflow/compiler/xla/service/shaped_buffer.h" #include "tensorflow/compiler/xla/shape_tree.h" #include "tensorflow/compiler/xla/shape_util.h" @@ -45,7 +44,6 @@ limitations under the License. #include "tensorflow/core/platform/mem.h" #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/types.h" -#include "tensorflow/stream_executor/device_memory_allocator.h" #include "tensorflow/stream_executor/host/host_stream.h" namespace xla { @@ -75,12 +73,11 @@ CpuExecutable::CpuExecutable( << reinterpret_cast(compute_function_); } -StatusOr, - std::vector, - std::vector>> +StatusOr, + std::vector>> CpuExecutable::CreateBufferTable( se::DeviceMemoryAllocator* memory_allocator, int device_ordinal, - std::vector> arguments) { + absl::Span arguments) { std::vector unowning_buffers( assignment_->Allocations().size()); std::vector owning_buffers( @@ -94,9 +91,8 @@ CpuExecutable::CreateBufferTable( VLOG(3) << allocation.ToString(); if (allocation.is_entry_computation_parameter()) { - unowning_buffers[i] = arguments[allocation.parameter_number()] - .element(allocation.param_shape_index()) - .AsDeviceMemoryBase(); + unowning_buffers[i] = arguments[allocation.parameter_number()]->buffer( + allocation.param_shape_index()); CHECK_EQ(allocation.size(), unowning_buffers[i].size()) << "Size mismatch on param " << allocation.parameter_number() << " at shape index " << allocation.param_shape_index().ToString(); @@ -138,17 +134,7 @@ CpuExecutable::CreateBufferTable( assignment_->GetUniqueTopLevelOutputSlice()); VLOG(3) << "result index: " << result_slice.index(); - std::vector buffers_to_free; - for (ShapeTree& argument : arguments) { - for (std::pair& buffer : argument) { - auto maybe_owning_buffer = buffer.second.Release(); - if (maybe_owning_buffer) { - buffers_to_free.push_back(std::move(*maybe_owning_buffer)); - } - } - } - return {{std::move(unowning_buffers), std::move(owning_buffers), - std::move(buffers_to_free)}}; + return {{std::move(unowning_buffers), std::move(owning_buffers)}}; } Status CpuExecutable::ExecuteComputeFunction( @@ -282,9 +268,9 @@ StatusOr CpuExecutable::CreateResultShapedBuffer( return std::move(result_buffer); } -StatusOr CpuExecutable::ExecuteAsyncOnStream( +StatusOr CpuExecutable::ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, - std::vector> arguments, + absl::Span arguments, HloExecutionProfile* hlo_execution_profile) { if (GetRootValueSet().IsAmbiguous()) { return Unimplemented("Points-to set of root instruction is ambiguous"); @@ -297,7 +283,7 @@ StatusOr CpuExecutable::ExecuteAsyncOnStream( for (int64 i = 0; i < entry_comp->num_parameters(); ++i) { const Shape& expected_shape = entry_comp->parameter_instruction(i)->shape(); - const Shape& actual_shape = arguments[i].shape(); + const Shape& actual_shape = arguments[i]->on_device_shape(); CHECK(expected_shape == actual_shape) << absl::StreamFormat( "Shape mismatch on argument %d. Expected %s, but was %s.", i, expected_shape.ToString(/*print_layout=*/true), @@ -311,11 +297,10 @@ StatusOr CpuExecutable::ExecuteAsyncOnStream( se::DeviceMemoryAllocator* memory_allocator = run_options->allocator(); std::vector owning_buffers; std::vector unowning_buffers; - std::vector buffers_to_release; TF_ASSIGN_OR_RETURN( - std::tie(unowning_buffers, owning_buffers, buffers_to_release), + std::tie(unowning_buffers, owning_buffers), CreateBufferTable(memory_allocator, stream->parent()->device_ordinal(), - std::move(arguments))); + arguments)); TF_ASSIGN_OR_RETURN( ScopedShapedBuffer result, @@ -354,8 +339,7 @@ StatusOr CpuExecutable::ExecuteAsyncOnStream( std::move(owning_buffers)), hlo_execution_profile}); - return ExecutionOutput(std::move(result), std::move(buffers_to_release), {}, - se::OwningDeviceMemory()); + return std::move(result); } /*static*/ int64 CpuExecutable::ShapeSizeBytes(const Shape& shape) { diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h index 6f8a7c3315a..37af630a2d9 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h +++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h @@ -55,9 +55,9 @@ class CpuExecutable : public Executable { std::unique_ptr hlo_profile_index_map); ~CpuExecutable() override {} - StatusOr ExecuteAsyncOnStream( + StatusOr ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, - std::vector> arguments, + absl::Span arguments, HloExecutionProfile* hlo_execution_profile) override; // This should be called after set_ir_module_string. @@ -96,15 +96,11 @@ class CpuExecutable : public Executable { // allocated by this routine. This routine allocates buffers for temporary // storage and the live-out buffer into which the computation writes it // result. - // - // - buffers_to_free: buffers whose ownership was donated by the caller that - // are to be freed by the caller. - StatusOr, - std::vector, - std::vector>> + StatusOr, + std::vector>> CreateBufferTable(se::DeviceMemoryAllocator* memory_allocator, int device_ordinal, - std::vector> arguments); + absl::Span arguments); // Calls the generated function performing the computation with the given // arguments using the supplied buffers. diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc index 9ece6172d12..c21721c9339 100644 --- a/tensorflow/compiler/xla/service/executable.cc +++ b/tensorflow/compiler/xla/service/executable.cc @@ -20,7 +20,6 @@ limitations under the License. #include "tensorflow/compiler/xla/debug_options_flags.h" #include "tensorflow/compiler/xla/service/dump.h" #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h" -#include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h" #include "tensorflow/compiler/xla/status.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/core/lib/core/status.h" @@ -44,36 +43,9 @@ StatusOr Executable::ExecuteOnStream( return result; } -static ShapeTree MakeMaybeOwningDeviceMemoryTree( - const ShapedBuffer& shaped_buffer) { - ShapeTree result(shaped_buffer.on_device_shape()); - auto in_it = shaped_buffer.buffers().begin(); - auto out_it = result.begin(); - for (; in_it != shaped_buffer.buffers().end(); ++in_it, ++out_it) { - DCHECK(out_it != result.end()); - out_it->second = MaybeOwningDeviceMemory(in_it->second); - } - return result; -} - -StatusOr Executable::ExecuteAsyncOnStream( - const ServiceExecutableRunOptions* run_options, - absl::Span arguments, - HloExecutionProfile* hlo_execution_profile) { - std::vector> args(arguments.size()); - auto out_it = args.begin(); - for (const ShapedBuffer* arg : arguments) { - *out_it++ = MakeMaybeOwningDeviceMemoryTree(*arg); - } - TF_ASSIGN_OR_RETURN(ExecutionOutput out, - ExecuteAsyncOnStream(run_options, std::move(args), - hlo_execution_profile)); - return out.ConsumeResult(); -} - StatusOr Executable::ExecuteOnStream( const ServiceExecutableRunOptions* run_options, - std::vector> arguments, + std::vector> arguments, HloExecutionProfile* hlo_execution_profile) { StatusOr result = ExecuteAsyncOnStream( run_options, std::move(arguments), hlo_execution_profile); @@ -83,6 +55,14 @@ StatusOr Executable::ExecuteOnStream( return result; } +StatusOr Executable::ExecuteAsyncOnStream( + const ServiceExecutableRunOptions* /*run_options*/, + std::vector> /*arguments*/, + HloExecutionProfile* /*hlo_execution_profile*/) { + return Unimplemented( + "MaybeOwningDeviceMemory version of overload is not implemented "); +} + StatusOr> Executable::ExecuteOnStreams( absl::Span run_options, absl::Span> arguments) { diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h index 496599e7aaf..971dab95bfd 100644 --- a/tensorflow/compiler/xla/service/executable.h +++ b/tensorflow/compiler/xla/service/executable.h @@ -160,22 +160,22 @@ class Executable { // If the hlo_execution_profile is provided as non-nullptr, profiling will be // enabled. Note that profiling is tricky to use correctly, as the profiling // objects (when they exist) must out-live the task. - StatusOr ExecuteAsyncOnStream( + virtual StatusOr ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, absl::Span arguments, - HloExecutionProfile* hlo_execution_profile); + HloExecutionProfile* hlo_execution_profile) = 0; // Same as ExecuteAsyncOnStream(), but blocks waiting for the computation to // complete. StatusOr ExecuteOnStream( const ServiceExecutableRunOptions* run_options, - std::vector> arguments, + std::vector> arguments, HloExecutionProfile* hlo_execution_profile); virtual StatusOr ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, - std::vector> arguments, - HloExecutionProfile* hlo_execution_profile) = 0; + std::vector> arguments, + HloExecutionProfile* hlo_execution_profile); // Same as ExecuteOnStream(), but runs this executable on multiple // streams. arguments[i] contains the arguments to the execution on diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc index 93af1cd995e..99bc0f7fee0 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc @@ -299,14 +299,11 @@ GpuExecutable::ResolveConstantGlobals(se::Stream* stream) { return &module_globals_.emplace(executor, std::move(globals)).first->second; } -StatusOr GpuExecutable::ExecuteAsyncOnStream( +StatusOr GpuExecutable::Execute( const ServiceExecutableRunOptions* run_options, - std::vector> arguments, - HloExecutionProfile* hlo_execution_profile) { - se::DeviceMemoryAllocator* const memory_allocator = run_options->allocator(); - // Force synchronous execution if the allocator requires it. - const bool block_host_until_done = - !memory_allocator->AllowsAsynchronousDeallocation(); + absl::Span arguments, + HloExecutionProfile* hlo_execution_profile, bool block_host_until_done) { + se::DeviceMemoryAllocator* memory_allocator = run_options->allocator(); if (GetRootValueSet().IsAmbiguous()) { return Unimplemented("Points-to set of root instruction is ambiguous"); @@ -337,9 +334,7 @@ StatusOr GpuExecutable::ExecuteAsyncOnStream( if (allocation.is_entry_computation_parameter()) { auto param_no = allocation.parameter_number(); se::DeviceMemoryBase buffer = - arguments[param_no] - .element(allocation.param_shape_index()) - .AsDeviceMemoryBase(); + arguments[param_no]->buffer(allocation.param_shape_index()); // All top-level buffers and sub-buffers must have an explicit, non-null // pointer, except for zero-sized buffers, which may be null. @@ -428,17 +423,19 @@ StatusOr GpuExecutable::ExecuteAsyncOnStream( })); TF_RETURN_IF_ERROR(buffer_allocations->TearDown(buffers_in_result)); - std::vector buffers_to_free; - for (ShapeTree& argument : arguments) { - for (std::pair& buffer : argument) { - auto maybe_owning_buffer = buffer.second.Release(); - if (maybe_owning_buffer) { - buffers_to_free.push_back(std::move(*maybe_owning_buffer)); - } - } - } - return ExecutionOutput(std::move(shaped_buffer), std::move(buffers_to_free), - {}, {}); + return std::move(shaped_buffer); +} + +StatusOr GpuExecutable::ExecuteAsyncOnStream( + const ServiceExecutableRunOptions* run_options, + absl::Span arguments, + HloExecutionProfile* hlo_execution_profile) { + se::DeviceMemoryAllocator* memory_allocator = run_options->allocator(); + // Force synchronous execution if the allocator requires it. + bool block_host_until_done = + !memory_allocator->AllowsAsynchronousDeallocation(); + return Execute(run_options, arguments, hlo_execution_profile, + block_host_until_done); } const InstructionValueSet& GpuExecutable::GetRootValueSet() const { diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h index 51e86a9f8ee..66f86d768be 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h +++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h @@ -82,9 +82,9 @@ class GpuExecutable : public Executable { // ExecuteAsyncOnStream will fail if the compute capability of the stream // doesn't match the compute capability passed to this object's constructor. - StatusOr ExecuteAsyncOnStream( + StatusOr ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, - std::vector> arguments, + absl::Span arguments, HloExecutionProfile* hlo_execution_profile) override; std::shared_ptr GetBufferAssignment() const { @@ -92,6 +92,11 @@ class GpuExecutable : public Executable { } private: + StatusOr Execute( + const ServiceExecutableRunOptions* run_options, + absl::Span arguments, + HloExecutionProfile* hlo_execution_profile, bool block_host_until_done); + // If `block_host_until_done` is false, execution will not block the host // until the kernels have completed. This is used as an optimization for // clients, such as Tensorflow, that use a single stream of execution for diff --git a/tensorflow/compiler/xla/service/hlo_input_output_alias_config.cc b/tensorflow/compiler/xla/service/hlo_input_output_alias_config.cc index 3e82e3271bb..1c5b166a801 100644 --- a/tensorflow/compiler/xla/service/hlo_input_output_alias_config.cc +++ b/tensorflow/compiler/xla/service/hlo_input_output_alias_config.cc @@ -151,8 +151,7 @@ absl::optional HloInputOutputAliasConfig::GetAliasedOutput( absl::optional HloInputOutputAliasConfig::GetAliasedParameter( const ShapeIndex& output_index) const { - CHECK(ShapeUtil::IndexIsValid(alias_.shape(), output_index)) - << ToString() << " " << alias_.shape().ToString() << " " << output_index; + CHECK(ShapeUtil::IndexIsValid(alias_.shape(), output_index)); return alias_.element(output_index); } diff --git a/tensorflow/compiler/xla/service/interpreter/BUILD b/tensorflow/compiler/xla/service/interpreter/BUILD index 84c7982ad10..3073c68c975 100644 --- a/tensorflow/compiler/xla/service/interpreter/BUILD +++ b/tensorflow/compiler/xla/service/interpreter/BUILD @@ -89,15 +89,10 @@ cc_library( "//tensorflow/compiler/xla/service:hlo_evaluator", "//tensorflow/compiler/xla/service:hlo_execution_profile", "//tensorflow/compiler/xla/service:hlo_module_config", - "//tensorflow/compiler/xla/service:maybe_owning_device_memory", "//tensorflow/compiler/xla/service:shaped_buffer", "//tensorflow/compiler/xla/service:transfer_manager", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", - "//tensorflow/core/platform:env", - "//tensorflow/core/platform:macros", - "//tensorflow/core/platform:mutex", - "//tensorflow/core/platform:types", "@com_google_absl//absl/memory", "@com_google_absl//absl/types:span", ], diff --git a/tensorflow/compiler/xla/service/interpreter/executable.cc b/tensorflow/compiler/xla/service/interpreter/executable.cc index f82a439fdb0..0dab86d986c 100644 --- a/tensorflow/compiler/xla/service/interpreter/executable.cc +++ b/tensorflow/compiler/xla/service/interpreter/executable.cc @@ -26,7 +26,6 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/interpreter/executor.h" -#include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h" #include "tensorflow/compiler/xla/service/transfer_manager.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" @@ -40,39 +39,24 @@ namespace interpreter { InterpreterExecutable::InterpreterExecutable( std::unique_ptr hlo_module, std::unique_ptr evaluator) - : Executable(std::move(hlo_module), /*hlo_profile_printer_data=*/nullptr, + : Executable(std::move(hlo_module), /*hlo_profile_printer=*/nullptr, /*hlo_profile_index_map=*/nullptr), evaluator_(std::move(evaluator)) {} InterpreterExecutable::~InterpreterExecutable() {} -StatusOr InterpreterExecutable::ExecuteAsyncOnStream( +StatusOr InterpreterExecutable::ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, - std::vector> arguments, + absl::Span arguments, HloExecutionProfile* hlo_execution_profile) { se::Stream* stream = run_options->stream(); se::StreamExecutor* executor = stream->parent(); const se::Platform* platform = executor->platform(); - // Convert the ShapeTree to a ShapedBuffer. We do this so we can call - // TransferManager methods below. - std::vector argument_buffers; - argument_buffers.reserve(arguments.size()); - for (const ShapeTree& arg : arguments) { - argument_buffers.push_back(ShapedBuffer(arg.shape(), arg.shape(), - /*platform=*/nullptr, - /*device_ordinal=*/0)); - auto in_it = arg.begin(); - auto out_it = argument_buffers.back().buffers().begin(); - for (; in_it != arg.end(); ++in_it, ++out_it) { - out_it->second = in_it->second.AsDeviceMemoryBase(); - } - } - VLOG(1) << "Execute " << module().name(); if (VLOG_IS_ON(2)) { - for (const auto& a : argument_buffers) { - VLOG(2) << "-- argument " << a; + for (const auto& a : arguments) { + VLOG(2) << "-- argument " << *a; } } @@ -87,7 +71,7 @@ StatusOr InterpreterExecutable::ExecuteAsyncOnStream( // Check that the args have the right shape. for (int64 i = 0; i < computation->num_parameters(); ++i) { const auto& expected_shape = computation->parameter_instruction(i)->shape(); - const auto& actual_shape = argument_buffers[i].on_device_shape(); + const auto& actual_shape = arguments[i]->on_device_shape(); if (!Shape::Equal().MinorToMajorOnlyInLayout()(expected_shape, actual_shape)) { return InvalidArgument( @@ -106,7 +90,7 @@ StatusOr InterpreterExecutable::ExecuteAsyncOnStream( for (int64 p = 0; p < computation->num_parameters(); ++p) { TF_ASSIGN_OR_RETURN(Literal arg_literal, transfer_manager->TransferLiteralFromDevice( - run_options->stream(), argument_buffers[p])); + run_options->stream(), *arguments[p])); arg_literals.push_back(std::move(arg_literal)); } @@ -135,16 +119,7 @@ StatusOr InterpreterExecutable::ExecuteAsyncOnStream( profile->set_compute_time_ns(std::max(nanoseconds, 1.0)); } - std::vector buffers_to_free; - for (ShapeTree& argument : arguments) { - for (std::pair& buffer : argument) { - auto maybe_owning_buffer = buffer.second.Release(); - if (maybe_owning_buffer) { - buffers_to_free.push_back(std::move(*maybe_owning_buffer)); - } - } - } - return ExecutionOutput(std::move(result), std::move(buffers_to_free), {}, {}); + return std::move(result); } /*static*/ int64 InterpreterExecutable::ShapeSizeBytes(const Shape& shape) { diff --git a/tensorflow/compiler/xla/service/interpreter/executable.h b/tensorflow/compiler/xla/service/interpreter/executable.h index 1bea6773fdd..ba010de76bd 100644 --- a/tensorflow/compiler/xla/service/interpreter/executable.h +++ b/tensorflow/compiler/xla/service/interpreter/executable.h @@ -46,9 +46,9 @@ class InterpreterExecutable : public Executable { std::unique_ptr evaluator); ~InterpreterExecutable() override; - StatusOr ExecuteAsyncOnStream( + StatusOr ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, - std::vector> arguments, + absl::Span arguments, HloExecutionProfile* hlo_execution_profile) override LOCKS_EXCLUDED(evaluator_lock_); diff --git a/tensorflow/compiler/xla/service/maybe_owning_device_memory.cc b/tensorflow/compiler/xla/service/maybe_owning_device_memory.cc index c4bf48bcc00..5fe5fea71ac 100644 --- a/tensorflow/compiler/xla/service/maybe_owning_device_memory.cc +++ b/tensorflow/compiler/xla/service/maybe_owning_device_memory.cc @@ -17,8 +17,7 @@ limitations under the License. #include "absl/types/variant.h" namespace xla { -tensorflow::se::DeviceMemoryBase MaybeOwningDeviceMemory::AsDeviceMemoryBase() - const { +tensorflow::se::DeviceMemoryBase MaybeOwningDeviceMemory::AsDeviceMemoryBase() { if (HasOwnership()) { return *absl::get(mem_); } else { diff --git a/tensorflow/compiler/xla/service/maybe_owning_device_memory.h b/tensorflow/compiler/xla/service/maybe_owning_device_memory.h index 7d23d178130..8edd64cf681 100644 --- a/tensorflow/compiler/xla/service/maybe_owning_device_memory.h +++ b/tensorflow/compiler/xla/service/maybe_owning_device_memory.h @@ -49,7 +49,7 @@ class MaybeOwningDeviceMemory { // Fetches the underlying DeviceMemoryBase from a MaybeOwningDeviceMemory. The // caller of this function is *not* responsible for freeing the memory. - tensorflow::se::DeviceMemoryBase AsDeviceMemoryBase() const; + tensorflow::se::DeviceMemoryBase AsDeviceMemoryBase(); // Release the tensorflow::se::OwningDeviceMemory without freeing it, and // moves the ownership of the memory buffer from the object to the caller.