From 1762bef9385b6d4d96dba69a0397618676b8b361 Mon Sep 17 00:00:00 2001 From: Peter Hawkins Date: Tue, 6 Aug 2019 08:51:47 -0700 Subject: [PATCH] [XLA] Consolidate Executable::ExecuteOnStream and ExecuteAsyncOnStream. Remove ExecuteOnStream virtual method, make ExecuteOnStream a non-virtual wrapper around ExecuteAsyncOnStream. This means that backend authors have one method to implement (ExecuteAsyncOnStream) rather than two, and reduces the number of code paths to running an executable. Comment that ExecuteAsyncOnStream may in fact not be async. While undesirable, this is a quality of implementation issue not a bug. Future changes can make implementations of ExecuteAsyncOnStream truly async. PiperOrigin-RevId: 261922907 --- .../compiler/xla/client/local_client.cc | 3 +- .../xla/service/cpu/cpu_executable.cc | 22 ----------- .../compiler/xla/service/cpu/cpu_executable.h | 16 +------- tensorflow/compiler/xla/service/executable.cc | 37 +++++++++++++++++- tensorflow/compiler/xla/service/executable.h | 39 +++++++++++-------- .../xla/service/gpu/gpu_executable.cc | 15 ++----- .../compiler/xla/service/gpu/gpu_executable.h | 10 ++--- .../xla/service/interpreter/executable.cc | 9 +---- .../xla/service/interpreter/executable.h | 6 +-- tensorflow/compiler/xla/service/service.cc | 3 +- 10 files changed, 70 insertions(+), 90 deletions(-) diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc index 67c3e9be2ea..eaa67cd4f0a 100644 --- a/tensorflow/compiler/xla/client/local_client.cc +++ b/tensorflow/compiler/xla/client/local_client.cc @@ -213,7 +213,8 @@ StatusOr LocalExecutable::RunAsync( TF_ASSIGN_OR_RETURN( ScopedShapedBuffer outputs, - executable_->ExecuteAsyncOnStream(&options_and_stream.first, arguments)); + executable_->ExecuteAsyncOnStream(&options_and_stream.first, arguments, + /*hlo_execution_profile=*/nullptr)); // Transfer the outputs and save the snapshot to disk. if (snapshot) { diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc index 77a4a598092..9b79e8ca8d7 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc @@ -268,29 +268,7 @@ StatusOr CpuExecutable::CreateResultShapedBuffer( return std::move(result_buffer); } -StatusOr CpuExecutable::ExecuteOnStream( - const ServiceExecutableRunOptions* run_options, - absl::Span arguments, - HloExecutionProfile* hlo_execution_profile) { - TF_ASSIGN_OR_RETURN( - auto result, - ExecuteAsyncOnStreamImpl(run_options, arguments, hlo_execution_profile)); - TF_RETURN_IF_ERROR(run_options->stream()->BlockHostUntilDone()); - return std::move(result); -} - StatusOr CpuExecutable::ExecuteAsyncOnStream( - const ServiceExecutableRunOptions* run_options, - absl::Span arguments) { - if (hlo_profiling_enabled()) { - return Unimplemented( - "Asynchronous execution on stream with hlo profiling is not yet " - "supported on CPU."); - } - return ExecuteAsyncOnStreamImpl(run_options, arguments, nullptr); -} - -StatusOr CpuExecutable::ExecuteAsyncOnStreamImpl( const ServiceExecutableRunOptions* run_options, absl::Span arguments, HloExecutionProfile* hlo_execution_profile) { diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h index 169acdeffd4..37af630a2d9 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h +++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h @@ -55,15 +55,11 @@ class CpuExecutable : public Executable { std::unique_ptr hlo_profile_index_map); ~CpuExecutable() override {} - StatusOr ExecuteOnStream( + StatusOr ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, absl::Span arguments, HloExecutionProfile* hlo_execution_profile) override; - StatusOr ExecuteAsyncOnStream( - const ServiceExecutableRunOptions* run_options, - absl::Span arguments) override; - // This should be called after set_ir_module_string. const string& ir_module_string() const { return ir_module_string_; } @@ -86,16 +82,6 @@ class CpuExecutable : public Executable { const BufferAssignment& buffer_assignment() const { return *assignment_; } private: - // This is for sharing the code between ExecuteOnStream and - // ExecuteAsyncOnStream. - // - // Notice that it's tricky to use correctly, as the profile object (when it - // exists) must out-live the task. - StatusOr ExecuteAsyncOnStreamImpl( - const ServiceExecutableRunOptions* run_options, - absl::Span arguments, - HloExecutionProfile* hlo_execution_profile); - // Creates an array suitable for passing as the "buffer_table" argument to the // JIT compiled function pointer. // diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc index 2ceabc34b9e..cbf2ad02a2a 100644 --- a/tensorflow/compiler/xla/service/executable.cc +++ b/tensorflow/compiler/xla/service/executable.cc @@ -29,6 +29,38 @@ limitations under the License. namespace xla { +StatusOr Executable::ExecuteOnStream( + const ServiceExecutableRunOptions* run_options, + absl::Span arguments, + HloExecutionProfile* hlo_execution_profile) { + StatusOr result = + ExecuteAsyncOnStream(run_options, arguments, hlo_execution_profile); + Status blocking_status = run_options->stream()->BlockHostUntilDone(); + TF_RETURN_IF_ERROR(result.status()); + TF_RETURN_IF_ERROR(blocking_status); + return result; +} + +StatusOr Executable::ExecuteOnStream( + const ServiceExecutableRunOptions* run_options, + std::vector> arguments, + HloExecutionProfile* hlo_execution_profile) { + StatusOr result = ExecuteAsyncOnStream( + run_options, std::move(arguments), hlo_execution_profile); + Status blocking_status = run_options->stream()->BlockHostUntilDone(); + TF_RETURN_IF_ERROR(result.status()); + TF_RETURN_IF_ERROR(blocking_status); + return result; +} + +StatusOr Executable::ExecuteAsyncOnStream( + const ServiceExecutableRunOptions* /*run_options*/, + std::vector> /*arguments*/, + HloExecutionProfile* /*hlo_execution_profile*/) { + return Unimplemented( + "MaybeOwningDeviceMemory version of overload is not implemented "); +} + StatusOr> Executable::ExecuteOnStreams( absl::Span run_options, absl::Span> arguments) { @@ -49,8 +81,9 @@ StatusOr> Executable::ExecuteOnStreams( // We cannot BlockHostUntilDone() on the already-launched executions in case // of error, since if the executions communicate, the initially launched // executions may never complete if not all executions are running. - TF_ASSIGN_OR_RETURN(auto rv, - ExecuteAsyncOnStream(&run_options[i], arguments[i])); + TF_ASSIGN_OR_RETURN( + auto rv, ExecuteAsyncOnStream(&run_options[i], arguments[i], + /*hlo_execution_profile=*/nullptr)); return_values.push_back(std::move(rv)); } for (const auto& options : run_options) { diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h index d1ea79a9608..449b6f2668b 100644 --- a/tensorflow/compiler/xla/service/executable.h +++ b/tensorflow/compiler/xla/service/executable.h @@ -123,16 +123,10 @@ class Executable { // enabled. // // Returns a shaped buffer containing the result of the computation. - virtual StatusOr ExecuteOnStream( + StatusOr ExecuteOnStream( const ServiceExecutableRunOptions* run_options, absl::Span arguments, - HloExecutionProfile* hlo_execution_profile) = 0; - - // Same as ExecuteOnStream(), but this call is non-blocking and returns as - // soon as all of the operations are enqueued for launch on the stream. - virtual StatusOr ExecuteAsyncOnStream( - const ServiceExecutableRunOptions* run_options, - absl::Span arguments) = 0; + HloExecutionProfile* hlo_execution_profile); // Starts the given program executing on the given stream/executor. // @@ -143,20 +137,31 @@ class Executable { // // If an input is donated to XLA but is not reused as output, it is returned // as an leftover buffer for the caller to release. - virtual StatusOr ExecuteOnStream( + // + // This call should be non-blocking and may return as soon as all of the + // operations are enqueued for launch on the stream. Note that some + // implementations may in fact block or may block in some circumstances (e.g., + // when profiling); i.e., asynchronous is a "may" not a "must". + // + // If the hlo_execution_profile is provided as non-nullptr, profiling will be + // enabled. Note that profiling is tricky to use correctly, as the profiling + // objects (when they exist) must out-live the task. + virtual StatusOr ExecuteAsyncOnStream( + const ServiceExecutableRunOptions* run_options, + absl::Span arguments, + HloExecutionProfile* hlo_execution_profile) = 0; + + // Same as ExecuteAsyncOnStream(), but blocks waiting for the computation to + // complete. + StatusOr ExecuteOnStream( const ServiceExecutableRunOptions* run_options, std::vector> arguments, - HloExecutionProfile* hlo_execution_profile) { - return Unimplemented( - "MaybeOwningDeviceMemory version of overload is not implemented "); - } + HloExecutionProfile* hlo_execution_profile); virtual StatusOr ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, - std::vector> arguments) { - return Unimplemented( - "MaybeOwningDeviceMemory version of overload is not implemented "); - } + std::vector> arguments, + HloExecutionProfile* hlo_execution_profile); // Same as ExecuteOnStream(), but runs this executable on multiple // streams. arguments[i] contains the arguments to the execution on diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc index b85197ae45d..6a40f045eb6 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc @@ -405,25 +405,16 @@ StatusOr GpuExecutable::Execute( return std::move(shaped_buffer); } -StatusOr GpuExecutable::ExecuteOnStream( +StatusOr GpuExecutable::ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, absl::Span arguments, HloExecutionProfile* hlo_execution_profile) { - // TODO(b/134086343): ExecuteOnStream should not be async according to the - // documentation, instead ExecuteAsyncOnStream should be used. - return Execute(run_options, arguments, hlo_execution_profile, - /*block_host_until_done=*/ - !run_options->allocator()->AllowsAsynchronousDeallocation()); -} - -StatusOr GpuExecutable::ExecuteAsyncOnStream( - const ServiceExecutableRunOptions* run_options, - absl::Span arguments) { se::DeviceMemoryAllocator* memory_allocator = run_options->allocator(); // Force synchronous execution if the allocator requires it. bool block_host_until_done = !memory_allocator->AllowsAsynchronousDeallocation(); - return Execute(run_options, arguments, nullptr, block_host_until_done); + return Execute(run_options, arguments, hlo_execution_profile, + block_host_until_done); } const InstructionValueSet& GpuExecutable::GetRootValueSet() const { diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h index 4b311b80353..0175e31568c 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h +++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h @@ -80,17 +80,13 @@ class GpuExecutable : public Executable { // compilation is left up to the GPU driver. const std::vector& binary() const { return binary_; } - // ExecuteOnStream will fail if the compute capability of the stream doesn't - // match the compute capability passed to this object's constructor. - StatusOr ExecuteOnStream( + // ExecuteAsyncOnStream will fail if the compute capability of the stream + // doesn't match the compute capability passed to this object's constructor. + StatusOr ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, absl::Span arguments, HloExecutionProfile* hlo_execution_profile) override; - StatusOr ExecuteAsyncOnStream( - const ServiceExecutableRunOptions* run_options, - absl::Span arguments) override; - std::shared_ptr GetBufferAssignment() const { return assignment_; } diff --git a/tensorflow/compiler/xla/service/interpreter/executable.cc b/tensorflow/compiler/xla/service/interpreter/executable.cc index 5dae0a52404..0dab86d986c 100644 --- a/tensorflow/compiler/xla/service/interpreter/executable.cc +++ b/tensorflow/compiler/xla/service/interpreter/executable.cc @@ -45,7 +45,7 @@ InterpreterExecutable::InterpreterExecutable( InterpreterExecutable::~InterpreterExecutable() {} -StatusOr InterpreterExecutable::ExecuteOnStream( +StatusOr InterpreterExecutable::ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, absl::Span arguments, HloExecutionProfile* hlo_execution_profile) { @@ -122,13 +122,6 @@ StatusOr InterpreterExecutable::ExecuteOnStream( return std::move(result); } -StatusOr InterpreterExecutable::ExecuteAsyncOnStream( - const ServiceExecutableRunOptions* run_options, - absl::Span arguments) { - return tensorflow::errors::Unimplemented( - "ExecuteAsyncOnStream is not yet supported on Interpreter."); -} - /*static*/ int64 InterpreterExecutable::ShapeSizeBytes(const Shape& shape) { if (shape.IsOpaque()) { return sizeof(void*); diff --git a/tensorflow/compiler/xla/service/interpreter/executable.h b/tensorflow/compiler/xla/service/interpreter/executable.h index bda13d37636..ba010de76bd 100644 --- a/tensorflow/compiler/xla/service/interpreter/executable.h +++ b/tensorflow/compiler/xla/service/interpreter/executable.h @@ -46,16 +46,12 @@ class InterpreterExecutable : public Executable { std::unique_ptr evaluator); ~InterpreterExecutable() override; - StatusOr ExecuteOnStream( + StatusOr ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, absl::Span arguments, HloExecutionProfile* hlo_execution_profile) override LOCKS_EXCLUDED(evaluator_lock_); - StatusOr ExecuteAsyncOnStream( - const ServiceExecutableRunOptions* run_options, - absl::Span arguments) override; - static int64 ShapeSizeBytes(const Shape& shape); protected: diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc index 823e4102d83..1353c00ef1c 100644 --- a/tensorflow/compiler/xla/service/service.cc +++ b/tensorflow/compiler/xla/service/service.cc @@ -462,7 +462,8 @@ Service::ExecuteParallelAndRegisterResult( // Asynchronously launch the computation. TF_ASSIGN_OR_RETURN(ScopedShapedBuffer result, executables[i]->ExecuteAsyncOnStream( - &run_options, arguments[i][replica])); + &run_options, arguments[i][replica], + /*hlo_execution_profile=*/nullptr)); if (replica == 0 && profile != nullptr) { streams.back()->ThenStopTimer(timers.back().get());