From 26ec7a449054fbd604653b8aafd4783174e089ac Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Wed, 18 Nov 2020 11:31:14 -0800
Subject: [PATCH] Do not creates MaybeOwningDeviceMemory and ExecutionInput
 object in TF/XLA. This lower XLA launch overhead.

---
 tensorflow/compiler/xla/service/executable.h  |  2 +-
 .../xla/service/gpu/gpu_executable.cc         | 61 +++++++++++++++----
 .../compiler/xla/service/gpu/gpu_executable.h | 15 ++++-
 3 files changed, 64 insertions(+), 14 deletions(-)
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index 62d3614ab1f..80d4c2bbebf 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -278,7 +278,7 @@ class Executable {
   // If the hlo_execution_profile is provided as non-nullptr, profiling will be
   // enabled. Note that profiling is tricky to use correctly, as the profiling
   // objects (when they exist) must out-live the task.
-  StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStream(
+  virtual StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
       absl::Span<const ShapedBuffer* const> arguments,
       HloExecutionProfile* hlo_execution_profile);
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 9674b6eb452..234e07ddd05 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -304,7 +304,7 @@ GpuExecutable::ResolveConstantGlobals(se::Stream* stream) {
 }
 
 StatusOr<se::DeviceMemoryBase> GpuExecutable::BufferForAllocation(
-    absl::Span<ExecutionInput const> arguments,
+    VariantArguments arguments,
     const GpuExecutable::BufferAllocToDeviceMemoryMap* globals,
     const BufferAllocation& allocation,
     se::DeviceMemoryAllocator* const memory_allocator, int device_ordinal,
@@ -313,10 +313,17 @@ StatusOr<se::DeviceMemoryBase> GpuExecutable::BufferForAllocation(
     return se::DeviceMemoryBase{};
   } else if (allocation.is_entry_computation_parameter()) {
     int64 param_no = allocation.parameter_number();
-    se::DeviceMemoryBase registered_buffer =
-        arguments[param_no]
+    se::DeviceMemoryBase registered_buffer = [&] {
+      if (auto unowned_shapedbuffers =
+              absl::get_if<absl::Span<const ShapedBuffer* const>>(&arguments)) {
+        return (*unowned_shapedbuffers)[param_no]->buffers().element(
+            allocation.param_shape_index());
+      } else {
+        return absl::get<absl::Span<ExecutionInput>>(arguments)[param_no]
             .Buffer(allocation.param_shape_index())
             .AsDeviceMemoryBase();
+      }
+    }();
     if (registered_buffer.is_null() && registered_buffer.size() > 0) {
       return FailedPrecondition(
           "Cannot run XLA computation because pointer to (sub-)buffer at "
@@ -369,7 +376,7 @@ static Status CheckAlignment(const BufferAllocation& allocation,
 }
 
 StatusOr<BufferAllocations> GpuExecutable::GenerateBufferAllocations(
-    absl::Span<ExecutionInput const> arguments,
+    VariantArguments arguments,
     const GpuExecutable::BufferAllocToDeviceMemoryMap* globals,
     se::DeviceMemoryAllocator* const memory_allocator,
     se::StreamExecutor* executor) {
@@ -396,8 +403,25 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
     std::vector<ExecutionInput> arguments,
     HloExecutionProfile* hlo_execution_profile) {
+  return ExecuteAsyncOnStreamImpl(run_options, absl::MakeSpan(arguments),
+                                  hlo_execution_profile);
+}
+
+StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteAsyncOnStream(
+    const ServiceExecutableRunOptions* run_options,
+    absl::Span<const ShapedBuffer* const> arguments,
+    HloExecutionProfile* hlo_execution_profile) {
+  TF_ASSIGN_OR_RETURN(
+      ExecutionOutput out,
+      ExecuteAsyncOnStreamImpl(run_options, arguments, hlo_execution_profile));
+  return out.ConsumeResult();
+}
+
+StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl(
+    const ServiceExecutableRunOptions* run_options, VariantArguments arguments,
+    HloExecutionProfile* hlo_execution_profile) {
   XLA_SCOPED_LOGGING_TIMER(
-      absl::StrCat("GpuExecutable::ExecuteAsyncOnStream(", module_name_, ")"));
+      absl::StrCat("GpuExecutable::ExecuteAsyncOnStreamImpl(", module_name_, ")"));
   se::DeviceMemoryAllocator* const memory_allocator = run_options->allocator();
   // Force synchronous execution if the allocator requires it.
   const bool block_host_until_done =
@@ -445,18 +469,31 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStream(
             << " @ index: " << index.ToString();
 
     if (output_info.alias_config) {
-      ExecutionInput& input = arguments[allocation->parameter_number()];
       MaybeOwningDeviceMemory* maybe_owning_memory =
-          input.MutableBuffer(allocation->param_shape_index());
+          [&]() -> xla::MaybeOwningDeviceMemory* {
+        // ScopedBuffer is never an owned buffer.
+        if (auto* unowned_shapedbuffers =
+                absl::get_if<absl::Span<const ShapedBuffer* const>>(
+                    &arguments)) {
+          return nullptr;
+        } else {
+          auto unowned_execution_input =
+              absl::get<absl::Span<ExecutionInput>>(arguments);
+          ExecutionInput& input =
+              unowned_execution_input[allocation->parameter_number()];
+          return input.MutableBuffer(allocation->param_shape_index());
+        }
+      }();
       if (output_info.alias_config->must_alias() &&
-          !maybe_owning_memory->HasOwnership()) {
+          maybe_owning_memory && !maybe_owning_memory->HasOwnership()) {
         return InvalidArgument(
             "An input was configured to be must-alias at "
             "compile time but not donated at runtime: allocation %d",
             output_info.allocation_index);
       }
-      if (absl::optional<se::OwningDeviceMemory> owning =
-              maybe_owning_memory->Release()) {
+      if (maybe_owning_memory && maybe_owning_memory->HasOwnership()) {
+        absl::optional<tensorflow::se::OwningDeviceMemory> owning =
+            maybe_owning_memory->Release();
         // If the caller passes the ownership of the device memory, reuse it
         // as the output buffer. It is up to the caller whether or not to
         // donate a buffer; the aliasing information describes which buffers
@@ -522,7 +559,9 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStream(
       buffer_allocations.TearDown(buffers_in_result, allocations_));
 
   // Free allocations for arguments.
-  MarkToBeReleasedArguments(absl::MakeSpan(arguments), result);
+  if (auto args = absl::get_if<absl::Span<ExecutionInput>>(&arguments)) {
+    MarkToBeReleasedArguments(*args, result);
+  }
   return std::move(result);
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index cbe3ee6c166..ea5ce5cde48 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -116,6 +116,17 @@ class GpuExecutable : public Executable {
       std::vector<ExecutionInput> arguments,
       HloExecutionProfile* hlo_execution_profile) override;
 
+  StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStream(
+      const ServiceExecutableRunOptions* run_options,
+      absl::Span<const ShapedBuffer* const> arguments,
+      HloExecutionProfile* hlo_execution_profile);
+
+  using VariantArguments = absl::variant<absl::Span<const ShapedBuffer* const>,
+                                         absl::Span<ExecutionInput>>;
+  StatusOr<ExecutionOutput> ExecuteAsyncOnStreamImpl(
+      const ServiceExecutableRunOptions* run_options,
+      VariantArguments arguments, HloExecutionProfile* hlo_execution_profile);
+
   absl::Span<const BufferAllocation> GetAllocations() const {
     return allocations_;
   }
@@ -146,13 +157,13 @@ class GpuExecutable : public Executable {
       const ServiceExecutableRunOptions* run_options);
 
   StatusOr<BufferAllocations> GenerateBufferAllocations(
-      absl::Span<ExecutionInput const> arguments,
+      VariantArguments arguments,
       const GpuExecutable::BufferAllocToDeviceMemoryMap* globals,
       se::DeviceMemoryAllocator* const memory_allocator,
       se::StreamExecutor* executor);
 
   StatusOr<se::DeviceMemoryBase> BufferForAllocation(
-      absl::Span<ExecutionInput const> arguments,
+      VariantArguments arguments,
       const GpuExecutable::BufferAllocToDeviceMemoryMap* globals,
       const BufferAllocation& allocation,
       se::DeviceMemoryAllocator* const memory_allocator, int device_ordinal,