Merge pull request #45022 from nouiz:upstream_maybeowning

PiperOrigin-RevId: 351103206 Change-Id: Ifa9821e3dc1d4fd15c440da19a7e03d26daaa89e
2021-01-11 01:30:30 -08:00 · 2021-01-11 01:30:30 -08:00 · c4b65fb314
commit c4b65fb314
parent 53e3975a0f 26ec7a4490
3 changed files with 65 additions and 15 deletions
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@ -278,7 +278,7 @@ class Executable {
  // If the hlo_execution_profile is provided as non-nullptr, profiling will be
  // enabled. Note that profiling is tricky to use correctly, as the profiling
  // objects (when they exist) must out-live the task.
-  StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStream(
+  virtual StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStream(
      const ServiceExecutableRunOptions* run_options,
      absl::Span<const ShapedBuffer* const> arguments,
      HloExecutionProfile* hlo_execution_profile);
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@ -299,7 +299,7 @@ GpuExecutable::ResolveConstantGlobals(se::Stream* stream) {
 }
 StatusOr<se::DeviceMemoryBase> GpuExecutable::BufferForAllocation(
-    absl::Span<ExecutionInput const> arguments,
+    VariantArguments arguments,
    const GpuExecutable::BufferAllocToDeviceMemoryMap* globals,
    const BufferAllocation& allocation,
    se::DeviceMemoryAllocator* const memory_allocator, int device_ordinal,
@ -308,10 +308,17 @@ StatusOr<se::DeviceMemoryBase> GpuExecutable::BufferForAllocation(
    return se::DeviceMemoryBase{};
  } else if (allocation.is_entry_computation_parameter()) {
    int64 param_no = allocation.parameter_number();
-    se::DeviceMemoryBase registered_buffer =
+    se::DeviceMemoryBase registered_buffer = [&] {
-        arguments[param_no]
+      if (auto unowned_shapedbuffers =
              absl::get_if<absl::Span<const ShapedBuffer* const>>(&arguments)) {
        return (*unowned_shapedbuffers)[param_no]->buffers().element(
            allocation.param_shape_index());
      } else {
        return absl::get<absl::Span<ExecutionInput>>(arguments)[param_no]
            .Buffer(allocation.param_shape_index())
            .AsDeviceMemoryBase();
      }
    }();
    if (registered_buffer.is_null() && registered_buffer.size() > 0) {
      return FailedPrecondition(
          "Cannot run XLA computation because pointer to (sub-)buffer at "
@ -364,7 +371,7 @@ static Status CheckAlignment(const BufferAllocation& allocation,
 }
 StatusOr<BufferAllocations> GpuExecutable::GenerateBufferAllocations(
-    absl::Span<ExecutionInput const> arguments,
+    VariantArguments arguments,
    const GpuExecutable::BufferAllocToDeviceMemoryMap* globals,
    se::DeviceMemoryAllocator* const memory_allocator,
    se::StreamExecutor* executor) {
@ -391,8 +398,25 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStream(
    const ServiceExecutableRunOptions* run_options,
    std::vector<ExecutionInput> arguments,
    HloExecutionProfile* hlo_execution_profile) {
-  XLA_SCOPED_LOGGING_TIMER(
+  return ExecuteAsyncOnStreamImpl(run_options, absl::MakeSpan(arguments),
-      absl::StrCat("GpuExecutable::ExecuteAsyncOnStream(", module_name_, ")"));
+                                  hlo_execution_profile);
 }
 StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteAsyncOnStream(
    const ServiceExecutableRunOptions* run_options,
    absl::Span<const ShapedBuffer* const> arguments,
    HloExecutionProfile* hlo_execution_profile) {
  TF_ASSIGN_OR_RETURN(
      ExecutionOutput out,
      ExecuteAsyncOnStreamImpl(run_options, arguments, hlo_execution_profile));
  return out.ConsumeResult();
 }
 StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl(
    const ServiceExecutableRunOptions* run_options, VariantArguments arguments,
    HloExecutionProfile* hlo_execution_profile) {
  XLA_SCOPED_LOGGING_TIMER(absl::StrCat(
      "GpuExecutable::ExecuteAsyncOnStreamImpl(", module_name_, ")"));
  se::DeviceMemoryAllocator* const memory_allocator = run_options->allocator();
  // Force synchronous execution if the allocator requires it.
  const bool block_host_until_done =
@ -443,18 +467,31 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStream(
            << " @ index: " << index.ToString();
    if (output_info.alias_config) {
      ExecutionInput& input = arguments[allocation->parameter_number()];
      MaybeOwningDeviceMemory* maybe_owning_memory =
-          input.MutableBuffer(allocation->param_shape_index());
+          [&]() -> xla::MaybeOwningDeviceMemory* {
-      if (output_info.alias_config->must_alias() &&
+        // ScopedBuffer is never an owned buffer.
        if (auto* unowned_shapedbuffers =
                absl::get_if<absl::Span<const ShapedBuffer* const>>(
                    &arguments)) {
          return nullptr;
        } else {
          auto unowned_execution_input =
              absl::get<absl::Span<ExecutionInput>>(arguments);
          ExecutionInput& input =
              unowned_execution_input[allocation->parameter_number()];
          return input.MutableBuffer(allocation->param_shape_index());
        }
      }();
      if (output_info.alias_config->must_alias() && maybe_owning_memory &&
          !maybe_owning_memory->HasOwnership()) {
        return InvalidArgument(
            "An input was configured to be must-alias at "
            "compile time but not donated at runtime: allocation %d",
            output_info.allocation_index);
      }
-      if (absl::optional<se::OwningDeviceMemory> owning =
+      if (maybe_owning_memory && maybe_owning_memory->HasOwnership()) {
-              maybe_owning_memory->Release()) {
+        absl::optional<tensorflow::se::OwningDeviceMemory> owning =
            maybe_owning_memory->Release();
        // If the caller passes the ownership of the device memory, reuse it
        // as the output buffer. It is up to the caller whether or not to
        // donate a buffer; the aliasing information describes which buffers
@ -520,7 +557,9 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStream(
      buffer_allocations.TearDown(buffers_in_result, allocations_));
  // Free allocations for arguments.
-  MarkToBeReleasedArguments(absl::MakeSpan(arguments), result);
+  if (auto args = absl::get_if<absl::Span<ExecutionInput>>(&arguments)) {
    MarkToBeReleasedArguments(*args, result);
  }
  return std::move(result);
 }
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@ -116,6 +116,17 @@ class GpuExecutable : public Executable {
      std::vector<ExecutionInput> arguments,
      HloExecutionProfile* hlo_execution_profile) override;
  StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStream(
      const ServiceExecutableRunOptions* run_options,
      absl::Span<const ShapedBuffer* const> arguments,
      HloExecutionProfile* hlo_execution_profile);
  using VariantArguments = absl::variant<absl::Span<const ShapedBuffer* const>,
                                         absl::Span<ExecutionInput>>;
  StatusOr<ExecutionOutput> ExecuteAsyncOnStreamImpl(
      const ServiceExecutableRunOptions* run_options,
      VariantArguments arguments, HloExecutionProfile* hlo_execution_profile);
  absl::Span<const BufferAllocation> GetAllocations() const {
    return allocations_;
  }
@ -146,13 +157,13 @@ class GpuExecutable : public Executable {
      const ServiceExecutableRunOptions* run_options);
  StatusOr<BufferAllocations> GenerateBufferAllocations(
-      absl::Span<ExecutionInput const> arguments,
+      VariantArguments arguments,
      const GpuExecutable::BufferAllocToDeviceMemoryMap* globals,
      se::DeviceMemoryAllocator* const memory_allocator,
      se::StreamExecutor* executor);
  StatusOr<se::DeviceMemoryBase> BufferForAllocation(
-      absl::Span<ExecutionInput const> arguments,
+      VariantArguments arguments,
      const GpuExecutable::BufferAllocToDeviceMemoryMap* globals,
      const BufferAllocation& allocation,
      se::DeviceMemoryAllocator* const memory_allocator, int device_ordinal,