Split StreamExecutor impl of pjrt_client into its own files.

Also removed duplicate comments for subclass methods. PiperOrigin-RevId: 346807683 Change-Id: I2ab9b5038d5e5fc991bfee53d0081c5eecf51906
2020-12-10 10:01:36 -08:00 · 2020-12-10 10:01:36 -08:00 · ec86d80f19
commit ec86d80f19
parent d9f4007ff6
17 changed files with 830 additions and 803 deletions
--- a/tensorflow/compiler/xla/pjrt/BUILD
+++ b/tensorflow/compiler/xla/pjrt/BUILD
@ -119,12 +119,39 @@ cc_library(
 cc_library(
    name = "pjrt_client",
    srcs = ["pjrt_client.cc"],
    hdrs = ["pjrt_client.h"],
    visibility = ["//tensorflow/compiler/xla:friends"],
    deps = [
        "//tensorflow/compiler/xla:executable_run_options",
        "//tensorflow/compiler/xla:literal",
        "//tensorflow/compiler/xla:shape_util",
        "//tensorflow/compiler/xla:status",
        "//tensorflow/compiler/xla:statusor",
        "//tensorflow/compiler/xla:util",
        "//tensorflow/compiler/xla:xla_data_proto_cc",
        "//tensorflow/compiler/xla/client:executable_build_options",
        "//tensorflow/compiler/xla/client:xla_computation",
        "//tensorflow/compiler/xla/pjrt/distributed:protocol_proto_cc",
        "//tensorflow/compiler/xla/service:hlo",
        "//tensorflow/compiler/xla/service:hlo_cost_analysis",
        "//tensorflow/core:lib",
        "@com_google_absl//absl/base",
        "@com_google_absl//absl/memory",
        "@com_google_absl//absl/strings",
        "@com_google_absl//absl/types:optional",
        "@com_google_absl//absl/types:span",
    ],
 )
 cc_library(
    name = "pjrt_stream_executor_client",
    srcs = ["pjrt_stream_executor_client.cc"],
    hdrs = ["pjrt_stream_executor_client.h"],
    visibility = ["//tensorflow/compiler/xla:friends"],
    deps = [
        ":event_pool",
        ":local_device_state",
        ":pjrt_client",
        ":tracked_device_buffer",
        "//tensorflow/compiler/xla:cpu_function_runtime",
        "//tensorflow/compiler/xla:executable_run_options",
@ -181,7 +208,7 @@ cc_library(
    ],
    deps = [
        ":local_device_state",
-        ":pjrt_client",
+        ":pjrt_stream_executor_client",
        ":tracked_device_buffer",
        "//tensorflow/compiler/xla:shape_util",
        "//tensorflow/compiler/xla:status",
@ -215,7 +242,7 @@ cc_library(
    srcs = ["interpreter_device.cc"],
    hdrs = ["interpreter_device.h"],
    deps = [
-        ":pjrt_client",
+        ":pjrt_stream_executor_client",
        "//tensorflow/compiler/xla:statusor",
        "//tensorflow/compiler/xla/client:client_library",
        "//tensorflow/compiler/xla/service:interpreter_plugin",
@ -229,7 +256,7 @@ cc_library(
    srcs = ["cpu_device.cc"],
    hdrs = ["cpu_device.h"],
    deps = [
-        ":pjrt_client",
+        ":pjrt_stream_executor_client",
        "//tensorflow/compiler/xla:statusor",
        "//tensorflow/compiler/xla/client:client_library",
        "//tensorflow/compiler/xla/service:platform_util",
@ -242,7 +269,7 @@ cc_library(
    srcs = ["gpu_device.cc"],
    hdrs = ["gpu_device.h"],
    deps = [
-        ":pjrt_client",
+        ":pjrt_stream_executor_client",
        "@com_google_absl//absl/container:flat_hash_map",
        "//tensorflow/compiler/xla/service/gpu:gpu_executable_run_options",
        "//tensorflow/compiler/xla:statusor",
@ -279,6 +306,7 @@ tf_cc_test(
    deps = [
        ":gpu_device",
        ":pjrt_client",
        ":pjrt_stream_executor_client",
        "//tensorflow/compiler/xla:test",
        "//tensorflow/compiler/xla/client:executable_build_options",
        "//tensorflow/compiler/xla/client:xla_builder",
--- a/tensorflow/compiler/xla/pjrt/cpu_device.cc
+++ b/tensorflow/compiler/xla/pjrt/cpu_device.cc
@ -17,7 +17,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 namespace xla {
--- a/tensorflow/compiler/xla/pjrt/cpu_device.h
+++ b/tensorflow/compiler/xla/pjrt/cpu_device.h
@ -18,7 +18,7 @@ limitations under the License.
 #include <memory>
-#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.h"
 #include "tensorflow/compiler/xla/statusor.h"
 namespace xla {
--- a/tensorflow/compiler/xla/pjrt/gpu_device.cc
+++ b/tensorflow/compiler/xla/pjrt/gpu_device.cc
@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/pjrt/gpu_device.h"
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.h"
 #ifdef NCCL_ENABLED
 #include "third_party/nccl/nccl.h"
--- a/tensorflow/compiler/xla/pjrt/gpu_device.h
+++ b/tensorflow/compiler/xla/pjrt/gpu_device.h
@ -19,7 +19,7 @@ limitations under the License.
 #include <memory>
 #include "tensorflow/compiler/xla/pjrt/distributed/client.h"
-#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/common_runtime/bfc_allocator.h"
--- a/tensorflow/compiler/xla/pjrt/interpreter_device.cc
+++ b/tensorflow/compiler/xla/pjrt/interpreter_device.cc
@ -17,7 +17,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 namespace xla {
--- a/tensorflow/compiler/xla/pjrt/interpreter_device.h
+++ b/tensorflow/compiler/xla/pjrt/interpreter_device.h
@ -18,7 +18,7 @@ limitations under the License.
 #include <memory>
-#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.h"
 #include "tensorflow/compiler/xla/statusor.h"
 namespace xla {
--- a/tensorflow/compiler/xla/pjrt/pjrt_client.h
+++ b/tensorflow/compiler/xla/pjrt/pjrt_client.h
@ -20,30 +20,19 @@ limitations under the License.
 #include <string>
 #include <vector>
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/synchronization/notification.h"
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/layout.h"
-#include "tensorflow/compiler/xla/pjrt/local_device_state.h"
+#include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
 #include "tensorflow/compiler/xla/pjrt/tracked_device_buffer.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/casts.h"
 #include "tensorflow/core/platform/fingerprint.h"
@ -106,78 +95,6 @@ class PjRtDevice {
  virtual StatusOr<Literal> TransferFromOutfeed(const Shape& shape) const = 0;
 };
 class PjRtStreamExecutorDevice : public PjRtDevice {
 public:
  explicit PjRtStreamExecutorDevice(
      int id, std::unique_ptr<LocalDeviceState> local_device_state,
      std::string device_kind, int host_id = 0)
      : id_(id),
        device_ordinal_(
            local_device_state ? local_device_state->device_ordinal() : -1),
        local_device_state_(std::move(local_device_state)),
        host_id_(host_id),
        device_kind_(std::move(device_kind)) {}
  ~PjRtStreamExecutorDevice() override {}
  // Must set client exactly once.
  void SetClient(PjRtClient* client) {
    CHECK(client_ == nullptr);
    client_ = client;
  }
  // Task ID. This is always 0 on single-task setup.
  int host_id() const override { return host_id_; }
  // Return `platform_id` from client.
  PjRtPlatformId platform_id() const;
  // Return `platform_name` from client.
  const std::string& platform_name() const;
  PjRtClient* client() const override { return client_; }
  // The ID of this device. IDs are unique among devices of this type
  // (e.g. CPUs, GPUs). On multi-host platforms, this will be unique across all
  // hosts' devices.  This is the ID that should be used in a DeviceAssignment.
  int id() const override { return id_; }
  bool IsAddressable() const override { return device_ordinal_ != -1; }
  int local_hardware_id() const override { return device_ordinal_; }
  // If this is a device local to this host, returns a LocalDeviceState object
  // that can be used to manipulate the device. Returns nullptr if the device is
  // not local to this host.
  LocalDeviceState* local_device_state() const {
    return local_device_state_.get();
  }
  // If this is a device local to this host, returns a LocalDeviceState object
  // that can be used to manipulate the device. Returns an error if the device
  // is not local to this host.
  StatusOr<LocalDeviceState*> GetLocalDeviceState() const;
  // A vendor-dependent string that uniquely identifies the kind of device.
  const std::string& device_kind() const override { return device_kind_; }
  std::string DebugString() const override;
  // Transfer the given literal to the infeed queue of the given localdevice.
  Status TransferToInfeed(const LiteralSlice& literal) const override;
  // Transfer and return a value of the given shape from the outfeed of the
  // given device.
  StatusOr<Literal> TransferFromOutfeed(const Shape& shape) const override;
 private:
  const int id_;
  const int device_ordinal_;  // -1 means not local.
  const std::unique_ptr<LocalDeviceState> local_device_state_;
  const int host_id_;
  const std::string device_kind_;
  PjRtClient* client_ = nullptr;
 };
 // Forward declaration.
 class PjRtBuffer;
@ -333,181 +250,6 @@ class PjRtClient {
  virtual StatusOr<ChannelHandle> CreateHostToDeviceChannelHandle() = 0;
 };
 class PjRtStreamExecutorClient : public PjRtClient {
 public:
  // `allocator` may null, in which case the platform default allocator is used.
  explicit PjRtStreamExecutorClient(
      std::string platform_name, LocalClient* client,
      std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> devices,
      int host_id, std::unique_ptr<se::DeviceMemoryAllocator> allocator,
      std::unique_ptr<tensorflow::Allocator> host_memory_allocator,
      bool should_stage_host_to_device_transfers,
      std::unique_ptr<gpu::GpuExecutableRunOptions> gpu_run_options);
  ~PjRtStreamExecutorClient() override = default;
  int host_id() const override { return host_id_; }
  int device_count() const override { return devices_.size(); }
  int addressable_device_count() const override {
    return local_devices_.size();
  }
  absl::Span<PjRtDevice* const> devices() const override { return devices_; }
  absl::Span<PjRtDevice* const> local_devices() const override {
    return local_devices_;
  }
  StatusOr<PjRtDevice*> LookupDevice(int device_id) const override {
    auto it = id_to_device_.find(device_id);
    if (it != id_to_device_.end()) {
      return it->second;
    }
    return InvalidArgument("No matching device found for device_id %d",
                           device_id);
  }
  StatusOr<PjRtDevice*> LookupAddressableDevice(
      int local_hardware_id) const override;
  PjRtPlatformId platform_id() const override { return platform_id_; }
  const std::string& platform_name() const override { return platform_name_; }
  // Most platforms expect device-to-device transfers to be enqueued on the
  // source d2d stream, but some platforms use the destination d2d stream. This
  // function specifies which one the platform expects.
  virtual bool EnqueueD2DTransfersOnSrcStream() const { return true; }
  StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
      int num_replicas, int num_partitions) const override;
  StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
      const XlaComputation& computation, CompileOptions options) override;
  // Generates a unique fingerprint for `executable`.
  StatusOr<absl::optional<std::string>> ExecutableFingerprint(
      const PjRtExecutable& executable) const override {
    return absl::optional<std::string>();
  }
  // Returns a backend-specific HLO cost analysis visitor.
  std::unique_ptr<HloCostAnalysis> GetHloCostAnalysis() override;
  // Creates a buffer on the device without initializing or copying any data.
  // An optional `definition_event` may be speficied that can be used to
  // ensure the buffer isn't referenced until some external mechanism has
  // initialized the data.
  // NOTE: The sequencing mechanism is not guaranteed to be supported by all
  // future backends and so callers should avoid wherever possible.
  StatusOr<std::unique_ptr<PjRtBuffer>> CreateUninitializedBuffer(
      const Shape& shape, PjRtDevice* device) override;
  StatusOr<std::unique_ptr<PjRtBuffer>> CreateUninitializedBuffer(
      const Shape& shape, PjRtDevice* device,
      std::shared_ptr<BufferSequencingEvent> definition_event);
  StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
      const void* data, const Shape& shape,
      HostBufferSemantics host_buffer_semantics,
      std::shared_ptr<void> buffer_reference, PjRtDevice* device) override;
  // Note that literal must remain in scope until the transfer has completed, so
  // the caller should, for example, wait for BlockHostUntilReady() completes on
  // the return value before letting literal go out of scope.
  StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
      const LiteralSlice& literal, PjRtDevice* device) override;
  // Asynchronously makes a vector of PjRtBuffers that can be used to receive
  // cross host transfers using `client` on `device'. `shapes` must be the exact
  // shapes, with identical layouts, corresponding to the buffers that will be
  // sent. When resources for the transfer are available, notifier will be
  // called with a vector of PjRtCrossHostRecvBuffer structs, one for each
  // shape in `shapes`. Each struct contains a buffer that will contain the
  // received value, and an opaque string that should be transmitted to the
  // sending host and used in a call to CopyToRemoteDevice. None of the recv
  // buffers will become ready until *all* of the sends have completed.
  void MakeCrossHostReceiveBuffers(
      absl::Span<const Shape> shapes, PjRtDevice* device,
      PjRtCrossHostRecvNotifier&& notifier) override;
  StatusOr<ChannelHandle> CreateChannelHandle() override {
    return client()->CreateChannelHandle();
  }
  StatusOr<ChannelHandle> CreateDeviceToHostChannelHandle() override {
    return client()->CreateDeviceToHostChannelHandle();
  }
  StatusOr<ChannelHandle> CreateHostToDeviceChannelHandle() override {
    return client()->CreateHostToDeviceChannelHandle();
  }
  LocalDeviceState& device_state(int device_ordinal) const {
    return *tensorflow::down_cast<PjRtStreamExecutorDevice*>(
                local_devices_.at(device_ordinal))
                ->local_device_state();
  }
  LocalClient* client() const { return client_; }
  se::DeviceMemoryAllocator* allocator() const { return allocator_; }
  tensorflow::Allocator* host_memory_allocator() const {
    return host_memory_allocator_.get();
  }
  bool should_stage_host_to_device_transfers() const {
    return should_stage_host_to_device_transfers_;
  }
  gpu::GpuExecutableRunOptions* gpu_run_options() const {
    return gpu_run_options_.get();
  }
  tensorflow::thread::ThreadPool* h2d_transfer_pool() {
    return &h2d_transfer_pool_;
  }
 protected:
  friend class PjRtStreamExecutorBuffer;
  virtual void EnqueueCrossHostReceive(
      std::vector<std::unique_ptr<PjRtBuffer>>&& buffers,
      std::shared_ptr<BufferSequencingEvent> definition_event,
      PjRtCrossHostRecvNotifier&& notifier) const {
    notifier(Unimplemented("Cross host receives not implemented."));
  }
  virtual Status CopyToRemoteDevice(
      PjRtBuffer* buffer, absl::string_view serialized_descriptor) const {
    return Unimplemented("Cross host sends not implemented.");
  }
  const PjRtPlatformId platform_id_;
  const std::string platform_name_;
  LocalClient* client_;
  // Allocator to be used for staging memory transfers to devices.
  std::unique_ptr<tensorflow::Allocator> host_memory_allocator_;
  // Includes all devices, including non-local devices on multi-host platforms.
  std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> owned_devices_;
  // Pointers to `owned_devices_`.
  std::vector<PjRtDevice*> devices_;
  // Maps Device::id() to the corresponding Device. Includes all devices.
  std::map<int, PjRtDevice*> id_to_device_;
  // Local devices indexed by local device ordinal.
  std::vector<PjRtDevice*> local_devices_;
  int host_id_;
  se::DeviceMemoryAllocator* allocator_;
  std::unique_ptr<se::DeviceMemoryAllocator> owned_allocator_;
  // Should we always prefer to stage host-to-device transfers via memory
  // allocated on host_memory_allocator_? True only on GPU, where we prefer to
  // transfer via pinned memory.
  bool should_stage_host_to_device_transfers_;
  std::unique_ptr<gpu::GpuExecutableRunOptions> gpu_run_options_;
  tensorflow::thread::ThreadPool h2d_transfer_pool_;
 };
 // Converts a 2D set of Device objects indexed by [replica][partition] into an
 // xla::DeviceAssignment.
 StatusOr<DeviceAssignment> DevicesToDeviceAssignment(
    absl::Span<const std::vector<PjRtDevice*>> devices);
 // Holds a reference from Python to a tuple of device buffers. A PjRtBuffer
 // can be either valid or invalid. An invalid buffer is one that has never been
 // initialized, or a buffer that has been deleted (e.g., by calling Delete, or
@ -625,393 +367,6 @@ class PjRtBuffer {
  virtual bool IsOnCpu() const = 0;
 };
 class PjRtStreamExecutorBuffer : public PjRtBuffer {
 public:
  // Helper class to retain a "hold" on a PjRtBuffer. A ScopedHold may not
  // outlive its parent PjRtBuffer.
  //
  // There are three types of hold, as follows:
  //
  // 1) Usage hold: a transient hold while an operation using the buffer is
  //    being enqueued onto a stream.
  // A client acquires a usage hold by calling
  // PjRtBuffer::GetBufferWithHold(kUsage) or the convenience wrapper
  // GetBufferWithUsageHold(). If the enqueue completes successfully the hold
  // should be released using a call to ConvertUsageHold. If the ScopedHold is
  // deleted without ConvertUsageHold being called, e.g., on error, the hold is
  // dropped. It is legal to drop a usage hold instead of calling
  // ConvertUsageHold, even if the buffer was successfully enqueued, as long as
  // the client ensures that all necessary synchronization has been done.
  //
  // 2) External hold: a potentially long-lived hold while the buffer is being
  //    shared by an external framework, e.g., NumPy.
  // A client acquires an external hold by calling
  // PjRtBuffer::GetBufferWithHold(kExternal) or the convenience wrapper
  // GetBufferWithExternalReference and releases it by deleting the ScopedHold.
  // The external framework should not modify the underlying buffer unless it is
  // confident via its own synchronization that modifications do not race with
  // reads from the PjRtBuffer.
  //
  // 3) Donation hold: a transient hold while an execution that donates the
  //    buffer is being enqueued onto the compute stream.
  // A client acquires a donation hold by calling
  // PjRtBuffer::GetBufferWithHold(kDonation). If the enqueue completes
  // successfully the hold should be released using a call to ConfirmDonation
  // after which the buffer is invalid. If the ScopedHold is deleted without
  // ConfirmDonation being called, e.g., on error, the hold is dropped and the
  // buffer remains valid. If the buffer is successfully enqueued the client
  // *must* call ConfirmDonation.
  //
  // Donation holds behave like exclusive write locks: when a donation hold
  // has been acquired, any attempt to acquire another hold of any type will
  // block until the donation hold is dropped or confirmed. Acquiring a donation
  // hold will fail with an error if there is any outstanding external hold, and
  // will block if there are any outstanding usage holds until those holds are
  // dropped or converted.
  //
  // Calls to PjRtBuffer::Release (and transitively to
  // PjRtBuffer::Delete() and ~PjRtBuffer()) will block until all usage
  // and donation holds are either deleted or converted/confirmed.
  class ScopedHold {
   public:
    enum Type { kUsage = 0, kExternalReference, kDonation, kMaxValue };
    // Use a State enum instead of encoding the state in an error Status to
    // avoid creating Status values in non-error cases. Creating a Status
    // entails several allocations and can add O(us) to every use of a hold.
    enum State {
      kUninitialized = 0,
      kValid,
      kMoved,
      kConverted,
      kReleased,
      kDonated,
      kError
    };
    ~ScopedHold();
    ScopedHold(ScopedHold&& other);
    ScopedHold(const ScopedHold&) = delete;
    ScopedHold& operator=(const ScopedHold&) = delete;
    Type type() const { return type_; }
    Status status() const {
      // Lazily create Status values only when they are requested.
      switch (state_) {
        case kUninitialized:
          return InvalidArgument("Buffer has not been initialized");
        case kValid:
          return Status::OK();
        case kMoved:
          return InvalidArgument("Buffer has been moved.");
        case kConverted:
          return InvalidArgument("Buffer has been converted");
        case kReleased:
          return InvalidArgument("Buffer has been released");
        case kDonated:
          return InvalidArgument("Buffer has been donated");
        case kError:
          return buffer_or_.status();
        default:
          CHECK(false) << "Unexpected state value " << state_;
      }
    }
    bool ok() const { return state_ == kValid; }
    // Access to the underlying device buffer storage. Requires this->ok().
    const std::shared_ptr<TrackedDeviceBuffer>& buffer() const {
      CHECK_EQ(state_, kValid);
      CHECK_NE(buffer_or_.ValueOrDie(), nullptr);
      return buffer_or_.ValueOrDie();
    }
    TrackedDeviceBuffer* operator->() const { return buffer().get(); }
    const TrackedDeviceBuffer& operator*() const { return *buffer(); }
    // Converts the hold into a usage event. Only valid for holds of type
    // kUsage.
    //
    //   usage_stream:   the stream that the buffer was used on.
    //   event:          an event that has been recorded on usage_stream after
    //                   the buffer was used.
    //   reference_held: true if and only if the caller has caused a
    //                   reference to this->buffer() to stay live until after
    //                   the host is sure that the usage (transfer or execution)
    //                   has completed.
    void ConvertUsageHold(se::Stream* usage_stream,
                          std::shared_ptr<BufferSequencingEvent> event,
                          bool reference_held);
    // Confirms that the buffer was successfully donated to an execution.
    // Only valid for holds of type kDonation. Causes the buffer to become
    // invalid.
    void ConfirmDonation();
    // Adds the held device buffers in order to 'iterator'. Used to add the
    // buffers to an ExecutionInput. We require but do not verify that
    // 'iterator' when passed in is pointing to a sub-tuple of the
    // ExecutionInput whose on_device_shape matches that of the
    // TrackedDeviceBuffer. 'end' is used to check that 'iterator' doesn't run
    // out of bounds. Donates the device buffers if the hold type is kDonation,
    // otherwise retains ownership of the device buffers.
    void AddToInput(ShapeTree<MaybeOwningDeviceMemory>::iterator* iterator,
                    const ShapeTree<MaybeOwningDeviceMemory>::iterator& end,
                    ExecutionInput* execution_input,
                    se::DeviceMemoryAllocator* allocator) const;
   private:
    friend class PjRtStreamExecutorBuffer;
    friend class PjRtStreamExecutorClient;
    // Helper struct that makes it possible to move a ScopedHold through a
    // closure.
    using ForClosure =
        std::tuple<PjRtStreamExecutorBuffer*, Type, State,
                   StatusOr<std::shared_ptr<TrackedDeviceBuffer>>>;
    ScopedHold(PjRtStreamExecutorBuffer* parent, Type type)
        : parent_(parent), type_(type), state_(kUninitialized) {}
    explicit ScopedHold(const ForClosure& closure_helper)
        : parent_(std::get<0>(closure_helper)),
          type_(std::get<1>(closure_helper)),
          state_(std::get<2>(closure_helper)),
          buffer_or_(std::get<3>(closure_helper)) {
      // Check the buffer is not in an error state.
      CHECK(buffer_or_.ValueOrDie() != nullptr);
    }
    // Sets buffer state.
    void SetState(State state) { state_ = state; }
    // Sets buffer_or_. Called by parent_ to initialize the hold.
    void Acquire(StatusOr<std::shared_ptr<TrackedDeviceBuffer>>&& buffer_or);
    // Releases the contents of *this, so *this can subsequently be
    // deleted without releasing the parent's hold. Should be passed to the
    // appropriate constructor of another ScopedHold, e.g., when a hold must be
    // passed through a closure that is incompatible with std::move.
    ForClosure ToClosure();
    PjRtStreamExecutorBuffer* const parent_;
    const Type type_;
    // There is an invariant that if ok() then
    // buffer_or_.ValueOrDie() != nullptr.
    State state_;
    StatusOr<std::shared_ptr<TrackedDeviceBuffer>> buffer_or_;
  };
  PjRtStreamExecutorBuffer(Shape on_host_shape, Shape on_device_shape,
                           std::shared_ptr<TrackedDeviceBuffer> device_buffer,
                           PjRtClient* client, PjRtDevice* device);
  ~PjRtStreamExecutorBuffer() override;
  PjRtStreamExecutorBuffer(const PjRtStreamExecutorBuffer&) = delete;
  PjRtStreamExecutorBuffer(PjRtStreamExecutorBuffer&&) = delete;
  PjRtStreamExecutorBuffer& operator=(const PjRtStreamExecutorBuffer&) = delete;
  PjRtStreamExecutorBuffer& operator=(PjRtStreamExecutorBuffer&&) = delete;
  const Shape& on_host_shape() const override { return on_host_shape_; }
  const Shape& on_device_shape() const override { return on_device_shape_; }
  PjRtStreamExecutorDevice* device() const override { return device_; }
  PjRtPlatformId platform_id() const { return client_->platform_id(); }
  const std::string& platform_name() const { return client_->platform_name(); }
  PjRtStreamExecutorClient* client() const override { return client_; }
  bool IsEmptyTuple() const {
    return on_host_shape_.IsTuple() && on_host_shape_.tuple_shapes_size() == 0;
  }
  // Returns the size of the on-device representation of this buffer in bytes.
  int64 OnDeviceSizeInBytes() const override;
  // Implement PjRtBuffer::ExternalReferenceHold a wrapped
  // ScopedHold::kExternalReference.
  class ScopedHoldAsExternalReference
      : public PjRtBuffer::ExternalReferenceHold {
   public:
    explicit ScopedHoldAsExternalReference(ScopedHold hold)
        : external_reference_(std::move(hold)) {
      CHECK(hold.type() == ScopedHold::kExternalReference);
    }
    ~ScopedHoldAsExternalReference() override = default;
    void* OpaqueDeviceMemoryDataPointer() const override {
      return external_reference_->device_memory().front().opaque();
    }
   private:
    ScopedHold external_reference_;
  };
  StatusOr<std::unique_ptr<ExternalReferenceHold>> AcquireExternalReference()
      override;
  StatusOr<absl::optional<std::shared_ptr<void>>> ReleaseDeviceMemoryOwnership(
      bool wait_for_operations_to_complete) override;
  // Returns the buffer's value as an XLA Literal. If the value has previously
  // been prefetched to the host, then returns the prefetched version, otherwise
  // copies the buffer to the host. Blocks until the value is ready. If
  // `discard_cached_copy` is true then buffer will no longer keep hold of a
  // cached copy of the literal (i.e. The reference to the host value will be
  // removed.) If a layout is passed than a literal with this layout will be
  // returned.
  using PjRtBuffer::ToLiteral;
  StatusOr<std::shared_ptr<Literal>> ToLiteral(
      bool discard_cached_copy, absl::optional<xla::Layout> layout) override;
  // Initiates a copy of the buffer to the host. Does not block waiting for
  // the transfer to complete. The value can be retrieved by a later call to
  // ToLiteral(). If a layout is passed then a cached copy with this layout will
  // be created.
  using PjRtBuffer::CopyToHostAsync;
  Status CopyToHostAsync(absl::optional<xla::Layout> layout) override;
  // Drops the buffer's reference to its associated device memory, leaving the
  // buffer in an invalid state. The memory will be freed lazily when all async
  // operations using the buffer have completed, according to the allocation
  // semantics of the underlying platform. Delete may briefly block if another
  // thread is in the process of enqueuing an operation on this buffer, but it
  // will never block for a stream operation to complete. If an external
  // framework holds a reference to the TrackedDeviceBuffer via
  // GetBufferWithExternalReference, the memory will not be freed until the
  // external framework drops the reference.
  void Delete() override;
  // True if and only if Delete or Release has previously been called.
  bool IsDeleted() override;
  // Returns a view of the PjRtBuffer device memory as a ShapedBuffer. The
  // PjRtBuffer retains ownership of the device buffers.
  StatusOr<ShapedBuffer> AsShapedBuffer() const;
  // Returns a hold on the TrackedDeviceBuffer holding the device
  // buffers. See comment on ScopedHold.
  ScopedHold GetBufferWithHold(ScopedHold::Type type);
  ScopedHold GetBufferWithUsageHold() {
    return GetBufferWithHold(ScopedHold::kUsage);
  }
  ScopedHold GetBufferWithExternalReference() {
    return GetBufferWithHold(ScopedHold::kExternalReference);
  }
  // Copies the buffer to device `dst_device`, performing a d2d transfer when
  // `dst_device` is sharing the same Client, and performing a d2h and h2d copy
  // if `dst_device` lives on a different Client.
  // Returns an error if the buffer is already on dst_device.
  StatusOr<std::unique_ptr<PjRtBuffer>> CopyToDevice(
      PjRtDevice* dst_device) override;
  // Copies the buffer to the remote device encoded in serialized_descriptor.
  // This call must be preceded by a call to MakeCrossHostReceiveBuffers on the
  // remote host's destination device. MakeCrossHostReceiveBuffers takes an
  // array of shapes to construct the destination buffers, and a callback
  // supplies an array containing both the destination buffers, and a serialized
  // descriptor for each buffer. For each destination buffer there should be a
  // matching call to src->CopyToRemoteDevice on a remote host for a src buffer
  // of the corresponding shape. serialized_descriptor is the string returned by
  // the callback along with the corresponding destination buffer.
  Status CopyToRemoteDevice(absl::string_view serialized_descriptor) override;
  // Blocks the host until the buffer's value has been computed and is ready for
  // immediate use on the device. Useful in particular for timing benchmarks.
  Status BlockHostUntilReady() override;
  // Whether this buffer is on CPU and thus allows for certain optimizations.
  bool IsOnCpu() const override;
  // Similar to Delete, drops the buffer's reference to its associated device
  // memory, leaving the buffer in an invalid state, but returns the
  // TrackedDeviceBuffer rather than freeing the device memory, so that another
  // framework can take ownership of it. The buffer returned from Release may
  // be safely dropped at any time even if it still has pending async
  // operations. The client should call BlockHostUntilReady before calling
  // Release with wait_for_operations_to_complete=false, to ensure that the host
  // has synchronized past any outstanding write operations to the buffer. If
  // wait_for_operations_to_complete=true the host will block until any
  // potentially outstanding asynchronous operations have completed before
  // returning, in which case it is safe to read or mutate the returned buffer.
  // If the buffer was shared via an external reference it is the client's
  // responsibility that accesses via that reference do not interfere with
  // accesses via the buffer returned from Release.
  StatusOr<std::shared_ptr<TrackedDeviceBuffer>> Release(
      bool wait_for_operations_to_complete);
 private:
  friend class PjRtClient;
  // The cached value of the buffer on the host, produced either from a call to
  // CopyToHost or from a call to ToLiteral. Once a value has been fetched to
  // the host, it persists Delete() is called or the PjRtBuffer is destroyed.
  struct HostValue {
    absl::Notification ready;
    // status and value are valid for reading only after `ready` has been
    // notified.
    Status status;
    std::shared_ptr<Literal> value;
  };
  // Blocks in mu_.Await until there are no more usage holds.
  void WaitForOutstandingUsageHolds() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
  // Blocks in mu_.Await until there is no donation hold.
  void WaitForOutstandingDonationHold() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
  // Adds a hold of 'type' and returns device_buffer_. Returns an error if
  // device_buffer_ is null, or if a donation hold was requested when there is
  // an outstanding external hold.
  StatusOr<std::shared_ptr<TrackedDeviceBuffer>> GetBufferForHoldLocked(
      ScopedHold::Type type) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
  // Adds a hold of hold->type() and initializes `hold` with device_buffer_.
  // Initializes hold with an error if device_buffer_ is null, or if a donation
  // hold was requested when there is an outstanding external hold.
  void AcquireHoldLocked(ScopedHold* hold) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
  // Drops a usage hold and calls device_buffer_->AddUsageEvent. Does a sanity
  // check that buffer==device_buffer_ or device_buffer_==nullptr. Called after
  // device_buffer_ was successfully enqueued on a stream.
  void ConvertUsageHold(TrackedDeviceBuffer* buffer, se::Stream* usage_stream,
                        std::shared_ptr<BufferSequencingEvent> event,
                        bool reference_held);
  // Drops a donation hold and makes *this invalid for further use. Does a
  // sanity check that buffer==device_buffer_. Called after device_buffer_ was
  // successfully donated to an execution.
  void ConfirmDonation(TrackedDeviceBuffer* device_buffer);
  // Initiates a copy of the buffer to the host. Does not block waiting for
  // the transfer to complete. A host value is returned and if
  // `discard_cached_copy` is false stored in an internal buffer so that future
  // transfers don't have to transfer the data from host again. If a layout is
  // passed then a literal of this layout will be returned and possibly cached.
  StatusOr<std::shared_ptr<HostValue>> CopyToHostAsyncInternal(
      bool discard_cached_copy, absl::optional<xla::Layout> layout);
  // Drops a hold without taking any other action. Does a sanity check that
  // buffer==device_buffer_ or device_buffer_==nullptr.
  void DropHold(ScopedHold::Type type, TrackedDeviceBuffer* buffer);
  StatusOr<std::pair<std::unique_ptr<PjRtBuffer>,
                     std::shared_ptr<BufferSequencingEvent>>>
  CopyToDeviceHelper(PjRtDevice* dst_device, LocalDeviceState* dst_local_device,
                     LocalDeviceState* transfer_local_device,
                     se::Stream* transfer_stream,
                     std::shared_ptr<TrackedDeviceBuffer> src_device_buffer);
  PjRtStreamExecutorClient* const client_;
  const Shape on_host_shape_;
  const Shape on_device_shape_;
  PjRtStreamExecutorDevice* const device_;
  mutable absl::Mutex mu_;
  std::shared_ptr<TrackedDeviceBuffer> device_buffer_ TF_GUARDED_BY(mu_);
  absl::flat_hash_map<xla::Layout, std::shared_ptr<HostValue>> host_values_
      TF_GUARDED_BY(mu_);
  std::shared_ptr<HostValue> host_value_ TF_GUARDED_BY(mu_);
  // Count of holds on the buffer.
  std::array<int, ScopedHold::Type::kMaxValue> holds_ TF_GUARDED_BY(mu_);
  // Semaphore used to ensure there is only one outstanding donation hold.
  Semaphore donation_semaphore_;
 };
 class ExecuteContext {
 public:
  virtual ~ExecuteContext() = default;
@ -1103,148 +458,6 @@ class PjRtExecutable {
  virtual void Delete() = 0;
 };
 // Wraps one or more XLA LocalExecutables (one per partition, as specified by
 // the build options).
 class PjRtStreamExecutorExecutable : public PjRtExecutable {
 public:
  PjRtStreamExecutorExecutable(
      std::vector<std::unique_ptr<LocalExecutable>> executables,
      bool parameter_is_tupled_arguments,
      std::shared_ptr<DeviceAssignment> device_assignment,
      std::vector<LogicalDeviceIds> addressable_device_logical_ids,
      std::vector<PjRtDevice*> addressable_devices,
      PjRtStreamExecutorClient* client);
  ~PjRtStreamExecutorExecutable() override = default;
  PjRtStreamExecutorClient* client() const override { return client_; }
  const std::string& name() const override;
  int num_replicas() const override {
    return executables_[0]->build_options().num_replicas();
  }
  int num_partitions() const override {
    return executables_[0]->build_options().num_partitions();
  }
  int64 SizeOfGeneratedCodeInBytes() const override {
    int64 size = 0;
    for (auto& executable : executables_) {
      size += executable->executable()->SizeOfGeneratedCodeInBytes();
    }
    return size;
  }
  const DeviceAssignment& device_assignment() const override {
    return *device_assignment_;
  }
  absl::Span<const LogicalDeviceIds> addressable_device_logical_ids()
      const override {
    return addressable_device_logical_ids_;
  }
  absl::Span<PjRtDevice* const> addressable_devices() const override {
    return addressable_devices_;
  }
  // Return an HloModule per partition.
  StatusOr<std::vector<std::shared_ptr<HloModule>>> GetHloModules()
      const override;
  StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>> Execute(
      absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
      const ExecuteOptions& options) const override;
  StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteSharded(
      absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
      const ExecuteOptions& options) const override;
  StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecutePortable(
      absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
      const ExecuteOptions& options) const override;
  void Delete() override { executables_.clear(); }
  absl::Span<const std::shared_ptr<LocalExecutable>> executables() const {
    return executables_;
  }
 protected:
  bool parameter_is_tupled_arguments() const {
    return parameter_is_tupled_arguments_;
  }
 private:
  friend class PjRtStreamExecutorClient;
  // Initializes information about which arguments to which executables must be
  // donated due to aliases that were specified by the computation.
  Status SetUpDonation(bool tuple_inputs);
  virtual bool MustDonateParameter(int executable_idx, int parameter) const;
  virtual StatusOr<std::vector<ExecutionInput>>
  MakeExecutionInputsAndWaitForEvents(
      int device_ordinal, const ExecuteOptions& options,
      absl::Span<PjRtBuffer* const> argument_handles,
      absl::Span<const PjRtStreamExecutorBuffer::ScopedHold> device_buffers,
      absl::flat_hash_set<BufferSequencingEvent*>& events) const;
  StatusOr<ScopedShapedBuffer> EnqueueExecution(
      absl::Span<PjRtBuffer* const> argument_handles, int replica,
      int partition, int executable_idx, const RunId& run_id,
      const ExecuteOptions& options, PjRtDevice* device,
      std::vector<PjRtStreamExecutorBuffer::ScopedHold>* device_buffers,
      std::shared_ptr<DeviceAssignment> device_assignment) const;
  virtual std::vector<std::unique_ptr<PjRtBuffer>> MakeOutputBuffers(
      int device_ordinal, const ExecuteOptions& options,
      ScopedShapedBuffer result_buffer,
      std::shared_ptr<BufferSequencingEvent> definition_event,
      PjRtDevice* device) const;
  StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteHelper(
      absl::Span<PjRtBuffer* const> argument_handles, int replica,
      int partition, const RunId& run_id, const ExecuteOptions& options,
      PjRtDevice* device = nullptr) const;
  // Create shared pointers so we can free them after the execution: with
  // asynchronous execution, the process being executed can outlive the
  // executable itself.
  PjRtStreamExecutorClient* const client_;
  // One executable per partition.
  std::vector<std::shared_ptr<LocalExecutable>> executables_;
  // Per-executable set of parameters that have any aliased buffers and thus
  // must be donated when executing the computation.
  std::vector<absl::flat_hash_set<int>> parameters_that_must_be_donated_;
  std::shared_ptr<DeviceAssignment> device_assignment_;
  // True if the executables were compiled expecting arguments in a single
  // tuple.
  const bool parameter_is_tupled_arguments_;
  // The replica and partition indices of device_assignment_ to be run by this
  // client. On single-host platforms without partitioning, this is all replicas
  // (i.e. addressable_device_logical_ids_[i] = (i, 0)), but this may not be the
  // case on multi-host platforms. If there are 4 replicas and 2 partitions on a
  // single host platform, size of addressable_device_logical_ids_ is 4*2 = 8.
  std::vector<LogicalDeviceIds> addressable_device_logical_ids_;
  // addressable_devices_[i] is the Device to which
  // addressable_device_logical_ids_[i] is assigned. shared_ptrs instead of
  // unique_ptrs to play well with the Python bindings (see xla.cc).
  std::vector<PjRtDevice*> addressable_devices_;
 };
 // Executables can donate buffers so that buffers can be aliased from inputs
 // to outputs. This function returns the list of parameters that must be
 // donated when executable is run. tuple_inputs reflects the option that
 // executable was compiled with.
 StatusOr<absl::flat_hash_set<int>> GetParametersThatMustBeDonated(
    const HloModule& hlo_module, bool tuple_inputs);
 }  // namespace xla
 #endif  // TENSORFLOW_COMPILER_XLA_PJRT_PJRT_CLIENT_H_
--- a/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.cc
+++ b/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.cc
@ -62,7 +62,7 @@ limitations under the License.
 // See the comment on LocalDeviceState::AllocationModel for a discussion of the
 // different allocation semantics on CPU, GPU, and TPU.
-#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.h"
 #include <cstddef>
 #include <memory>
--- a/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.h
+++ b/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.h
@ -0,0 +1,782 @@
 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_COMPILER_XLA_PJRT_PJRT_STREAM_EXECUTOR_CLIENT_H_
 #define TENSORFLOW_COMPILER_XLA_PJRT_PJRT_STREAM_EXECUTOR_CLIENT_H_
 #include <memory>
 #include <string>
 #include <vector>
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/synchronization/notification.h"
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/layout.h"
 #include "tensorflow/compiler/xla/pjrt/local_device_state.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/compiler/xla/pjrt/tracked_device_buffer.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/casts.h"
 #include "tensorflow/core/platform/fingerprint.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
 namespace xla {
 class PjRtStreamExecutorDevice : public PjRtDevice {
 public:
  explicit PjRtStreamExecutorDevice(
      int id, std::unique_ptr<LocalDeviceState> local_device_state,
      std::string device_kind, int host_id = 0)
      : id_(id),
        device_ordinal_(
            local_device_state ? local_device_state->device_ordinal() : -1),
        local_device_state_(std::move(local_device_state)),
        host_id_(host_id),
        device_kind_(std::move(device_kind)) {}
  ~PjRtStreamExecutorDevice() override {}
  // Must set client exactly once.
  void SetClient(PjRtClient* client) {
    CHECK(client_ == nullptr);
    client_ = client;
  }
  int host_id() const override { return host_id_; }
  // Return `platform_id` from client.
  PjRtPlatformId platform_id() const;
  // Return `platform_name` from client.
  const std::string& platform_name() const;
  PjRtClient* client() const override { return client_; }
  int id() const override { return id_; }
  bool IsAddressable() const override { return device_ordinal_ != -1; }
  int local_hardware_id() const override { return device_ordinal_; }
  // If this is a device local to this host, returns a LocalDeviceState object
  // that can be used to manipulate the device. Returns nullptr if the device is
  // not local to this host.
  LocalDeviceState* local_device_state() const {
    return local_device_state_.get();
  }
  // If this is a device local to this host, returns a LocalDeviceState object
  // that can be used to manipulate the device. Returns an error if the device
  // is not local to this host.
  StatusOr<LocalDeviceState*> GetLocalDeviceState() const;
  const std::string& device_kind() const override { return device_kind_; }
  std::string DebugString() const override;
  Status TransferToInfeed(const LiteralSlice& literal) const override;
  StatusOr<Literal> TransferFromOutfeed(const Shape& shape) const override;
 private:
  const int id_;
  const int device_ordinal_;  // -1 means not local.
  const std::unique_ptr<LocalDeviceState> local_device_state_;
  const int host_id_;
  const std::string device_kind_;
  PjRtClient* client_ = nullptr;
 };
 class PjRtStreamExecutorClient : public PjRtClient {
 public:
  // `allocator` may null, in which case the platform default allocator is used.
  explicit PjRtStreamExecutorClient(
      std::string platform_name, LocalClient* client,
      std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> devices,
      int host_id, std::unique_ptr<se::DeviceMemoryAllocator> allocator,
      std::unique_ptr<tensorflow::Allocator> host_memory_allocator,
      bool should_stage_host_to_device_transfers,
      std::unique_ptr<gpu::GpuExecutableRunOptions> gpu_run_options);
  ~PjRtStreamExecutorClient() override = default;
  int host_id() const override { return host_id_; }
  int device_count() const override { return devices_.size(); }
  int addressable_device_count() const override {
    return local_devices_.size();
  }
  absl::Span<PjRtDevice* const> devices() const override { return devices_; }
  absl::Span<PjRtDevice* const> local_devices() const override {
    return local_devices_;
  }
  StatusOr<PjRtDevice*> LookupDevice(int device_id) const override {
    auto it = id_to_device_.find(device_id);
    if (it != id_to_device_.end()) {
      return it->second;
    }
    return InvalidArgument("No matching device found for device_id %d",
                           device_id);
  }
  StatusOr<PjRtDevice*> LookupAddressableDevice(
      int local_hardware_id) const override;
  PjRtPlatformId platform_id() const override { return platform_id_; }
  const std::string& platform_name() const override { return platform_name_; }
  // Most platforms expect device-to-device transfers to be enqueued on the
  // source d2d stream, but some platforms use the destination d2d stream. This
  // function specifies which one the platform expects.
  virtual bool EnqueueD2DTransfersOnSrcStream() const { return true; }
  StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
      int num_replicas, int num_partitions) const override;
  StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
      const XlaComputation& computation, CompileOptions options) override;
  StatusOr<absl::optional<std::string>> ExecutableFingerprint(
      const PjRtExecutable& executable) const override {
    return absl::optional<std::string>();
  }
  std::unique_ptr<HloCostAnalysis> GetHloCostAnalysis() override;
  // Creates a buffer on the device without initializing or copying any data.
  // An optional `definition_event` may be speficied that can be used to
  // ensure the buffer isn't referenced until some external mechanism has
  // initialized the data.
  StatusOr<std::unique_ptr<PjRtBuffer>> CreateUninitializedBuffer(
      const Shape& shape, PjRtDevice* device) override;
  StatusOr<std::unique_ptr<PjRtBuffer>> CreateUninitializedBuffer(
      const Shape& shape, PjRtDevice* device,
      std::shared_ptr<BufferSequencingEvent> definition_event);
  StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
      const void* data, const Shape& shape,
      HostBufferSemantics host_buffer_semantics,
      std::shared_ptr<void> buffer_reference, PjRtDevice* device) override;
  StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
      const LiteralSlice& literal, PjRtDevice* device) override;
  void MakeCrossHostReceiveBuffers(
      absl::Span<const Shape> shapes, PjRtDevice* device,
      PjRtCrossHostRecvNotifier&& notifier) override;
  StatusOr<ChannelHandle> CreateChannelHandle() override {
    return client()->CreateChannelHandle();
  }
  StatusOr<ChannelHandle> CreateDeviceToHostChannelHandle() override {
    return client()->CreateDeviceToHostChannelHandle();
  }
  StatusOr<ChannelHandle> CreateHostToDeviceChannelHandle() override {
    return client()->CreateHostToDeviceChannelHandle();
  }
  LocalDeviceState& device_state(int device_ordinal) const {
    return *tensorflow::down_cast<PjRtStreamExecutorDevice*>(
                local_devices_.at(device_ordinal))
                ->local_device_state();
  }
  LocalClient* client() const { return client_; }
  se::DeviceMemoryAllocator* allocator() const { return allocator_; }
  tensorflow::Allocator* host_memory_allocator() const {
    return host_memory_allocator_.get();
  }
  bool should_stage_host_to_device_transfers() const {
    return should_stage_host_to_device_transfers_;
  }
  gpu::GpuExecutableRunOptions* gpu_run_options() const {
    return gpu_run_options_.get();
  }
  tensorflow::thread::ThreadPool* h2d_transfer_pool() {
    return &h2d_transfer_pool_;
  }
 protected:
  friend class PjRtStreamExecutorBuffer;
  virtual void EnqueueCrossHostReceive(
      std::vector<std::unique_ptr<PjRtBuffer>>&& buffers,
      std::shared_ptr<BufferSequencingEvent> definition_event,
      PjRtCrossHostRecvNotifier&& notifier) const {
    notifier(Unimplemented("Cross host receives not implemented."));
  }
  virtual Status CopyToRemoteDevice(
      PjRtBuffer* buffer, absl::string_view serialized_descriptor) const {
    return Unimplemented("Cross host sends not implemented.");
  }
  const PjRtPlatformId platform_id_;
  const std::string platform_name_;
  LocalClient* client_;
  // Allocator to be used for staging memory transfers to devices.
  std::unique_ptr<tensorflow::Allocator> host_memory_allocator_;
  // Includes all devices, including non-local devices on multi-host platforms.
  std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> owned_devices_;
  // Pointers to `owned_devices_`.
  std::vector<PjRtDevice*> devices_;
  // Maps Device::id() to the corresponding Device. Includes all devices.
  std::map<int, PjRtDevice*> id_to_device_;
  // Local devices indexed by local device ordinal.
  std::vector<PjRtDevice*> local_devices_;
  int host_id_;
  se::DeviceMemoryAllocator* allocator_;
  std::unique_ptr<se::DeviceMemoryAllocator> owned_allocator_;
  // Should we always prefer to stage host-to-device transfers via memory
  // allocated on host_memory_allocator_? True only on GPU, where we prefer to
  // transfer via pinned memory.
  bool should_stage_host_to_device_transfers_;
  std::unique_ptr<gpu::GpuExecutableRunOptions> gpu_run_options_;
  tensorflow::thread::ThreadPool h2d_transfer_pool_;
 };
 // Converts a 2D set of Device objects indexed by [replica][partition] into an
 // xla::DeviceAssignment.
 StatusOr<DeviceAssignment> DevicesToDeviceAssignment(
    absl::Span<const std::vector<PjRtDevice*>> devices);
 class PjRtStreamExecutorBuffer : public PjRtBuffer {
 public:
  // Helper class to retain a "hold" on a PjRtStreamExecutorBuffer. A ScopedHold
  // may not outlive its parent PjRtStreamExecutorBuffer.
  //
  // There are three types of hold, as follows:
  //
  // 1) Usage hold: a transient hold while an operation using the buffer is
  //    being enqueued onto a stream.
  // A client acquires a usage hold by calling
  // PjRtStreamExecutorBuffer::GetBufferWithHold(kUsage) or the convenience
  // wrapper GetBufferWithUsageHold(). If the enqueue completes successfully the
  // hold should be released using a call to ConvertUsageHold. If the ScopedHold
  // is deleted without ConvertUsageHold being called, e.g., on error, the hold
  // is dropped. It is legal to drop a usage hold instead of calling
  // ConvertUsageHold, even if the buffer was successfully enqueued, as long as
  // the client ensures that all necessary synchronization has been done.
  //
  // 2) External hold: a potentially long-lived hold while the buffer is being
  //    shared by an external framework, e.g., NumPy.
  // A client acquires an external hold by calling
  // PjRtStreamExecutorBuffer::GetBufferWithHold(kExternal) or the convenience
  // wrapper GetBufferWithExternalReference and releases it by deleting the
  // ScopedHold. The external framework should not modify the underlying buffer
  // unless it is confident via its own synchronization that modifications do
  // not race with reads from the PjRtStreamExecutorBuffer.
  //
  // 3) Donation hold: a transient hold while an execution that donates the
  //    buffer is being enqueued onto the compute stream.
  // A client acquires a donation hold by calling
  // PjRtStreamExecutorBuffer::GetBufferWithHold(kDonation). If the enqueue
  // completes successfully the hold should be released using a call to
  // ConfirmDonation after which the buffer is invalid. If the ScopedHold is
  // deleted without ConfirmDonation being called, e.g., on error, the hold is
  // dropped and the buffer remains valid. If the buffer is successfully
  // enqueued the client *must* call ConfirmDonation.
  //
  // Donation holds behave like exclusive write locks: when a donation hold
  // has been acquired, any attempt to acquire another hold of any type will
  // block until the donation hold is dropped or confirmed. Acquiring a donation
  // hold will fail with an error if there is any outstanding external hold, and
  // will block if there are any outstanding usage holds until those holds are
  // dropped or converted.
  //
  // Calls to PjRtStreamExecutorBuffer::Release (and transitively to
  // PjRtStreamExecutorBuffer::Delete() and ~PjRtStreamExecutorBuffer()) will
  // block until all usage and donation holds are either deleted or
  // converted/confirmed.
  class ScopedHold {
   public:
    enum Type { kUsage = 0, kExternalReference, kDonation, kMaxValue };
    // Use a State enum instead of encoding the state in an error Status to
    // avoid creating Status values in non-error cases. Creating a Status
    // entails several allocations and can add O(us) to every use of a hold.
    enum State {
      kUninitialized = 0,
      kValid,
      kMoved,
      kConverted,
      kReleased,
      kDonated,
      kError
    };
    ~ScopedHold();
    ScopedHold(ScopedHold&& other);
    ScopedHold(const ScopedHold&) = delete;
    ScopedHold& operator=(const ScopedHold&) = delete;
    Type type() const { return type_; }
    Status status() const {
      // Lazily create Status values only when they are requested.
      switch (state_) {
        case kUninitialized:
          return InvalidArgument("Buffer has not been initialized");
        case kValid:
          return Status::OK();
        case kMoved:
          return InvalidArgument("Buffer has been moved.");
        case kConverted:
          return InvalidArgument("Buffer has been converted");
        case kReleased:
          return InvalidArgument("Buffer has been released");
        case kDonated:
          return InvalidArgument("Buffer has been donated");
        case kError:
          return buffer_or_.status();
        default:
          CHECK(false) << "Unexpected state value " << state_;
      }
    }
    bool ok() const { return state_ == kValid; }
    // Access to the underlying device buffer storage. Requires this->ok().
    const std::shared_ptr<TrackedDeviceBuffer>& buffer() const {
      CHECK_EQ(state_, kValid);
      CHECK_NE(buffer_or_.ValueOrDie(), nullptr);
      return buffer_or_.ValueOrDie();
    }
    TrackedDeviceBuffer* operator->() const { return buffer().get(); }
    const TrackedDeviceBuffer& operator*() const { return *buffer(); }
    // Converts the hold into a usage event. Only valid for holds of type
    // kUsage.
    //
    //   usage_stream:   the stream that the buffer was used on.
    //   event:          an event that has been recorded on usage_stream after
    //                   the buffer was used.
    //   reference_held: true if and only if the caller has caused a
    //                   reference to this->buffer() to stay live until after
    //                   the host is sure that the usage (transfer or execution)
    //                   has completed.
    void ConvertUsageHold(se::Stream* usage_stream,
                          std::shared_ptr<BufferSequencingEvent> event,
                          bool reference_held);
    // Confirms that the buffer was successfully donated to an execution.
    // Only valid for holds of type kDonation. Causes the buffer to become
    // invalid.
    void ConfirmDonation();
    // Adds the held device buffers in order to 'iterator'. Used to add the
    // buffers to an ExecutionInput. We require but do not verify that
    // 'iterator' when passed in is pointing to a sub-tuple of the
    // ExecutionInput whose on_device_shape matches that of the
    // TrackedDeviceBuffer. 'end' is used to check that 'iterator' doesn't run
    // out of bounds. Donates the device buffers if the hold type is kDonation,
    // otherwise retains ownership of the device buffers.
    void AddToInput(ShapeTree<MaybeOwningDeviceMemory>::iterator* iterator,
                    const ShapeTree<MaybeOwningDeviceMemory>::iterator& end,
                    ExecutionInput* execution_input,
                    se::DeviceMemoryAllocator* allocator) const;
   private:
    friend class PjRtStreamExecutorBuffer;
    friend class PjRtStreamExecutorClient;
    // Helper struct that makes it possible to move a ScopedHold through a
    // closure.
    using ForClosure =
        std::tuple<PjRtStreamExecutorBuffer*, Type, State,
                   StatusOr<std::shared_ptr<TrackedDeviceBuffer>>>;
    ScopedHold(PjRtStreamExecutorBuffer* parent, Type type)
        : parent_(parent), type_(type), state_(kUninitialized) {}
    explicit ScopedHold(const ForClosure& closure_helper)
        : parent_(std::get<0>(closure_helper)),
          type_(std::get<1>(closure_helper)),
          state_(std::get<2>(closure_helper)),
          buffer_or_(std::get<3>(closure_helper)) {
      // Check the buffer is not in an error state.
      CHECK(buffer_or_.ValueOrDie() != nullptr);
    }
    // Sets buffer state.
    void SetState(State state) { state_ = state; }
    // Sets buffer_or_. Called by parent_ to initialize the hold.
    void Acquire(StatusOr<std::shared_ptr<TrackedDeviceBuffer>>&& buffer_or);
    // Releases the contents of *this, so *this can subsequently be
    // deleted without releasing the parent's hold. Should be passed to the
    // appropriate constructor of another ScopedHold, e.g., when a hold must be
    // passed through a closure that is incompatible with std::move.
    ForClosure ToClosure();
    PjRtStreamExecutorBuffer* const parent_;
    const Type type_;
    // There is an invariant that if ok() then
    // buffer_or_.ValueOrDie() != nullptr.
    State state_;
    StatusOr<std::shared_ptr<TrackedDeviceBuffer>> buffer_or_;
  };
  PjRtStreamExecutorBuffer(Shape on_host_shape, Shape on_device_shape,
                           std::shared_ptr<TrackedDeviceBuffer> device_buffer,
                           PjRtClient* client, PjRtDevice* device);
  ~PjRtStreamExecutorBuffer() override;
  PjRtStreamExecutorBuffer(const PjRtStreamExecutorBuffer&) = delete;
  PjRtStreamExecutorBuffer(PjRtStreamExecutorBuffer&&) = delete;
  PjRtStreamExecutorBuffer& operator=(const PjRtStreamExecutorBuffer&) = delete;
  PjRtStreamExecutorBuffer& operator=(PjRtStreamExecutorBuffer&&) = delete;
  const Shape& on_host_shape() const override { return on_host_shape_; }
  const Shape& on_device_shape() const override { return on_device_shape_; }
  PjRtStreamExecutorDevice* device() const override { return device_; }
  PjRtPlatformId platform_id() const { return client_->platform_id(); }
  const std::string& platform_name() const { return client_->platform_name(); }
  PjRtStreamExecutorClient* client() const override { return client_; }
  bool IsEmptyTuple() const {
    return on_host_shape_.IsTuple() && on_host_shape_.tuple_shapes_size() == 0;
  }
  int64 OnDeviceSizeInBytes() const override;
  // Implement PjRtBuffer::ExternalReferenceHold a wrapped
  // ScopedHold::kExternalReference.
  class ScopedHoldAsExternalReference
      : public PjRtBuffer::ExternalReferenceHold {
   public:
    explicit ScopedHoldAsExternalReference(ScopedHold hold)
        : external_reference_(std::move(hold)) {
      CHECK(hold.type() == ScopedHold::kExternalReference);
    }
    ~ScopedHoldAsExternalReference() override = default;
    void* OpaqueDeviceMemoryDataPointer() const override {
      return external_reference_->device_memory().front().opaque();
    }
   private:
    ScopedHold external_reference_;
  };
  StatusOr<std::unique_ptr<ExternalReferenceHold>> AcquireExternalReference()
      override;
  StatusOr<absl::optional<std::shared_ptr<void>>> ReleaseDeviceMemoryOwnership(
      bool wait_for_operations_to_complete) override;
  using PjRtBuffer::ToLiteral;
  StatusOr<std::shared_ptr<Literal>> ToLiteral(
      bool discard_cached_copy, absl::optional<xla::Layout> layout) override;
  using PjRtBuffer::CopyToHostAsync;
  Status CopyToHostAsync(absl::optional<xla::Layout> layout) override;
  // Drops the buffer's reference to its associated device memory, leaving the
  // buffer in an invalid state. The memory will be freed lazily when all async
  // operations using the buffer have completed, according to the allocation
  // semantics of the underlying platform. Delete may briefly block if another
  // thread is in the process of enqueuing an operation on this buffer, but it
  // will never block for a stream operation to complete. If an external
  // framework holds a reference to the TrackedDeviceBuffer via
  // GetBufferWithExternalReference, the memory will not be freed until the
  // external framework drops the reference.
  void Delete() override;
  bool IsDeleted() override;
  // Returns a view of the PjRtBuffer device memory as a ShapedBuffer. The
  // PjRtBuffer retains ownership of the device buffers.
  StatusOr<ShapedBuffer> AsShapedBuffer() const;
  // Returns a hold on the TrackedDeviceBuffer holding the device
  // buffers. See comment on ScopedHold.
  ScopedHold GetBufferWithHold(ScopedHold::Type type);
  ScopedHold GetBufferWithUsageHold() {
    return GetBufferWithHold(ScopedHold::kUsage);
  }
  ScopedHold GetBufferWithExternalReference() {
    return GetBufferWithHold(ScopedHold::kExternalReference);
  }
  StatusOr<std::unique_ptr<PjRtBuffer>> CopyToDevice(
      PjRtDevice* dst_device) override;
  Status CopyToRemoteDevice(absl::string_view serialized_descriptor) override;
  Status BlockHostUntilReady() override;
  bool IsOnCpu() const override;
  // Similar to Delete, drops the buffer's reference to its associated device
  // memory, leaving the buffer in an invalid state, but returns the
  // TrackedDeviceBuffer rather than freeing the device memory, so that another
  // framework can take ownership of it. The buffer returned from Release may
  // be safely dropped at any time even if it still has pending async
  // operations. The client should call BlockHostUntilReady before calling
  // Release with wait_for_operations_to_complete=false, to ensure that the host
  // has synchronized past any outstanding write operations to the buffer. If
  // wait_for_operations_to_complete=true the host will block until any
  // potentially outstanding asynchronous operations have completed before
  // returning, in which case it is safe to read or mutate the returned buffer.
  // If the buffer was shared via an external reference it is the client's
  // responsibility that accesses via that reference do not interfere with
  // accesses via the buffer returned from Release.
  StatusOr<std::shared_ptr<TrackedDeviceBuffer>> Release(
      bool wait_for_operations_to_complete);
 private:
  friend class PjRtClient;
  // The cached value of the buffer on the host, produced either from a call to
  // CopyToHost or from a call to ToLiteral. Once a value has been fetched to
  // the host, it persists Delete() is called or the PjRtBuffer is destroyed.
  struct HostValue {
    absl::Notification ready;
    // status and value are valid for reading only after `ready` has been
    // notified.
    Status status;
    std::shared_ptr<Literal> value;
  };
  // Blocks in mu_.Await until there are no more usage holds.
  void WaitForOutstandingUsageHolds() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
  // Blocks in mu_.Await until there is no donation hold.
  void WaitForOutstandingDonationHold() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
  // Adds a hold of 'type' and returns device_buffer_. Returns an error if
  // device_buffer_ is null, or if a donation hold was requested when there is
  // an outstanding external hold.
  StatusOr<std::shared_ptr<TrackedDeviceBuffer>> GetBufferForHoldLocked(
      ScopedHold::Type type) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
  // Adds a hold of hold->type() and initializes `hold` with device_buffer_.
  // Initializes hold with an error if device_buffer_ is null, or if a donation
  // hold was requested when there is an outstanding external hold.
  void AcquireHoldLocked(ScopedHold* hold) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
  // Drops a usage hold and calls device_buffer_->AddUsageEvent. Does a sanity
  // check that buffer==device_buffer_ or device_buffer_==nullptr. Called after
  // device_buffer_ was successfully enqueued on a stream.
  void ConvertUsageHold(TrackedDeviceBuffer* buffer, se::Stream* usage_stream,
                        std::shared_ptr<BufferSequencingEvent> event,
                        bool reference_held);
  // Drops a donation hold and makes *this invalid for further use. Does a
  // sanity check that buffer==device_buffer_. Called after device_buffer_ was
  // successfully donated to an execution.
  void ConfirmDonation(TrackedDeviceBuffer* device_buffer);
  // Initiates a copy of the buffer to the host. Does not block waiting for
  // the transfer to complete. A host value is returned and if
  // `discard_cached_copy` is false stored in an internal buffer so that future
  // transfers don't have to transfer the data from host again. If a layout is
  // passed then a literal of this layout will be returned and possibly cached.
  StatusOr<std::shared_ptr<HostValue>> CopyToHostAsyncInternal(
      bool discard_cached_copy, absl::optional<xla::Layout> layout);
  // Drops a hold without taking any other action. Does a sanity check that
  // buffer==device_buffer_ or device_buffer_==nullptr.
  void DropHold(ScopedHold::Type type, TrackedDeviceBuffer* buffer);
  StatusOr<std::pair<std::unique_ptr<PjRtBuffer>,
                     std::shared_ptr<BufferSequencingEvent>>>
  CopyToDeviceHelper(PjRtDevice* dst_device, LocalDeviceState* dst_local_device,
                     LocalDeviceState* transfer_local_device,
                     se::Stream* transfer_stream,
                     std::shared_ptr<TrackedDeviceBuffer> src_device_buffer);
  PjRtStreamExecutorClient* const client_;
  const Shape on_host_shape_;
  const Shape on_device_shape_;
  PjRtStreamExecutorDevice* const device_;
  mutable absl::Mutex mu_;
  std::shared_ptr<TrackedDeviceBuffer> device_buffer_ TF_GUARDED_BY(mu_);
  absl::flat_hash_map<xla::Layout, std::shared_ptr<HostValue>> host_values_
      TF_GUARDED_BY(mu_);
  std::shared_ptr<HostValue> host_value_ TF_GUARDED_BY(mu_);
  // Count of holds on the buffer.
  std::array<int, ScopedHold::Type::kMaxValue> holds_ TF_GUARDED_BY(mu_);
  // Semaphore used to ensure there is only one outstanding donation hold.
  Semaphore donation_semaphore_;
 };
 // Wraps one or more XLA LocalExecutables (one per partition, as specified by
 // the build options).
 class PjRtStreamExecutorExecutable : public PjRtExecutable {
 public:
  PjRtStreamExecutorExecutable(
      std::vector<std::unique_ptr<LocalExecutable>> executables,
      bool parameter_is_tupled_arguments,
      std::shared_ptr<DeviceAssignment> device_assignment,
      std::vector<LogicalDeviceIds> addressable_device_logical_ids,
      std::vector<PjRtDevice*> addressable_devices,
      PjRtStreamExecutorClient* client);
  ~PjRtStreamExecutorExecutable() override = default;
  PjRtStreamExecutorClient* client() const override { return client_; }
  const std::string& name() const override;
  int num_replicas() const override {
    return executables_[0]->build_options().num_replicas();
  }
  int num_partitions() const override {
    return executables_[0]->build_options().num_partitions();
  }
  int64 SizeOfGeneratedCodeInBytes() const override {
    int64 size = 0;
    for (auto& executable : executables_) {
      size += executable->executable()->SizeOfGeneratedCodeInBytes();
    }
    return size;
  }
  const DeviceAssignment& device_assignment() const override {
    return *device_assignment_;
  }
  absl::Span<const LogicalDeviceIds> addressable_device_logical_ids()
      const override {
    return addressable_device_logical_ids_;
  }
  absl::Span<PjRtDevice* const> addressable_devices() const override {
    return addressable_devices_;
  }
  // Return an HloModule per partition.
  StatusOr<std::vector<std::shared_ptr<HloModule>>> GetHloModules()
      const override;
  StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>> Execute(
      absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
      const ExecuteOptions& options) const override;
  StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteSharded(
      absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
      const ExecuteOptions& options) const override;
  StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecutePortable(
      absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
      const ExecuteOptions& options) const override;
  void Delete() override { executables_.clear(); }
  absl::Span<const std::shared_ptr<LocalExecutable>> executables() const {
    return executables_;
  }
 protected:
  bool parameter_is_tupled_arguments() const {
    return parameter_is_tupled_arguments_;
  }
 private:
  friend class PjRtStreamExecutorClient;
  // Initializes information about which arguments to which executables must be
  // donated due to aliases that were specified by the computation.
  Status SetUpDonation(bool tuple_inputs);
  virtual bool MustDonateParameter(int executable_idx, int parameter) const;
  virtual StatusOr<std::vector<ExecutionInput>>
  MakeExecutionInputsAndWaitForEvents(
      int device_ordinal, const ExecuteOptions& options,
      absl::Span<PjRtBuffer* const> argument_handles,
      absl::Span<const PjRtStreamExecutorBuffer::ScopedHold> device_buffers,
      absl::flat_hash_set<BufferSequencingEvent*>& events) const;
  StatusOr<ScopedShapedBuffer> EnqueueExecution(
      absl::Span<PjRtBuffer* const> argument_handles, int replica,
      int partition, int executable_idx, const RunId& run_id,
      const ExecuteOptions& options, PjRtDevice* device,
      std::vector<PjRtStreamExecutorBuffer::ScopedHold>* device_buffers,
      std::shared_ptr<DeviceAssignment> device_assignment) const;
  virtual std::vector<std::unique_ptr<PjRtBuffer>> MakeOutputBuffers(
      int device_ordinal, const ExecuteOptions& options,
      ScopedShapedBuffer result_buffer,
      std::shared_ptr<BufferSequencingEvent> definition_event,
      PjRtDevice* device) const;
  StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteHelper(
      absl::Span<PjRtBuffer* const> argument_handles, int replica,
      int partition, const RunId& run_id, const ExecuteOptions& options,
      PjRtDevice* device = nullptr) const;
  // Create shared pointers so we can free them after the execution: with
  // asynchronous execution, the process being executed can outlive the
  // executable itself.
  PjRtStreamExecutorClient* const client_;
  // One executable per partition.
  std::vector<std::shared_ptr<LocalExecutable>> executables_;
  // Per-executable set of parameters that have any aliased buffers and thus
  // must be donated when executing the computation.
  std::vector<absl::flat_hash_set<int>> parameters_that_must_be_donated_;
  std::shared_ptr<DeviceAssignment> device_assignment_;
  // True if the executables were compiled expecting arguments in a single
  // tuple.
  const bool parameter_is_tupled_arguments_;
  // The replica and partition indices of device_assignment_ to be run by this
  // client. On single-host platforms without partitioning, this is all replicas
  // (i.e. addressable_device_logical_ids_[i] = (i, 0)), but this may not be the
  // case on multi-host platforms. If there are 4 replicas and 2 partitions on a
  // single host platform, size of addressable_device_logical_ids_ is 4*2 = 8.
  std::vector<LogicalDeviceIds> addressable_device_logical_ids_;
  // addressable_devices_[i] is the Device to which
  // addressable_device_logical_ids_[i] is assigned. shared_ptrs instead of
  // unique_ptrs to play well with the Python bindings (see xla.cc).
  std::vector<PjRtDevice*> addressable_devices_;
 };
 // Executables can donate buffers so that buffers can be aliased from inputs
 // to outputs. This function returns the list of parameters that must be
 // donated when executable is run. tuple_inputs reflects the option that
 // executable was compiled with.
 StatusOr<absl::flat_hash_set<int>> GetParametersThatMustBeDonated(
    const HloModule& hlo_module, bool tuple_inputs);
 }  // namespace xla
 #endif  // TENSORFLOW_COMPILER_XLA_PJRT_PJRT_STREAM_EXECUTOR_CLIENT_H_
--- a/tensorflow/compiler/xla/pjrt/tpu_client.cc
+++ b/tensorflow/compiler/xla/pjrt/tpu_client.cc
@ -23,7 +23,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/pjrt/local_device_state.h"
-#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.h"
 #include "tensorflow/compiler/xla/pjrt/tracked_device_buffer.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/shape.h"
--- a/tensorflow/compiler/xla/pjrt/tpu_client.h
+++ b/tensorflow/compiler/xla/pjrt/tpu_client.h
@ -20,7 +20,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
-#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/stream_executor/tpu/tpu_topology.h"
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@ -211,6 +211,7 @@ cc_library(
        "//tensorflow/compiler/xla:types",
        "//tensorflow/compiler/xla:util",
        "//tensorflow/compiler/xla/pjrt:pjrt_client",
        "//tensorflow/compiler/xla/pjrt:pjrt_stream_executor_client",  # TODO(zhangqiaorjc): Remove after adding a factory method for PjRtBuffer.
        "//tensorflow/compiler/xla/pjrt:tracked_device_buffer",
        "//tensorflow/stream_executor:device_memory",
        "//tensorflow/stream_executor:platform",
--- a/tensorflow/compiler/xla/python/dlpack.cc
+++ b/tensorflow/compiler/xla/python/dlpack.cc
@ -25,6 +25,7 @@ limitations under the License.
 #include "include/dlpack/dlpack.h"  // from @dlpack
 #include "pybind11/pytypes.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.h"
 #include "tensorflow/compiler/xla/pjrt/tracked_device_buffer.h"
 #include "tensorflow/compiler/xla/python/python_ref_manager.h"
 #include "tensorflow/compiler/xla/python/traceback.h"
--- a/tensorflow/compiler/xla/python/outfeed_receiver.cc
+++ b/tensorflow/compiler/xla/python/outfeed_receiver.cc
@ -18,6 +18,7 @@ limitations under the License.
 #include <sys/types.h>
 #include <memory>
 #include <queue>
 #include <sstream>
 #include "absl/container/flat_hash_map.h"
--- a/tensorflow/compiler/xla/python/tpu_driver/client/BUILD
+++ b/tensorflow/compiler/xla/python/tpu_driver/client/BUILD
@ -26,7 +26,7 @@ cc_library(
        "//tensorflow/compiler/xla:util",
        "//tensorflow/compiler/xla:xla_data_proto_cc",
        "//tensorflow/compiler/xla/client:executable_build_options",
-        "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "//tensorflow/compiler/xla/pjrt:pjrt_stream_executor_client",
        "//tensorflow/compiler/xla/pjrt:semaphore",
        "//tensorflow/compiler/xla/python/tpu_driver",
        "//tensorflow/compiler/xla/python/tpu_driver:direct_tpu_driver",
--- a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.h
+++ b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.h
@ -24,7 +24,7 @@ limitations under the License.
 #include "absl/synchronization/notification.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
-#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.h"
 #include "tensorflow/compiler/xla/python/tpu_driver/tpu_driver.h"
 #include "tensorflow/compiler/xla/python/tpu_driver/tpu_driver.pb.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"