diff --git a/tensorflow/compiler/xla/pjrt/pjrt_client.cc b/tensorflow/compiler/xla/pjrt/pjrt_client.cc index 06ec69f44c1..126b74b9b98 100644 --- a/tensorflow/compiler/xla/pjrt/pjrt_client.cc +++ b/tensorflow/compiler/xla/pjrt/pjrt_client.cc @@ -1610,6 +1610,10 @@ StatusOr PjRtExecutable::EnqueueExecution( run_options.set_run_id(run_id); run_options.set_rng_seed(device_state->GetNewPrngSeed()); run_options.set_gpu_executable_run_options(client_->gpu_run_options()); + run_options.set_launch_id(options.launch_id); + if (run_options.launch_id() != 0) { + VLOG(1) << "launch id for " << name() << ": " << run_options.launch_id(); + } // The choice of where we wait is arbitrary; the reason for the wait is // pacing to avoid problems such as memory fragmentation and running ahead @@ -2138,13 +2142,13 @@ StatusOr, Shape>> GetShardedProgramShapes( client->client()->Compile(computation, argument_layout_pointers, build_options)); - auto py_executable = absl::make_unique( + auto executable = absl::make_unique( std::move(local_executables), options.parameter_is_tupled_arguments, std::move(device_assignment), std::move(local_logical_device_ids), std::move(local_devices), client); - TF_RETURN_IF_ERROR(py_executable->SetUpDonation( - client, options.parameter_is_tupled_arguments)); - return py_executable; + TF_RETURN_IF_ERROR( + executable->SetUpDonation(client, options.parameter_is_tupled_arguments)); + return executable; } } // namespace xla diff --git a/tensorflow/compiler/xla/pjrt/pjrt_client.h b/tensorflow/compiler/xla/pjrt/pjrt_client.h index b234027adf3..bb9093a8bf7 100644 --- a/tensorflow/compiler/xla/pjrt/pjrt_client.h +++ b/tensorflow/compiler/xla/pjrt/pjrt_client.h @@ -119,6 +119,8 @@ struct PjRtCrossHostRecvBuffer { using PjRtCrossHostRecvNotifier = std::function>&&)>; +class PjRtExecutable; + // Encapsulates the state of Python session with XLA. // // It is the responsibility of the client of this API to keep the PjRtClient @@ -181,6 +183,13 @@ class PjRtClient { virtual StatusOr> GetParametersThatMustBeDonated( const LocalExecutable& executable, bool tuple_inputs) const; + // Generates a unique fingerprint for `executable`. See + // PjRtExecutable::fingerprint_. + virtual StatusOr> ExecutableFingerprint( + const PjRtExecutable& executable) const { + return absl::optional(); + } + protected: friend class PjRtBuffer; virtual void EnqueueCrossHostReceive( @@ -668,6 +677,11 @@ struct ExecuteOptions { // If true, the computation must return a tuple, which will be destructured // into its elements. bool untuple_result = false; + // If non-zero, identifies this execution as part of a potentially + // multi-device launch. This can be used to detect scheduling errors, e.g. if + // multi-host programs are launched in different orders on different hosts, + // the launch IDs may be used by the runtime to detect the mismatch. + int32 launch_id = 0; }; // Represents a compiled computation that can be executed given handles to @@ -687,6 +701,8 @@ class PjRtExecutable { std::vector> local_logical_device_ids, std::vector local_devices, PjRtClient* client); + virtual ~PjRtExecutable() = default; + PjRtClient* client() const { return client_; } int num_replicas() const { @@ -744,12 +760,14 @@ class PjRtExecutable { // Initializes information about which arguments to which executables must be // donated due to aliases that were specified by the computation. Status SetUpDonation(PjRtClient* client, bool tuple_inputs); + StatusOr EnqueueExecution( absl::Span argument_handles, int replica, int partition, int executable_idx, const RunId& run_id, const ExecuteOptions& options, Device* device, std::vector* device_buffers, std::shared_ptr device_assignment) const; + StatusOr>> ExecuteHelper( absl::Span argument_handles, int replica, int partition, const RunId& run_id, const ExecuteOptions& options, diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD index b8f9b8e57ca..aa55a39218d 100644 --- a/tensorflow/compiler/xla/python/BUILD +++ b/tensorflow/compiler/xla/python/BUILD @@ -202,6 +202,7 @@ cc_library( "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla/pjrt:pjrt_client", + "//tensorflow/core/platform:fingerprint", "//tensorflow/core/profiler:protos_all_cc", "@com_google_absl//absl/algorithm:container", "@com_google_absl//absl/container:flat_hash_map", diff --git a/tensorflow/compiler/xla/python/py_client.cc b/tensorflow/compiler/xla/python/py_client.cc index f0273d5ed4b..1f07c6e2042 100644 --- a/tensorflow/compiler/xla/python/py_client.cc +++ b/tensorflow/compiler/xla/python/py_client.cc @@ -124,15 +124,19 @@ StatusOr> PyClient::BufferFromPyval( StatusOr> PyClient::Compile( const XlaComputation& computation, CompileOptions options) { std::unique_ptr executable; + absl::optional fingerprint; { py::gil_scoped_release gil_release; TF_ASSIGN_OR_RETURN(executable, PjRtExecutable::Compile(computation, pjrt_client_.get(), std::move(options))); + TF_ASSIGN_OR_RETURN(fingerprint, + pjrt_client_->ExecutableFingerprint(*executable)); } auto traceback = Traceback::Get(); return std::make_unique( - shared_from_this(), std::move(executable), std::move(traceback)); + shared_from_this(), std::move(executable), std::move(traceback), + std::move(fingerprint)); } class ProfileBuilder { diff --git a/tensorflow/compiler/xla/python/py_executable.cc b/tensorflow/compiler/xla/python/py_executable.cc index c56fd3a89fc..b2cd2af56ea 100644 --- a/tensorflow/compiler/xla/python/py_executable.cc +++ b/tensorflow/compiler/xla/python/py_executable.cc @@ -16,6 +16,7 @@ limitations under the License. #include "tensorflow/compiler/xla/python/py_executable.h" #include "absl/algorithm/container.h" +#include "tensorflow/core/platform/fingerprint.h" namespace xla { @@ -23,10 +24,12 @@ namespace py = pybind11; PyExecutable::PyExecutable(std::shared_ptr client, std::unique_ptr executable, - std::shared_ptr traceback) + std::shared_ptr traceback, + absl::optional fingerprint) : client_(std::move(client)), executable_(std::move(executable)), - traceback_(std::move(traceback)) { + traceback_(std::move(traceback)), + fingerprint_(std::move(fingerprint)) { CHECK(PyGILState_Check()); next_ = client_->executables_; client_->executables_ = this; @@ -34,6 +37,10 @@ PyExecutable::PyExecutable(std::shared_ptr client, if (next_) { next_->prev_ = this; } + if (fingerprint_) { + VLOG(1) << "Fingerprint for executable " << executable_->name() << ": " + << *fingerprint_; + } } PyExecutable::~PyExecutable() { @@ -65,6 +72,9 @@ StatusOr>> PyExecutable::Execute( py::gil_scoped_release gil_release; ExecuteOptions options; options.untuple_result = true; + if (fingerprint_) { + options.launch_id = tensorflow::Fingerprint32(*fingerprint_); + } std::vector arg_buffers(args.size()); absl::c_transform(args, arg_buffers.begin(), [](PyBuffer* buf) { return buf->buffer(); }); @@ -89,6 +99,9 @@ PyExecutable::ExecuteOnLocalDevices( py::gil_scoped_release gil_release; ExecuteOptions options; options.untuple_result = true; + if (fingerprint_) { + options.launch_id = tensorflow::Fingerprint32(*fingerprint_); + } std::vector> arg_buffers(args.size()); for (int computation = 0; computation < args.size(); ++computation) { arg_buffers[computation].resize(args[computation].size()); diff --git a/tensorflow/compiler/xla/python/py_executable.h b/tensorflow/compiler/xla/python/py_executable.h index 7f35f97f6e9..1051d065335 100644 --- a/tensorflow/compiler/xla/python/py_executable.h +++ b/tensorflow/compiler/xla/python/py_executable.h @@ -37,7 +37,8 @@ class PyExecutable { public: PyExecutable(std::shared_ptr client, std::unique_ptr executable, - std::shared_ptr traceback); + std::shared_ptr traceback, + absl::optional fingerprint); ~PyExecutable(); std::shared_ptr client() const { return client_; } @@ -71,6 +72,11 @@ class PyExecutable { std::unique_ptr executable_; std::shared_ptr traceback_; + // Identical executables (i.e. representing the same program) will have the + // same fingerprint. nullopt on platforms or executables where fingerprints + // aren't implemented. + absl::optional fingerprint_; + // Doubly-linked list of all executables known to the client. Protected by the // GIL. PyExecutable* next_; diff --git a/tensorflow/core/platform/fingerprint.h b/tensorflow/core/platform/fingerprint.h index b1260615580..cebb0679f0d 100644 --- a/tensorflow/core/platform/fingerprint.h +++ b/tensorflow/core/platform/fingerprint.h @@ -90,6 +90,15 @@ inline uint64 Fingerprint64(const StringPiece s) { #endif } +// 32-bit variant of Fingerprint64 above (same properties and caveats apply). +inline uint32 Fingerprint32(const StringPiece s) { +#ifdef USE_OSS_FARMHASH + return ::util::Fingerprint32(s.data(), s.size()); +#else + return farmhash::Fingerprint32(s.data(), s.size()); +#endif +} + // 128-bit variant of Fingerprint64 above (same properties and caveats apply). inline Fprint128 Fingerprint128(const StringPiece s) { #ifdef USE_OSS_FARMHASH