improve concurrency between compute and nccl streams

The NcclManager records and waits on an Event as each Participant is added, rather than synchronizing with the compute stream only after all Participants have been added. Otherwise, most compute kernels are added to the compute stream prior to the NCCL sync Event, delaying the start of the collective.
2019-08-09 15:40:32 +00:00 · 2019-08-09 15:40:32 +00:00 · 7dbb5dd1c4
commit 7dbb5dd1c4
parent 6c526e012c
3 changed files with 13 additions and 1 deletions
--- a/tensorflow/core/nccl/BUILD
+++ b/tensorflow/core/nccl/BUILD
@ -29,6 +29,7 @@ cc_library(
    copts = tf_copts(),
    deps = if_cuda([
        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/memory",
        "@local_config_nccl//:nccl",
        "//tensorflow/core:core_cpu",
        "//tensorflow/core:framework",
--- a/tensorflow/core/nccl/nccl_manager.cc
+++ b/tensorflow/core/nccl/nccl_manager.cc
@ -539,7 +539,7 @@ void NcclManager::RunCollective(Collective* collective) {
      // Wait to ensure that the kernel that produces the data in the input
      // tensor has finished running before the nccl kernel runs on the
      // communication stream.
-      nccl_stream->stream->ThenWaitFor(p->tensor_stream);
+      nccl_stream->stream->ThenWaitFor(p->input_event.get());
    }
    if (p->root) {
      if (collective->root_rank == -1) {
--- a/tensorflow/core/nccl/nccl_manager.h
+++ b/tensorflow/core/nccl/nccl_manager.h
@ -27,6 +27,7 @@ limitations under the License.
 #endif

 #include "absl/container/flat_hash_map.h"
+#include "absl/memory/memory.h"
 #include "third_party/nccl/nccl.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
@ -63,6 +64,7 @@ class NcclManager {
          event_mgr(event_mgr),
          gpu_device_id(gpu_device_id),
          input(input),
+          input_event(nullptr),
          output(output),
          global_rank(global_rank),
          done_callback(std::move(done_callback)),
@ -70,6 +72,11 @@ class NcclManager {
      DCHECK(executor != nullptr);
      DCHECK(event_mgr != nullptr);
      DCHECK(tensor_stream != nullptr);
+      if (input != nullptr) {
+        input_event = absl::make_unique<se::Event>(executor);
+        input_event->Init();
+        tensor_stream->ThenRecordEvent(input_event.get());
+      }
    }

    // StreamExecutor for the device. Expected to be live for process lifetime.
@ -94,6 +101,10 @@ class NcclManager {
    // called. Is NULL for participants that only receive data.
    const Tensor* input;

+    // Wait on this event rather than synchronizing on the entire stream.
+    // This allows greater concurrency between compute and nccl streams.
+    std::unique_ptr<se::Event> input_event;
+
    // Owned by the caller, who must keep it live until `done_callback` is
    // called. Is NULL for participants that only send data.
    Tensor* output;