improve concurrency between compute and nccl streams

The NcclManager records and waits on an Event as each Participant is added,
rather than synchronizing with the compute stream only after all Participants
have been added. Otherwise, most compute kernels are added to the compute
stream prior to the NCCL sync Event, delaying the start of the collective.
This commit is contained in:
Jeff Daily 2019-08-09 15:40:32 +00:00
parent 6c526e012c
commit 7dbb5dd1c4
3 changed files with 13 additions and 1 deletions

View File

@ -29,6 +29,7 @@ cc_library(
copts = tf_copts(),
deps = if_cuda([
"@com_google_absl//absl/container:flat_hash_map",
"@com_google_absl//absl/memory",
"@local_config_nccl//:nccl",
"//tensorflow/core:core_cpu",
"//tensorflow/core:framework",

View File

@ -539,7 +539,7 @@ void NcclManager::RunCollective(Collective* collective) {
// Wait to ensure that the kernel that produces the data in the input
// tensor has finished running before the nccl kernel runs on the
// communication stream.
nccl_stream->stream->ThenWaitFor(p->tensor_stream);
nccl_stream->stream->ThenWaitFor(p->input_event.get());
}
if (p->root) {
if (collective->root_rank == -1) {

View File

@ -27,6 +27,7 @@ limitations under the License.
#endif
#include "absl/container/flat_hash_map.h"
#include "absl/memory/memory.h"
#include "third_party/nccl/nccl.h"
#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
#include "tensorflow/core/framework/tensor.h"
@ -63,6 +64,7 @@ class NcclManager {
event_mgr(event_mgr),
gpu_device_id(gpu_device_id),
input(input),
input_event(nullptr),
output(output),
global_rank(global_rank),
done_callback(std::move(done_callback)),
@ -70,6 +72,11 @@ class NcclManager {
DCHECK(executor != nullptr);
DCHECK(event_mgr != nullptr);
DCHECK(tensor_stream != nullptr);
if (input != nullptr) {
input_event = absl::make_unique<se::Event>(executor);
input_event->Init();
tensor_stream->ThenRecordEvent(input_event.get());
}
}
// StreamExecutor for the device. Expected to be live for process lifetime.
@ -94,6 +101,10 @@ class NcclManager {
// called. Is NULL for participants that only receive data.
const Tensor* input;
// Wait on this event rather than synchronizing on the entire stream.
// This allows greater concurrency between compute and nccl streams.
std::unique_ptr<se::Event> input_event;
// Owned by the caller, who must keep it live until `done_callback` is
// called. Is NULL for participants that only send data.
Tensor* output;