improve concurrency between compute and nccl streams
The NcclManager records and waits on an Event as each Participant is added, rather than synchronizing with the compute stream only after all Participants have been added. Otherwise, most compute kernels are added to the compute stream prior to the NCCL sync Event, delaying the start of the collective.
This commit is contained in:
parent
6c526e012c
commit
7dbb5dd1c4
@ -29,6 +29,7 @@ cc_library(
|
||||
copts = tf_copts(),
|
||||
deps = if_cuda([
|
||||
"@com_google_absl//absl/container:flat_hash_map",
|
||||
"@com_google_absl//absl/memory",
|
||||
"@local_config_nccl//:nccl",
|
||||
"//tensorflow/core:core_cpu",
|
||||
"//tensorflow/core:framework",
|
||||
|
@ -539,7 +539,7 @@ void NcclManager::RunCollective(Collective* collective) {
|
||||
// Wait to ensure that the kernel that produces the data in the input
|
||||
// tensor has finished running before the nccl kernel runs on the
|
||||
// communication stream.
|
||||
nccl_stream->stream->ThenWaitFor(p->tensor_stream);
|
||||
nccl_stream->stream->ThenWaitFor(p->input_event.get());
|
||||
}
|
||||
if (p->root) {
|
||||
if (collective->root_rank == -1) {
|
||||
|
@ -27,6 +27,7 @@ limitations under the License.
|
||||
#endif
|
||||
|
||||
#include "absl/container/flat_hash_map.h"
|
||||
#include "absl/memory/memory.h"
|
||||
#include "third_party/nccl/nccl.h"
|
||||
#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
|
||||
#include "tensorflow/core/framework/tensor.h"
|
||||
@ -63,6 +64,7 @@ class NcclManager {
|
||||
event_mgr(event_mgr),
|
||||
gpu_device_id(gpu_device_id),
|
||||
input(input),
|
||||
input_event(nullptr),
|
||||
output(output),
|
||||
global_rank(global_rank),
|
||||
done_callback(std::move(done_callback)),
|
||||
@ -70,6 +72,11 @@ class NcclManager {
|
||||
DCHECK(executor != nullptr);
|
||||
DCHECK(event_mgr != nullptr);
|
||||
DCHECK(tensor_stream != nullptr);
|
||||
if (input != nullptr) {
|
||||
input_event = absl::make_unique<se::Event>(executor);
|
||||
input_event->Init();
|
||||
tensor_stream->ThenRecordEvent(input_event.get());
|
||||
}
|
||||
}
|
||||
|
||||
// StreamExecutor for the device. Expected to be live for process lifetime.
|
||||
@ -94,6 +101,10 @@ class NcclManager {
|
||||
// called. Is NULL for participants that only receive data.
|
||||
const Tensor* input;
|
||||
|
||||
// Wait on this event rather than synchronizing on the entire stream.
|
||||
// This allows greater concurrency between compute and nccl streams.
|
||||
std::unique_ptr<se::Event> input_event;
|
||||
|
||||
// Owned by the caller, who must keep it live until `done_callback` is
|
||||
// called. Is NULL for participants that only send data.
|
||||
Tensor* output;
|
||||
|
Loading…
Reference in New Issue
Block a user