Reuse GPUs for multiple ranks in NcclManager test.

Since NCCL 2.1.2, NCCL uses cooperative group launch by default which
results in deadlocks in the NcclManager test if number of ranks > number of
GPUs.  Setting NCCL_LAUNCH_MODE to PARALLEL, which was the default before NCCL
2.1.2, fixes this.

PiperOrigin-RevId: 224531851
This commit is contained in:
Ayush Dubey 2018-12-07 09:57:11 -08:00 committed by TensorFlower Gardener
parent b56d14fbcb
commit 8f2bc575d9

View File

@ -65,6 +65,7 @@ class NcclManagerTest : public ::testing::Test {
static void SetUpTestCase() {
setenv("NCCL_DEBUG", "WARN", 1 /* replace */);
setenv("NCCL_LAUNCH_MODE", "PARALLEL", 1 /* replace */);
devices_ = new std::vector<std::unique_ptr<BaseGPUDevice>>(GetGPUDevices());
LOG(ERROR) << "Running test with " << devices_->size() << " gpus";
}
@ -200,7 +201,7 @@ TYPED_TEST_CASE(NcclManagerTest, TypeList);
// Test basic sum reduction.
TYPED_TEST(NcclManagerTest, BasicSumReduction) {
const int num_ranks = this->NumGPUs();
const int num_ranks = 4;
for (int op = 0; op < 4; ++op) {
ncclRedOp_t reduction_op = static_cast<ncclRedOp_t>(op);
@ -208,6 +209,7 @@ TYPED_TEST(NcclManagerTest, BasicSumReduction) {
this->MakeTestCase(num_ranks, reduction_op, TensorShape({2, 3}), 0.0f));
for (int rank = 0; rank < num_ranks; ++rank) {
auto* device = this->GetDevice(rank);
VLOG(2) << "rank " << rank << " device " << device->name();
auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
auto* stream = device->tensorflow_gpu_device_info()->stream;
NcclManager::instance()->AddToAllReduce(
@ -224,14 +226,12 @@ TYPED_TEST(NcclManagerTest, BasicSumReduction) {
// Same as the Basic test, but with multiple threads launching parts of many
// reductions.
//
// Testing the multi-rank execution is currently reduced as it can hang when run
// with num_ranks > devices->size(), for some GPUs (e.g. K20m).
// To test the higher settings, increase num_ranks,
// num_collectives_per_iteration and time_limit_micros.
// To run test longer, increase num_ranks, num_collectives_per_iteration and
// time_limit_micros.
TYPED_TEST(NcclManagerTest, MultipleCallers) {
const int num_ranks = this->NumGPUs();
const int num_ranks = 4;
const int num_collectives_per_iteration = 10; // 1000;
const int num_threads = 3;
const int num_threads = num_ranks * 2;
const int time_limit_micros = 100; // 60 * 30 * 1000 * 1000;
int64 start = Env::Default()->NowMicros();