Add a collective broadcast implementation using NCCL.

The implementation extends `NcclBase` to `NcclBroadcaster`, similar to `NcclReducer`. This change also refactors collective NCCL tests. PiperOrigin-RevId: 261021538
2019-07-31 16:27:46 -07:00 · 2019-07-31 16:27:46 -07:00 · 9bdc9dbf52
commit 9bdc9dbf52
parent 4bdc9d0ee6
5 changed files with 344 additions and 73 deletions
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
@ -56,10 +56,14 @@ void CollectiveParamResolverLocal::CompleteGroupAsync(
 }

 namespace {
-string GetCollectiveName(const CollectiveParams* cp, bool nccl) {
+const char* GetCollectiveName(const CollectiveParams* cp, bool nccl) {
  switch (cp->instance.type) {
    case BROADCAST_COLLECTIVE:
-      return "HierarchicalTreeBroadcast";
+      if (nccl) {
+        return "NcclBroadcast";
+      } else {
+        return "HierarchicalTreeBroadcast";
+      }

    case REDUCTION_COLLECTIVE: {
      if (nccl) {
@ -96,8 +100,8 @@ void CollectiveParamResolverLocal::CompleteGroupLocal(

      // Initialize group runtime details.
      CollectiveImplementationInterface* col_impl;
-      // TODO(b/128853131,b/132707282): Remove NCCL special case when we have
-      // NCCL implementations for all collectives.
+      // TODO(b/128853131): Remove NCCL special case when we have NCCL
+      // implementations for all collectives.
      status = CollectiveRegistry::LookupParamResolverInstance(
          nccl_ ? "NcclReduce" : GetCollectiveName(cp, /*nccl=*/false),
          &col_impl);
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@ -200,6 +200,8 @@ tf_kernel_library(
    srcs = if_nccl([
        "collective_nccl.h",
        "collective_nccl.cc",
+        "collective_nccl_broadcaster.h",
+        "collective_nccl_broadcaster.cc",
        "collective_nccl_reducer.h",
        "collective_nccl_reducer.cc",
    ]),
@ -216,9 +218,9 @@ tf_kernel_library(
 )

 tf_cuda_cc_test(
-    name = "collective_nccl_reducer_test",
+    name = "collective_nccl_test",
    size = "small",
-    srcs = ["collective_nccl_reducer_test.cc"],
+    srcs = ["collective_nccl_test.cc"],
    tags = tf_cuda_tests_tags() + ["no_cuda_on_cpu_tap"],
    deps = [
        "//tensorflow/core:all_kernels",
--- a/tensorflow/core/kernels/collective_nccl_broadcaster.cc
+++ b/tensorflow/core/kernels/collective_nccl_broadcaster.cc
@ -0,0 +1,81 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/collective_nccl_broadcaster.h"
+
+#ifdef GOOGLE_CUDA
+
+#include "tensorflow/core/common_runtime/collective_util.h"
+#include "tensorflow/core/nccl/nccl_manager.h"
+#include "tensorflow/core/platform/tracing.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+
+namespace tensorflow {
+
+void NcclBroadcaster::Run(StatusCallback done) {
+  auto* compute_stream = col_ctx_->op_ctx->op_device_context()->stream();
+  auto* gpu_info = col_ctx_->op_ctx->device()->tensorflow_gpu_device_info();
+  const int num_global_devices = col_params_->group.group_size;
+  const int num_local_devices = col_params_->instance.num_devices_per_task.at(
+      col_params_->instance.task_names[col_params_->default_rank]);
+  string nccl_collective_key =
+      NcclCollectiveKey(col_ctx_->exec_key, col_ctx_->step_id);
+  auto participant = absl::make_unique<NcclManager::Participant>(
+      compute_stream->parent(), compute_stream, gpu_info->event_mgr,
+      gpu_info->gpu_id, col_ctx_->input, col_ctx_->output,
+      col_params_->default_rank, std::move(done));
+  VLOG(1)
+      << "NcclBroadcast calling NcclManager::AddBroadcastSend/Recv num_tasks "
+      << col_params_->group.num_tasks << " current task "
+      << col_params_->instance.task_names[col_params_->default_rank]
+      << " num local devices " << num_local_devices << " num global devices "
+      << num_global_devices << " rank " << col_params_->default_rank
+      << " device " << col_ctx_->device_name << " instance "
+      << col_params_->instance.instance_key << " source "
+      << col_params_->is_source;
+  if (col_params_->is_source) {
+    NcclManager::instance()->AddBroadcastSend(
+        std::move(participant),
+        {std::move(nccl_collective_key), num_local_devices, num_global_devices,
+         col_params_->group.runtime_details.communicator_key});
+  } else {
+    NcclManager::instance()->AddBroadcastRecv(
+        std::move(participant),
+        {std::move(nccl_collective_key), num_local_devices, num_global_devices,
+         col_params_->group.runtime_details.communicator_key});
+  }
+  {
+    // `WaitForDependencies` may block if the collective instances on which this
+    // op depends have not yet launched.  When this function returns, this op is
+    // ready to go.
+    profiler::TraceMe activity("WaitForDependencies",
+                               profiler::TraceMeLevel::kInfo);
+    col_ctx_->col_exec->WaitForDependencies(*col_params_);
+    NcclManager::instance()->SignalMultiNodeReady(nccl_collective_key);
+  }
+  {
+    // When all devices at this worker have called `SignalMultiNodeReady`, the
+    // `NcclManager` will enqueue the NCCL kernel on the NCCL stream.  Thus the
+    // implementation of `Launched` keeps track of the number of devices that
+    // have launched.
+    profiler::TraceMe activity("Schedule", profiler::TraceMeLevel::kInfo);
+    col_ctx_->col_exec->Launched(*col_params_);
+  }
+}
+
+REGISTER_COLLECTIVE(NcclBroadcast, NcclBroadcaster);
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
--- a/tensorflow/core/kernels/collective_nccl_broadcaster.h
+++ b/tensorflow/core/kernels/collective_nccl_broadcaster.h
@ -0,0 +1,35 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_BROADCASTER_H_
+#define TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_BROADCASTER_H_
+
+#include "tensorflow/core/kernels/collective_nccl.h"
+
+namespace tensorflow {
+#ifdef GOOGLE_CUDA
+
+class NcclBroadcaster : public NcclBase {
+ public:
+  NcclBroadcaster() : NcclBase(BROADCAST_COLLECTIVE, "NcclBroadcast") {}
+  ~NcclBroadcaster() override = default;
+
+  // Hands off broadcast to NcclManager.
+  void Run(StatusCallback done) override;
+};
+
+#endif  // GOOGLE_CUDA
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_BROADCASTER_H_
--- a/tensorflow/core/kernels/collective_nccl_reducer_test.cc
+++ b/tensorflow/core/kernels/collective_nccl_reducer_test.cc
@ -15,7 +15,7 @@ limitations under the License.

 #ifdef GOOGLE_CUDA

-#include "tensorflow/core/kernels/collective_nccl_reducer.h"
+#include "tensorflow/core/kernels/collective_nccl.h"

 #include <algorithm>

@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/device_resolver_local.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/common_runtime/test_collective_executor_mgr.h"
 #include "tensorflow/core/framework/collective.h"
@ -32,6 +33,8 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/collective_nccl_broadcaster.h"
+#include "tensorflow/core/kernels/collective_nccl_reducer.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
@ -70,9 +73,13 @@ std::unique_ptr<OpKernel> GetDiv(DeviceBase* device) {
  return GetKernel(node_def, device);
 }

-class NcclReducerTest : public ::testing::Test {
+class NcclTestBase : public ::testing::Test {
 protected:
-  ~NcclReducerTest() override {
+  class DeviceInstance;
+
+  NcclTestBase(CollectiveType collective_type, const string& collective_name)
+      : collective_type_(collective_type), collective_name_(collective_name) {}
+  ~NcclTestBase() override {
    if (col_exec_) col_exec_->Unref();
  }

@ -92,7 +99,7 @@ class NcclReducerTest : public ::testing::Test {
    }
  }

-  void Init(int num_ranks) {
+  void Init(const int num_ranks, const int instance_key) {
    setenv("NCCL_DEBUG", "INFO", 1 /* replace */);
    setenv("NCCL_LAUNCH_MODE", "PARALLEL", 1 /* replace */);
    InitGPUDevices();
@ -115,15 +122,14 @@ class NcclReducerTest : public ::testing::Test {

    // Initialize collective params.
    col_params_.name = "test_nccl_collective_op";
-    const int group_key = 5;
+    const int group_key = num_ranks;
    col_params_.group.group_key = group_key;
    col_params_.group.device_type = DEVICE_GPU;
    col_params_.group.group_size = num_ranks;
-    const int instance_key = 23;
    col_params_.instance.instance_key = instance_key;
-    col_params_.instance.type = REDUCTION_COLLECTIVE;
+    col_params_.instance.type = collective_type_;
    col_params_.instance.data_type = DT_FLOAT;
-    col_params_.instance.impl_details.collective_name = "NcclReduce";
+    col_params_.instance.impl_details.collective_name = collective_name_;
    const string task_name = "/job:worker/replica:0/task:0";
    col_params_.instance.num_devices_per_task[task_name] = num_ranks;
    for (int rank = 0; rank < num_ranks; ++rank) {
@ -137,14 +143,28 @@ class NcclReducerTest : public ::testing::Test {
    }
  }

-  void Reduce() {
+  // Initialize `input` tensor at rank `rank`.
+  virtual void InitInput(Tensor* input, const int rank) = 0;
+
+  // Initialize `expected` output at all `num_ranks` ranks.
+  virtual void InitExpected(std::vector<float>* expected,
+                            const int tensor_length, const int num_ranks) = 0;
+
+  // Initialize device `di` specific to the collective op.
+  virtual void InitDevice(DeviceInstance* di) = 0;
+
+  // Run collective op on device `di`.
+  virtual void RunCollectiveOnDevice(DeviceInstance* di) = 0;
+
+  void RunCollective() {
    int done = 0;
    mutex done_mu;
    condition_variable done_cv;
    for (const auto& instance : instances_) {
      DeviceInstance* di = instance.get();
-      SchedClosure([di, &done, &done_mu, &done_cv] {
-        di->DoReduce();
+      InitDevice(di);
+      SchedClosure([this, di, &done, &done_mu, &done_cv] {
+        RunCollectiveOnDevice(di);
        mutex_lock l(done_mu);
        ++done;
        done_cv.notify_all();
@ -155,35 +175,32 @@ class NcclReducerTest : public ::testing::Test {
    while (done < instances_.size()) done_cv.wait(l);
  }

-  void RunTest(int num_ranks, int tensor_length) {
-    Init(num_ranks);
+  void RunTest(int num_ranks, int tensor_length, int instance_key) {
+    Init(num_ranks, instance_key);
    std::vector<float> expected(tensor_length, 0.0);
+    InitExpected(&expected, tensor_length, num_ranks);
    for (int rank = 0; rank < num_ranks; ++rank) {
      DeviceInstance* instance = instances_[rank].get();
      instance->InitTensor(DT_FLOAT, TensorShape({tensor_length}),
-                           [&expected, rank](Tensor* t) {
-                             for (size_t i = 0; i < t->NumElements(); ++i) {
-                               float value = pow(10, rank) * i;
-                               t->flat<float>()(i) = value;
-                               expected[i] += value;
-                             }
-                           });
+                           [this, rank](Tensor* t) { InitInput(t, rank); });
    }
-    Reduce();
+    RunCollective();
    // Confirm that every rank computed the same correct value.
-    for (int i = 0; i < tensor_length; ++i) {
-      expected[i] /= num_ranks;
-    }
    for (int rank = 0; rank < instances_.size(); ++rank) {
      TF_ASSERT_OK(instances_[rank]->status_);
      Tensor* dev_tensor = &instances_[rank]->tensor_;
+      VLOG(2) << "rank " << rank << " output " << dev_tensor << " buf "
+              << DMAHelper::base(dev_tensor);
      Tensor actual(DT_FLOAT, TensorShape({tensor_length}));
      Notification note;
      Device* dev = instances_[rank]->device_;
      auto* dev_info = dev->tensorflow_gpu_device_info();
      dev_info->default_context->CopyDeviceTensorToCPU(
          dev_tensor, /*tensor_name=*/"", dev, &actual,
-          [&note](const Status&) { note.Notify(); });
+          [&note](const Status& s) {
+            TF_CHECK_OK(s);
+            note.Notify();
+          });
      note.WaitForNotification();
      for (int i = 0; i < tensor_length; ++i) {
        EXPECT_FLOAT_EQ(expected[i], actual.template flat<float>()(i))
@ -192,14 +209,12 @@ class NcclReducerTest : public ::testing::Test {
    }
  }

-  std::unique_ptr<OpKernel> GetCollectiveReduce(const CollectiveParams& params,
-                                                Tensor* input,
-                                                DeviceBase* device) {
+  std::unique_ptr<OpKernel> GetCollectiveReduceOpKernel(
+      const CollectiveParams& params, Tensor* input, DeviceBase* device) {
    mutex_lock l(mu_);
    NodeDef node_def;
-    NodeDefBuilder builder(
-        strings::StrCat("collective_reduce_", reduce_counter_++),
-        "CollectiveReduce");
+    NodeDefBuilder builder(strings::StrCat("collective_reduce_", op_counter_++),
+                           "CollectiveReduce");
    TF_CHECK_OK(
        builder.Attr("T", params.instance.data_type)
            .Attr("merge_op", "Add")
@ -215,7 +230,7 @@ class NcclReducerTest : public ::testing::Test {

  class DeviceInstance {
   public:
-    DeviceInstance(int rank, const string& device_name, NcclReducerTest* parent)
+    DeviceInstance(int rank, const string& device_name, NcclTestBase* parent)
        : parent_(parent), device_name_(device_name), rank_(rank) {
      TF_CHECK_OK(parent_->dev_mgr_->LookupDevice(device_name_, &device_))
          << "Could not find device " << device_name_ << " existing devices "
@ -238,26 +253,16 @@ class NcclReducerTest : public ::testing::Test {
      auto* dev_info = device_->tensorflow_gpu_device_info();
      Notification note;
      dev_info->default_context->CopyCPUTensorToDevice(
-          &cpu_tensor, device_, &tensor_,
-          [&note](const Status&) { note.Notify(); });
+          &cpu_tensor, device_, &tensor_, [&note](const Status& s) {
+            TF_CHECK_OK(s);
+            note.Notify();
+          });
      note.WaitForNotification();
    }

-    void DoReduce() {
-      col_params_.merge_op = GetAdd(device_);
-      col_params_.final_op = GetDiv(device_);
-
-      // Prepare an OpKernelContext.
-      OpKernelContext::Params op_params;
-      op_params.step_id = kStepId;
-      op_params.device = device_;
-      gtl::InlinedVector<TensorValue, 4> inputs;
-      inputs.push_back(TensorValue(&tensor_));
-      op_params.inputs = &inputs;
-      gtl::InlinedVector<AllocatorAttributes, 4> input_aa(
-          {AllocatorAttributes()});
-      op_params.input_alloc_attrs = &input_aa;
-      gtl::InlinedVector<DeviceContext*, 4> input_dc;
+    void PrepareDeviceContext(OpKernelContext::Params* params) {
+      params->step_id = kStepId;
+      params->device = device_;
      DeviceContext* dev_ctx = nullptr;
      auto* dev_info = device_->tensorflow_gpu_device_info();
      if (dev_info) {
@ -266,18 +271,32 @@ class NcclReducerTest : public ::testing::Test {
      } else {
        dev_ctx = new DeviceContext;
      }
-      input_dc.push_back(dev_ctx);
+      params->op_device_context = dev_ctx;
+    }
+
+    void RunReduce() {
+      // Prepare an OpKernelContext.
+      OpKernelContext::Params op_params;
+      PrepareDeviceContext(&op_params);
+
+      // Prepare inputs and outputs to OpKernel.
+      gtl::InlinedVector<TensorValue, 4> inputs;
+      inputs.push_back(TensorValue(&tensor_));
+      op_params.inputs = &inputs;
+      gtl::InlinedVector<AllocatorAttributes, 4> input_aa(
+          {AllocatorAttributes()});
+      op_params.input_alloc_attrs = &input_aa;
+      gtl::InlinedVector<DeviceContext*, 4> input_dc;
+      input_dc.push_back(op_params.op_device_context);
      op_params.input_device_contexts = &input_dc;
-      op_params.op_device_context = dev_ctx;
      int forward_from = 0;
      op_params.forward_from_array = &forward_from;
      AllocatorAttributes generic_alloc_attr;
      op_params.output_attr_array = &generic_alloc_attr;
      std::unique_ptr<OpKernel> op =
-          parent_->GetCollectiveReduce(col_params_, &tensor_, device_);
+          parent_->GetCollectiveReduceOpKernel(col_params_, &tensor_, device_);
      op_params.op_kernel = op.get();
      OpKernelContext ctx(&op_params, 1);
-
      // We never actually execute the kernel, so we need to do the output
      // allocation it would do, ourselves.
      Tensor* output_tensor_ptr = nullptr;
@ -285,25 +304,57 @@ class NcclReducerTest : public ::testing::Test {
                                                       &output_tensor_ptr));
      CHECK_EQ(output_tensor_ptr, ctx.mutable_output(0));

-      // Prepare a NcclReducer instance.
+      // Run the all-reduce.
      string exec_key =
          strings::StrCat(col_params_.instance.instance_key, ":0:0");
      NcclReducer reducer;
      CollectiveContext col_ctx(parent_->col_exec_, parent_->dev_mgr_.get(),
-                                &ctx, &op_params, col_params_, exec_key,
-                                kStepId, &tensor_, &tensor_);
+                                /*OpKernelContext=*/&ctx, &op_params,
+                                col_params_, exec_key, kStepId,
+                                /*input=*/&tensor_, /*output=*/&tensor_);
      TF_CHECK_OK(reducer.InitializeCollectiveContext(&col_ctx));
-
-      // Run the all-reduce.
-      reducer.Run([this](Status s) { status_ = s; });
+      Notification note;
+      reducer.Run([this, &note](Status s) {
+        status_ = s;
+        note.Notify();
+      });
+      note.WaitForNotification();
      if (status_.ok()) {
        CHECK(tensor_.CopyFrom(*ctx.mutable_output(0), tensor_.shape()));
      }

-      dev_ctx->Unref();
+      op_params.op_device_context->Unref();
    }

-    NcclReducerTest* parent_;
+    void RunBroadcast() {
+      VLOG(2) << "RunBroadcast name " << parent_->collective_name_ << " rank "
+              << col_params_.default_rank;
+      // Prepare an OpKernelContext.
+      OpKernelContext::Params op_params;
+      PrepareDeviceContext(&op_params);
+      OpKernelContext ctx(&op_params, 1);
+
+      // Run broadcast.
+      string exec_key =
+          strings::StrCat(col_params_.instance.instance_key, ":0:0");
+      NcclBroadcaster broadcaster;
+      CollectiveContext col_ctx(
+          parent_->col_exec_, parent_->dev_mgr_.get(),
+          /*OpKernelContext=*/&ctx, &op_params, col_params_, exec_key, kStepId,
+          /*input=*/col_params_.is_source ? &tensor_ : nullptr,
+          /*output=*/&tensor_);
+      TF_CHECK_OK(broadcaster.InitializeCollectiveContext(&col_ctx));
+      Notification note;
+      broadcaster.Run([this, &note](Status s) {
+        status_ = s;
+        note.Notify();
+      });
+      note.WaitForNotification();
+
+      op_params.op_device_context->Unref();
+    }
+
+    NcclTestBase* parent_;
    string device_name_;
    int rank_;
    Tensor tensor_;
@ -312,6 +363,8 @@ class NcclReducerTest : public ::testing::Test {
    Status status_;
  };

+  CollectiveType collective_type_;
+  const string collective_name_;
  std::vector<std::unique_ptr<tensorflow::Device>> gpus_;
  TestCollectiveExecutorMgr col_exec_mgr_;
  CollectiveExecutor* col_exec_;
@ -319,14 +372,110 @@ class NcclReducerTest : public ::testing::Test {
  std::vector<std::unique_ptr<DeviceInstance>> instances_;
  CollectiveParams col_params_;
  mutex mu_;
-  int32 reduce_counter_ GUARDED_BY(mu_) = 0;
+  int32 op_counter_ GUARDED_BY(mu_) = 0;
 };

-TEST_F(NcclReducerTest, Test2Dev16Len) { RunTest(2, 16); }
-TEST_F(NcclReducerTest, Test4Dev16Len) { RunTest(4, 16); }
-TEST_F(NcclReducerTest, Test8Dev16Len) { RunTest(8, 16); }
-TEST_F(NcclReducerTest, Test8Dev128Len) { RunTest(8, 128); }
-TEST_F(NcclReducerTest, Test8Dev1045991Len) { RunTest(8, 1048576); }
+class NcclReducerTest : public NcclTestBase {
+ protected:
+  NcclReducerTest()
+      : NcclTestBase(/*collective_type=*/REDUCTION_COLLECTIVE,
+                     /*collective_name=*/"NcclReduce") {}
+  ~NcclReducerTest() override = default;
+
+  void InitInput(Tensor* input, const int rank) override {
+    for (size_t i = 0; i < input->NumElements(); ++i) {
+      float value = pow(10, rank) * i;
+      input->flat<float>()(i) = value;
+    }
+  }
+
+  void InitExpected(std::vector<float>* expected, const int tensor_length,
+                    const int num_ranks) override {
+    expected->resize(tensor_length);
+    for (int i = 0; i < tensor_length; ++i) {
+      float expected_sum = 0.0;
+      for (int rank = 0; rank < num_ranks; ++rank) {
+        float value = pow(10, rank) * i;
+        expected_sum += value;
+      }
+      (*expected)[i] = expected_sum / num_ranks;
+    }
+  }
+
+  void InitDevice(DeviceInstance* di) override {
+    di->col_params_.merge_op = GetAdd(di->device_);
+    di->col_params_.final_op = GetDiv(di->device_);
+  }
+
+  void RunCollectiveOnDevice(DeviceInstance* di) override { di->RunReduce(); }
+};
+
+class NcclBroadcasterTest : public NcclTestBase {
+ protected:
+  NcclBroadcasterTest()
+      : NcclTestBase(/*collective_type=*/BROADCAST_COLLECTIVE,
+                     /*collective_name=*/"NcclBroadcast") {}
+  ~NcclBroadcasterTest() override = default;
+
+  void InitInput(Tensor* input, const int rank) override {
+    bool source = rank == source_rank_;
+    for (size_t i = 0; i < input->NumElements(); ++i) {
+      input->flat<float>()(i) = source ? static_cast<float>(i) : -1.0;
+    }
+  }
+
+  void InitExpected(std::vector<float>* expected, const int tensor_length,
+                    const int num_ranks) override {
+    for (int i = 0; i < tensor_length; ++i) {
+      (*expected)[i] = i;
+    }
+  }
+
+  void InitDevice(DeviceInstance* di) override {
+    di->col_params_.source_rank = source_rank_;
+    di->col_params_.is_source = di->col_params_.default_rank == source_rank_;
+  }
+
+  void RunCollectiveOnDevice(DeviceInstance* di) override {
+    di->RunBroadcast();
+  }
+
+  int source_rank_ = 0;
+};
+
+TEST_F(NcclReducerTest, Test2Dev16Len) {
+  RunTest(/*num_ranks=*/2, /*tensor_length=*/16, /*instance_key=*/23);
+}
+TEST_F(NcclReducerTest, Test4Dev16Len) {
+  RunTest(/*num_ranks=*/4, /*tensor_length=*/16, /*instance_key=*/23);
+}
+TEST_F(NcclReducerTest, Test8Dev16Len) {
+  RunTest(/*num_ranks=*/8, /*tensor_length=*/16, /*instance_key=*/23);
+}
+TEST_F(NcclReducerTest, Test8Dev128Len) {
+  RunTest(/*num_ranks=*/8, /*tensor_length=*/128, /*instance_key=*/23);
+}
+TEST_F(NcclReducerTest, Test8Dev1045991Len) {
+  RunTest(/*num_ranks=*/8, /*tensor_length=*/1048576, /*instance_key=*/23);
+}
+
+TEST_F(NcclBroadcasterTest, Test2Dev16LenSrc0) {
+  RunTest(/*num_ranks=*/2, /*tensor_length=*/16, /*instance_key=*/23);
+}
+TEST_F(NcclBroadcasterTest, Test4Dev16LenSrc1) {
+  source_rank_ = 1;
+  RunTest(/*num_ranks=*/4, /*tensor_length=*/16, /*instance_key=*/23);
+}
+TEST_F(NcclBroadcasterTest, Test8Dev16LenSrc7) {
+  source_rank_ = 7;
+  RunTest(/*num_ranks=*/8, /*tensor_length=*/16, /*instance_key=*/23);
+}
+TEST_F(NcclBroadcasterTest, Test8Dev128LenSrc0) {
+  RunTest(/*num_ranks=*/8, /*tensor_length=*/128, /*instance_key=*/24);
+}
+TEST_F(NcclBroadcasterTest, Test8Dev1045991LenSrc0) {
+  RunTest(/*num_ranks=*/8, /*tensor_length=*/1048576, /*instance_key=*/23);
+}

 }  // namespace tensorflow