From 5882ae35d304f813313a4456c087237c29b63b64 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 Sep 2017 04:07:01 -0700
Subject: [PATCH] Update to NCCL version 1.3.5. Remove temporary buffer for
 ncclReduce, it's no longer needed in this version.

PiperOrigin-RevId: 169221983
---
 tensorflow/contrib/nccl/kernels/nccl_manager.cc | 10 ++++++----
 tensorflow/contrib/nccl/kernels/nccl_manager.h  |  3 +--
 tensorflow/contrib/nccl/kernels/nccl_ops.cc     | 11 ++---------
 tensorflow/workspace.bzl                        |  8 ++++----
 4 files changed, 13 insertions(+), 19 deletions(-)
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager.cc b/tensorflow/contrib/nccl/kernels/nccl_manager.cc
index 4b642f64c12..1eb1481675e 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager.cc
@@ -312,11 +312,11 @@ void NcclManager::AddReduceSend(int num_devices, const string& key,
                                 perftools::gputools::StreamExecutor* executor,
                                 int gpu_device_id, EventMgr* event_mgr,
                                 perftools::gputools::Stream* tensor_stream,
-                                const Tensor* in_t, Tensor* temp_t,
+                                const Tensor* in_t,
                                 DoneCallback done_callback) {
   std::unique_ptr<Participant> participant(
-      new Participant(in_t, temp_t, event_mgr, tensor_stream, executor,
-                      gpu_device_id, std::move(done_callback)));
+      new Participant(in_t, nullptr /* out_t */, event_mgr, tensor_stream,
+                      executor, gpu_device_id, std::move(done_callback)));
   AddParticipant(num_devices, key, std::move(participant), in_t->dtype(),
                  kReduce, reduction_op);
 }
@@ -462,7 +462,9 @@ void NcclManager::LoopKernelLaunches(NcclStream* nccl_stream) {
       }
       case kReduce: {
         const void* sendbuff = p->in_t->tensor_data().data();
-        void* recvbuff = const_cast<char*>(p->out_t->tensor_data().data());
+        void* recvbuff = p->out_t
+                             ? const_cast<char*>(p->out_t->tensor_data().data())
+                             : nullptr;
         nccl_result = ncclReduce(sendbuff, recvbuff, p->in_t->NumElements(),
                                  data_type, collective->reduction_op,
                                  collective->root_rank, nccl_comm, *cu_stream);
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager.h b/tensorflow/contrib/nccl/kernels/nccl_manager.h
index 619a1b69bfd..cb1719c3be6 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager.h
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager.h
@@ -82,8 +82,7 @@ class NcclManager {
                      perftools::gputools::StreamExecutor* executor,
                      int gpu_device_id, EventMgr* event_mgr,
                      perftools::gputools::Stream* tensor_stream,
-                     const Tensor* in_t, Tensor* temp_t,
-                     DoneCallback done_callback);
+                     const Tensor* in_t, DoneCallback done_callback);
   void AddReduceRecv(int num_devices, const string& key,
                      ncclRedOp_t reduction_op,
                      perftools::gputools::StreamExecutor* executor,
diff --git a/tensorflow/contrib/nccl/kernels/nccl_ops.cc b/tensorflow/contrib/nccl/kernels/nccl_ops.cc
index 81cc74416bc..4eb52492dbc 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_ops.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_ops.cc
@@ -121,14 +121,7 @@ class NcclReduceSendKernel : public NcclReduceOpBase {
       : NcclReduceOpBase(c) {}
 
   void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
-    const Tensor& in_t = c->input(0);
-    std::unique_ptr<Tensor> temp_ptr(new Tensor());
-    OP_REQUIRES_OK_ASYNC(
-        c, c->allocate_temp(in_t.dtype(), in_t.shape(), temp_ptr.get()), done);
-    Tensor* temp_t = temp_ptr.release();
-
-    auto actual_done = [c, done, temp_t](Status s) {
-      delete temp_t;
+    auto actual_done = [c, done](Status s) {
       OP_REQUIRES_OK_ASYNC(c, s, done);
       done();
     };
@@ -138,7 +131,7 @@ class NcclReduceSendKernel : public NcclReduceOpBase {
     NcclManager::instance()->AddReduceSend(
         num_devices(), GetCollectiveKey(c), reduction_op(),
         compute_stream->parent(), gpu_info->gpu_id, gpu_info->event_mgr,
-        compute_stream, &in_t, temp_t, std::move(actual_done));
+        compute_stream, &c->input(0), std::move(actual_done));
   }
 };
 REGISTER_KERNEL_BUILDER(Name("NcclReduceSend").Device(DEVICE_GPU),
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index b062ba0e81a..06d92376f23 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -629,11 +629,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   temp_workaround_http_archive(
       name = "nccl_archive",
       urls = [
-          "http://mirror.bazel.build/github.com/nvidia/nccl/archive/29a1a916dc14bb2c00feed3d4820d51fa85be1e6.tar.gz",
-          "https://github.com/nvidia/nccl/archive/29a1a916dc14bb2c00feed3d4820d51fa85be1e6.tar.gz",
+          "http://mirror.bazel.build/github.com/nvidia/nccl/archive/03d856977ecbaac87e598c0c4bafca96761b9ac7.tar.gz",
+          "https://github.com/nvidia/nccl/archive/03d856977ecbaac87e598c0c4bafca96761b9ac7.tar.gz",
       ],
-      sha256 = "6387030e37d14762f87eefbc86ee527293ec04745c66ccd820cf7fc0fdc23f92",
-      strip_prefix = "nccl-29a1a916dc14bb2c00feed3d4820d51fa85be1e6",
+      sha256 = "2ca86fb6179ecbff789cc67c836139c1bbc0324ed8c04643405a30bf26325176",
+      strip_prefix = "nccl-03d856977ecbaac87e598c0c4bafca96761b9ac7",
       build_file = str(Label("//third_party:nccl.BUILD")),
       repository = tf_repo_name,
   )