Rolling forward CL 252574722 and 252855085 with a fix.

Update NCCL from 2.3.5 to 2.4.7. PiperOrigin-RevId: 253953922
2019-06-19 01:50:16 -07:00 · 2019-06-19 01:50:16 -07:00 · 77659d9c93
commit 77659d9c93
parent 8ec9355084
5 changed files with 136 additions and 137 deletions
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@ -192,6 +192,7 @@ tensorflow/third_party/nccl/LICENSE
 tensorflow/third_party/nccl/system.BUILD.tpl
 tensorflow/third_party/nccl/nccl_configure.bzl
 tensorflow/third_party/nccl/build_defs.bzl.tpl
+tensorflow/third_party/nccl/archive.patch
 tensorflow/third_party/nccl/BUILD
 tensorflow/third_party/boringssl/BUILD
 tensorflow/third_party/mpi/.gitignore
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@ -602,11 +602,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
    tf_http_archive(
        name = "nccl_archive",
        build_file = clean_dep("//third_party:nccl/archive.BUILD"),
-        sha256 = "19132b5127fa8e02d95a09795866923f04064c8f1e0770b2b42ab551408882a4",
-        strip_prefix = "nccl-f93fe9bfd94884cec2ba711897222e0df5569a53",
+        patch_file = clean_dep("//third_party/nccl:archive.patch"),
+        sha256 = "9a7633e224982e2b60fa6b397d895d20d6b7498e3e02f46f98a5a4e187c5a44c",
+        strip_prefix = "nccl-0ceaec9cee96ae7658aa45686853286651f36384",
        urls = [
-            "http://mirror.tensorflow.org/github.com/nvidia/nccl/archive/f93fe9bfd94884cec2ba711897222e0df5569a53.tar.gz",
-            "https://github.com/nvidia/nccl/archive/f93fe9bfd94884cec2ba711897222e0df5569a53.tar.gz",
+            "http://mirror.tensorflow.org/github.com/nvidia/nccl/archive/0ceaec9cee96ae7658aa45686853286651f36384.tar.gz",
+            "https://github.com/nvidia/nccl/archive/0ceaec9cee96ae7658aa45686853286651f36384.tar.gz",
        ],
    )

--- a/third_party/nccl/archive.BUILD
+++ b/third_party/nccl/archive.BUILD
@ -9,99 +9,89 @@ load(
    "@local_config_nccl//:build_defs.bzl",
    "cuda_rdc_library",
    "gen_device_srcs",
-    "process_srcs",
 )
-load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")

 cc_library(
    name = "src_hdrs",
-    hdrs = process_srcs([
+    hdrs = [
+        "src/collectives.h",
        "src/collectives/collectives.h",
-        "src/nccl.h.in",
-    ]),
+        "src/nccl.h",
+    ],
+    strip_include_prefix = "src",
 )

 cc_library(
    name = "include_hdrs",
-    hdrs = process_srcs(glob(["src/include/*.h"])),
-    strip_include_prefix = "include",
+    hdrs = glob(["src/include/*.h"]),
+    strip_include_prefix = "src/include",
    deps = ["@local_config_cuda//cuda:cuda_headers"],
 )

-device_srcs = process_srcs([
-    "src/collectives/device/all_gather.cu",
-    "src/collectives/device/all_reduce.cu",
-    "src/collectives/device/broadcast.cu",
-    "src/collectives/device/reduce.cu",
-    "src/collectives/device/reduce_scatter.cu",
-])
+cc_library(
+    name = "device_hdrs",
+    hdrs = glob(["src/collectives/device/*.h"]),
+    strip_include_prefix = "src/collectives/device",
+)

-# NCCL compiles the same source files with different NCCL_OP defines. RDC
-# compilation requires that each compiled module has a unique ID. Clang derives
-# the module ID from the path only so we need to rename the files to get
+# NCCL compiles the same source files with different NCCL_OP/NCCL_TYPE defines.
+# RDC compilation requires that each compiled module has a unique ID. Clang
+# derives the module ID from the path only so we need to copy the files to get
 # different IDs for different parts of compilation. NVCC does not have that
 # problem because it generates IDs based on preprocessed content.
 gen_device_srcs(
-    name = "sum",
-    srcs = device_srcs,
-    NCCL_OP = 0,
-)
-
-gen_device_srcs(
-    name = "prod",
-    srcs = device_srcs,
-    NCCL_OP = 1,
-)
-
-gen_device_srcs(
-    name = "min",
-    srcs = device_srcs,
-    NCCL_OP = 2,
-)
-
-gen_device_srcs(
-    name = "max",
-    srcs = device_srcs,
-    NCCL_OP = 3,
+    name = "device_srcs",
+    srcs = [
+        "src/collectives/device/all_gather.cu.cc",
+        "src/collectives/device/all_reduce.cu.cc",
+        "src/collectives/device/broadcast.cu.cc",
+        "src/collectives/device/reduce.cu.cc",
+        "src/collectives/device/reduce_scatter.cu.cc",
+    ],
 )

 cuda_rdc_library(
    name = "device",
    srcs = [
-        ":max",
-        ":min",
-        ":prod",
-        ":sum",
-    ] + process_srcs(glob([
+        "src/collectives/device/functions.cu.cc",
+        ":device_srcs",
+    ] + glob([
+        # Required for header inclusion checking, see below for details.
        "src/collectives/device/*.h",
-        "src/collectives/device/functions.cu",
-    ])),
+        "src/nccl.h",
+    ]),
    deps = [
+        ":device_hdrs",
        ":include_hdrs",
        ":src_hdrs",
+        "@local_config_cuda//cuda:cuda_headers",
    ],
 )

 # Primary NCCL target.
 cc_library(
    name = "nccl",
-    srcs = process_srcs(glob(
-        include = ["src/**/*.cu"],
+    srcs = glob(
+        include = ["src/**/*.cc"],
        # Exclude device-library code.
        exclude = ["src/collectives/device/**"],
-    )) + [
+    ) + [
        # Required for header inclusion checking (see
        # http://docs.bazel.build/versions/master/be/c-cpp.html#hdrs).
-        "nccl.h",
-        "collectives/collectives.h",
+        # Files in src/ which #include "nccl.h" load it from there rather than
+        # from the virtual includes directory.
+        "src/collectives.h",
+        "src/collectives/collectives.h",
+        "src/nccl.h",
    ],
-    hdrs = ["nccl.h"],
-    copts = cuda_default_copts() + ["-Wno-vla"],
+    hdrs = ["src/nccl.h"],
    include_prefix = "third_party/nccl",
+    strip_include_prefix = "src",
    visibility = ["//visibility:public"],
    deps = [
        ":device",
        ":include_hdrs",
+        ":src_hdrs",
        "@local_config_cuda//cuda:cudart_static",
    ],
 )
--- a/third_party/nccl/archive.patch
+++ b/third_party/nccl/archive.patch
@ -0,0 +1,69 @@
+diff --git a/src/collectives.h b/src/collectives.h
+new file mode 100644
+index 0000000..7d04b16
+--- /dev/null
+++ b/src/collectives.h
+@@ -0,0 +1 @@
+#include "collectives/collectives.h"
+diff --git a/src/collectives/device/all_gather.cu b/src/collectives/device/all_gather.cu.cc
+similarity index 100%
+rename from src/collectives/device/all_gather.cu
+rename to src/collectives/device/all_gather.cu.cc
+diff --git a/src/collectives/device/all_reduce.cu b/src/collectives/device/all_reduce.cu.cc
+similarity index 100%
+rename from src/collectives/device/all_reduce.cu
+rename to src/collectives/device/all_reduce.cu.cc
+diff --git a/src/collectives/device/broadcast.cu b/src/collectives/device/broadcast.cu.cc
+similarity index 100%
+rename from src/collectives/device/broadcast.cu
+rename to src/collectives/device/broadcast.cu.cc
+diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
+index 8c336bf..2eef3ae 100644
+--- a/src/collectives/device/common.h
+++ b/src/collectives/device/common.h
+@@ -7,7 +7,7 @@
+ #ifndef NCCL_DEVICE_COMMON_H_
+ #define NCCL_DEVICE_COMMON_H_
+ 
+-#include "../collectives.h"
+#include "collectives.h"
+ #include "devcomm.h"
+ #include "nccl.h"
+ 
+diff --git a/src/collectives/device/functions.cu b/src/collectives/device/functions.cu.cc
+similarity index 100%
+rename from src/collectives/device/functions.cu
+rename to src/collectives/device/functions.cu.cc
+diff --git a/src/collectives/device/reduce.cu b/src/collectives/device/reduce.cu.cc
+similarity index 100%
+rename from src/collectives/device/reduce.cu
+rename to src/collectives/device/reduce.cu.cc
+diff --git a/src/collectives/device/reduce_scatter.cu b/src/collectives/device/reduce_scatter.cu.cc
+similarity index 100%
+rename from src/collectives/device/reduce_scatter.cu
+rename to src/collectives/device/reduce_scatter.cu.cc
+diff --git a/src/nccl.h.in b/src/nccl.h
+similarity index 98%
+rename from src/nccl.h.in
+rename to src/nccl.h
+index 985274e..7ebb1e1 100644
+--- a/src/nccl.h.in
+++ b/src/nccl.h
+@@ -10,12 +10,12 @@
+ #include <cuda_runtime.h>
+ #include <cuda_fp16.h>
+ 
+-#define NCCL_MAJOR ${nccl:Major}
+-#define NCCL_MINOR ${nccl:Minor}
+-#define NCCL_PATCH ${nccl:Patch}
+-#define NCCL_SUFFIX "${nccl:Suffix}"
+#define NCCL_MAJOR 2
+#define NCCL_MINOR 4
+#define NCCL_PATCH 7
+#define NCCL_SUFFIX ""
+ 
+-#define NCCL_VERSION_CODE ${nccl:Version}
+#define NCCL_VERSION_CODE 2407
+ #define NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z))
+ 
+ #ifdef __cplusplus
--- a/third_party/nccl/build_defs.bzl.tpl
+++ b/third_party/nccl/build_defs.bzl.tpl
@ -3,90 +3,33 @@
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
 load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain")

-def _process_src_impl(ctx):
-    """Applies various patches to the NCCL source."""
-    substitutions = {
-        "\"collectives.h": "\"collectives/collectives.h",
-        "\"../collectives.h": "\"collectives/collectives.h",
-        # Clang does not define __CUDACC_VER_*__, use CUDA_VERSION instead.
-        # TODO(csigg): Apply substitutions upstream and remove here.
-        "#if __CUDACC_VER_MAJOR__ >= 10 || (__CUDACC_VER_MAJOR__ >= 9 && __CUDACC_VER_MINOR__ >= 2)": "#if CUDART_VERSION >= 9200",
-        "#if __CUDACC_VER_MAJOR__ >= 10": "#if CUDART_VERSION >= 10000",
-        "#if __CUDACC_VER_MAJOR__ >= 9": "#if CUDART_VERSION >= 9000",
-        "#if __CUDACC_VER_MAJOR__ < 9": "#if CUDART_VERSION < 9000",
-        "nullptr_t": "std::nullptr_t",
-    }
-    if ctx.file.src.basename == "nccl.h.in":
-        substitutions.update({
-          "${nccl:Major}": "2",
-          "${nccl:Minor}": "3",
-          "${nccl:Patch}": "5",
-          "${nccl:Suffix}": "",
-          "${nccl:Version}": "2305",
-        })
-    if ctx.file.src.basename == "function.cu":
-        substitutions.update({
-            # Don't try to initialize the host shadow copy of this device-side
-            # global variable. There is no host pointer to a device-side
-            # function, which confuses clang.
-            # TODO(csigg): remove when fixed in clang.
-            "NCCL_FUNCS2B(ncclBroadcast),": "#if __CUDA_ARCH__\nNCCL_FUNCS2B(ncclBroadcast),",
-            "NCCL_FUNCS2A(ncclAllReduce)": "NCCL_FUNCS2A(ncclAllReduce)\n#endif",
-        })
-    ctx.actions.expand_template(
-        output = ctx.outputs.out,
-        template = ctx.file.src,
-        substitutions = substitutions,
-    )
-
-_process_src = rule(
-    implementation = _process_src_impl,
-    attrs = {
-        "src": attr.label(allow_single_file = True),
-        "out": attr.output(),
-    },
-)
-"""Processes one NCCL source file so it can be compiled with bazel and clang."""
-
-def _out(src):
-    if not src.startswith("src/"):
-      fail("Source file not under src/...:", src)
-    src = src[4:]  # Strip 'src/'
-    if src == "nccl.h.in":
-      return "nccl.h"
-    if src.endswith(".cu"):
-      return src + ".cc"
-    return src
-
-def process_srcs(srcs):
-    """Processes files under src/ and copies them to the parent directory."""
-    [_process_src(
-      name = "_" + src,
-      src = src,
-      out = _out(src),
-    ) for src in srcs]
-    return ["_" + src for src in srcs]
-
 def _gen_device_srcs_impl(ctx):
+    ops = ["sum", "prod", "min", "max"]
+    types = ["i8", "u8", "i32", "u32", "i64", "u64", "f16", "f32", "f64"]
+    hdr_tail = "****************************************/"
+    defines = "\n\n#define NCCL_OP %d\n#define NCCL_TYPE %d"
+
    files = []
-    for src in ctx.files.srcs:
-        name = "%s_%s" % (ctx.attr.name, src.basename)
-        file = ctx.actions.declare_file(name, sibling = src)
-        ctx.actions.expand_template(
-            output = file,
-            template = src,
+    for NCCL_OP, op in enumerate(ops):
+        for NCCL_TYPE, dt in enumerate(types):
            substitutions = {
-                "#define UNROLL 4": "#define UNROLL 4\n#define NCCL_OP %d" % ctx.attr.NCCL_OP,
-            },
-        )
-        files.append(file)
+                hdr_tail: hdr_tail + defines % (NCCL_OP, NCCL_TYPE),
+            }
+            for src in ctx.files.srcs:
+                name = "%s_%s_%s" % (op, dt, src.basename)
+                file = ctx.actions.declare_file(name, sibling = src)
+                ctx.actions.expand_template(
+                    output = file,
+                    template = src,
+                    substitutions = substitutions,
+                )
+                files.append(file)
    return [DefaultInfo(files = depset(files))]

 gen_device_srcs = rule(
    implementation = _gen_device_srcs_impl,
    attrs = {
        "srcs": attr.label_list(allow_files = True),
-        "NCCL_OP": attr.int(),
    },
 )
 """Adds prefix to each file name in srcs and adds #define NCCL_OP."""
@ -110,10 +53,6 @@ def _rdc_copts():
            "-fcuda-rdc",
            "-Xcuda-ptxas",
            maxrregcount,
-            # Work around for clang bug (fixed in r348662), declaring
-            # '__device__ operator delete(void*, std::size_t)' non-inline.
-            # TODO(csigg): Only add this option for older clang versions.
-            "-std=gnu++11",
        ],
        "//conditions:default": [],
    })
@ -240,8 +179,7 @@ def _merge_archive_impl(ctx):
    ctx.actions.run_shell(
        inputs = ctx.files.srcs,  # + ctx.files._crosstool,
        outputs = [ctx.outputs.out],
-        command = ("printf \"%s\" " % mri_script +
-                   "| %s -M" % cc_toolchain.ar_executable),
+        command = "printf \"%s\" | %s -M" % (mri_script, cc_toolchain.ar_executable),
    )

 _merge_archive = rule(