Rolling forward CL 252574722 and 252855085 with a fix.

Update NCCL from 2.3.5 to 2.4.7.

PiperOrigin-RevId: 253953922
This commit is contained in:
A. Unique TensorFlower 2019-06-19 01:50:16 -07:00 committed by TensorFlower Gardener
parent 8ec9355084
commit 77659d9c93
5 changed files with 136 additions and 137 deletions

View File

@ -192,6 +192,7 @@ tensorflow/third_party/nccl/LICENSE
tensorflow/third_party/nccl/system.BUILD.tpl
tensorflow/third_party/nccl/nccl_configure.bzl
tensorflow/third_party/nccl/build_defs.bzl.tpl
tensorflow/third_party/nccl/archive.patch
tensorflow/third_party/nccl/BUILD
tensorflow/third_party/boringssl/BUILD
tensorflow/third_party/mpi/.gitignore

View File

@ -602,11 +602,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
tf_http_archive(
name = "nccl_archive",
build_file = clean_dep("//third_party:nccl/archive.BUILD"),
sha256 = "19132b5127fa8e02d95a09795866923f04064c8f1e0770b2b42ab551408882a4",
strip_prefix = "nccl-f93fe9bfd94884cec2ba711897222e0df5569a53",
patch_file = clean_dep("//third_party/nccl:archive.patch"),
sha256 = "9a7633e224982e2b60fa6b397d895d20d6b7498e3e02f46f98a5a4e187c5a44c",
strip_prefix = "nccl-0ceaec9cee96ae7658aa45686853286651f36384",
urls = [
"http://mirror.tensorflow.org/github.com/nvidia/nccl/archive/f93fe9bfd94884cec2ba711897222e0df5569a53.tar.gz",
"https://github.com/nvidia/nccl/archive/f93fe9bfd94884cec2ba711897222e0df5569a53.tar.gz",
"http://mirror.tensorflow.org/github.com/nvidia/nccl/archive/0ceaec9cee96ae7658aa45686853286651f36384.tar.gz",
"https://github.com/nvidia/nccl/archive/0ceaec9cee96ae7658aa45686853286651f36384.tar.gz",
],
)

View File

@ -9,99 +9,89 @@ load(
"@local_config_nccl//:build_defs.bzl",
"cuda_rdc_library",
"gen_device_srcs",
"process_srcs",
)
load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
cc_library(
name = "src_hdrs",
hdrs = process_srcs([
hdrs = [
"src/collectives.h",
"src/collectives/collectives.h",
"src/nccl.h.in",
]),
"src/nccl.h",
],
strip_include_prefix = "src",
)
cc_library(
name = "include_hdrs",
hdrs = process_srcs(glob(["src/include/*.h"])),
strip_include_prefix = "include",
hdrs = glob(["src/include/*.h"]),
strip_include_prefix = "src/include",
deps = ["@local_config_cuda//cuda:cuda_headers"],
)
device_srcs = process_srcs([
"src/collectives/device/all_gather.cu",
"src/collectives/device/all_reduce.cu",
"src/collectives/device/broadcast.cu",
"src/collectives/device/reduce.cu",
"src/collectives/device/reduce_scatter.cu",
])
cc_library(
name = "device_hdrs",
hdrs = glob(["src/collectives/device/*.h"]),
strip_include_prefix = "src/collectives/device",
)
# NCCL compiles the same source files with different NCCL_OP defines. RDC
# compilation requires that each compiled module has a unique ID. Clang derives
# the module ID from the path only so we need to rename the files to get
# NCCL compiles the same source files with different NCCL_OP/NCCL_TYPE defines.
# RDC compilation requires that each compiled module has a unique ID. Clang
# derives the module ID from the path only so we need to copy the files to get
# different IDs for different parts of compilation. NVCC does not have that
# problem because it generates IDs based on preprocessed content.
gen_device_srcs(
name = "sum",
srcs = device_srcs,
NCCL_OP = 0,
)
gen_device_srcs(
name = "prod",
srcs = device_srcs,
NCCL_OP = 1,
)
gen_device_srcs(
name = "min",
srcs = device_srcs,
NCCL_OP = 2,
)
gen_device_srcs(
name = "max",
srcs = device_srcs,
NCCL_OP = 3,
name = "device_srcs",
srcs = [
"src/collectives/device/all_gather.cu.cc",
"src/collectives/device/all_reduce.cu.cc",
"src/collectives/device/broadcast.cu.cc",
"src/collectives/device/reduce.cu.cc",
"src/collectives/device/reduce_scatter.cu.cc",
],
)
cuda_rdc_library(
name = "device",
srcs = [
":max",
":min",
":prod",
":sum",
] + process_srcs(glob([
"src/collectives/device/functions.cu.cc",
":device_srcs",
] + glob([
# Required for header inclusion checking, see below for details.
"src/collectives/device/*.h",
"src/collectives/device/functions.cu",
])),
"src/nccl.h",
]),
deps = [
":device_hdrs",
":include_hdrs",
":src_hdrs",
"@local_config_cuda//cuda:cuda_headers",
],
)
# Primary NCCL target.
cc_library(
name = "nccl",
srcs = process_srcs(glob(
include = ["src/**/*.cu"],
srcs = glob(
include = ["src/**/*.cc"],
# Exclude device-library code.
exclude = ["src/collectives/device/**"],
)) + [
) + [
# Required for header inclusion checking (see
# http://docs.bazel.build/versions/master/be/c-cpp.html#hdrs).
"nccl.h",
"collectives/collectives.h",
# Files in src/ which #include "nccl.h" load it from there rather than
# from the virtual includes directory.
"src/collectives.h",
"src/collectives/collectives.h",
"src/nccl.h",
],
hdrs = ["nccl.h"],
copts = cuda_default_copts() + ["-Wno-vla"],
hdrs = ["src/nccl.h"],
include_prefix = "third_party/nccl",
strip_include_prefix = "src",
visibility = ["//visibility:public"],
deps = [
":device",
":include_hdrs",
":src_hdrs",
"@local_config_cuda//cuda:cudart_static",
],
)

69
third_party/nccl/archive.patch vendored Normal file
View File

@ -0,0 +1,69 @@
diff --git a/src/collectives.h b/src/collectives.h
new file mode 100644
index 0000000..7d04b16
--- /dev/null
+++ b/src/collectives.h
@@ -0,0 +1 @@
+#include "collectives/collectives.h"
diff --git a/src/collectives/device/all_gather.cu b/src/collectives/device/all_gather.cu.cc
similarity index 100%
rename from src/collectives/device/all_gather.cu
rename to src/collectives/device/all_gather.cu.cc
diff --git a/src/collectives/device/all_reduce.cu b/src/collectives/device/all_reduce.cu.cc
similarity index 100%
rename from src/collectives/device/all_reduce.cu
rename to src/collectives/device/all_reduce.cu.cc
diff --git a/src/collectives/device/broadcast.cu b/src/collectives/device/broadcast.cu.cc
similarity index 100%
rename from src/collectives/device/broadcast.cu
rename to src/collectives/device/broadcast.cu.cc
diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
index 8c336bf..2eef3ae 100644
--- a/src/collectives/device/common.h
+++ b/src/collectives/device/common.h
@@ -7,7 +7,7 @@
#ifndef NCCL_DEVICE_COMMON_H_
#define NCCL_DEVICE_COMMON_H_
-#include "../collectives.h"
+#include "collectives.h"
#include "devcomm.h"
#include "nccl.h"
diff --git a/src/collectives/device/functions.cu b/src/collectives/device/functions.cu.cc
similarity index 100%
rename from src/collectives/device/functions.cu
rename to src/collectives/device/functions.cu.cc
diff --git a/src/collectives/device/reduce.cu b/src/collectives/device/reduce.cu.cc
similarity index 100%
rename from src/collectives/device/reduce.cu
rename to src/collectives/device/reduce.cu.cc
diff --git a/src/collectives/device/reduce_scatter.cu b/src/collectives/device/reduce_scatter.cu.cc
similarity index 100%
rename from src/collectives/device/reduce_scatter.cu
rename to src/collectives/device/reduce_scatter.cu.cc
diff --git a/src/nccl.h.in b/src/nccl.h
similarity index 98%
rename from src/nccl.h.in
rename to src/nccl.h
index 985274e..7ebb1e1 100644
--- a/src/nccl.h.in
+++ b/src/nccl.h
@@ -10,12 +10,12 @@
#include <cuda_runtime.h>
#include <cuda_fp16.h>
-#define NCCL_MAJOR ${nccl:Major}
-#define NCCL_MINOR ${nccl:Minor}
-#define NCCL_PATCH ${nccl:Patch}
-#define NCCL_SUFFIX "${nccl:Suffix}"
+#define NCCL_MAJOR 2
+#define NCCL_MINOR 4
+#define NCCL_PATCH 7
+#define NCCL_SUFFIX ""
-#define NCCL_VERSION_CODE ${nccl:Version}
+#define NCCL_VERSION_CODE 2407
#define NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z))
#ifdef __cplusplus

View File

@ -3,90 +3,33 @@
load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain")
def _process_src_impl(ctx):
"""Applies various patches to the NCCL source."""
substitutions = {
"\"collectives.h": "\"collectives/collectives.h",
"\"../collectives.h": "\"collectives/collectives.h",
# Clang does not define __CUDACC_VER_*__, use CUDA_VERSION instead.
# TODO(csigg): Apply substitutions upstream and remove here.
"#if __CUDACC_VER_MAJOR__ >= 10 || (__CUDACC_VER_MAJOR__ >= 9 && __CUDACC_VER_MINOR__ >= 2)": "#if CUDART_VERSION >= 9200",
"#if __CUDACC_VER_MAJOR__ >= 10": "#if CUDART_VERSION >= 10000",
"#if __CUDACC_VER_MAJOR__ >= 9": "#if CUDART_VERSION >= 9000",
"#if __CUDACC_VER_MAJOR__ < 9": "#if CUDART_VERSION < 9000",
"nullptr_t": "std::nullptr_t",
}
if ctx.file.src.basename == "nccl.h.in":
substitutions.update({
"${nccl:Major}": "2",
"${nccl:Minor}": "3",
"${nccl:Patch}": "5",
"${nccl:Suffix}": "",
"${nccl:Version}": "2305",
})
if ctx.file.src.basename == "function.cu":
substitutions.update({
# Don't try to initialize the host shadow copy of this device-side
# global variable. There is no host pointer to a device-side
# function, which confuses clang.
# TODO(csigg): remove when fixed in clang.
"NCCL_FUNCS2B(ncclBroadcast),": "#if __CUDA_ARCH__\nNCCL_FUNCS2B(ncclBroadcast),",
"NCCL_FUNCS2A(ncclAllReduce)": "NCCL_FUNCS2A(ncclAllReduce)\n#endif",
})
ctx.actions.expand_template(
output = ctx.outputs.out,
template = ctx.file.src,
substitutions = substitutions,
)
_process_src = rule(
implementation = _process_src_impl,
attrs = {
"src": attr.label(allow_single_file = True),
"out": attr.output(),
},
)
"""Processes one NCCL source file so it can be compiled with bazel and clang."""
def _out(src):
if not src.startswith("src/"):
fail("Source file not under src/...:", src)
src = src[4:] # Strip 'src/'
if src == "nccl.h.in":
return "nccl.h"
if src.endswith(".cu"):
return src + ".cc"
return src
def process_srcs(srcs):
"""Processes files under src/ and copies them to the parent directory."""
[_process_src(
name = "_" + src,
src = src,
out = _out(src),
) for src in srcs]
return ["_" + src for src in srcs]
def _gen_device_srcs_impl(ctx):
ops = ["sum", "prod", "min", "max"]
types = ["i8", "u8", "i32", "u32", "i64", "u64", "f16", "f32", "f64"]
hdr_tail = "****************************************/"
defines = "\n\n#define NCCL_OP %d\n#define NCCL_TYPE %d"
files = []
for src in ctx.files.srcs:
name = "%s_%s" % (ctx.attr.name, src.basename)
file = ctx.actions.declare_file(name, sibling = src)
ctx.actions.expand_template(
output = file,
template = src,
for NCCL_OP, op in enumerate(ops):
for NCCL_TYPE, dt in enumerate(types):
substitutions = {
"#define UNROLL 4": "#define UNROLL 4\n#define NCCL_OP %d" % ctx.attr.NCCL_OP,
},
)
files.append(file)
hdr_tail: hdr_tail + defines % (NCCL_OP, NCCL_TYPE),
}
for src in ctx.files.srcs:
name = "%s_%s_%s" % (op, dt, src.basename)
file = ctx.actions.declare_file(name, sibling = src)
ctx.actions.expand_template(
output = file,
template = src,
substitutions = substitutions,
)
files.append(file)
return [DefaultInfo(files = depset(files))]
gen_device_srcs = rule(
implementation = _gen_device_srcs_impl,
attrs = {
"srcs": attr.label_list(allow_files = True),
"NCCL_OP": attr.int(),
},
)
"""Adds prefix to each file name in srcs and adds #define NCCL_OP."""
@ -110,10 +53,6 @@ def _rdc_copts():
"-fcuda-rdc",
"-Xcuda-ptxas",
maxrregcount,
# Work around for clang bug (fixed in r348662), declaring
# '__device__ operator delete(void*, std::size_t)' non-inline.
# TODO(csigg): Only add this option for older clang versions.
"-std=gnu++11",
],
"//conditions:default": [],
})
@ -240,8 +179,7 @@ def _merge_archive_impl(ctx):
ctx.actions.run_shell(
inputs = ctx.files.srcs, # + ctx.files._crosstool,
outputs = [ctx.outputs.out],
command = ("printf \"%s\" " % mri_script +
"| %s -M" % cc_toolchain.ar_executable),
command = "printf \"%s\" | %s -M" % (mri_script, cc_toolchain.ar_executable),
)
_merge_archive = rule(