Rolling forward CL 252574722 and 252855085 with a fix.

Update NCCL from 2.3.5 to 2.4.7.

PiperOrigin-RevId: 253953922
This commit is contained in:
A. Unique TensorFlower 2019-06-19 01:50:16 -07:00 committed by TensorFlower Gardener
parent 8ec9355084
commit 77659d9c93
5 changed files with 136 additions and 137 deletions

View File

@ -192,6 +192,7 @@ tensorflow/third_party/nccl/LICENSE
tensorflow/third_party/nccl/system.BUILD.tpl tensorflow/third_party/nccl/system.BUILD.tpl
tensorflow/third_party/nccl/nccl_configure.bzl tensorflow/third_party/nccl/nccl_configure.bzl
tensorflow/third_party/nccl/build_defs.bzl.tpl tensorflow/third_party/nccl/build_defs.bzl.tpl
tensorflow/third_party/nccl/archive.patch
tensorflow/third_party/nccl/BUILD tensorflow/third_party/nccl/BUILD
tensorflow/third_party/boringssl/BUILD tensorflow/third_party/boringssl/BUILD
tensorflow/third_party/mpi/.gitignore tensorflow/third_party/mpi/.gitignore

View File

@ -602,11 +602,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
tf_http_archive( tf_http_archive(
name = "nccl_archive", name = "nccl_archive",
build_file = clean_dep("//third_party:nccl/archive.BUILD"), build_file = clean_dep("//third_party:nccl/archive.BUILD"),
sha256 = "19132b5127fa8e02d95a09795866923f04064c8f1e0770b2b42ab551408882a4", patch_file = clean_dep("//third_party/nccl:archive.patch"),
strip_prefix = "nccl-f93fe9bfd94884cec2ba711897222e0df5569a53", sha256 = "9a7633e224982e2b60fa6b397d895d20d6b7498e3e02f46f98a5a4e187c5a44c",
strip_prefix = "nccl-0ceaec9cee96ae7658aa45686853286651f36384",
urls = [ urls = [
"http://mirror.tensorflow.org/github.com/nvidia/nccl/archive/f93fe9bfd94884cec2ba711897222e0df5569a53.tar.gz", "http://mirror.tensorflow.org/github.com/nvidia/nccl/archive/0ceaec9cee96ae7658aa45686853286651f36384.tar.gz",
"https://github.com/nvidia/nccl/archive/f93fe9bfd94884cec2ba711897222e0df5569a53.tar.gz", "https://github.com/nvidia/nccl/archive/0ceaec9cee96ae7658aa45686853286651f36384.tar.gz",
], ],
) )

View File

@ -9,99 +9,89 @@ load(
"@local_config_nccl//:build_defs.bzl", "@local_config_nccl//:build_defs.bzl",
"cuda_rdc_library", "cuda_rdc_library",
"gen_device_srcs", "gen_device_srcs",
"process_srcs",
) )
load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
cc_library( cc_library(
name = "src_hdrs", name = "src_hdrs",
hdrs = process_srcs([ hdrs = [
"src/collectives.h",
"src/collectives/collectives.h", "src/collectives/collectives.h",
"src/nccl.h.in", "src/nccl.h",
]), ],
strip_include_prefix = "src",
) )
cc_library( cc_library(
name = "include_hdrs", name = "include_hdrs",
hdrs = process_srcs(glob(["src/include/*.h"])), hdrs = glob(["src/include/*.h"]),
strip_include_prefix = "include", strip_include_prefix = "src/include",
deps = ["@local_config_cuda//cuda:cuda_headers"], deps = ["@local_config_cuda//cuda:cuda_headers"],
) )
device_srcs = process_srcs([ cc_library(
"src/collectives/device/all_gather.cu", name = "device_hdrs",
"src/collectives/device/all_reduce.cu", hdrs = glob(["src/collectives/device/*.h"]),
"src/collectives/device/broadcast.cu", strip_include_prefix = "src/collectives/device",
"src/collectives/device/reduce.cu", )
"src/collectives/device/reduce_scatter.cu",
])
# NCCL compiles the same source files with different NCCL_OP defines. RDC # NCCL compiles the same source files with different NCCL_OP/NCCL_TYPE defines.
# compilation requires that each compiled module has a unique ID. Clang derives # RDC compilation requires that each compiled module has a unique ID. Clang
# the module ID from the path only so we need to rename the files to get # derives the module ID from the path only so we need to copy the files to get
# different IDs for different parts of compilation. NVCC does not have that # different IDs for different parts of compilation. NVCC does not have that
# problem because it generates IDs based on preprocessed content. # problem because it generates IDs based on preprocessed content.
gen_device_srcs( gen_device_srcs(
name = "sum", name = "device_srcs",
srcs = device_srcs, srcs = [
NCCL_OP = 0, "src/collectives/device/all_gather.cu.cc",
) "src/collectives/device/all_reduce.cu.cc",
"src/collectives/device/broadcast.cu.cc",
gen_device_srcs( "src/collectives/device/reduce.cu.cc",
name = "prod", "src/collectives/device/reduce_scatter.cu.cc",
srcs = device_srcs, ],
NCCL_OP = 1,
)
gen_device_srcs(
name = "min",
srcs = device_srcs,
NCCL_OP = 2,
)
gen_device_srcs(
name = "max",
srcs = device_srcs,
NCCL_OP = 3,
) )
cuda_rdc_library( cuda_rdc_library(
name = "device", name = "device",
srcs = [ srcs = [
":max", "src/collectives/device/functions.cu.cc",
":min", ":device_srcs",
":prod", ] + glob([
":sum", # Required for header inclusion checking, see below for details.
] + process_srcs(glob([
"src/collectives/device/*.h", "src/collectives/device/*.h",
"src/collectives/device/functions.cu", "src/nccl.h",
])), ]),
deps = [ deps = [
":device_hdrs",
":include_hdrs", ":include_hdrs",
":src_hdrs", ":src_hdrs",
"@local_config_cuda//cuda:cuda_headers",
], ],
) )
# Primary NCCL target. # Primary NCCL target.
cc_library( cc_library(
name = "nccl", name = "nccl",
srcs = process_srcs(glob( srcs = glob(
include = ["src/**/*.cu"], include = ["src/**/*.cc"],
# Exclude device-library code. # Exclude device-library code.
exclude = ["src/collectives/device/**"], exclude = ["src/collectives/device/**"],
)) + [ ) + [
# Required for header inclusion checking (see # Required for header inclusion checking (see
# http://docs.bazel.build/versions/master/be/c-cpp.html#hdrs). # http://docs.bazel.build/versions/master/be/c-cpp.html#hdrs).
"nccl.h", # Files in src/ which #include "nccl.h" load it from there rather than
"collectives/collectives.h", # from the virtual includes directory.
"src/collectives.h",
"src/collectives/collectives.h",
"src/nccl.h",
], ],
hdrs = ["nccl.h"], hdrs = ["src/nccl.h"],
copts = cuda_default_copts() + ["-Wno-vla"],
include_prefix = "third_party/nccl", include_prefix = "third_party/nccl",
strip_include_prefix = "src",
visibility = ["//visibility:public"], visibility = ["//visibility:public"],
deps = [ deps = [
":device", ":device",
":include_hdrs", ":include_hdrs",
":src_hdrs",
"@local_config_cuda//cuda:cudart_static", "@local_config_cuda//cuda:cudart_static",
], ],
) )

69
third_party/nccl/archive.patch vendored Normal file
View File

@ -0,0 +1,69 @@
diff --git a/src/collectives.h b/src/collectives.h
new file mode 100644
index 0000000..7d04b16
--- /dev/null
+++ b/src/collectives.h
@@ -0,0 +1 @@
+#include "collectives/collectives.h"
diff --git a/src/collectives/device/all_gather.cu b/src/collectives/device/all_gather.cu.cc
similarity index 100%
rename from src/collectives/device/all_gather.cu
rename to src/collectives/device/all_gather.cu.cc
diff --git a/src/collectives/device/all_reduce.cu b/src/collectives/device/all_reduce.cu.cc
similarity index 100%
rename from src/collectives/device/all_reduce.cu
rename to src/collectives/device/all_reduce.cu.cc
diff --git a/src/collectives/device/broadcast.cu b/src/collectives/device/broadcast.cu.cc
similarity index 100%
rename from src/collectives/device/broadcast.cu
rename to src/collectives/device/broadcast.cu.cc
diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
index 8c336bf..2eef3ae 100644
--- a/src/collectives/device/common.h
+++ b/src/collectives/device/common.h
@@ -7,7 +7,7 @@
#ifndef NCCL_DEVICE_COMMON_H_
#define NCCL_DEVICE_COMMON_H_
-#include "../collectives.h"
+#include "collectives.h"
#include "devcomm.h"
#include "nccl.h"
diff --git a/src/collectives/device/functions.cu b/src/collectives/device/functions.cu.cc
similarity index 100%
rename from src/collectives/device/functions.cu
rename to src/collectives/device/functions.cu.cc
diff --git a/src/collectives/device/reduce.cu b/src/collectives/device/reduce.cu.cc
similarity index 100%
rename from src/collectives/device/reduce.cu
rename to src/collectives/device/reduce.cu.cc
diff --git a/src/collectives/device/reduce_scatter.cu b/src/collectives/device/reduce_scatter.cu.cc
similarity index 100%
rename from src/collectives/device/reduce_scatter.cu
rename to src/collectives/device/reduce_scatter.cu.cc
diff --git a/src/nccl.h.in b/src/nccl.h
similarity index 98%
rename from src/nccl.h.in
rename to src/nccl.h
index 985274e..7ebb1e1 100644
--- a/src/nccl.h.in
+++ b/src/nccl.h
@@ -10,12 +10,12 @@
#include <cuda_runtime.h>
#include <cuda_fp16.h>
-#define NCCL_MAJOR ${nccl:Major}
-#define NCCL_MINOR ${nccl:Minor}
-#define NCCL_PATCH ${nccl:Patch}
-#define NCCL_SUFFIX "${nccl:Suffix}"
+#define NCCL_MAJOR 2
+#define NCCL_MINOR 4
+#define NCCL_PATCH 7
+#define NCCL_SUFFIX ""
-#define NCCL_VERSION_CODE ${nccl:Version}
+#define NCCL_VERSION_CODE 2407
#define NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z))
#ifdef __cplusplus

View File

@ -3,90 +3,33 @@
load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts") load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain") load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain")
def _process_src_impl(ctx):
"""Applies various patches to the NCCL source."""
substitutions = {
"\"collectives.h": "\"collectives/collectives.h",
"\"../collectives.h": "\"collectives/collectives.h",
# Clang does not define __CUDACC_VER_*__, use CUDA_VERSION instead.
# TODO(csigg): Apply substitutions upstream and remove here.
"#if __CUDACC_VER_MAJOR__ >= 10 || (__CUDACC_VER_MAJOR__ >= 9 && __CUDACC_VER_MINOR__ >= 2)": "#if CUDART_VERSION >= 9200",
"#if __CUDACC_VER_MAJOR__ >= 10": "#if CUDART_VERSION >= 10000",
"#if __CUDACC_VER_MAJOR__ >= 9": "#if CUDART_VERSION >= 9000",
"#if __CUDACC_VER_MAJOR__ < 9": "#if CUDART_VERSION < 9000",
"nullptr_t": "std::nullptr_t",
}
if ctx.file.src.basename == "nccl.h.in":
substitutions.update({
"${nccl:Major}": "2",
"${nccl:Minor}": "3",
"${nccl:Patch}": "5",
"${nccl:Suffix}": "",
"${nccl:Version}": "2305",
})
if ctx.file.src.basename == "function.cu":
substitutions.update({
# Don't try to initialize the host shadow copy of this device-side
# global variable. There is no host pointer to a device-side
# function, which confuses clang.
# TODO(csigg): remove when fixed in clang.
"NCCL_FUNCS2B(ncclBroadcast),": "#if __CUDA_ARCH__\nNCCL_FUNCS2B(ncclBroadcast),",
"NCCL_FUNCS2A(ncclAllReduce)": "NCCL_FUNCS2A(ncclAllReduce)\n#endif",
})
ctx.actions.expand_template(
output = ctx.outputs.out,
template = ctx.file.src,
substitutions = substitutions,
)
_process_src = rule(
implementation = _process_src_impl,
attrs = {
"src": attr.label(allow_single_file = True),
"out": attr.output(),
},
)
"""Processes one NCCL source file so it can be compiled with bazel and clang."""
def _out(src):
if not src.startswith("src/"):
fail("Source file not under src/...:", src)
src = src[4:] # Strip 'src/'
if src == "nccl.h.in":
return "nccl.h"
if src.endswith(".cu"):
return src + ".cc"
return src
def process_srcs(srcs):
"""Processes files under src/ and copies them to the parent directory."""
[_process_src(
name = "_" + src,
src = src,
out = _out(src),
) for src in srcs]
return ["_" + src for src in srcs]
def _gen_device_srcs_impl(ctx): def _gen_device_srcs_impl(ctx):
ops = ["sum", "prod", "min", "max"]
types = ["i8", "u8", "i32", "u32", "i64", "u64", "f16", "f32", "f64"]
hdr_tail = "****************************************/"
defines = "\n\n#define NCCL_OP %d\n#define NCCL_TYPE %d"
files = [] files = []
for src in ctx.files.srcs: for NCCL_OP, op in enumerate(ops):
name = "%s_%s" % (ctx.attr.name, src.basename) for NCCL_TYPE, dt in enumerate(types):
file = ctx.actions.declare_file(name, sibling = src)
ctx.actions.expand_template(
output = file,
template = src,
substitutions = { substitutions = {
"#define UNROLL 4": "#define UNROLL 4\n#define NCCL_OP %d" % ctx.attr.NCCL_OP, hdr_tail: hdr_tail + defines % (NCCL_OP, NCCL_TYPE),
}, }
) for src in ctx.files.srcs:
files.append(file) name = "%s_%s_%s" % (op, dt, src.basename)
file = ctx.actions.declare_file(name, sibling = src)
ctx.actions.expand_template(
output = file,
template = src,
substitutions = substitutions,
)
files.append(file)
return [DefaultInfo(files = depset(files))] return [DefaultInfo(files = depset(files))]
gen_device_srcs = rule( gen_device_srcs = rule(
implementation = _gen_device_srcs_impl, implementation = _gen_device_srcs_impl,
attrs = { attrs = {
"srcs": attr.label_list(allow_files = True), "srcs": attr.label_list(allow_files = True),
"NCCL_OP": attr.int(),
}, },
) )
"""Adds prefix to each file name in srcs and adds #define NCCL_OP.""" """Adds prefix to each file name in srcs and adds #define NCCL_OP."""
@ -110,10 +53,6 @@ def _rdc_copts():
"-fcuda-rdc", "-fcuda-rdc",
"-Xcuda-ptxas", "-Xcuda-ptxas",
maxrregcount, maxrregcount,
# Work around for clang bug (fixed in r348662), declaring
# '__device__ operator delete(void*, std::size_t)' non-inline.
# TODO(csigg): Only add this option for older clang versions.
"-std=gnu++11",
], ],
"//conditions:default": [], "//conditions:default": [],
}) })
@ -240,8 +179,7 @@ def _merge_archive_impl(ctx):
ctx.actions.run_shell( ctx.actions.run_shell(
inputs = ctx.files.srcs, # + ctx.files._crosstool, inputs = ctx.files.srcs, # + ctx.files._crosstool,
outputs = [ctx.outputs.out], outputs = [ctx.outputs.out],
command = ("printf \"%s\" " % mri_script + command = "printf \"%s\" | %s -M" % (mri_script, cc_toolchain.ar_executable),
"| %s -M" % cc_toolchain.ar_executable),
) )
_merge_archive = rule( _merge_archive = rule(