Rolling forward CL 252574722 and 252855085 with a fix.
Update NCCL from 2.3.5 to 2.4.7. PiperOrigin-RevId: 253953922
This commit is contained in:
parent
8ec9355084
commit
77659d9c93
tensorflow
third_party/nccl
@ -192,6 +192,7 @@ tensorflow/third_party/nccl/LICENSE
|
||||
tensorflow/third_party/nccl/system.BUILD.tpl
|
||||
tensorflow/third_party/nccl/nccl_configure.bzl
|
||||
tensorflow/third_party/nccl/build_defs.bzl.tpl
|
||||
tensorflow/third_party/nccl/archive.patch
|
||||
tensorflow/third_party/nccl/BUILD
|
||||
tensorflow/third_party/boringssl/BUILD
|
||||
tensorflow/third_party/mpi/.gitignore
|
||||
|
@ -602,11 +602,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
|
||||
tf_http_archive(
|
||||
name = "nccl_archive",
|
||||
build_file = clean_dep("//third_party:nccl/archive.BUILD"),
|
||||
sha256 = "19132b5127fa8e02d95a09795866923f04064c8f1e0770b2b42ab551408882a4",
|
||||
strip_prefix = "nccl-f93fe9bfd94884cec2ba711897222e0df5569a53",
|
||||
patch_file = clean_dep("//third_party/nccl:archive.patch"),
|
||||
sha256 = "9a7633e224982e2b60fa6b397d895d20d6b7498e3e02f46f98a5a4e187c5a44c",
|
||||
strip_prefix = "nccl-0ceaec9cee96ae7658aa45686853286651f36384",
|
||||
urls = [
|
||||
"http://mirror.tensorflow.org/github.com/nvidia/nccl/archive/f93fe9bfd94884cec2ba711897222e0df5569a53.tar.gz",
|
||||
"https://github.com/nvidia/nccl/archive/f93fe9bfd94884cec2ba711897222e0df5569a53.tar.gz",
|
||||
"http://mirror.tensorflow.org/github.com/nvidia/nccl/archive/0ceaec9cee96ae7658aa45686853286651f36384.tar.gz",
|
||||
"https://github.com/nvidia/nccl/archive/0ceaec9cee96ae7658aa45686853286651f36384.tar.gz",
|
||||
],
|
||||
)
|
||||
|
||||
|
94
third_party/nccl/archive.BUILD
vendored
94
third_party/nccl/archive.BUILD
vendored
@ -9,99 +9,89 @@ load(
|
||||
"@local_config_nccl//:build_defs.bzl",
|
||||
"cuda_rdc_library",
|
||||
"gen_device_srcs",
|
||||
"process_srcs",
|
||||
)
|
||||
load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
|
||||
|
||||
cc_library(
|
||||
name = "src_hdrs",
|
||||
hdrs = process_srcs([
|
||||
hdrs = [
|
||||
"src/collectives.h",
|
||||
"src/collectives/collectives.h",
|
||||
"src/nccl.h.in",
|
||||
]),
|
||||
"src/nccl.h",
|
||||
],
|
||||
strip_include_prefix = "src",
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "include_hdrs",
|
||||
hdrs = process_srcs(glob(["src/include/*.h"])),
|
||||
strip_include_prefix = "include",
|
||||
hdrs = glob(["src/include/*.h"]),
|
||||
strip_include_prefix = "src/include",
|
||||
deps = ["@local_config_cuda//cuda:cuda_headers"],
|
||||
)
|
||||
|
||||
device_srcs = process_srcs([
|
||||
"src/collectives/device/all_gather.cu",
|
||||
"src/collectives/device/all_reduce.cu",
|
||||
"src/collectives/device/broadcast.cu",
|
||||
"src/collectives/device/reduce.cu",
|
||||
"src/collectives/device/reduce_scatter.cu",
|
||||
])
|
||||
cc_library(
|
||||
name = "device_hdrs",
|
||||
hdrs = glob(["src/collectives/device/*.h"]),
|
||||
strip_include_prefix = "src/collectives/device",
|
||||
)
|
||||
|
||||
# NCCL compiles the same source files with different NCCL_OP defines. RDC
|
||||
# compilation requires that each compiled module has a unique ID. Clang derives
|
||||
# the module ID from the path only so we need to rename the files to get
|
||||
# NCCL compiles the same source files with different NCCL_OP/NCCL_TYPE defines.
|
||||
# RDC compilation requires that each compiled module has a unique ID. Clang
|
||||
# derives the module ID from the path only so we need to copy the files to get
|
||||
# different IDs for different parts of compilation. NVCC does not have that
|
||||
# problem because it generates IDs based on preprocessed content.
|
||||
gen_device_srcs(
|
||||
name = "sum",
|
||||
srcs = device_srcs,
|
||||
NCCL_OP = 0,
|
||||
)
|
||||
|
||||
gen_device_srcs(
|
||||
name = "prod",
|
||||
srcs = device_srcs,
|
||||
NCCL_OP = 1,
|
||||
)
|
||||
|
||||
gen_device_srcs(
|
||||
name = "min",
|
||||
srcs = device_srcs,
|
||||
NCCL_OP = 2,
|
||||
)
|
||||
|
||||
gen_device_srcs(
|
||||
name = "max",
|
||||
srcs = device_srcs,
|
||||
NCCL_OP = 3,
|
||||
name = "device_srcs",
|
||||
srcs = [
|
||||
"src/collectives/device/all_gather.cu.cc",
|
||||
"src/collectives/device/all_reduce.cu.cc",
|
||||
"src/collectives/device/broadcast.cu.cc",
|
||||
"src/collectives/device/reduce.cu.cc",
|
||||
"src/collectives/device/reduce_scatter.cu.cc",
|
||||
],
|
||||
)
|
||||
|
||||
cuda_rdc_library(
|
||||
name = "device",
|
||||
srcs = [
|
||||
":max",
|
||||
":min",
|
||||
":prod",
|
||||
":sum",
|
||||
] + process_srcs(glob([
|
||||
"src/collectives/device/functions.cu.cc",
|
||||
":device_srcs",
|
||||
] + glob([
|
||||
# Required for header inclusion checking, see below for details.
|
||||
"src/collectives/device/*.h",
|
||||
"src/collectives/device/functions.cu",
|
||||
])),
|
||||
"src/nccl.h",
|
||||
]),
|
||||
deps = [
|
||||
":device_hdrs",
|
||||
":include_hdrs",
|
||||
":src_hdrs",
|
||||
"@local_config_cuda//cuda:cuda_headers",
|
||||
],
|
||||
)
|
||||
|
||||
# Primary NCCL target.
|
||||
cc_library(
|
||||
name = "nccl",
|
||||
srcs = process_srcs(glob(
|
||||
include = ["src/**/*.cu"],
|
||||
srcs = glob(
|
||||
include = ["src/**/*.cc"],
|
||||
# Exclude device-library code.
|
||||
exclude = ["src/collectives/device/**"],
|
||||
)) + [
|
||||
) + [
|
||||
# Required for header inclusion checking (see
|
||||
# http://docs.bazel.build/versions/master/be/c-cpp.html#hdrs).
|
||||
"nccl.h",
|
||||
"collectives/collectives.h",
|
||||
# Files in src/ which #include "nccl.h" load it from there rather than
|
||||
# from the virtual includes directory.
|
||||
"src/collectives.h",
|
||||
"src/collectives/collectives.h",
|
||||
"src/nccl.h",
|
||||
],
|
||||
hdrs = ["nccl.h"],
|
||||
copts = cuda_default_copts() + ["-Wno-vla"],
|
||||
hdrs = ["src/nccl.h"],
|
||||
include_prefix = "third_party/nccl",
|
||||
strip_include_prefix = "src",
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
":device",
|
||||
":include_hdrs",
|
||||
":src_hdrs",
|
||||
"@local_config_cuda//cuda:cudart_static",
|
||||
],
|
||||
)
|
||||
|
69
third_party/nccl/archive.patch
vendored
Normal file
69
third_party/nccl/archive.patch
vendored
Normal file
@ -0,0 +1,69 @@
|
||||
diff --git a/src/collectives.h b/src/collectives.h
|
||||
new file mode 100644
|
||||
index 0000000..7d04b16
|
||||
--- /dev/null
|
||||
+++ b/src/collectives.h
|
||||
@@ -0,0 +1 @@
|
||||
+#include "collectives/collectives.h"
|
||||
diff --git a/src/collectives/device/all_gather.cu b/src/collectives/device/all_gather.cu.cc
|
||||
similarity index 100%
|
||||
rename from src/collectives/device/all_gather.cu
|
||||
rename to src/collectives/device/all_gather.cu.cc
|
||||
diff --git a/src/collectives/device/all_reduce.cu b/src/collectives/device/all_reduce.cu.cc
|
||||
similarity index 100%
|
||||
rename from src/collectives/device/all_reduce.cu
|
||||
rename to src/collectives/device/all_reduce.cu.cc
|
||||
diff --git a/src/collectives/device/broadcast.cu b/src/collectives/device/broadcast.cu.cc
|
||||
similarity index 100%
|
||||
rename from src/collectives/device/broadcast.cu
|
||||
rename to src/collectives/device/broadcast.cu.cc
|
||||
diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
|
||||
index 8c336bf..2eef3ae 100644
|
||||
--- a/src/collectives/device/common.h
|
||||
+++ b/src/collectives/device/common.h
|
||||
@@ -7,7 +7,7 @@
|
||||
#ifndef NCCL_DEVICE_COMMON_H_
|
||||
#define NCCL_DEVICE_COMMON_H_
|
||||
|
||||
-#include "../collectives.h"
|
||||
+#include "collectives.h"
|
||||
#include "devcomm.h"
|
||||
#include "nccl.h"
|
||||
|
||||
diff --git a/src/collectives/device/functions.cu b/src/collectives/device/functions.cu.cc
|
||||
similarity index 100%
|
||||
rename from src/collectives/device/functions.cu
|
||||
rename to src/collectives/device/functions.cu.cc
|
||||
diff --git a/src/collectives/device/reduce.cu b/src/collectives/device/reduce.cu.cc
|
||||
similarity index 100%
|
||||
rename from src/collectives/device/reduce.cu
|
||||
rename to src/collectives/device/reduce.cu.cc
|
||||
diff --git a/src/collectives/device/reduce_scatter.cu b/src/collectives/device/reduce_scatter.cu.cc
|
||||
similarity index 100%
|
||||
rename from src/collectives/device/reduce_scatter.cu
|
||||
rename to src/collectives/device/reduce_scatter.cu.cc
|
||||
diff --git a/src/nccl.h.in b/src/nccl.h
|
||||
similarity index 98%
|
||||
rename from src/nccl.h.in
|
||||
rename to src/nccl.h
|
||||
index 985274e..7ebb1e1 100644
|
||||
--- a/src/nccl.h.in
|
||||
+++ b/src/nccl.h
|
||||
@@ -10,12 +10,12 @@
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
-#define NCCL_MAJOR ${nccl:Major}
|
||||
-#define NCCL_MINOR ${nccl:Minor}
|
||||
-#define NCCL_PATCH ${nccl:Patch}
|
||||
-#define NCCL_SUFFIX "${nccl:Suffix}"
|
||||
+#define NCCL_MAJOR 2
|
||||
+#define NCCL_MINOR 4
|
||||
+#define NCCL_PATCH 7
|
||||
+#define NCCL_SUFFIX ""
|
||||
|
||||
-#define NCCL_VERSION_CODE ${nccl:Version}
|
||||
+#define NCCL_VERSION_CODE 2407
|
||||
#define NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z))
|
||||
|
||||
#ifdef __cplusplus
|
100
third_party/nccl/build_defs.bzl.tpl
vendored
100
third_party/nccl/build_defs.bzl.tpl
vendored
@ -3,90 +3,33 @@
|
||||
load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
|
||||
load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain")
|
||||
|
||||
def _process_src_impl(ctx):
|
||||
"""Applies various patches to the NCCL source."""
|
||||
substitutions = {
|
||||
"\"collectives.h": "\"collectives/collectives.h",
|
||||
"\"../collectives.h": "\"collectives/collectives.h",
|
||||
# Clang does not define __CUDACC_VER_*__, use CUDA_VERSION instead.
|
||||
# TODO(csigg): Apply substitutions upstream and remove here.
|
||||
"#if __CUDACC_VER_MAJOR__ >= 10 || (__CUDACC_VER_MAJOR__ >= 9 && __CUDACC_VER_MINOR__ >= 2)": "#if CUDART_VERSION >= 9200",
|
||||
"#if __CUDACC_VER_MAJOR__ >= 10": "#if CUDART_VERSION >= 10000",
|
||||
"#if __CUDACC_VER_MAJOR__ >= 9": "#if CUDART_VERSION >= 9000",
|
||||
"#if __CUDACC_VER_MAJOR__ < 9": "#if CUDART_VERSION < 9000",
|
||||
"nullptr_t": "std::nullptr_t",
|
||||
}
|
||||
if ctx.file.src.basename == "nccl.h.in":
|
||||
substitutions.update({
|
||||
"${nccl:Major}": "2",
|
||||
"${nccl:Minor}": "3",
|
||||
"${nccl:Patch}": "5",
|
||||
"${nccl:Suffix}": "",
|
||||
"${nccl:Version}": "2305",
|
||||
})
|
||||
if ctx.file.src.basename == "function.cu":
|
||||
substitutions.update({
|
||||
# Don't try to initialize the host shadow copy of this device-side
|
||||
# global variable. There is no host pointer to a device-side
|
||||
# function, which confuses clang.
|
||||
# TODO(csigg): remove when fixed in clang.
|
||||
"NCCL_FUNCS2B(ncclBroadcast),": "#if __CUDA_ARCH__\nNCCL_FUNCS2B(ncclBroadcast),",
|
||||
"NCCL_FUNCS2A(ncclAllReduce)": "NCCL_FUNCS2A(ncclAllReduce)\n#endif",
|
||||
})
|
||||
ctx.actions.expand_template(
|
||||
output = ctx.outputs.out,
|
||||
template = ctx.file.src,
|
||||
substitutions = substitutions,
|
||||
)
|
||||
|
||||
_process_src = rule(
|
||||
implementation = _process_src_impl,
|
||||
attrs = {
|
||||
"src": attr.label(allow_single_file = True),
|
||||
"out": attr.output(),
|
||||
},
|
||||
)
|
||||
"""Processes one NCCL source file so it can be compiled with bazel and clang."""
|
||||
|
||||
def _out(src):
|
||||
if not src.startswith("src/"):
|
||||
fail("Source file not under src/...:", src)
|
||||
src = src[4:] # Strip 'src/'
|
||||
if src == "nccl.h.in":
|
||||
return "nccl.h"
|
||||
if src.endswith(".cu"):
|
||||
return src + ".cc"
|
||||
return src
|
||||
|
||||
def process_srcs(srcs):
|
||||
"""Processes files under src/ and copies them to the parent directory."""
|
||||
[_process_src(
|
||||
name = "_" + src,
|
||||
src = src,
|
||||
out = _out(src),
|
||||
) for src in srcs]
|
||||
return ["_" + src for src in srcs]
|
||||
|
||||
def _gen_device_srcs_impl(ctx):
|
||||
ops = ["sum", "prod", "min", "max"]
|
||||
types = ["i8", "u8", "i32", "u32", "i64", "u64", "f16", "f32", "f64"]
|
||||
hdr_tail = "****************************************/"
|
||||
defines = "\n\n#define NCCL_OP %d\n#define NCCL_TYPE %d"
|
||||
|
||||
files = []
|
||||
for src in ctx.files.srcs:
|
||||
name = "%s_%s" % (ctx.attr.name, src.basename)
|
||||
file = ctx.actions.declare_file(name, sibling = src)
|
||||
ctx.actions.expand_template(
|
||||
output = file,
|
||||
template = src,
|
||||
for NCCL_OP, op in enumerate(ops):
|
||||
for NCCL_TYPE, dt in enumerate(types):
|
||||
substitutions = {
|
||||
"#define UNROLL 4": "#define UNROLL 4\n#define NCCL_OP %d" % ctx.attr.NCCL_OP,
|
||||
},
|
||||
)
|
||||
files.append(file)
|
||||
hdr_tail: hdr_tail + defines % (NCCL_OP, NCCL_TYPE),
|
||||
}
|
||||
for src in ctx.files.srcs:
|
||||
name = "%s_%s_%s" % (op, dt, src.basename)
|
||||
file = ctx.actions.declare_file(name, sibling = src)
|
||||
ctx.actions.expand_template(
|
||||
output = file,
|
||||
template = src,
|
||||
substitutions = substitutions,
|
||||
)
|
||||
files.append(file)
|
||||
return [DefaultInfo(files = depset(files))]
|
||||
|
||||
gen_device_srcs = rule(
|
||||
implementation = _gen_device_srcs_impl,
|
||||
attrs = {
|
||||
"srcs": attr.label_list(allow_files = True),
|
||||
"NCCL_OP": attr.int(),
|
||||
},
|
||||
)
|
||||
"""Adds prefix to each file name in srcs and adds #define NCCL_OP."""
|
||||
@ -110,10 +53,6 @@ def _rdc_copts():
|
||||
"-fcuda-rdc",
|
||||
"-Xcuda-ptxas",
|
||||
maxrregcount,
|
||||
# Work around for clang bug (fixed in r348662), declaring
|
||||
# '__device__ operator delete(void*, std::size_t)' non-inline.
|
||||
# TODO(csigg): Only add this option for older clang versions.
|
||||
"-std=gnu++11",
|
||||
],
|
||||
"//conditions:default": [],
|
||||
})
|
||||
@ -240,8 +179,7 @@ def _merge_archive_impl(ctx):
|
||||
ctx.actions.run_shell(
|
||||
inputs = ctx.files.srcs, # + ctx.files._crosstool,
|
||||
outputs = [ctx.outputs.out],
|
||||
command = ("printf \"%s\" " % mri_script +
|
||||
"| %s -M" % cc_toolchain.ar_executable),
|
||||
command = "printf \"%s\" | %s -M" % (mri_script, cc_toolchain.ar_executable),
|
||||
)
|
||||
|
||||
_merge_archive = rule(
|
||||
|
Loading…
Reference in New Issue
Block a user