Rolling forward CL 252574722 and 252855085 with a fix.
Update NCCL from 2.3.5 to 2.4.7. PiperOrigin-RevId: 253953922
This commit is contained in:
parent
8ec9355084
commit
77659d9c93
@ -192,6 +192,7 @@ tensorflow/third_party/nccl/LICENSE
|
|||||||
tensorflow/third_party/nccl/system.BUILD.tpl
|
tensorflow/third_party/nccl/system.BUILD.tpl
|
||||||
tensorflow/third_party/nccl/nccl_configure.bzl
|
tensorflow/third_party/nccl/nccl_configure.bzl
|
||||||
tensorflow/third_party/nccl/build_defs.bzl.tpl
|
tensorflow/third_party/nccl/build_defs.bzl.tpl
|
||||||
|
tensorflow/third_party/nccl/archive.patch
|
||||||
tensorflow/third_party/nccl/BUILD
|
tensorflow/third_party/nccl/BUILD
|
||||||
tensorflow/third_party/boringssl/BUILD
|
tensorflow/third_party/boringssl/BUILD
|
||||||
tensorflow/third_party/mpi/.gitignore
|
tensorflow/third_party/mpi/.gitignore
|
||||||
|
@ -602,11 +602,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
|
|||||||
tf_http_archive(
|
tf_http_archive(
|
||||||
name = "nccl_archive",
|
name = "nccl_archive",
|
||||||
build_file = clean_dep("//third_party:nccl/archive.BUILD"),
|
build_file = clean_dep("//third_party:nccl/archive.BUILD"),
|
||||||
sha256 = "19132b5127fa8e02d95a09795866923f04064c8f1e0770b2b42ab551408882a4",
|
patch_file = clean_dep("//third_party/nccl:archive.patch"),
|
||||||
strip_prefix = "nccl-f93fe9bfd94884cec2ba711897222e0df5569a53",
|
sha256 = "9a7633e224982e2b60fa6b397d895d20d6b7498e3e02f46f98a5a4e187c5a44c",
|
||||||
|
strip_prefix = "nccl-0ceaec9cee96ae7658aa45686853286651f36384",
|
||||||
urls = [
|
urls = [
|
||||||
"http://mirror.tensorflow.org/github.com/nvidia/nccl/archive/f93fe9bfd94884cec2ba711897222e0df5569a53.tar.gz",
|
"http://mirror.tensorflow.org/github.com/nvidia/nccl/archive/0ceaec9cee96ae7658aa45686853286651f36384.tar.gz",
|
||||||
"https://github.com/nvidia/nccl/archive/f93fe9bfd94884cec2ba711897222e0df5569a53.tar.gz",
|
"https://github.com/nvidia/nccl/archive/0ceaec9cee96ae7658aa45686853286651f36384.tar.gz",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
94
third_party/nccl/archive.BUILD
vendored
94
third_party/nccl/archive.BUILD
vendored
@ -9,99 +9,89 @@ load(
|
|||||||
"@local_config_nccl//:build_defs.bzl",
|
"@local_config_nccl//:build_defs.bzl",
|
||||||
"cuda_rdc_library",
|
"cuda_rdc_library",
|
||||||
"gen_device_srcs",
|
"gen_device_srcs",
|
||||||
"process_srcs",
|
|
||||||
)
|
)
|
||||||
load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
|
|
||||||
|
|
||||||
cc_library(
|
cc_library(
|
||||||
name = "src_hdrs",
|
name = "src_hdrs",
|
||||||
hdrs = process_srcs([
|
hdrs = [
|
||||||
|
"src/collectives.h",
|
||||||
"src/collectives/collectives.h",
|
"src/collectives/collectives.h",
|
||||||
"src/nccl.h.in",
|
"src/nccl.h",
|
||||||
]),
|
],
|
||||||
|
strip_include_prefix = "src",
|
||||||
)
|
)
|
||||||
|
|
||||||
cc_library(
|
cc_library(
|
||||||
name = "include_hdrs",
|
name = "include_hdrs",
|
||||||
hdrs = process_srcs(glob(["src/include/*.h"])),
|
hdrs = glob(["src/include/*.h"]),
|
||||||
strip_include_prefix = "include",
|
strip_include_prefix = "src/include",
|
||||||
deps = ["@local_config_cuda//cuda:cuda_headers"],
|
deps = ["@local_config_cuda//cuda:cuda_headers"],
|
||||||
)
|
)
|
||||||
|
|
||||||
device_srcs = process_srcs([
|
cc_library(
|
||||||
"src/collectives/device/all_gather.cu",
|
name = "device_hdrs",
|
||||||
"src/collectives/device/all_reduce.cu",
|
hdrs = glob(["src/collectives/device/*.h"]),
|
||||||
"src/collectives/device/broadcast.cu",
|
strip_include_prefix = "src/collectives/device",
|
||||||
"src/collectives/device/reduce.cu",
|
)
|
||||||
"src/collectives/device/reduce_scatter.cu",
|
|
||||||
])
|
|
||||||
|
|
||||||
# NCCL compiles the same source files with different NCCL_OP defines. RDC
|
# NCCL compiles the same source files with different NCCL_OP/NCCL_TYPE defines.
|
||||||
# compilation requires that each compiled module has a unique ID. Clang derives
|
# RDC compilation requires that each compiled module has a unique ID. Clang
|
||||||
# the module ID from the path only so we need to rename the files to get
|
# derives the module ID from the path only so we need to copy the files to get
|
||||||
# different IDs for different parts of compilation. NVCC does not have that
|
# different IDs for different parts of compilation. NVCC does not have that
|
||||||
# problem because it generates IDs based on preprocessed content.
|
# problem because it generates IDs based on preprocessed content.
|
||||||
gen_device_srcs(
|
gen_device_srcs(
|
||||||
name = "sum",
|
name = "device_srcs",
|
||||||
srcs = device_srcs,
|
srcs = [
|
||||||
NCCL_OP = 0,
|
"src/collectives/device/all_gather.cu.cc",
|
||||||
)
|
"src/collectives/device/all_reduce.cu.cc",
|
||||||
|
"src/collectives/device/broadcast.cu.cc",
|
||||||
gen_device_srcs(
|
"src/collectives/device/reduce.cu.cc",
|
||||||
name = "prod",
|
"src/collectives/device/reduce_scatter.cu.cc",
|
||||||
srcs = device_srcs,
|
],
|
||||||
NCCL_OP = 1,
|
|
||||||
)
|
|
||||||
|
|
||||||
gen_device_srcs(
|
|
||||||
name = "min",
|
|
||||||
srcs = device_srcs,
|
|
||||||
NCCL_OP = 2,
|
|
||||||
)
|
|
||||||
|
|
||||||
gen_device_srcs(
|
|
||||||
name = "max",
|
|
||||||
srcs = device_srcs,
|
|
||||||
NCCL_OP = 3,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
cuda_rdc_library(
|
cuda_rdc_library(
|
||||||
name = "device",
|
name = "device",
|
||||||
srcs = [
|
srcs = [
|
||||||
":max",
|
"src/collectives/device/functions.cu.cc",
|
||||||
":min",
|
":device_srcs",
|
||||||
":prod",
|
] + glob([
|
||||||
":sum",
|
# Required for header inclusion checking, see below for details.
|
||||||
] + process_srcs(glob([
|
|
||||||
"src/collectives/device/*.h",
|
"src/collectives/device/*.h",
|
||||||
"src/collectives/device/functions.cu",
|
"src/nccl.h",
|
||||||
])),
|
]),
|
||||||
deps = [
|
deps = [
|
||||||
|
":device_hdrs",
|
||||||
":include_hdrs",
|
":include_hdrs",
|
||||||
":src_hdrs",
|
":src_hdrs",
|
||||||
|
"@local_config_cuda//cuda:cuda_headers",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
# Primary NCCL target.
|
# Primary NCCL target.
|
||||||
cc_library(
|
cc_library(
|
||||||
name = "nccl",
|
name = "nccl",
|
||||||
srcs = process_srcs(glob(
|
srcs = glob(
|
||||||
include = ["src/**/*.cu"],
|
include = ["src/**/*.cc"],
|
||||||
# Exclude device-library code.
|
# Exclude device-library code.
|
||||||
exclude = ["src/collectives/device/**"],
|
exclude = ["src/collectives/device/**"],
|
||||||
)) + [
|
) + [
|
||||||
# Required for header inclusion checking (see
|
# Required for header inclusion checking (see
|
||||||
# http://docs.bazel.build/versions/master/be/c-cpp.html#hdrs).
|
# http://docs.bazel.build/versions/master/be/c-cpp.html#hdrs).
|
||||||
"nccl.h",
|
# Files in src/ which #include "nccl.h" load it from there rather than
|
||||||
"collectives/collectives.h",
|
# from the virtual includes directory.
|
||||||
|
"src/collectives.h",
|
||||||
|
"src/collectives/collectives.h",
|
||||||
|
"src/nccl.h",
|
||||||
],
|
],
|
||||||
hdrs = ["nccl.h"],
|
hdrs = ["src/nccl.h"],
|
||||||
copts = cuda_default_copts() + ["-Wno-vla"],
|
|
||||||
include_prefix = "third_party/nccl",
|
include_prefix = "third_party/nccl",
|
||||||
|
strip_include_prefix = "src",
|
||||||
visibility = ["//visibility:public"],
|
visibility = ["//visibility:public"],
|
||||||
deps = [
|
deps = [
|
||||||
":device",
|
":device",
|
||||||
":include_hdrs",
|
":include_hdrs",
|
||||||
|
":src_hdrs",
|
||||||
"@local_config_cuda//cuda:cudart_static",
|
"@local_config_cuda//cuda:cudart_static",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
69
third_party/nccl/archive.patch
vendored
Normal file
69
third_party/nccl/archive.patch
vendored
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
diff --git a/src/collectives.h b/src/collectives.h
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000..7d04b16
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/src/collectives.h
|
||||||
|
@@ -0,0 +1 @@
|
||||||
|
+#include "collectives/collectives.h"
|
||||||
|
diff --git a/src/collectives/device/all_gather.cu b/src/collectives/device/all_gather.cu.cc
|
||||||
|
similarity index 100%
|
||||||
|
rename from src/collectives/device/all_gather.cu
|
||||||
|
rename to src/collectives/device/all_gather.cu.cc
|
||||||
|
diff --git a/src/collectives/device/all_reduce.cu b/src/collectives/device/all_reduce.cu.cc
|
||||||
|
similarity index 100%
|
||||||
|
rename from src/collectives/device/all_reduce.cu
|
||||||
|
rename to src/collectives/device/all_reduce.cu.cc
|
||||||
|
diff --git a/src/collectives/device/broadcast.cu b/src/collectives/device/broadcast.cu.cc
|
||||||
|
similarity index 100%
|
||||||
|
rename from src/collectives/device/broadcast.cu
|
||||||
|
rename to src/collectives/device/broadcast.cu.cc
|
||||||
|
diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
|
||||||
|
index 8c336bf..2eef3ae 100644
|
||||||
|
--- a/src/collectives/device/common.h
|
||||||
|
+++ b/src/collectives/device/common.h
|
||||||
|
@@ -7,7 +7,7 @@
|
||||||
|
#ifndef NCCL_DEVICE_COMMON_H_
|
||||||
|
#define NCCL_DEVICE_COMMON_H_
|
||||||
|
|
||||||
|
-#include "../collectives.h"
|
||||||
|
+#include "collectives.h"
|
||||||
|
#include "devcomm.h"
|
||||||
|
#include "nccl.h"
|
||||||
|
|
||||||
|
diff --git a/src/collectives/device/functions.cu b/src/collectives/device/functions.cu.cc
|
||||||
|
similarity index 100%
|
||||||
|
rename from src/collectives/device/functions.cu
|
||||||
|
rename to src/collectives/device/functions.cu.cc
|
||||||
|
diff --git a/src/collectives/device/reduce.cu b/src/collectives/device/reduce.cu.cc
|
||||||
|
similarity index 100%
|
||||||
|
rename from src/collectives/device/reduce.cu
|
||||||
|
rename to src/collectives/device/reduce.cu.cc
|
||||||
|
diff --git a/src/collectives/device/reduce_scatter.cu b/src/collectives/device/reduce_scatter.cu.cc
|
||||||
|
similarity index 100%
|
||||||
|
rename from src/collectives/device/reduce_scatter.cu
|
||||||
|
rename to src/collectives/device/reduce_scatter.cu.cc
|
||||||
|
diff --git a/src/nccl.h.in b/src/nccl.h
|
||||||
|
similarity index 98%
|
||||||
|
rename from src/nccl.h.in
|
||||||
|
rename to src/nccl.h
|
||||||
|
index 985274e..7ebb1e1 100644
|
||||||
|
--- a/src/nccl.h.in
|
||||||
|
+++ b/src/nccl.h
|
||||||
|
@@ -10,12 +10,12 @@
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
#include <cuda_fp16.h>
|
||||||
|
|
||||||
|
-#define NCCL_MAJOR ${nccl:Major}
|
||||||
|
-#define NCCL_MINOR ${nccl:Minor}
|
||||||
|
-#define NCCL_PATCH ${nccl:Patch}
|
||||||
|
-#define NCCL_SUFFIX "${nccl:Suffix}"
|
||||||
|
+#define NCCL_MAJOR 2
|
||||||
|
+#define NCCL_MINOR 4
|
||||||
|
+#define NCCL_PATCH 7
|
||||||
|
+#define NCCL_SUFFIX ""
|
||||||
|
|
||||||
|
-#define NCCL_VERSION_CODE ${nccl:Version}
|
||||||
|
+#define NCCL_VERSION_CODE 2407
|
||||||
|
#define NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z))
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
100
third_party/nccl/build_defs.bzl.tpl
vendored
100
third_party/nccl/build_defs.bzl.tpl
vendored
@ -3,90 +3,33 @@
|
|||||||
load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
|
load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
|
||||||
load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain")
|
load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain")
|
||||||
|
|
||||||
def _process_src_impl(ctx):
|
|
||||||
"""Applies various patches to the NCCL source."""
|
|
||||||
substitutions = {
|
|
||||||
"\"collectives.h": "\"collectives/collectives.h",
|
|
||||||
"\"../collectives.h": "\"collectives/collectives.h",
|
|
||||||
# Clang does not define __CUDACC_VER_*__, use CUDA_VERSION instead.
|
|
||||||
# TODO(csigg): Apply substitutions upstream and remove here.
|
|
||||||
"#if __CUDACC_VER_MAJOR__ >= 10 || (__CUDACC_VER_MAJOR__ >= 9 && __CUDACC_VER_MINOR__ >= 2)": "#if CUDART_VERSION >= 9200",
|
|
||||||
"#if __CUDACC_VER_MAJOR__ >= 10": "#if CUDART_VERSION >= 10000",
|
|
||||||
"#if __CUDACC_VER_MAJOR__ >= 9": "#if CUDART_VERSION >= 9000",
|
|
||||||
"#if __CUDACC_VER_MAJOR__ < 9": "#if CUDART_VERSION < 9000",
|
|
||||||
"nullptr_t": "std::nullptr_t",
|
|
||||||
}
|
|
||||||
if ctx.file.src.basename == "nccl.h.in":
|
|
||||||
substitutions.update({
|
|
||||||
"${nccl:Major}": "2",
|
|
||||||
"${nccl:Minor}": "3",
|
|
||||||
"${nccl:Patch}": "5",
|
|
||||||
"${nccl:Suffix}": "",
|
|
||||||
"${nccl:Version}": "2305",
|
|
||||||
})
|
|
||||||
if ctx.file.src.basename == "function.cu":
|
|
||||||
substitutions.update({
|
|
||||||
# Don't try to initialize the host shadow copy of this device-side
|
|
||||||
# global variable. There is no host pointer to a device-side
|
|
||||||
# function, which confuses clang.
|
|
||||||
# TODO(csigg): remove when fixed in clang.
|
|
||||||
"NCCL_FUNCS2B(ncclBroadcast),": "#if __CUDA_ARCH__\nNCCL_FUNCS2B(ncclBroadcast),",
|
|
||||||
"NCCL_FUNCS2A(ncclAllReduce)": "NCCL_FUNCS2A(ncclAllReduce)\n#endif",
|
|
||||||
})
|
|
||||||
ctx.actions.expand_template(
|
|
||||||
output = ctx.outputs.out,
|
|
||||||
template = ctx.file.src,
|
|
||||||
substitutions = substitutions,
|
|
||||||
)
|
|
||||||
|
|
||||||
_process_src = rule(
|
|
||||||
implementation = _process_src_impl,
|
|
||||||
attrs = {
|
|
||||||
"src": attr.label(allow_single_file = True),
|
|
||||||
"out": attr.output(),
|
|
||||||
},
|
|
||||||
)
|
|
||||||
"""Processes one NCCL source file so it can be compiled with bazel and clang."""
|
|
||||||
|
|
||||||
def _out(src):
|
|
||||||
if not src.startswith("src/"):
|
|
||||||
fail("Source file not under src/...:", src)
|
|
||||||
src = src[4:] # Strip 'src/'
|
|
||||||
if src == "nccl.h.in":
|
|
||||||
return "nccl.h"
|
|
||||||
if src.endswith(".cu"):
|
|
||||||
return src + ".cc"
|
|
||||||
return src
|
|
||||||
|
|
||||||
def process_srcs(srcs):
|
|
||||||
"""Processes files under src/ and copies them to the parent directory."""
|
|
||||||
[_process_src(
|
|
||||||
name = "_" + src,
|
|
||||||
src = src,
|
|
||||||
out = _out(src),
|
|
||||||
) for src in srcs]
|
|
||||||
return ["_" + src for src in srcs]
|
|
||||||
|
|
||||||
def _gen_device_srcs_impl(ctx):
|
def _gen_device_srcs_impl(ctx):
|
||||||
|
ops = ["sum", "prod", "min", "max"]
|
||||||
|
types = ["i8", "u8", "i32", "u32", "i64", "u64", "f16", "f32", "f64"]
|
||||||
|
hdr_tail = "****************************************/"
|
||||||
|
defines = "\n\n#define NCCL_OP %d\n#define NCCL_TYPE %d"
|
||||||
|
|
||||||
files = []
|
files = []
|
||||||
for src in ctx.files.srcs:
|
for NCCL_OP, op in enumerate(ops):
|
||||||
name = "%s_%s" % (ctx.attr.name, src.basename)
|
for NCCL_TYPE, dt in enumerate(types):
|
||||||
file = ctx.actions.declare_file(name, sibling = src)
|
|
||||||
ctx.actions.expand_template(
|
|
||||||
output = file,
|
|
||||||
template = src,
|
|
||||||
substitutions = {
|
substitutions = {
|
||||||
"#define UNROLL 4": "#define UNROLL 4\n#define NCCL_OP %d" % ctx.attr.NCCL_OP,
|
hdr_tail: hdr_tail + defines % (NCCL_OP, NCCL_TYPE),
|
||||||
},
|
}
|
||||||
)
|
for src in ctx.files.srcs:
|
||||||
files.append(file)
|
name = "%s_%s_%s" % (op, dt, src.basename)
|
||||||
|
file = ctx.actions.declare_file(name, sibling = src)
|
||||||
|
ctx.actions.expand_template(
|
||||||
|
output = file,
|
||||||
|
template = src,
|
||||||
|
substitutions = substitutions,
|
||||||
|
)
|
||||||
|
files.append(file)
|
||||||
return [DefaultInfo(files = depset(files))]
|
return [DefaultInfo(files = depset(files))]
|
||||||
|
|
||||||
gen_device_srcs = rule(
|
gen_device_srcs = rule(
|
||||||
implementation = _gen_device_srcs_impl,
|
implementation = _gen_device_srcs_impl,
|
||||||
attrs = {
|
attrs = {
|
||||||
"srcs": attr.label_list(allow_files = True),
|
"srcs": attr.label_list(allow_files = True),
|
||||||
"NCCL_OP": attr.int(),
|
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
"""Adds prefix to each file name in srcs and adds #define NCCL_OP."""
|
"""Adds prefix to each file name in srcs and adds #define NCCL_OP."""
|
||||||
@ -110,10 +53,6 @@ def _rdc_copts():
|
|||||||
"-fcuda-rdc",
|
"-fcuda-rdc",
|
||||||
"-Xcuda-ptxas",
|
"-Xcuda-ptxas",
|
||||||
maxrregcount,
|
maxrregcount,
|
||||||
# Work around for clang bug (fixed in r348662), declaring
|
|
||||||
# '__device__ operator delete(void*, std::size_t)' non-inline.
|
|
||||||
# TODO(csigg): Only add this option for older clang versions.
|
|
||||||
"-std=gnu++11",
|
|
||||||
],
|
],
|
||||||
"//conditions:default": [],
|
"//conditions:default": [],
|
||||||
})
|
})
|
||||||
@ -240,8 +179,7 @@ def _merge_archive_impl(ctx):
|
|||||||
ctx.actions.run_shell(
|
ctx.actions.run_shell(
|
||||||
inputs = ctx.files.srcs, # + ctx.files._crosstool,
|
inputs = ctx.files.srcs, # + ctx.files._crosstool,
|
||||||
outputs = [ctx.outputs.out],
|
outputs = [ctx.outputs.out],
|
||||||
command = ("printf \"%s\" " % mri_script +
|
command = "printf \"%s\" | %s -M" % (mri_script, cc_toolchain.ar_executable),
|
||||||
"| %s -M" % cc_toolchain.ar_executable),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
_merge_archive = rule(
|
_merge_archive = rule(
|
||||||
|
Loading…
Reference in New Issue
Block a user