Improve build rules to compile NCCL from source, in particular for clang.
PiperOrigin-RevId: 225051897
This commit is contained in:
parent
e3d751c2a8
commit
d6a4685035
156
third_party/nccl/archive.BUILD
vendored
156
third_party/nccl/archive.BUILD
vendored
@ -1,157 +1,110 @@
|
|||||||
# NVIDIA NCCL 2
|
# NVIDIA NCCL 2
|
||||||
# A package of optimized primitives for collective multi-GPU communication.
|
# A package of optimized primitives for collective multi-GPU communication.
|
||||||
|
|
||||||
licenses(["restricted"])
|
licenses(["notice"])
|
||||||
|
|
||||||
exports_files(["LICENSE.txt"])
|
exports_files(["LICENSE.txt"])
|
||||||
|
|
||||||
load(
|
load(
|
||||||
"@local_config_nccl//:build_defs.bzl",
|
"@local_config_nccl//:build_defs.bzl",
|
||||||
"gen_nccl_h",
|
"cuda_rdc_library",
|
||||||
"nccl_library",
|
"gen_device_srcs",
|
||||||
"rdc_copts",
|
"process_srcs",
|
||||||
"rdc_library",
|
|
||||||
)
|
)
|
||||||
load(
|
load("@org_tensorflow//tensorflow:tensorflow.bzl", "tf_cuda_library")
|
||||||
"@local_config_cuda//cuda:build_defs.bzl",
|
|
||||||
"cuda_default_copts",
|
process_srcs(
|
||||||
|
name = "process_srcs",
|
||||||
|
srcs = glob([
|
||||||
|
"**/*.cc",
|
||||||
|
"**/*.h",
|
||||||
|
]),
|
||||||
)
|
)
|
||||||
|
|
||||||
# Generate the nccl.h header file.
|
cc_library(
|
||||||
gen_nccl_h(
|
|
||||||
name = "nccl_h",
|
|
||||||
output = "src/nccl.h",
|
|
||||||
template = "src/nccl.h.in",
|
|
||||||
)
|
|
||||||
|
|
||||||
nccl_library(
|
|
||||||
name = "src_hdrs",
|
name = "src_hdrs",
|
||||||
hdrs = [
|
hdrs = [
|
||||||
"src/nccl.h",
|
|
||||||
# src/include/common_coll.h #includes "collectives/collectives.h".
|
|
||||||
# All other #includes of collectives.h are patched in process_srcs.
|
|
||||||
"src/collectives/collectives.h",
|
"src/collectives/collectives.h",
|
||||||
|
"src/nccl.h",
|
||||||
],
|
],
|
||||||
|
data = [":process_srcs"],
|
||||||
strip_include_prefix = "src",
|
strip_include_prefix = "src",
|
||||||
)
|
)
|
||||||
|
|
||||||
nccl_library(
|
cc_library(
|
||||||
name = "include_hdrs",
|
name = "include_hdrs",
|
||||||
hdrs = glob(["src/include/*.h"]),
|
hdrs = glob(["src/include/*.h"]),
|
||||||
|
data = [":process_srcs"],
|
||||||
strip_include_prefix = "src/include",
|
strip_include_prefix = "src/include",
|
||||||
)
|
)
|
||||||
|
|
||||||
filegroup(
|
cc_library(
|
||||||
name = "device_hdrs",
|
name = "device_hdrs",
|
||||||
srcs = glob(["src/collectives/device/*.h"]),
|
hdrs = glob(["src/collectives/device/*.h"]),
|
||||||
|
strip_include_prefix = "src/collectives/device",
|
||||||
)
|
)
|
||||||
|
|
||||||
filegroup(
|
filegroup(
|
||||||
name = "device_srcs",
|
name = "device_srcs",
|
||||||
srcs = [
|
srcs = [
|
||||||
"src/collectives/device/all_gather.cu",
|
"src/collectives/device/all_gather.cu.cc",
|
||||||
"src/collectives/device/all_reduce.cu",
|
"src/collectives/device/all_reduce.cu.cc",
|
||||||
"src/collectives/device/broadcast.cu",
|
"src/collectives/device/broadcast.cu.cc",
|
||||||
"src/collectives/device/reduce.cu",
|
"src/collectives/device/reduce.cu.cc",
|
||||||
"src/collectives/device/reduce_scatter.cu",
|
"src/collectives/device/reduce_scatter.cu.cc",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
nccl_library(
|
# NCCL compiles the same source files with different NCCL_OP defines. RDC
|
||||||
|
# compilation requires that each compiled module has a unique ID. Clang derives
|
||||||
|
# the module ID from the path only so we need to rename the files to get
|
||||||
|
# different IDs for different parts of compilation. NVCC does not have that
|
||||||
|
# problem because it generates IDs based on preprocessed content.
|
||||||
|
gen_device_srcs(
|
||||||
name = "sum",
|
name = "sum",
|
||||||
srcs = [
|
srcs = [":device_srcs"],
|
||||||
":device_hdrs",
|
NCCL_OP = 0,
|
||||||
":device_srcs",
|
|
||||||
],
|
|
||||||
copts = ["-DNCCL_OP=0"] + rdc_copts(),
|
|
||||||
linkstatic = True,
|
|
||||||
prefix = "sum_",
|
|
||||||
deps = [
|
|
||||||
":include_hdrs",
|
|
||||||
":src_hdrs",
|
|
||||||
"@local_config_cuda//cuda:cuda_headers",
|
|
||||||
],
|
|
||||||
)
|
)
|
||||||
|
|
||||||
nccl_library(
|
gen_device_srcs(
|
||||||
name = "prod",
|
name = "prod",
|
||||||
srcs = [
|
srcs = [":device_srcs"],
|
||||||
":device_hdrs",
|
NCCL_OP = 1,
|
||||||
":device_srcs",
|
|
||||||
],
|
|
||||||
copts = ["-DNCCL_OP=1"] + rdc_copts(),
|
|
||||||
linkstatic = True,
|
|
||||||
prefix = "_prod",
|
|
||||||
deps = [
|
|
||||||
":include_hdrs",
|
|
||||||
":src_hdrs",
|
|
||||||
"@local_config_cuda//cuda:cuda_headers",
|
|
||||||
],
|
|
||||||
)
|
)
|
||||||
|
|
||||||
nccl_library(
|
gen_device_srcs(
|
||||||
name = "min",
|
name = "min",
|
||||||
srcs = [
|
srcs = [":device_srcs"],
|
||||||
":device_hdrs",
|
NCCL_OP = 2,
|
||||||
":device_srcs",
|
|
||||||
],
|
|
||||||
copts = ["-DNCCL_OP=2"] + rdc_copts(),
|
|
||||||
linkstatic = True,
|
|
||||||
prefix = "min_",
|
|
||||||
deps = [
|
|
||||||
":include_hdrs",
|
|
||||||
":src_hdrs",
|
|
||||||
"@local_config_cuda//cuda:cuda_headers",
|
|
||||||
],
|
|
||||||
)
|
)
|
||||||
|
|
||||||
nccl_library(
|
gen_device_srcs(
|
||||||
name = "max",
|
name = "max",
|
||||||
srcs = [
|
srcs = [":device_srcs"],
|
||||||
":device_hdrs",
|
NCCL_OP = 3,
|
||||||
":device_srcs",
|
|
||||||
],
|
|
||||||
copts = ["-DNCCL_OP=3"] + rdc_copts(),
|
|
||||||
linkstatic = True,
|
|
||||||
prefix = "max_",
|
|
||||||
deps = [
|
|
||||||
":include_hdrs",
|
|
||||||
":src_hdrs",
|
|
||||||
"@local_config_cuda//cuda:cuda_headers",
|
|
||||||
],
|
|
||||||
)
|
)
|
||||||
|
|
||||||
nccl_library(
|
cuda_rdc_library(
|
||||||
name = "functions",
|
name = "device",
|
||||||
srcs = [
|
srcs = [
|
||||||
"src/collectives/device/functions.cu",
|
"src/collectives/device/functions.cu.cc",
|
||||||
":device_hdrs",
|
|
||||||
],
|
|
||||||
copts = rdc_copts(),
|
|
||||||
linkstatic = True,
|
|
||||||
deps = [
|
|
||||||
":include_hdrs",
|
|
||||||
":src_hdrs",
|
|
||||||
"@local_config_cuda//cuda:cuda_headers",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
rdc_library(
|
|
||||||
name = "device_code",
|
|
||||||
deps = [
|
|
||||||
":functions",
|
|
||||||
":max",
|
":max",
|
||||||
":min",
|
":min",
|
||||||
":prod",
|
":prod",
|
||||||
":sum",
|
":sum",
|
||||||
],
|
],
|
||||||
|
deps = [
|
||||||
|
":device_hdrs",
|
||||||
|
":include_hdrs",
|
||||||
|
":src_hdrs",
|
||||||
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
# Primary NCCL target.
|
# Primary NCCL target.
|
||||||
nccl_library(
|
tf_cuda_library(
|
||||||
name = "nccl",
|
name = "nccl",
|
||||||
srcs = glob(
|
srcs = glob(
|
||||||
include = ["src/**/*.cu"],
|
include = ["src/**/*.cu.cc"],
|
||||||
# Exclude device-library code.
|
# Exclude device-library code.
|
||||||
exclude = ["src/collectives/device/**"],
|
exclude = ["src/collectives/device/**"],
|
||||||
) + [
|
) + [
|
||||||
@ -162,13 +115,14 @@ nccl_library(
|
|||||||
"src/nccl.h",
|
"src/nccl.h",
|
||||||
],
|
],
|
||||||
hdrs = ["src/nccl.h"],
|
hdrs = ["src/nccl.h"],
|
||||||
copts = cuda_default_copts(),
|
copts = ["-Wno-vla"],
|
||||||
include_prefix = "third_party/nccl",
|
include_prefix = "third_party/nccl",
|
||||||
strip_include_prefix = "src",
|
strip_include_prefix = "src",
|
||||||
visibility = ["//visibility:public"],
|
visibility = ["//visibility:public"],
|
||||||
deps = [
|
deps = [
|
||||||
":device_code",
|
":device",
|
||||||
":include_hdrs",
|
":include_hdrs",
|
||||||
":src_hdrs",
|
":src_hdrs",
|
||||||
|
"@local_config_cuda//cuda:cudart_static",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
475
third_party/nccl/build_defs.bzl.tpl
vendored
475
third_party/nccl/build_defs.bzl.tpl
vendored
@ -1,87 +1,86 @@
|
|||||||
"""Repository rule for NCCL."""
|
"""Repository rule for NCCL."""
|
||||||
|
|
||||||
load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
|
load("@org_tensorflow//tensorflow:tensorflow.bzl", "tf_cuda_library")
|
||||||
|
load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain")
|
||||||
def _gen_nccl_h_impl(ctx):
|
|
||||||
"""Creates nccl.h from a template."""
|
|
||||||
ctx.actions.expand_template(
|
|
||||||
output = ctx.outputs.output,
|
|
||||||
template = ctx.file.template,
|
|
||||||
substitutions = {
|
|
||||||
"${nccl:Major}": "2",
|
|
||||||
"${nccl:Minor}": "3",
|
|
||||||
"${nccl:Patch}": "5",
|
|
||||||
"${nccl:Suffix}": "",
|
|
||||||
"${nccl:Version}": "2305",
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
gen_nccl_h = rule(
|
|
||||||
implementation = _gen_nccl_h_impl,
|
|
||||||
attrs = {
|
|
||||||
"template": attr.label(allow_single_file = True),
|
|
||||||
"output": attr.output(),
|
|
||||||
},
|
|
||||||
)
|
|
||||||
"""Creates the NCCL header file."""
|
|
||||||
|
|
||||||
def _process_srcs_impl(ctx):
|
def _process_srcs_impl(ctx):
|
||||||
"""Appends .cc to .cu files, patches include directives."""
|
"""Appends .cc to .cu files, patches include directives."""
|
||||||
files = []
|
files = []
|
||||||
for src in ctx.files.srcs:
|
for src in ctx.files.srcs:
|
||||||
if not src.is_source:
|
substitutions = {
|
||||||
# Process only once, specifically "src/nccl.h".
|
"\"collectives.h": "\"collectives/collectives.h",
|
||||||
files.append(src)
|
"\"../collectives.h": "\"collectives/collectives.h",
|
||||||
continue
|
# Clang does not define __CUDACC_VER_*__, use CUDA_VERSION instead.
|
||||||
|
# TODO(csigg): Apply substitutions upstream and remove here.
|
||||||
|
"#if __CUDACC_VER_MAJOR__ >= 10 || (__CUDACC_VER_MAJOR__ >= 9 && __CUDACC_VER_MINOR__ >= 2)": "#if CUDA_VERSION >= 9200",
|
||||||
|
"#if __CUDACC_VER_MAJOR__ >= 10": "#if CUDA_VERSION >= 10000",
|
||||||
|
"#if __CUDACC_VER_MAJOR__ >= 9": "#if CUDA_VERSION >= 9000",
|
||||||
|
"#if __CUDACC_VER_MAJOR__ < 9": "#if CUDA_VERSION < 9000",
|
||||||
|
"nullptr_t": "std::nullptr_t",
|
||||||
|
}
|
||||||
name = src.basename
|
name = src.basename
|
||||||
|
if name == "nccl.in.h":
|
||||||
|
name = "nccl.h"
|
||||||
|
substitutions.update({
|
||||||
|
"${nccl:Major}": "2",
|
||||||
|
"${nccl:Minor}": "3",
|
||||||
|
"${nccl:Patch}": "5",
|
||||||
|
"${nccl:Suffix}": "",
|
||||||
|
"${nccl:Version}": "2305",
|
||||||
|
})
|
||||||
|
if name == "functions.cu":
|
||||||
|
# Don't try to initialize the host shadow copy of this device-side
|
||||||
|
# global variable. There is no host pointer to a device-side
|
||||||
|
# function, which confuses clang.
|
||||||
|
# TODO(csigg): remove when fixed in clang.
|
||||||
|
substitutions.update({
|
||||||
|
"NCCL_FUNCS2B(ncclBroadcast),": "#if __CUDA_ARCH__\nNCCL_FUNCS2B(ncclBroadcast),",
|
||||||
|
"NCCL_FUNCS2A(ncclAllReduce)": "NCCL_FUNCS2A(ncclAllReduce)\n#endif",
|
||||||
|
})
|
||||||
if src.extension == "cu":
|
if src.extension == "cu":
|
||||||
name = ctx.attr.prefix + name + ".cc"
|
name += ".cc"
|
||||||
|
file = ctx.actions.declare_file(name, sibling = src)
|
||||||
|
ctx.actions.expand_template(
|
||||||
|
output = file,
|
||||||
|
template = src,
|
||||||
|
substitutions = substitutions,
|
||||||
|
)
|
||||||
|
files.append(file)
|
||||||
|
return [DefaultInfo(files = depset(files))]
|
||||||
|
|
||||||
|
process_srcs = rule(
|
||||||
|
implementation = _process_srcs_impl,
|
||||||
|
attrs = {
|
||||||
|
"srcs": attr.label_list(allow_files = True),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
"""Processes the NCCL srcs so they can be compiled with bazel and clang."""
|
||||||
|
|
||||||
|
def _gen_device_srcs_impl(ctx):
|
||||||
|
files = []
|
||||||
|
for src in ctx.files.srcs:
|
||||||
|
name = "%s_%s" % (ctx.attr.name, src.basename)
|
||||||
file = ctx.actions.declare_file(name, sibling = src)
|
file = ctx.actions.declare_file(name, sibling = src)
|
||||||
ctx.actions.expand_template(
|
ctx.actions.expand_template(
|
||||||
output = file,
|
output = file,
|
||||||
template = src,
|
template = src,
|
||||||
substitutions = {
|
substitutions = {
|
||||||
"\"collectives.h": "\"collectives/collectives.h",
|
"#define UNROLL 4": "#define UNROLL 4\n#define NCCL_OP %d" % ctx.attr.NCCL_OP,
|
||||||
"\"../collectives.h": "\"collectives/collectives.h",
|
|
||||||
"#if __CUDACC_VER_MAJOR__": "#if defined __CUDACC_VER_MAJOR__ && __CUDACC_VER_MAJOR__",
|
|
||||||
# Substitutions are applied in order.
|
|
||||||
"std::nullptr_t": "nullptr_t",
|
|
||||||
"nullptr_t": "std::nullptr_t",
|
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
files.append(file)
|
files.append(file)
|
||||||
return [DefaultInfo(files = depset(files))]
|
return [DefaultInfo(files = depset(files))]
|
||||||
|
|
||||||
_process_srcs = rule(
|
gen_device_srcs = rule(
|
||||||
implementation = _process_srcs_impl,
|
implementation = _gen_device_srcs_impl,
|
||||||
attrs = {
|
attrs = {
|
||||||
"srcs": attr.label_list(allow_files = True),
|
"srcs": attr.label_list(allow_files = True),
|
||||||
"prefix": attr.string(default = ""),
|
"NCCL_OP": attr.int(),
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
"""Processes the NCCL srcs so they can be compiled with bazel and clang."""
|
"""Adds prefix to each file name in srcs and adds #define NCCL_OP."""
|
||||||
|
|
||||||
def nccl_library(name, srcs = None, hdrs = None, prefix = None, **kwargs):
|
def _rdc_copts():
|
||||||
"""Processes the srcs and hdrs and creates a cc_library."""
|
|
||||||
|
|
||||||
_process_srcs(
|
|
||||||
name = name + "_srcs",
|
|
||||||
srcs = srcs,
|
|
||||||
prefix = prefix,
|
|
||||||
)
|
|
||||||
_process_srcs(
|
|
||||||
name = name + "_hdrs",
|
|
||||||
srcs = hdrs,
|
|
||||||
)
|
|
||||||
|
|
||||||
native.cc_library(
|
|
||||||
name = name,
|
|
||||||
srcs = [name + "_srcs"] if srcs else [],
|
|
||||||
hdrs = [name + "_hdrs"] if hdrs else [],
|
|
||||||
**kwargs
|
|
||||||
)
|
|
||||||
|
|
||||||
def rdc_copts():
|
|
||||||
"""Returns copts for compiling relocatable device code."""
|
"""Returns copts for compiling relocatable device code."""
|
||||||
|
|
||||||
# The global functions can not have a lower register count than the
|
# The global functions can not have a lower register count than the
|
||||||
@ -89,7 +88,7 @@ def rdc_copts():
|
|||||||
# https://github.com/NVIDIA/nccl/blob/f93fe9bfd94884cec2ba711897222e0df5569a53/makefiles/common.mk#L48
|
# https://github.com/NVIDIA/nccl/blob/f93fe9bfd94884cec2ba711897222e0df5569a53/makefiles/common.mk#L48
|
||||||
maxrregcount = "-maxrregcount=96"
|
maxrregcount = "-maxrregcount=96"
|
||||||
|
|
||||||
return cuda_default_copts() + select({
|
return select({
|
||||||
"@local_config_cuda//cuda:using_nvcc": [
|
"@local_config_cuda//cuda:using_nvcc": [
|
||||||
"-nvcc_options",
|
"-nvcc_options",
|
||||||
"relocatable-device-code=true",
|
"relocatable-device-code=true",
|
||||||
@ -100,118 +99,255 @@ def rdc_copts():
|
|||||||
"-fcuda-rdc",
|
"-fcuda-rdc",
|
||||||
"-Xcuda-ptxas",
|
"-Xcuda-ptxas",
|
||||||
maxrregcount,
|
maxrregcount,
|
||||||
|
# Work around for clang bug (fixed in r348662), declaring
|
||||||
|
# '__device__ operator delete(void*, std::size_t)' non-inline.
|
||||||
|
# TODO(csigg): Only add this option for older clang versions.
|
||||||
|
"-std=gnu++11",
|
||||||
],
|
],
|
||||||
"//conditions:default": [],
|
"//conditions:default": [],
|
||||||
}) + ["-fvisibility=hidden"]
|
})
|
||||||
|
|
||||||
def _filter_impl(ctx):
|
def _lookup_file(filegroup, path):
|
||||||
suffix = ctx.attr.suffix
|
"""Extracts file at (relative) path in filegroup."""
|
||||||
files = [src for src in ctx.files.srcs if src.path.endswith(suffix)]
|
for file in filegroup.files:
|
||||||
return [DefaultInfo(files = depset(files))]
|
if file.path.endswith(path):
|
||||||
|
return file
|
||||||
|
return None
|
||||||
|
|
||||||
_filter = rule(
|
def _pic_only(files):
|
||||||
implementation = _filter_impl,
|
"""Returns the PIC files if there are any in 'files', otherwise 'files'."""
|
||||||
attrs = {
|
pic_only = [f for f in files if f.basename.find(".pic.") >= 0]
|
||||||
"srcs": attr.label_list(allow_files = True),
|
return pic_only if pic_only else files
|
||||||
"suffix": attr.string(),
|
|
||||||
},
|
|
||||||
)
|
|
||||||
"""Filters the srcs to the ones ending with suffix."""
|
|
||||||
|
|
||||||
def _gen_link_src_impl(ctx):
|
def _device_link_impl(ctx):
|
||||||
ctx.actions.expand_template(
|
if not ctx.attr.gpu_archs:
|
||||||
output = ctx.outputs.output,
|
fail("No GPU architecture specified. NCCL requires --config=cuda or similar.")
|
||||||
template = ctx.file.template,
|
|
||||||
substitutions = {
|
|
||||||
"REGISTERLINKBINARYFILE": '"%s"' % ctx.file.register_hdr.short_path,
|
|
||||||
"FATBINFILE": '"%s"' % ctx.file.fatbin_hdr.short_path,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
_gen_link_src = rule(
|
inputs = []
|
||||||
implementation = _gen_link_src_impl,
|
for dep in ctx.attr.deps:
|
||||||
attrs = {
|
inputs += dep.files.to_list()
|
||||||
"register_hdr": attr.label(allow_single_file = True),
|
inputs = _pic_only(inputs)
|
||||||
"fatbin_hdr": attr.label(allow_single_file = True),
|
|
||||||
"template": attr.label(allow_single_file = True),
|
|
||||||
"output": attr.output(),
|
|
||||||
},
|
|
||||||
)
|
|
||||||
"""Patches the include directives for the link.stub file."""
|
|
||||||
|
|
||||||
def rdc_library(name, deps):
|
|
||||||
"""Produces a cc_library from deps containing relocatable device code."""
|
|
||||||
|
|
||||||
# From .a and .pic.a archives, just use the latter. Otherwise we get
|
|
||||||
# multiply defined symbols.
|
|
||||||
# TODO(csigg): C++ Sandwich once available should allow passing this target
|
|
||||||
# to a cc_library dependency, which would avoid the linking order issue.
|
|
||||||
_filter(
|
|
||||||
name = name + "_deps_a",
|
|
||||||
srcs = deps,
|
|
||||||
suffix = ".pic.a",
|
|
||||||
)
|
|
||||||
|
|
||||||
# Device-link to cubins for each architecture.
|
# Device-link to cubins for each architecture.
|
||||||
images = []
|
name = ctx.attr.name
|
||||||
|
register_h = None
|
||||||
cubins = []
|
cubins = []
|
||||||
for arch in %{gpu_architectures}:
|
images = []
|
||||||
cubin = "%s_%s.cubin" % (name, arch)
|
for arch in ctx.attr.gpu_archs:
|
||||||
register_hdr = "%s_%s.h" % (name, arch)
|
cubin = ctx.actions.declare_file("%s_%s.cubin" % (name, arch))
|
||||||
nvlink = "@local_config_nccl//:nvlink"
|
register_h = ctx.actions.declare_file("%s_register_%s.h" % (name, arch))
|
||||||
cmd = ("$(location %s) " % nvlink +
|
ctx.actions.run(
|
||||||
select({
|
outputs = [register_h, cubin],
|
||||||
# NCCL is only supported on Linux.
|
inputs = inputs,
|
||||||
"@org_tensorflow//tensorflow:linux_x86_64": "--cpu-arch=X86_64 ",
|
executable = ctx.file._nvlink,
|
||||||
"@org_tensorflow//tensorflow:linux_ppc64le": "--cpu-arch=PPC64LE ",
|
arguments = ctx.attr.nvlink_args + [
|
||||||
"//conditions:default": "",
|
"--arch=%s" % arch,
|
||||||
}) +
|
"--register-link-binaries=%s" % register_h.path,
|
||||||
"--arch=%s $(SRCS) " % arch +
|
"--output-file=%s" % cubin.path,
|
||||||
"--register-link-binaries=$(location %s) " % register_hdr +
|
] + [file.path for file in inputs],
|
||||||
"--output-file=$(location %s)" % cubin)
|
mnemonic = "nvlink",
|
||||||
native.genrule(
|
|
||||||
name = "%s_%s" % (name, arch),
|
|
||||||
outs = [register_hdr, cubin],
|
|
||||||
srcs = [name + "_deps_a"],
|
|
||||||
cmd = cmd,
|
|
||||||
tools = [nvlink],
|
|
||||||
)
|
)
|
||||||
images.append("--image=profile=%s,file=$(location %s)" % (arch, cubin))
|
|
||||||
cubins.append(cubin)
|
cubins.append(cubin)
|
||||||
|
images.append("--image=profile=%s,file=%s" % (arch, cubin.path))
|
||||||
|
|
||||||
# Generate fatbin header from all cubins.
|
# Generate fatbin header from all cubins.
|
||||||
fatbin_hdr = name + ".fatbin.h"
|
tmp_fatbin = ctx.actions.declare_file("%s.fatbin" % name)
|
||||||
fatbinary = "@local_config_nccl//:cuda/bin/fatbinary"
|
fatbin_h = ctx.actions.declare_file("%s_fatbin.h" % name)
|
||||||
bin2c = "@local_config_nccl//:cuda/bin/bin2c"
|
bin2c = ctx.file._bin2c
|
||||||
cmd = ("$(location %s) -64 --cmdline=--compile-only " % fatbinary +
|
ctx.actions.run(
|
||||||
"--link --bin2c-path $$(dirname $(location %s)) " % bin2c +
|
outputs = [tmp_fatbin, fatbin_h],
|
||||||
"--compress-all %s --create=%%{name}.fatbin " % " ".join(images) +
|
inputs = cubins,
|
||||||
"--embedded-fatbin=$@")
|
executable = ctx.file._fatbinary,
|
||||||
native.genrule(
|
arguments = [
|
||||||
name = name + "_fatbin_h",
|
"-64",
|
||||||
outs = [fatbin_hdr],
|
"--cmdline=--compile-only",
|
||||||
srcs = cubins,
|
"--link",
|
||||||
cmd = cmd,
|
"--compress-all",
|
||||||
tools = [fatbinary, bin2c],
|
"--bin2c-path=%s" % bin2c.dirname,
|
||||||
|
"--create=%s" % tmp_fatbin.path,
|
||||||
|
"--embedded-fatbin=%s" % fatbin_h.path,
|
||||||
|
] + images,
|
||||||
|
tools = [bin2c],
|
||||||
|
mnemonic = "fatbinary",
|
||||||
)
|
)
|
||||||
|
|
||||||
# Generate the source file #including the headers generated above.
|
# Generate the source file #including the headers generated above.
|
||||||
_gen_link_src(
|
ctx.actions.expand_template(
|
||||||
name = name + "_dlink_src",
|
output = ctx.outputs.out,
|
||||||
# Include just the last one, they are equivalent.
|
template = ctx.file._link_stub,
|
||||||
register_hdr = register_hdr,
|
substitutions = {
|
||||||
fatbin_hdr = fatbin_hdr,
|
"REGISTERLINKBINARYFILE": '"%s"' % register_h.short_path,
|
||||||
template = "@local_config_nccl//:cuda/bin/crt/link.stub",
|
"FATBINFILE": '"%s"' % fatbin_h.short_path,
|
||||||
output = name + ".cc",
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
# Compile the source file into the cc_library.
|
return [DefaultInfo(files = depset([register_h, fatbin_h]))]
|
||||||
|
|
||||||
|
_device_link = rule(
|
||||||
|
implementation = _device_link_impl,
|
||||||
|
attrs = {
|
||||||
|
"deps": attr.label_list(),
|
||||||
|
"out": attr.output(mandatory = True),
|
||||||
|
"gpu_archs": attr.string_list(),
|
||||||
|
"nvlink_args": attr.string_list(),
|
||||||
|
"_nvlink": attr.label(
|
||||||
|
default = Label("@local_config_nccl//:nvlink"),
|
||||||
|
allow_single_file = True,
|
||||||
|
executable = True,
|
||||||
|
cfg = "host",
|
||||||
|
),
|
||||||
|
"_fatbinary": attr.label(
|
||||||
|
default = Label("@local_config_nccl//:cuda/bin/fatbinary"),
|
||||||
|
allow_single_file = True,
|
||||||
|
executable = True,
|
||||||
|
cfg = "host",
|
||||||
|
),
|
||||||
|
"_bin2c": attr.label(
|
||||||
|
default = Label("@local_config_nccl//:cuda/bin/bin2c"),
|
||||||
|
allow_single_file = True,
|
||||||
|
executable = True,
|
||||||
|
cfg = "host",
|
||||||
|
),
|
||||||
|
"_link_stub": attr.label(
|
||||||
|
default = Label("@local_config_nccl//:cuda/bin/crt/link.stub"),
|
||||||
|
allow_single_file = True,
|
||||||
|
),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
"""Links device code and generates source code for kernel registration."""
|
||||||
|
|
||||||
|
def _merge_archive_impl(ctx):
|
||||||
|
# Generate an mri script to the merge archives in srcs and pass it to 'ar'.
|
||||||
|
# See https://stackoverflow.com/a/23621751.
|
||||||
|
files = _pic_only(ctx.files.srcs)
|
||||||
|
mri_script = "create " + ctx.outputs.out.path
|
||||||
|
for f in files:
|
||||||
|
mri_script += "\\naddlib " + f.path
|
||||||
|
mri_script += "\\nsave\\nend"
|
||||||
|
|
||||||
|
cc_toolchain = find_cpp_toolchain(ctx)
|
||||||
|
ctx.actions.run_shell(
|
||||||
|
inputs = ctx.files.srcs, # + ctx.files._crosstool,
|
||||||
|
outputs = [ctx.outputs.out],
|
||||||
|
command = ("printf \"%s\" " % mri_script +
|
||||||
|
"| %s -M" % cc_toolchain.ar_executable),
|
||||||
|
)
|
||||||
|
|
||||||
|
_merge_archive = rule(
|
||||||
|
implementation = _merge_archive_impl,
|
||||||
|
attrs = {
|
||||||
|
"srcs": attr.label_list(mandatory = True, allow_files = True),
|
||||||
|
"_cc_toolchain": attr.label(default = "@bazel_tools//tools/cpp:current_cc_toolchain"),
|
||||||
|
# "_crosstool": attr.label_list(cfg = "host", default = ["@bazel_tools//tools/cpp:crosstool"]),
|
||||||
|
},
|
||||||
|
outputs = {"out": "lib%{name}.a"},
|
||||||
|
)
|
||||||
|
"""Merges srcs into a single archive."""
|
||||||
|
|
||||||
|
def cuda_rdc_library(name, hdrs = None, copts = None, linkstatic = True, **kwargs):
|
||||||
|
"""Produces a cuda_library using separate compilation and linking.
|
||||||
|
|
||||||
|
CUDA separate compilation and linking allows device function calls across
|
||||||
|
translation units. This is different from the normal whole program
|
||||||
|
compilation where each translation unit contains all device code. For more
|
||||||
|
background, see
|
||||||
|
https://devblogs.nvidia.com/separate-compilation-linking-cuda-device-code/,
|
||||||
|
https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#nvcc-options-for-separate-compilation
|
||||||
|
|
||||||
|
During separate compilation, the different CUDA source files are compiled
|
||||||
|
to 'relocatable device code' (RDC) and embedded in the host object files.
|
||||||
|
When using nvcc, linking the device code for each supported GPU
|
||||||
|
architecture and generating kernel registration code for the CUDA runtime
|
||||||
|
is handled automatically. Clang supports generating relocatable device
|
||||||
|
code, but it can't link it. We therefore rely on tools provided by the CUDA
|
||||||
|
SDK to link the device code and generate the host code to register the
|
||||||
|
kernels.
|
||||||
|
|
||||||
|
The nvlink tool extracts the RDC code from the object files and links it
|
||||||
|
into cubin files, one per GPU architecture. It also produces a header file
|
||||||
|
with a list of kernel names to register. The cubins are merged into a
|
||||||
|
binary blob using the fatbinary tool, and converted to a C header file with
|
||||||
|
the help of the bin2c tool. The registration header file, the fatbinary
|
||||||
|
header file, and the link.stub file (shipped with the CUDA SDK) are
|
||||||
|
compiled as ordinary host code.
|
||||||
|
|
||||||
|
Here is a diagram of the CUDA separate compilation trajectory:
|
||||||
|
|
||||||
|
x.cu.cc y.cu.cc
|
||||||
|
\ / cc_library (compile RDC and archive)
|
||||||
|
xy.a
|
||||||
|
/ \ * nvlink
|
||||||
|
register.h xy.cubin
|
||||||
|
: | * fatbinary and bin2c
|
||||||
|
: xy.fatbin.h
|
||||||
|
: : * #include
|
||||||
|
dlink.cc * Expanded from crt/dlink.stub template
|
||||||
|
| cc_library (host compile and archive)
|
||||||
|
dlink.a
|
||||||
|
|
||||||
|
The steps marked with '*' are implemented in the _device_link rule.
|
||||||
|
|
||||||
|
The object files in both xy.a and dlink.a reference symbols defined in the
|
||||||
|
other archive. The separate archives are a side effect of using two
|
||||||
|
cc_library targets to implement a single compilation trajectory. We could
|
||||||
|
fix this once bazel supports C++ sandwich. For now, we just merge the two
|
||||||
|
archives to avoid unresolved symbols:
|
||||||
|
|
||||||
|
xy.a dlink.a
|
||||||
|
\ / merge archive
|
||||||
|
xy_dlink.a
|
||||||
|
| cc_library (or alternatively, cc_import)
|
||||||
|
final target
|
||||||
|
|
||||||
|
Another complication is that cc_library produces (depending on the
|
||||||
|
configuration) both PIC and non-PIC archives, but the distinction
|
||||||
|
is hidden from Starlark until C++ sandwich becomes available. We work
|
||||||
|
around this by dropping the non-PIC files if PIC files are available.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name: Target name.
|
||||||
|
hdrs: Header files.
|
||||||
|
copts: Compiler options.
|
||||||
|
linkstatic: Must be true.
|
||||||
|
**kwargs: Any other arguments.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if not hdrs:
|
||||||
|
hdrs = []
|
||||||
|
if not copts:
|
||||||
|
copts = []
|
||||||
|
|
||||||
|
# Compile host and device code into library.
|
||||||
|
lib = name + "_lib"
|
||||||
|
tf_cuda_library(
|
||||||
|
name = lib,
|
||||||
|
hdrs = hdrs,
|
||||||
|
copts = _rdc_copts() + copts,
|
||||||
|
linkstatic = linkstatic,
|
||||||
|
**kwargs
|
||||||
|
)
|
||||||
|
|
||||||
|
# Generate source file containing linked device code.
|
||||||
|
dlink_hdrs = name + "_dlink_hdrs"
|
||||||
|
dlink_cc = name + "_dlink.cc"
|
||||||
|
_device_link(
|
||||||
|
name = dlink_hdrs,
|
||||||
|
deps = [lib],
|
||||||
|
out = dlink_cc,
|
||||||
|
gpu_archs = %{gpu_architectures},
|
||||||
|
nvlink_args = select({
|
||||||
|
"@org_tensorflow//tensorflow:linux_x86_64": ["--cpu-arch=X86_64"],
|
||||||
|
"@org_tensorflow//tensorflow:linux_ppc64le": ["--cpu-arch=PPC64LE"],
|
||||||
|
"//conditions:default": [],
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Compile the source file into a library.
|
||||||
|
dlink = name + "_dlink"
|
||||||
native.cc_library(
|
native.cc_library(
|
||||||
name = name + "_dlink_a",
|
name = dlink,
|
||||||
srcs = [
|
srcs = [dlink_cc],
|
||||||
name + "_dlink_src",
|
textual_hdrs = [dlink_hdrs],
|
||||||
],
|
|
||||||
textual_hdrs = [register_hdr, fatbin_hdr],
|
|
||||||
deps = [
|
deps = [
|
||||||
"@local_config_cuda//cuda:cuda_headers",
|
"@local_config_cuda//cuda:cuda_headers",
|
||||||
],
|
],
|
||||||
@ -222,31 +358,22 @@ def rdc_library(name, deps):
|
|||||||
"__NV_EXTRA_INITIALIZATION=",
|
"__NV_EXTRA_INITIALIZATION=",
|
||||||
"__NV_EXTRA_FINALIZATION=",
|
"__NV_EXTRA_FINALIZATION=",
|
||||||
],
|
],
|
||||||
linkstatic = True,
|
linkstatic = linkstatic,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Repackage deps into a single archive. This avoid unresolved symbols when
|
# Repackage the two libs into a single archive. This is required because
|
||||||
# the archives happen to be linked in the wrong order. For more details, see
|
# both libs reference symbols defined in the other one. For details, see
|
||||||
# https://eli.thegreenplace.net/2013/07/09/library-order-in-static-linking
|
# https://eli.thegreenplace.net/2013/07/09/library-order-in-static-linking
|
||||||
native.genrule(
|
archive = name + "_a"
|
||||||
name = name + "_a",
|
_merge_archive(
|
||||||
srcs = [
|
name = archive,
|
||||||
name + "_deps_a",
|
srcs = [lib, dlink],
|
||||||
name + "_dlink_a",
|
|
||||||
],
|
|
||||||
outs = [name + ".a"],
|
|
||||||
# See https://stackoverflow.com/a/23621751
|
|
||||||
cmd = """
|
|
||||||
addlibs=$$(echo $(SRCS) | sed "s/[^ ]* */\\naddlib &/g")
|
|
||||||
printf "create $@$${addlibs}\\nsave\\nend" | $(AR) -M
|
|
||||||
""",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Create cc target from archive.
|
||||||
native.cc_library(
|
native.cc_library(
|
||||||
name = name,
|
name = name,
|
||||||
srcs = [name + "_a"],
|
srcs = [archive],
|
||||||
deps = [
|
hdrs = hdrs,
|
||||||
"@local_config_cuda//cuda:cudart_static",
|
linkstatic = linkstatic,
|
||||||
],
|
|
||||||
linkstatic = True,
|
|
||||||
)
|
)
|
||||||
|
Loading…
Reference in New Issue
Block a user