Enable stripping of NCCL's relocatable device code with nvprune support coming in CUDA 11.3.
PiperOrigin-RevId: 353687332 Change-Id: I08aba126b7ff134cbdfe138aaf7a55bd0f0f7b55
This commit is contained in:
parent
5a251d2975
commit
e07069218c
18
third_party/gpus/cuda_configure.bzl
vendored
18
third_party/gpus/cuda_configure.bzl
vendored
@ -1087,21 +1087,15 @@ def _create_local_cuda_repository(repository_ctx):
|
|||||||
|
|
||||||
# copy files mentioned in third_party/nccl/build_defs.bzl.tpl
|
# copy files mentioned in third_party/nccl/build_defs.bzl.tpl
|
||||||
file_ext = ".exe" if is_windows(repository_ctx) else ""
|
file_ext = ".exe" if is_windows(repository_ctx) else ""
|
||||||
|
bin_files = (
|
||||||
|
["crt/link.stub"] +
|
||||||
|
[f + file_ext for f in ["bin2c", "fatbinary", "nvlink", "nvprune"]]
|
||||||
|
)
|
||||||
copy_rules.append(make_copy_files_rule(
|
copy_rules.append(make_copy_files_rule(
|
||||||
repository_ctx,
|
repository_ctx,
|
||||||
name = "cuda-bin",
|
name = "cuda-bin",
|
||||||
srcs = [
|
srcs = [cuda_config.cuda_toolkit_path + "/bin/" + f for f in bin_files],
|
||||||
cuda_config.cuda_toolkit_path + "/bin/" + "crt/link.stub",
|
outs = ["cuda/bin/" + f for f in bin_files],
|
||||||
cuda_config.cuda_toolkit_path + "/bin/" + "nvlink" + file_ext,
|
|
||||||
cuda_config.cuda_toolkit_path + "/bin/" + "fatbinary" + file_ext,
|
|
||||||
cuda_config.cuda_toolkit_path + "/bin/" + "bin2c" + file_ext,
|
|
||||||
],
|
|
||||||
outs = [
|
|
||||||
"cuda/bin/" + "crt/link.stub",
|
|
||||||
"cuda/bin/" + "nvlink" + file_ext,
|
|
||||||
"cuda/bin/" + "fatbinary" + file_ext,
|
|
||||||
"cuda/bin/" + "bin2c" + file_ext,
|
|
||||||
],
|
|
||||||
))
|
))
|
||||||
|
|
||||||
# Select the headers based on the cuDNN version (strip '64_' for Windows).
|
# Select the headers based on the cuDNN version (strip '64_' for Windows).
|
||||||
|
90
third_party/nccl/build_defs.bzl.tpl
vendored
90
third_party/nccl/build_defs.bzl.tpl
vendored
@ -3,6 +3,9 @@
|
|||||||
load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts", "cuda_gpu_architectures")
|
load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts", "cuda_gpu_architectures")
|
||||||
load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain")
|
load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain")
|
||||||
|
|
||||||
|
# CUDA toolkit version as tuple (e.g. '(11, 1)').
|
||||||
|
_cuda_version = %{cuda_version}
|
||||||
|
|
||||||
def _gen_device_srcs_impl(ctx):
|
def _gen_device_srcs_impl(ctx):
|
||||||
ops = ["sum", "prod", "min", "max"]
|
ops = ["sum", "prod", "min", "max"]
|
||||||
types = ["i8", "u8", "i32", "u32", "i64", "u64", "f16", "f32", "f64"]
|
types = ["i8", "u8", "i32", "u32", "i64", "u64", "f16", "f32", "f64"]
|
||||||
@ -113,7 +116,7 @@ def _device_link_impl(ctx):
|
|||||||
"--create=%s" % tmp_fatbin.path,
|
"--create=%s" % tmp_fatbin.path,
|
||||||
"--embedded-fatbin=%s" % fatbin_h.path,
|
"--embedded-fatbin=%s" % fatbin_h.path,
|
||||||
]
|
]
|
||||||
if %{use_bin2c_path}:
|
if _cuda_version <= (10, 1):
|
||||||
arguments_list.append("--bin2c-path=%s" % bin2c.dirname)
|
arguments_list.append("--bin2c-path=%s" % bin2c.dirname)
|
||||||
ctx.actions.run(
|
ctx.actions.run(
|
||||||
outputs = [tmp_fatbin, fatbin_h],
|
outputs = [tmp_fatbin, fatbin_h],
|
||||||
@ -171,55 +174,51 @@ _device_link = rule(
|
|||||||
|
|
||||||
def _prune_relocatable_code_impl(ctx):
|
def _prune_relocatable_code_impl(ctx):
|
||||||
"""Clears __nv_relfatbin section containing relocatable device code."""
|
"""Clears __nv_relfatbin section containing relocatable device code."""
|
||||||
empty_file = ctx.actions.declare_file(ctx.attr.name + "__nv_relfatbin")
|
|
||||||
ctx.actions.write(empty_file, "")
|
|
||||||
|
|
||||||
# Parse 'objcopy --version' and update section if it's at least v2.26.
|
if _cuda_version < (11, 3):
|
||||||
# Otherwise, simply copy the file without changing it.
|
# -no-relocatable-elf not supported, return unpruned input.
|
||||||
# TODO(csigg): version parsing is brittle, can we do better?
|
return ctx.attr.input[DefaultInfo]
|
||||||
command = r"""
|
|
||||||
objcopy=$1 \
|
# nvcc --generate-code options for the active set of cuda architectures.
|
||||||
section=$2 \
|
gencodes = []
|
||||||
input=$3 \
|
for code in ctx.attr.gpu_archs:
|
||||||
output=$4 \
|
arch = code.replace("compute_", "sm_")
|
||||||
args="" \
|
if code != arch:
|
||||||
pattern='([0-9])\.([0-9]+)'; \
|
gencodes.append((arch, arch))
|
||||||
if [[ $($objcopy --version) =~ $pattern ]] && { \
|
gencodes.append((arch, code))
|
||||||
[ ${BASH_REMATCH[1]} -gt 2 ] || \
|
|
||||||
[ ${BASH_REMATCH[2]} -ge 26 ]; }; then \
|
|
||||||
args="--update-section __nv_relfatbin=$section"; \
|
|
||||||
fi; \
|
|
||||||
$objcopy $args $input $output
|
|
||||||
"""
|
|
||||||
cc_toolchain = find_cpp_toolchain(ctx)
|
|
||||||
outputs = []
|
outputs = []
|
||||||
for src in ctx.files.srcs:
|
for input in ctx.files.input:
|
||||||
out = ctx.actions.declare_file("pruned_" + src.basename, sibling = src)
|
output = ctx.actions.declare_file(
|
||||||
ctx.actions.run_shell(
|
"pruned_" + input.basename,
|
||||||
inputs = [empty_file] + ctx.files.srcs, # + ctx.files._crosstool,
|
sibling = input,
|
||||||
outputs = [out],
|
|
||||||
arguments = [
|
|
||||||
cc_toolchain.objcopy_executable,
|
|
||||||
empty_file.path,
|
|
||||||
src.path,
|
|
||||||
out.path,
|
|
||||||
],
|
|
||||||
command = command,
|
|
||||||
)
|
)
|
||||||
outputs.append(out)
|
arguments = (
|
||||||
|
["--generate-code=arch=%s,code=%s" % code for code in gencodes] +
|
||||||
|
["-no-relocatable-elf", "--output-file=%s" % output.path, str(input.path)]
|
||||||
|
)
|
||||||
|
ctx.actions.run(
|
||||||
|
outputs = [output],
|
||||||
|
inputs = [input],
|
||||||
|
executable = ctx.file._nvprune,
|
||||||
|
arguments = arguments,
|
||||||
|
mnemonic = "nvprune",
|
||||||
|
)
|
||||||
|
output.append(outputs)
|
||||||
|
|
||||||
return DefaultInfo(files = depset(outputs))
|
return DefaultInfo(files = depset(outputs))
|
||||||
|
|
||||||
_prune_relocatable_code = rule(
|
_prune_relocatable_code = rule(
|
||||||
implementation = _prune_relocatable_code_impl,
|
implementation = _prune_relocatable_code_impl,
|
||||||
attrs = {
|
attrs = {
|
||||||
"srcs": attr.label_list(mandatory = True, allow_files = True),
|
"input": attr.label(mandatory = True, allow_files = True),
|
||||||
"_cc_toolchain": attr.label(
|
"gpu_archs": attr.string_list(),
|
||||||
default = "@bazel_tools//tools/cpp:current_cc_toolchain",
|
"_nvprune": attr.label(
|
||||||
|
default = Label("@local_config_cuda//cuda:cuda/bin/nvprune"),
|
||||||
|
allow_single_file = True,
|
||||||
|
executable = True,
|
||||||
|
cfg = "host",
|
||||||
),
|
),
|
||||||
# "_crosstool": attr.label_list(
|
|
||||||
# cfg = "host",
|
|
||||||
# default = ["@bazel_tools//tools/cpp:crosstool"]
|
|
||||||
# ),
|
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -383,7 +382,8 @@ def cuda_rdc_library(name, hdrs = None, copts = None, linkstatic = True, **kwarg
|
|||||||
pruned = name + "_pruned"
|
pruned = name + "_pruned"
|
||||||
_prune_relocatable_code(
|
_prune_relocatable_code(
|
||||||
name = pruned,
|
name = pruned,
|
||||||
srcs = [lib],
|
input = lib,
|
||||||
|
gpu_archs = cuda_gpu_architectures(),
|
||||||
)
|
)
|
||||||
|
|
||||||
# Repackage the two libs into a single archive. This is required because
|
# Repackage the two libs into a single archive. This is required because
|
||||||
@ -392,11 +392,7 @@ def cuda_rdc_library(name, hdrs = None, copts = None, linkstatic = True, **kwarg
|
|||||||
merged = name + "_merged"
|
merged = name + "_merged"
|
||||||
_merge_archive(
|
_merge_archive(
|
||||||
name = merged,
|
name = merged,
|
||||||
|
srcs = [pruned, dlink],
|
||||||
# TODO(b/166662245): We're deliberately not using `pruned` here.
|
|
||||||
# Pruning __nv_relfatbin also seems to prune out the PTX shipped with
|
|
||||||
# NCCL.
|
|
||||||
srcs = [lib, dlink],
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Create cc target from archive.
|
# Create cc target from archive.
|
||||||
|
10
third_party/nccl/nccl_configure.bzl
vendored
10
third_party/nccl/nccl_configure.bzl
vendored
@ -76,23 +76,15 @@ def _create_local_nccl_repository(repository_ctx):
|
|||||||
|
|
||||||
cuda_config = find_cuda_config(repository_ctx, find_cuda_config_path, ["cuda"])
|
cuda_config = find_cuda_config(repository_ctx, find_cuda_config_path, ["cuda"])
|
||||||
cuda_version = cuda_config["cuda_version"].split(".")
|
cuda_version = cuda_config["cuda_version"].split(".")
|
||||||
cuda_major = cuda_version[0]
|
|
||||||
cuda_minor = cuda_version[1]
|
|
||||||
|
|
||||||
if nccl_version == "":
|
if nccl_version == "":
|
||||||
# Alias to open source build from @nccl_archive.
|
# Alias to open source build from @nccl_archive.
|
||||||
repository_ctx.file("BUILD", _NCCL_ARCHIVE_BUILD_CONTENT)
|
repository_ctx.file("BUILD", _NCCL_ARCHIVE_BUILD_CONTENT)
|
||||||
|
|
||||||
config_wrap = {
|
|
||||||
"%{use_bin2c_path}": "False",
|
|
||||||
}
|
|
||||||
if (int(cuda_major), int(cuda_minor)) <= (10, 1):
|
|
||||||
config_wrap["%{use_bin2c_path}"] = "True"
|
|
||||||
|
|
||||||
repository_ctx.template(
|
repository_ctx.template(
|
||||||
"build_defs.bzl",
|
"build_defs.bzl",
|
||||||
_label("build_defs.bzl.tpl"),
|
_label("build_defs.bzl.tpl"),
|
||||||
config_wrap,
|
{"%{cuda_version}": "(%s, %s)" % tuple(cuda_version)},
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# Create target for locally installed NCCL.
|
# Create target for locally installed NCCL.
|
||||||
|
Loading…
Reference in New Issue
Block a user