From e07069218c39cbfc4bbad79fc50c83d64b0546af Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Mon, 25 Jan 2021 11:13:26 -0800
Subject: [PATCH] Enable stripping of NCCL's relocatable device code with
 nvprune support coming in CUDA 11.3.

PiperOrigin-RevId: 353687332
Change-Id: I08aba126b7ff134cbdfe138aaf7a55bd0f0f7b55
---
 third_party/gpus/cuda_configure.bzl |  18 ++---
 third_party/nccl/build_defs.bzl.tpl | 106 +++++++++++++---------------
 third_party/nccl/nccl_configure.bzl |  10 +--
 3 files changed, 58 insertions(+), 76 deletions(-)

diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 3ba34470b93..a017ab45843 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -1087,21 +1087,15 @@ def _create_local_cuda_repository(repository_ctx):
 
     # copy files mentioned in third_party/nccl/build_defs.bzl.tpl
     file_ext = ".exe" if is_windows(repository_ctx) else ""
+    bin_files = (
+        ["crt/link.stub"] +
+        [f + file_ext for f in ["bin2c", "fatbinary", "nvlink", "nvprune"]]
+    )
     copy_rules.append(make_copy_files_rule(
         repository_ctx,
         name = "cuda-bin",
-        srcs = [
-            cuda_config.cuda_toolkit_path + "/bin/" + "crt/link.stub",
-            cuda_config.cuda_toolkit_path + "/bin/" + "nvlink" + file_ext,
-            cuda_config.cuda_toolkit_path + "/bin/" + "fatbinary" + file_ext,
-            cuda_config.cuda_toolkit_path + "/bin/" + "bin2c" + file_ext,
-        ],
-        outs = [
-            "cuda/bin/" + "crt/link.stub",
-            "cuda/bin/" + "nvlink" + file_ext,
-            "cuda/bin/" + "fatbinary" + file_ext,
-            "cuda/bin/" + "bin2c" + file_ext,
-        ],
+        srcs = [cuda_config.cuda_toolkit_path + "/bin/" + f for f in bin_files],
+        outs = ["cuda/bin/" + f for f in bin_files],
     ))
 
     # Select the headers based on the cuDNN version (strip '64_' for Windows).
diff --git a/third_party/nccl/build_defs.bzl.tpl b/third_party/nccl/build_defs.bzl.tpl
index 7dd6ea58a2c..7e698f589a7 100644
--- a/third_party/nccl/build_defs.bzl.tpl
+++ b/third_party/nccl/build_defs.bzl.tpl
@@ -3,6 +3,9 @@
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts", "cuda_gpu_architectures")
 load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain")
 
+# CUDA toolkit version as tuple (e.g. '(11, 1)').
+_cuda_version = %{cuda_version}
+
 def _gen_device_srcs_impl(ctx):
     ops = ["sum", "prod", "min", "max"]
     types = ["i8", "u8", "i32", "u32", "i64", "u64", "f16", "f32", "f64"]
@@ -106,15 +109,15 @@ def _device_link_impl(ctx):
     fatbin_h = ctx.actions.declare_file("%s_fatbin.h" % name)
     bin2c = ctx.file._bin2c
     arguments_list = [
-            "-64",
-            "--cmdline=--compile-only",
-            "--link",
-            "--compress-all",
-            "--create=%s" % tmp_fatbin.path,
-            "--embedded-fatbin=%s" % fatbin_h.path,
-        ]
-    if %{use_bin2c_path}:
-           arguments_list.append("--bin2c-path=%s" % bin2c.dirname)
+        "-64",
+        "--cmdline=--compile-only",
+        "--link",
+        "--compress-all",
+        "--create=%s" % tmp_fatbin.path,
+        "--embedded-fatbin=%s" % fatbin_h.path,
+    ]
+    if _cuda_version <= (10, 1):
+        arguments_list.append("--bin2c-path=%s" % bin2c.dirname)
     ctx.actions.run(
         outputs = [tmp_fatbin, fatbin_h],
         inputs = cubins,
@@ -171,55 +174,51 @@ _device_link = rule(
 
 def _prune_relocatable_code_impl(ctx):
     """Clears __nv_relfatbin section containing relocatable device code."""
-    empty_file = ctx.actions.declare_file(ctx.attr.name + "__nv_relfatbin")
-    ctx.actions.write(empty_file, "")
 
-    # Parse 'objcopy --version' and update section if it's at least v2.26.
-    # Otherwise, simply copy the file without changing it.
-    # TODO(csigg): version parsing is brittle, can we do better?
-    command = r"""
-        objcopy=$1                                         \
-        section=$2                                         \
-        input=$3                                           \
-        output=$4                                          \
-        args=""                                            \
-        pattern='([0-9])\.([0-9]+)';                       \
-        if [[ $($objcopy --version) =~ $pattern ]] && {    \
-            [ ${BASH_REMATCH[1]} -gt 2 ] ||                \
-            [ ${BASH_REMATCH[2]} -ge 26 ]; }; then         \
-          args="--update-section __nv_relfatbin=$section"; \
-        fi;                                                \
-        $objcopy $args $input $output
-    """
-    cc_toolchain = find_cpp_toolchain(ctx)
+    if _cuda_version < (11, 3):
+        # -no-relocatable-elf not supported, return unpruned input.
+        return ctx.attr.input[DefaultInfo]
+
+    # nvcc --generate-code options for the active set of cuda architectures.
+    gencodes = []
+    for code in ctx.attr.gpu_archs:
+        arch = code.replace("compute_", "sm_")
+        if code != arch:
+            gencodes.append((arch, arch))
+        gencodes.append((arch, code))
+
     outputs = []
-    for src in ctx.files.srcs:
-        out = ctx.actions.declare_file("pruned_" + src.basename, sibling = src)
-        ctx.actions.run_shell(
-            inputs = [empty_file] + ctx.files.srcs,  # + ctx.files._crosstool,
-            outputs = [out],
-            arguments = [
-                cc_toolchain.objcopy_executable,
-                empty_file.path,
-                src.path,
-                out.path,
-            ],
-            command = command,
+    for input in ctx.files.input:
+        output = ctx.actions.declare_file(
+            "pruned_" + input.basename,
+            sibling = input,
         )
-        outputs.append(out)
+        arguments = (
+            ["--generate-code=arch=%s,code=%s" % code for code in gencodes] +
+            ["-no-relocatable-elf", "--output-file=%s" % output.path, str(input.path)]
+        )
+        ctx.actions.run(
+            outputs = [output],
+            inputs = [input],
+            executable = ctx.file._nvprune,
+            arguments = arguments,
+            mnemonic = "nvprune",
+        )
+        output.append(outputs)
+
     return DefaultInfo(files = depset(outputs))
 
 _prune_relocatable_code = rule(
     implementation = _prune_relocatable_code_impl,
     attrs = {
-        "srcs": attr.label_list(mandatory = True, allow_files = True),
-        "_cc_toolchain": attr.label(
-            default = "@bazel_tools//tools/cpp:current_cc_toolchain",
+        "input": attr.label(mandatory = True, allow_files = True),
+        "gpu_archs": attr.string_list(),
+        "_nvprune": attr.label(
+            default = Label("@local_config_cuda//cuda:cuda/bin/nvprune"),
+            allow_single_file = True,
+            executable = True,
+            cfg = "host",
         ),
-        # "_crosstool": attr.label_list(
-        #     cfg = "host",
-        #     default = ["@bazel_tools//tools/cpp:crosstool"]
-        # ),
     },
 )
 
@@ -383,7 +382,8 @@ def cuda_rdc_library(name, hdrs = None, copts = None, linkstatic = True, **kwarg
     pruned = name + "_pruned"
     _prune_relocatable_code(
         name = pruned,
-        srcs = [lib],
+        input = lib,
+        gpu_archs = cuda_gpu_architectures(),
     )
 
     # Repackage the two libs into a single archive. This is required because
@@ -392,11 +392,7 @@ def cuda_rdc_library(name, hdrs = None, copts = None, linkstatic = True, **kwarg
     merged = name + "_merged"
     _merge_archive(
         name = merged,
-
-        # TODO(b/166662245): We're deliberately not using `pruned` here.
-        # Pruning __nv_relfatbin also seems to prune out the PTX shipped with
-        # NCCL.
-        srcs = [lib, dlink],
+        srcs = [pruned, dlink],
     )
 
     # Create cc target from archive.
diff --git a/third_party/nccl/nccl_configure.bzl b/third_party/nccl/nccl_configure.bzl
index d59e861d70b..5432c35fac0 100644
--- a/third_party/nccl/nccl_configure.bzl
+++ b/third_party/nccl/nccl_configure.bzl
@@ -76,23 +76,15 @@ def _create_local_nccl_repository(repository_ctx):
 
     cuda_config = find_cuda_config(repository_ctx, find_cuda_config_path, ["cuda"])
     cuda_version = cuda_config["cuda_version"].split(".")
-    cuda_major = cuda_version[0]
-    cuda_minor = cuda_version[1]
 
     if nccl_version == "":
         # Alias to open source build from @nccl_archive.
         repository_ctx.file("BUILD", _NCCL_ARCHIVE_BUILD_CONTENT)
 
-        config_wrap = {
-            "%{use_bin2c_path}": "False",
-        }
-        if (int(cuda_major), int(cuda_minor)) <= (10, 1):
-            config_wrap["%{use_bin2c_path}"] = "True"
-
         repository_ctx.template(
             "build_defs.bzl",
             _label("build_defs.bzl.tpl"),
-            config_wrap,
+            {"%{cuda_version}": "(%s, %s)" % tuple(cuda_version)},
         )
     else:
         # Create target for locally installed NCCL.