Improve build rules to compile NCCL from source, in particular for clang.

PiperOrigin-RevId: 225051897
2018-12-11 12:39:12 -08:00 · 2018-12-11 12:39:12 -08:00 · d6a4685035
commit d6a4685035
parent e3d751c2a8
2 changed files with 356 additions and 275 deletions
--- a/third_party/nccl/archive.BUILD
+++ b/third_party/nccl/archive.BUILD
@ -1,157 +1,110 @@
 # NVIDIA NCCL 2
 # A package of optimized primitives for collective multi-GPU communication.
-licenses(["restricted"])
+licenses(["notice"])
 exports_files(["LICENSE.txt"])
 load(
    "@local_config_nccl//:build_defs.bzl",
-    "gen_nccl_h",
+    "cuda_rdc_library",
-    "nccl_library",
+    "gen_device_srcs",
-    "rdc_copts",
+    "process_srcs",
    "rdc_library",
 )
-load(
+load("@org_tensorflow//tensorflow:tensorflow.bzl", "tf_cuda_library")
-    "@local_config_cuda//cuda:build_defs.bzl",
+
-    "cuda_default_copts",
+process_srcs(
    name = "process_srcs",
    srcs = glob([
        "**/*.cc",
        "**/*.h",
    ]),
 )
-# Generate the nccl.h header file.
+cc_library(
 gen_nccl_h(
    name = "nccl_h",
    output = "src/nccl.h",
    template = "src/nccl.h.in",
 )
 nccl_library(
    name = "src_hdrs",
    hdrs = [
        "src/nccl.h",
        # src/include/common_coll.h #includes "collectives/collectives.h".
        # All other #includes of collectives.h are patched in process_srcs.
        "src/collectives/collectives.h",
        "src/nccl.h",
    ],
    data = [":process_srcs"],
    strip_include_prefix = "src",
 )
-nccl_library(
+cc_library(
    name = "include_hdrs",
    hdrs = glob(["src/include/*.h"]),
    data = [":process_srcs"],
    strip_include_prefix = "src/include",
 )
-filegroup(
+cc_library(
    name = "device_hdrs",
-    srcs = glob(["src/collectives/device/*.h"]),
+    hdrs = glob(["src/collectives/device/*.h"]),
    strip_include_prefix = "src/collectives/device",
 )
 filegroup(
    name = "device_srcs",
    srcs = [
-        "src/collectives/device/all_gather.cu",
+        "src/collectives/device/all_gather.cu.cc",
-        "src/collectives/device/all_reduce.cu",
+        "src/collectives/device/all_reduce.cu.cc",
-        "src/collectives/device/broadcast.cu",
+        "src/collectives/device/broadcast.cu.cc",
-        "src/collectives/device/reduce.cu",
+        "src/collectives/device/reduce.cu.cc",
-        "src/collectives/device/reduce_scatter.cu",
+        "src/collectives/device/reduce_scatter.cu.cc",
    ],
 )
-nccl_library(
+# NCCL compiles the same source files with different NCCL_OP defines. RDC
 # compilation requires that each compiled module has a unique ID. Clang derives
 # the module ID from the path only so we need to rename the files to get
 # different IDs for different parts of compilation. NVCC does not have that
 # problem because it generates IDs based on preprocessed content.
 gen_device_srcs(
    name = "sum",
-    srcs = [
+    srcs = [":device_srcs"],
-        ":device_hdrs",
+    NCCL_OP = 0,
        ":device_srcs",
    ],
    copts = ["-DNCCL_OP=0"] + rdc_copts(),
    linkstatic = True,
    prefix = "sum_",
    deps = [
        ":include_hdrs",
        ":src_hdrs",
        "@local_config_cuda//cuda:cuda_headers",
    ],
 )
-nccl_library(
+gen_device_srcs(
    name = "prod",
-    srcs = [
+    srcs = [":device_srcs"],
-        ":device_hdrs",
+    NCCL_OP = 1,
        ":device_srcs",
    ],
    copts = ["-DNCCL_OP=1"] + rdc_copts(),
    linkstatic = True,
    prefix = "_prod",
    deps = [
        ":include_hdrs",
        ":src_hdrs",
        "@local_config_cuda//cuda:cuda_headers",
    ],
 )
-nccl_library(
+gen_device_srcs(
    name = "min",
-    srcs = [
+    srcs = [":device_srcs"],
-        ":device_hdrs",
+    NCCL_OP = 2,
        ":device_srcs",
    ],
    copts = ["-DNCCL_OP=2"] + rdc_copts(),
    linkstatic = True,
    prefix = "min_",
    deps = [
        ":include_hdrs",
        ":src_hdrs",
        "@local_config_cuda//cuda:cuda_headers",
    ],
 )
-nccl_library(
+gen_device_srcs(
    name = "max",
-    srcs = [
+    srcs = [":device_srcs"],
-        ":device_hdrs",
+    NCCL_OP = 3,
        ":device_srcs",
    ],
    copts = ["-DNCCL_OP=3"] + rdc_copts(),
    linkstatic = True,
    prefix = "max_",
    deps = [
        ":include_hdrs",
        ":src_hdrs",
        "@local_config_cuda//cuda:cuda_headers",
    ],
 )
-nccl_library(
+cuda_rdc_library(
-    name = "functions",
+    name = "device",
    srcs = [
-        "src/collectives/device/functions.cu",
+        "src/collectives/device/functions.cu.cc",
        ":device_hdrs",
    ],
    copts = rdc_copts(),
    linkstatic = True,
    deps = [
        ":include_hdrs",
        ":src_hdrs",
        "@local_config_cuda//cuda:cuda_headers",
    ],
 )
 rdc_library(
    name = "device_code",
    deps = [
        ":functions",
        ":max",
        ":min",
        ":prod",
        ":sum",
    ],
    deps = [
        ":device_hdrs",
        ":include_hdrs",
        ":src_hdrs",
    ],
 )
 # Primary NCCL target.
-nccl_library(
+tf_cuda_library(
    name = "nccl",
    srcs = glob(
-        include = ["src/**/*.cu"],
+        include = ["src/**/*.cu.cc"],
        # Exclude device-library code.
        exclude = ["src/collectives/device/**"],
    ) + [
@ -162,13 +115,14 @@ nccl_library(
        "src/nccl.h",
    ],
    hdrs = ["src/nccl.h"],
-    copts = cuda_default_copts(),
+    copts = ["-Wno-vla"],
    include_prefix = "third_party/nccl",
    strip_include_prefix = "src",
    visibility = ["//visibility:public"],
    deps = [
-        ":device_code",
+        ":device",
        ":include_hdrs",
        ":src_hdrs",
        "@local_config_cuda//cuda:cudart_static",
    ],
 )
--- a/third_party/nccl/build_defs.bzl.tpl
+++ b/third_party/nccl/build_defs.bzl.tpl
@ -1,87 +1,86 @@
 """Repository rule for NCCL."""
-load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
+load("@org_tensorflow//tensorflow:tensorflow.bzl", "tf_cuda_library")
-
+load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain")
 def _gen_nccl_h_impl(ctx):
    """Creates nccl.h from a template."""
    ctx.actions.expand_template(
        output = ctx.outputs.output,
        template = ctx.file.template,
        substitutions = {
            "${nccl:Major}": "2",
            "${nccl:Minor}": "3",
            "${nccl:Patch}": "5",
            "${nccl:Suffix}": "",
            "${nccl:Version}": "2305",
        },
    )
 gen_nccl_h = rule(
    implementation = _gen_nccl_h_impl,
    attrs = {
        "template": attr.label(allow_single_file = True),
        "output": attr.output(),
    },
 )
 """Creates the NCCL header file."""
 def _process_srcs_impl(ctx):
    """Appends .cc to .cu files, patches include directives."""
    files = []
    for src in ctx.files.srcs:
-        if not src.is_source:
+        substitutions = {
-            # Process only once, specifically "src/nccl.h".
+            "\"collectives.h": "\"collectives/collectives.h",
-            files.append(src)
+            "\"../collectives.h": "\"collectives/collectives.h",
-            continue
+            # Clang does not define __CUDACC_VER_*__, use CUDA_VERSION instead.
            # TODO(csigg): Apply substitutions upstream and remove here.
            "#if __CUDACC_VER_MAJOR__ >= 10 || (__CUDACC_VER_MAJOR__ >= 9 && __CUDACC_VER_MINOR__ >= 2)": "#if CUDA_VERSION >= 9200",
            "#if __CUDACC_VER_MAJOR__ >= 10": "#if CUDA_VERSION >= 10000",
            "#if __CUDACC_VER_MAJOR__ >= 9": "#if CUDA_VERSION >= 9000",
            "#if __CUDACC_VER_MAJOR__ < 9": "#if CUDA_VERSION < 9000",
            "nullptr_t": "std::nullptr_t",
        }
        name = src.basename
        if name == "nccl.in.h":
            name = "nccl.h"
            substitutions.update({
                "${nccl:Major}": "2",
                "${nccl:Minor}": "3",
                "${nccl:Patch}": "5",
                "${nccl:Suffix}": "",
                "${nccl:Version}": "2305",
            })
        if name == "functions.cu":
            # Don't try to initialize the host shadow copy of this device-side
            # global variable. There is no host pointer to a device-side
            # function, which confuses clang.
            # TODO(csigg): remove when fixed in clang.
            substitutions.update({
                "NCCL_FUNCS2B(ncclBroadcast),": "#if __CUDA_ARCH__\nNCCL_FUNCS2B(ncclBroadcast),",
                "NCCL_FUNCS2A(ncclAllReduce)": "NCCL_FUNCS2A(ncclAllReduce)\n#endif",
            })
        if src.extension == "cu":
-            name = ctx.attr.prefix + name + ".cc"
+            name += ".cc"
        file = ctx.actions.declare_file(name, sibling = src)
        ctx.actions.expand_template(
            output = file,
            template = src,
            substitutions = substitutions,
        )
        files.append(file)
    return [DefaultInfo(files = depset(files))]
 process_srcs = rule(
    implementation = _process_srcs_impl,
    attrs = {
        "srcs": attr.label_list(allow_files = True),
    },
 )
 """Processes the NCCL srcs so they can be compiled with bazel and clang."""
 def _gen_device_srcs_impl(ctx):
    files = []
    for src in ctx.files.srcs:
        name = "%s_%s" % (ctx.attr.name, src.basename)
        file = ctx.actions.declare_file(name, sibling = src)
        ctx.actions.expand_template(
            output = file,
            template = src,
            substitutions = {
-                "\"collectives.h": "\"collectives/collectives.h",
+                "#define UNROLL 4": "#define UNROLL 4\n#define NCCL_OP %d" % ctx.attr.NCCL_OP,
                "\"../collectives.h": "\"collectives/collectives.h",
                "#if __CUDACC_VER_MAJOR__": "#if defined __CUDACC_VER_MAJOR__ && __CUDACC_VER_MAJOR__",
                # Substitutions are applied in order.
                "std::nullptr_t": "nullptr_t",
                "nullptr_t": "std::nullptr_t",
            },
        )
        files.append(file)
    return [DefaultInfo(files = depset(files))]
-_process_srcs = rule(
+gen_device_srcs = rule(
-    implementation = _process_srcs_impl,
+    implementation = _gen_device_srcs_impl,
    attrs = {
        "srcs": attr.label_list(allow_files = True),
-        "prefix": attr.string(default = ""),
+        "NCCL_OP": attr.int(),
    },
 )
-"""Processes the NCCL srcs so they can be compiled with bazel and clang."""
+"""Adds prefix to each file name in srcs and adds #define NCCL_OP."""
-def nccl_library(name, srcs = None, hdrs = None, prefix = None, **kwargs):
+def _rdc_copts():
    """Processes the srcs and hdrs and creates a cc_library."""
    _process_srcs(
        name = name + "_srcs",
        srcs = srcs,
        prefix = prefix,
    )
    _process_srcs(
        name = name + "_hdrs",
        srcs = hdrs,
    )
    native.cc_library(
        name = name,
        srcs = [name + "_srcs"] if srcs else [],
        hdrs = [name + "_hdrs"] if hdrs else [],
        **kwargs
    )
 def rdc_copts():
    """Returns copts for compiling relocatable device code."""
    # The global functions can not have a lower register count than the
@ -89,7 +88,7 @@ def rdc_copts():
    # https://github.com/NVIDIA/nccl/blob/f93fe9bfd94884cec2ba711897222e0df5569a53/makefiles/common.mk#L48
    maxrregcount = "-maxrregcount=96"
-    return cuda_default_copts() + select({
+    return select({
        "@local_config_cuda//cuda:using_nvcc": [
            "-nvcc_options",
            "relocatable-device-code=true",
@ -100,118 +99,255 @@ def rdc_copts():
            "-fcuda-rdc",
            "-Xcuda-ptxas",
            maxrregcount,
            # Work around for clang bug (fixed in r348662), declaring
            # '__device__ operator delete(void*, std::size_t)' non-inline.
            # TODO(csigg): Only add this option for older clang versions.
            "-std=gnu++11",
        ],
        "//conditions:default": [],
-    }) + ["-fvisibility=hidden"]
+    })
-def _filter_impl(ctx):
+def _lookup_file(filegroup, path):
-    suffix = ctx.attr.suffix
+    """Extracts file at (relative) path in filegroup."""
-    files = [src for src in ctx.files.srcs if src.path.endswith(suffix)]
+    for file in filegroup.files:
-    return [DefaultInfo(files = depset(files))]
+        if file.path.endswith(path):
            return file
    return None
-_filter = rule(
+def _pic_only(files):
-    implementation = _filter_impl,
+    """Returns the PIC files if there are any in 'files', otherwise 'files'."""
-    attrs = {
+    pic_only = [f for f in files if f.basename.find(".pic.") >= 0]
-        "srcs": attr.label_list(allow_files = True),
+    return pic_only if pic_only else files
        "suffix": attr.string(),
    },
 )
 """Filters the srcs to the ones ending with suffix."""
-def _gen_link_src_impl(ctx):
+def _device_link_impl(ctx):
-    ctx.actions.expand_template(
+    if not ctx.attr.gpu_archs:
-        output = ctx.outputs.output,
+        fail("No GPU architecture specified. NCCL requires --config=cuda or similar.")
        template = ctx.file.template,
        substitutions = {
            "REGISTERLINKBINARYFILE": '"%s"' % ctx.file.register_hdr.short_path,
            "FATBINFILE": '"%s"' % ctx.file.fatbin_hdr.short_path,
        },
    )
-_gen_link_src = rule(
+    inputs = []
-    implementation = _gen_link_src_impl,
+    for dep in ctx.attr.deps:
-    attrs = {
+        inputs += dep.files.to_list()
-        "register_hdr": attr.label(allow_single_file = True),
+    inputs = _pic_only(inputs)
        "fatbin_hdr": attr.label(allow_single_file = True),
        "template": attr.label(allow_single_file = True),
        "output": attr.output(),
    },
 )
 """Patches the include directives for the link.stub file."""
 def rdc_library(name, deps):
    """Produces a cc_library from deps containing relocatable device code."""
    # From .a and .pic.a archives, just use the latter. Otherwise we get
    # multiply defined symbols.
    # TODO(csigg): C++ Sandwich once available should allow passing this target
    # to a cc_library dependency, which would avoid the linking order issue.
    _filter(
        name = name + "_deps_a",
        srcs = deps,
        suffix = ".pic.a",
    )
    # Device-link to cubins for each architecture.
-    images = []
+    name = ctx.attr.name
    register_h = None
    cubins = []
-    for arch in %{gpu_architectures}:
+    images = []
-        cubin = "%s_%s.cubin" % (name, arch)
+    for arch in ctx.attr.gpu_archs:
-        register_hdr = "%s_%s.h" % (name, arch)
+        cubin = ctx.actions.declare_file("%s_%s.cubin" % (name, arch))
-        nvlink = "@local_config_nccl//:nvlink"
+        register_h = ctx.actions.declare_file("%s_register_%s.h" % (name, arch))
-        cmd = ("$(location %s) " % nvlink +
+        ctx.actions.run(
-               select({
+            outputs = [register_h, cubin],
-                   # NCCL is only supported on Linux.
+            inputs = inputs,
-                   "@org_tensorflow//tensorflow:linux_x86_64": "--cpu-arch=X86_64 ",
+            executable = ctx.file._nvlink,
-                   "@org_tensorflow//tensorflow:linux_ppc64le": "--cpu-arch=PPC64LE ",
+            arguments = ctx.attr.nvlink_args + [
-                   "//conditions:default": "",
+                "--arch=%s" % arch,
-               }) +
+                "--register-link-binaries=%s" % register_h.path,
-               "--arch=%s $(SRCS) " % arch +
+                "--output-file=%s" % cubin.path,
-               "--register-link-binaries=$(location %s) " % register_hdr +
+            ] + [file.path for file in inputs],
-               "--output-file=$(location %s)" % cubin)
+            mnemonic = "nvlink",
        native.genrule(
            name = "%s_%s" % (name, arch),
            outs = [register_hdr, cubin],
            srcs = [name + "_deps_a"],
            cmd = cmd,
            tools = [nvlink],
        )
        images.append("--image=profile=%s,file=$(location %s)" % (arch, cubin))
        cubins.append(cubin)
        images.append("--image=profile=%s,file=%s" % (arch, cubin.path))
    # Generate fatbin header from all cubins.
-    fatbin_hdr = name + ".fatbin.h"
+    tmp_fatbin = ctx.actions.declare_file("%s.fatbin" % name)
-    fatbinary = "@local_config_nccl//:cuda/bin/fatbinary"
+    fatbin_h = ctx.actions.declare_file("%s_fatbin.h" % name)
-    bin2c = "@local_config_nccl//:cuda/bin/bin2c"
+    bin2c = ctx.file._bin2c
-    cmd = ("$(location %s) -64 --cmdline=--compile-only " % fatbinary +
+    ctx.actions.run(
-           "--link --bin2c-path $$(dirname $(location %s)) " % bin2c +
+        outputs = [tmp_fatbin, fatbin_h],
-           "--compress-all %s --create=%%{name}.fatbin " % " ".join(images) +
+        inputs = cubins,
-           "--embedded-fatbin=$@")
+        executable = ctx.file._fatbinary,
-    native.genrule(
+        arguments = [
-        name = name + "_fatbin_h",
+            "-64",
-        outs = [fatbin_hdr],
+            "--cmdline=--compile-only",
-        srcs = cubins,
+            "--link",
-        cmd = cmd,
+            "--compress-all",
-        tools = [fatbinary, bin2c],
+            "--bin2c-path=%s" % bin2c.dirname,
            "--create=%s" % tmp_fatbin.path,
            "--embedded-fatbin=%s" % fatbin_h.path,
        ] + images,
        tools = [bin2c],
        mnemonic = "fatbinary",
    )
    # Generate the source file #including the headers generated above.
-    _gen_link_src(
+    ctx.actions.expand_template(
-        name = name + "_dlink_src",
+        output = ctx.outputs.out,
-        # Include just the last one, they are equivalent.
+        template = ctx.file._link_stub,
-        register_hdr = register_hdr,
+        substitutions = {
-        fatbin_hdr = fatbin_hdr,
+            "REGISTERLINKBINARYFILE": '"%s"' % register_h.short_path,
-        template = "@local_config_nccl//:cuda/bin/crt/link.stub",
+            "FATBINFILE": '"%s"' % fatbin_h.short_path,
-        output = name + ".cc",
+        },
    )
-    # Compile the source file into the cc_library.
+    return [DefaultInfo(files = depset([register_h, fatbin_h]))]
 _device_link = rule(
    implementation = _device_link_impl,
    attrs = {
        "deps": attr.label_list(),
        "out": attr.output(mandatory = True),
        "gpu_archs": attr.string_list(),
        "nvlink_args": attr.string_list(),
        "_nvlink": attr.label(
            default = Label("@local_config_nccl//:nvlink"),
            allow_single_file = True,
            executable = True,
            cfg = "host",
        ),
        "_fatbinary": attr.label(
            default = Label("@local_config_nccl//:cuda/bin/fatbinary"),
            allow_single_file = True,
            executable = True,
            cfg = "host",
        ),
        "_bin2c": attr.label(
            default = Label("@local_config_nccl//:cuda/bin/bin2c"),
            allow_single_file = True,
            executable = True,
            cfg = "host",
        ),
        "_link_stub": attr.label(
            default = Label("@local_config_nccl//:cuda/bin/crt/link.stub"),
            allow_single_file = True,
        ),
    },
 )
 """Links device code and generates source code for kernel registration."""
 def _merge_archive_impl(ctx):
    # Generate an mri script to the merge archives in srcs and pass it to 'ar'.
    # See https://stackoverflow.com/a/23621751.
    files = _pic_only(ctx.files.srcs)
    mri_script = "create " + ctx.outputs.out.path
    for f in files:
        mri_script += "\\naddlib " + f.path
    mri_script += "\\nsave\\nend"
    cc_toolchain = find_cpp_toolchain(ctx)
    ctx.actions.run_shell(
        inputs = ctx.files.srcs,  # + ctx.files._crosstool,
        outputs = [ctx.outputs.out],
        command = ("printf \"%s\" " % mri_script +
                   "| %s -M" % cc_toolchain.ar_executable),
    )
 _merge_archive = rule(
    implementation = _merge_archive_impl,
    attrs = {
        "srcs": attr.label_list(mandatory = True, allow_files = True),
        "_cc_toolchain": attr.label(default = "@bazel_tools//tools/cpp:current_cc_toolchain"),
        # "_crosstool": attr.label_list(cfg = "host", default = ["@bazel_tools//tools/cpp:crosstool"]),
    },
    outputs = {"out": "lib%{name}.a"},
 )
 """Merges srcs into a single archive."""
 def cuda_rdc_library(name, hdrs = None, copts = None, linkstatic = True, **kwargs):
    """Produces a cuda_library using separate compilation and linking.
    CUDA separate compilation and linking allows device function calls across
    translation units. This is different from the normal whole program
    compilation where each translation unit contains all device code. For more
    background, see
    https://devblogs.nvidia.com/separate-compilation-linking-cuda-device-code/,
    https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#nvcc-options-for-separate-compilation
    During separate compilation, the different CUDA source files are compiled
    to 'relocatable device code' (RDC) and embedded in the host object files.
    When using nvcc, linking the device code for each supported GPU
    architecture and generating kernel registration code for the CUDA runtime
    is handled automatically. Clang supports generating relocatable device
    code, but it can't link it. We therefore rely on tools provided by the CUDA
    SDK to link the device code and generate the host code to register the
    kernels.
    The nvlink tool extracts the RDC code from the object files and links it
    into cubin files, one per GPU architecture. It also produces a header file
    with a list of kernel names to register. The cubins are merged into a
    binary blob using the fatbinary tool, and converted to a C header file with
    the help of the bin2c tool. The registration header file, the fatbinary
    header file, and the link.stub file (shipped with the CUDA SDK) are
    compiled as ordinary host code.
    Here is a diagram of the CUDA separate compilation trajectory:
     x.cu.cc    y.cu.cc
           \    /            cc_library (compile RDC and archive)
            xy.a
           /    \            * nvlink
    register.h  xy.cubin
          :      |           * fatbinary and bin2c
          :     xy.fatbin.h
          :      :           * #include
          dlink.cc           * Expanded from crt/dlink.stub template
             |               cc_library (host compile and archive)
          dlink.a
    The steps marked with '*' are implemented in the _device_link rule.
    The object files in both xy.a and dlink.a reference symbols defined in the
    other archive. The separate archives are a side effect of using two
    cc_library targets to implement a single compilation trajectory. We could
    fix this once bazel supports C++ sandwich. For now, we just merge the two
    archives to avoid unresolved symbols:
    xy.a      dlink.a
        \    /           merge archive
      xy_dlink.a
           |             cc_library (or alternatively, cc_import)
     final target
    Another complication is that cc_library produces (depending on the
    configuration) both PIC and non-PIC archives, but the distinction
    is hidden from Starlark until C++ sandwich becomes available. We work
    around this by dropping the non-PIC files if PIC files are available.
    Args:
      name: Target name.
      hdrs: Header files.
      copts: Compiler options.
      linkstatic: Must be true.
      **kwargs: Any other arguments.
    """
    if not hdrs:
        hdrs = []
    if not copts:
        copts = []
    # Compile host and device code into library.
    lib = name + "_lib"
    tf_cuda_library(
        name = lib,
        hdrs = hdrs,
        copts = _rdc_copts() + copts,
        linkstatic = linkstatic,
        **kwargs
    )
    # Generate source file containing linked device code.
    dlink_hdrs = name + "_dlink_hdrs"
    dlink_cc = name + "_dlink.cc"
    _device_link(
        name = dlink_hdrs,
        deps = [lib],
        out = dlink_cc,
        gpu_archs = %{gpu_architectures},
        nvlink_args = select({
            "@org_tensorflow//tensorflow:linux_x86_64": ["--cpu-arch=X86_64"],
            "@org_tensorflow//tensorflow:linux_ppc64le": ["--cpu-arch=PPC64LE"],
            "//conditions:default": [],
        }),
    )
    # Compile the source file into a library.
    dlink = name + "_dlink"
    native.cc_library(
-        name = name + "_dlink_a",
+        name = dlink,
-        srcs = [
+        srcs = [dlink_cc],
-            name + "_dlink_src",
+        textual_hdrs = [dlink_hdrs],
        ],
        textual_hdrs = [register_hdr, fatbin_hdr],
        deps = [
            "@local_config_cuda//cuda:cuda_headers",
        ],
@ -222,31 +358,22 @@ def rdc_library(name, deps):
            "__NV_EXTRA_INITIALIZATION=",
            "__NV_EXTRA_FINALIZATION=",
        ],
-        linkstatic = True,
+        linkstatic = linkstatic,
    )
-    # Repackage deps into a single archive. This avoid unresolved symbols when
+    # Repackage the two libs into a single archive. This is required because
-    # the archives happen to be linked in the wrong order. For more details, see
+    # both libs reference symbols defined in the other one. For details, see
    # https://eli.thegreenplace.net/2013/07/09/library-order-in-static-linking
-    native.genrule(
+    archive = name + "_a"
-        name = name + "_a",
+    _merge_archive(
-        srcs = [
+        name = archive,
-            name + "_deps_a",
+        srcs = [lib, dlink],
            name + "_dlink_a",
        ],
        outs = [name + ".a"],
        # See https://stackoverflow.com/a/23621751
        cmd = """
 addlibs=$$(echo $(SRCS) | sed "s/[^ ]* */\\naddlib &/g")
 printf "create $@$${addlibs}\\nsave\\nend" | $(AR) -M
 """,
    )
    # Create cc target from archive.
    native.cc_library(
        name = name,
-        srcs = [name + "_a"],
+        srcs = [archive],
-        deps = [
+        hdrs = hdrs,
-            "@local_config_cuda//cuda:cudart_static",
+        linkstatic = linkstatic,
        ],
        linkstatic = True,
    )