diff --git a/third_party/nccl/archive.BUILD b/third_party/nccl/archive.BUILD
index 22b97280179..7a08f97ef32 100644
--- a/third_party/nccl/archive.BUILD
+++ b/third_party/nccl/archive.BUILD
@@ -1,110 +1,157 @@
 # NVIDIA NCCL 2
 # A package of optimized primitives for collective multi-GPU communication.
 
-licenses(["notice"])
+licenses(["restricted"])
 
 exports_files(["LICENSE.txt"])
 
 load(
     "@local_config_nccl//:build_defs.bzl",
-    "cuda_rdc_library",
-    "gen_device_srcs",
-    "process_srcs",
+    "gen_nccl_h",
+    "nccl_library",
+    "rdc_copts",
+    "rdc_library",
 )
-load("@org_tensorflow//tensorflow:tensorflow.bzl", "tf_cuda_library")
-
-process_srcs(
-    name = "process_srcs",
-    srcs = glob([
-        "**/*.cc",
-        "**/*.h",
-    ]),
+load(
+    "@local_config_cuda//cuda:build_defs.bzl",
+    "cuda_default_copts",
 )
 
-cc_library(
+# Generate the nccl.h header file.
+gen_nccl_h(
+    name = "nccl_h",
+    output = "src/nccl.h",
+    template = "src/nccl.h.in",
+)
+
+nccl_library(
     name = "src_hdrs",
     hdrs = [
-        "src/collectives/collectives.h",
         "src/nccl.h",
+        # src/include/common_coll.h #includes "collectives/collectives.h".
+        # All other #includes of collectives.h are patched in process_srcs.
+        "src/collectives/collectives.h",
     ],
-    data = [":process_srcs"],
     strip_include_prefix = "src",
 )
 
-cc_library(
+nccl_library(
     name = "include_hdrs",
     hdrs = glob(["src/include/*.h"]),
-    data = [":process_srcs"],
     strip_include_prefix = "src/include",
 )
 
-cc_library(
+filegroup(
     name = "device_hdrs",
-    hdrs = glob(["src/collectives/device/*.h"]),
-    strip_include_prefix = "src/collectives/device",
+    srcs = glob(["src/collectives/device/*.h"]),
 )
 
 filegroup(
     name = "device_srcs",
     srcs = [
-        "src/collectives/device/all_gather.cu.cc",
-        "src/collectives/device/all_reduce.cu.cc",
-        "src/collectives/device/broadcast.cu.cc",
-        "src/collectives/device/reduce.cu.cc",
-        "src/collectives/device/reduce_scatter.cu.cc",
+        "src/collectives/device/all_gather.cu",
+        "src/collectives/device/all_reduce.cu",
+        "src/collectives/device/broadcast.cu",
+        "src/collectives/device/reduce.cu",
+        "src/collectives/device/reduce_scatter.cu",
     ],
 )
 
-# NCCL compiles the same source files with different NCCL_OP defines. RDC
-# compilation requires that each compiled module has a unique ID. Clang derives
-# the module ID from the path only so we need to rename the files to get
-# different IDs for different parts of compilation. NVCC does not have that
-# problem because it generates IDs based on preprocessed content.
-gen_device_srcs(
+nccl_library(
     name = "sum",
-    srcs = [":device_srcs"],
-    NCCL_OP = 0,
-)
-
-gen_device_srcs(
-    name = "prod",
-    srcs = [":device_srcs"],
-    NCCL_OP = 1,
-)
-
-gen_device_srcs(
-    name = "min",
-    srcs = [":device_srcs"],
-    NCCL_OP = 2,
-)
-
-gen_device_srcs(
-    name = "max",
-    srcs = [":device_srcs"],
-    NCCL_OP = 3,
-)
-
-cuda_rdc_library(
-    name = "device",
     srcs = [
-        "src/collectives/device/functions.cu.cc",
+        ":device_hdrs",
+        ":device_srcs",
+    ],
+    copts = ["-DNCCL_OP=0"] + rdc_copts(),
+    linkstatic = True,
+    prefix = "sum_",
+    deps = [
+        ":include_hdrs",
+        ":src_hdrs",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+)
+
+nccl_library(
+    name = "prod",
+    srcs = [
+        ":device_hdrs",
+        ":device_srcs",
+    ],
+    copts = ["-DNCCL_OP=1"] + rdc_copts(),
+    linkstatic = True,
+    prefix = "_prod",
+    deps = [
+        ":include_hdrs",
+        ":src_hdrs",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+)
+
+nccl_library(
+    name = "min",
+    srcs = [
+        ":device_hdrs",
+        ":device_srcs",
+    ],
+    copts = ["-DNCCL_OP=2"] + rdc_copts(),
+    linkstatic = True,
+    prefix = "min_",
+    deps = [
+        ":include_hdrs",
+        ":src_hdrs",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+)
+
+nccl_library(
+    name = "max",
+    srcs = [
+        ":device_hdrs",
+        ":device_srcs",
+    ],
+    copts = ["-DNCCL_OP=3"] + rdc_copts(),
+    linkstatic = True,
+    prefix = "max_",
+    deps = [
+        ":include_hdrs",
+        ":src_hdrs",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+)
+
+nccl_library(
+    name = "functions",
+    srcs = [
+        "src/collectives/device/functions.cu",
+        ":device_hdrs",
+    ],
+    copts = rdc_copts(),
+    linkstatic = True,
+    deps = [
+        ":include_hdrs",
+        ":src_hdrs",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+)
+
+rdc_library(
+    name = "device_code",
+    deps = [
+        ":functions",
         ":max",
         ":min",
         ":prod",
         ":sum",
     ],
-    deps = [
-        ":device_hdrs",
-        ":include_hdrs",
-        ":src_hdrs",
-    ],
 )
 
 # Primary NCCL target.
-tf_cuda_library(
+nccl_library(
     name = "nccl",
     srcs = glob(
-        include = ["src/**/*.cu.cc"],
+        include = ["src/**/*.cu"],
         # Exclude device-library code.
         exclude = ["src/collectives/device/**"],
     ) + [
@@ -115,14 +162,13 @@ tf_cuda_library(
         "src/nccl.h",
     ],
     hdrs = ["src/nccl.h"],
-    copts = ["-Wno-vla"],
+    copts = cuda_default_copts(),
     include_prefix = "third_party/nccl",
     strip_include_prefix = "src",
     visibility = ["//visibility:public"],
     deps = [
-        ":device",
+        ":device_code",
         ":include_hdrs",
         ":src_hdrs",
-        "@local_config_cuda//cuda:cudart_static",
     ],
 )
diff --git a/third_party/nccl/build_defs.bzl.tpl b/third_party/nccl/build_defs.bzl.tpl
index fe16f10432f..42de79c411c 100644
--- a/third_party/nccl/build_defs.bzl.tpl
+++ b/third_party/nccl/build_defs.bzl.tpl
@@ -1,86 +1,87 @@
 """Repository rule for NCCL."""
 
-load("@org_tensorflow//tensorflow:tensorflow.bzl", "tf_cuda_library")
-load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain")
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
+
+def _gen_nccl_h_impl(ctx):
+    """Creates nccl.h from a template."""
+    ctx.actions.expand_template(
+        output = ctx.outputs.output,
+        template = ctx.file.template,
+        substitutions = {
+            "${nccl:Major}": "2",
+            "${nccl:Minor}": "3",
+            "${nccl:Patch}": "5",
+            "${nccl:Suffix}": "",
+            "${nccl:Version}": "2305",
+        },
+    )
+
+gen_nccl_h = rule(
+    implementation = _gen_nccl_h_impl,
+    attrs = {
+        "template": attr.label(allow_single_file = True),
+        "output": attr.output(),
+    },
+)
+"""Creates the NCCL header file."""
 
 def _process_srcs_impl(ctx):
     """Appends .cc to .cu files, patches include directives."""
     files = []
     for src in ctx.files.srcs:
-        substitutions = {
-            "\"collectives.h": "\"collectives/collectives.h",
-            "\"../collectives.h": "\"collectives/collectives.h",
-            # Clang does not define __CUDACC_VER_*__, use CUDA_VERSION instead.
-            # TODO(csigg): Apply substitutions upstream and remove here.
-            "#if __CUDACC_VER_MAJOR__ >= 10 || (__CUDACC_VER_MAJOR__ >= 9 && __CUDACC_VER_MINOR__ >= 2)": "#if CUDA_VERSION >= 9200",
-            "#if __CUDACC_VER_MAJOR__ >= 10": "#if CUDA_VERSION >= 10000",
-            "#if __CUDACC_VER_MAJOR__ >= 9": "#if CUDA_VERSION >= 9000",
-            "#if __CUDACC_VER_MAJOR__ < 9": "#if CUDA_VERSION < 9000",
-            "nullptr_t": "std::nullptr_t",
-        }
+        if not src.is_source:
+            # Process only once, specifically "src/nccl.h".
+            files.append(src)
+            continue
         name = src.basename
-        if name == "nccl.in.h":
-            name = "nccl.h"
-            substitutions.update({
-                "${nccl:Major}": "2",
-                "${nccl:Minor}": "3",
-                "${nccl:Patch}": "5",
-                "${nccl:Suffix}": "",
-                "${nccl:Version}": "2305",
-            })
-        if name == "functions.cu":
-            # Don't try to initialize the host shadow copy of this device-side
-            # global variable. There is no host pointer to a device-side
-            # function, which confuses clang.
-            # TODO(csigg): remove when fixed in clang.
-            substitutions.update({
-                "NCCL_FUNCS2B(ncclBroadcast),": "#if __CUDA_ARCH__\nNCCL_FUNCS2B(ncclBroadcast),",
-                "NCCL_FUNCS2A(ncclAllReduce)": "NCCL_FUNCS2A(ncclAllReduce)\n#endif",
-            })
         if src.extension == "cu":
-            name += ".cc"
-        file = ctx.actions.declare_file(name, sibling = src)
-        ctx.actions.expand_template(
-            output = file,
-            template = src,
-            substitutions = substitutions,
-        )
-        files.append(file)
-    return [DefaultInfo(files = depset(files))]
-
-process_srcs = rule(
-    implementation = _process_srcs_impl,
-    attrs = {
-        "srcs": attr.label_list(allow_files = True),
-    },
-)
-"""Processes the NCCL srcs so they can be compiled with bazel and clang."""
-
-def _gen_device_srcs_impl(ctx):
-    files = []
-    for src in ctx.files.srcs:
-        name = "%s_%s" % (ctx.attr.name, src.basename)
+            name = ctx.attr.prefix + name + ".cc"
         file = ctx.actions.declare_file(name, sibling = src)
         ctx.actions.expand_template(
             output = file,
             template = src,
             substitutions = {
-                "#define UNROLL 4": "#define UNROLL 4\n#define NCCL_OP %d" % ctx.attr.NCCL_OP,
+                "\"collectives.h": "\"collectives/collectives.h",
+                "\"../collectives.h": "\"collectives/collectives.h",
+                "#if __CUDACC_VER_MAJOR__": "#if defined __CUDACC_VER_MAJOR__ && __CUDACC_VER_MAJOR__",
+                # Substitutions are applied in order.
+                "std::nullptr_t": "nullptr_t",
+                "nullptr_t": "std::nullptr_t",
             },
         )
         files.append(file)
     return [DefaultInfo(files = depset(files))]
 
-gen_device_srcs = rule(
-    implementation = _gen_device_srcs_impl,
+_process_srcs = rule(
+    implementation = _process_srcs_impl,
     attrs = {
         "srcs": attr.label_list(allow_files = True),
-        "NCCL_OP": attr.int(),
+        "prefix": attr.string(default = ""),
     },
 )
-"""Adds prefix to each file name in srcs and adds #define NCCL_OP."""
+"""Processes the NCCL srcs so they can be compiled with bazel and clang."""
 
-def _rdc_copts():
+def nccl_library(name, srcs = None, hdrs = None, prefix = None, **kwargs):
+    """Processes the srcs and hdrs and creates a cc_library."""
+
+    _process_srcs(
+        name = name + "_srcs",
+        srcs = srcs,
+        prefix = prefix,
+    )
+    _process_srcs(
+        name = name + "_hdrs",
+        srcs = hdrs,
+    )
+
+    native.cc_library(
+        name = name,
+        srcs = [name + "_srcs"] if srcs else [],
+        hdrs = [name + "_hdrs"] if hdrs else [],
+        **kwargs
+    )
+
+def rdc_copts():
     """Returns copts for compiling relocatable device code."""
 
     # The global functions can not have a lower register count than the
@@ -88,7 +89,7 @@ def _rdc_copts():
     # https://github.com/NVIDIA/nccl/blob/f93fe9bfd94884cec2ba711897222e0df5569a53/makefiles/common.mk#L48
     maxrregcount = "-maxrregcount=96"
 
-    return select({
+    return cuda_default_copts() + select({
         "@local_config_cuda//cuda:using_nvcc": [
             "-nvcc_options",
             "relocatable-device-code=true",
@@ -99,255 +100,118 @@ def _rdc_copts():
             "-fcuda-rdc",
             "-Xcuda-ptxas",
             maxrregcount,
-            # Work around for clang bug (fixed in r348662), declaring
-            # '__device__ operator delete(void*, std::size_t)' non-inline.
-            # TODO(csigg): Only add this option for older clang versions.
-            "-std=gnu++11",
         ],
         "//conditions:default": [],
-    })
+    }) + ["-fvisibility=hidden"]
 
-def _lookup_file(filegroup, path):
-    """Extracts file at (relative) path in filegroup."""
-    for file in filegroup.files:
-        if file.path.endswith(path):
-            return file
-    return None
+def _filter_impl(ctx):
+    suffix = ctx.attr.suffix
+    files = [src for src in ctx.files.srcs if src.path.endswith(suffix)]
+    return [DefaultInfo(files = depset(files))]
 
-def _pic_only(files):
-    """Returns the PIC files if there are any in 'files', otherwise 'files'."""
-    pic_only = [f for f in files if f.basename.find(".pic.") >= 0]
-    return pic_only if pic_only else files
+_filter = rule(
+    implementation = _filter_impl,
+    attrs = {
+        "srcs": attr.label_list(allow_files = True),
+        "suffix": attr.string(),
+    },
+)
+"""Filters the srcs to the ones ending with suffix."""
 
-def _device_link_impl(ctx):
-    if not ctx.attr.gpu_archs:
-        fail("No GPU architecture specified. NCCL requires --config=cuda or similar.")
-
-    inputs = []
-    for dep in ctx.attr.deps:
-        inputs += dep.files.to_list()
-    inputs = _pic_only(inputs)
-
-    # Device-link to cubins for each architecture.
-    name = ctx.attr.name
-    register_h = None
-    cubins = []
-    images = []
-    for arch in ctx.attr.gpu_archs:
-        cubin = ctx.actions.declare_file("%s_%s.cubin" % (name, arch))
-        register_h = ctx.actions.declare_file("%s_register_%s.h" % (name, arch))
-        ctx.actions.run(
-            outputs = [register_h, cubin],
-            inputs = inputs,
-            executable = ctx.file._nvlink,
-            arguments = ctx.attr.nvlink_args + [
-                "--arch=%s" % arch,
-                "--register-link-binaries=%s" % register_h.path,
-                "--output-file=%s" % cubin.path,
-            ] + [file.path for file in inputs],
-            mnemonic = "nvlink",
-        )
-        cubins.append(cubin)
-        images.append("--image=profile=%s,file=%s" % (arch, cubin.path))
-
-    # Generate fatbin header from all cubins.
-    tmp_fatbin = ctx.actions.declare_file("%s.fatbin" % name)
-    fatbin_h = ctx.actions.declare_file("%s_fatbin.h" % name)
-    bin2c = ctx.file._bin2c
-    ctx.actions.run(
-        outputs = [tmp_fatbin, fatbin_h],
-        inputs = cubins,
-        executable = ctx.file._fatbinary,
-        arguments = [
-            "-64",
-            "--cmdline=--compile-only",
-            "--link",
-            "--compress-all",
-            "--bin2c-path=%s" % bin2c.dirname,
-            "--create=%s" % tmp_fatbin.path,
-            "--embedded-fatbin=%s" % fatbin_h.path,
-        ] + images,
-        tools = [bin2c],
-        mnemonic = "fatbinary",
-    )
-
-    # Generate the source file #including the headers generated above.
+def _gen_link_src_impl(ctx):
     ctx.actions.expand_template(
-        output = ctx.outputs.out,
-        template = ctx.file._link_stub,
+        output = ctx.outputs.output,
+        template = ctx.file.template,
         substitutions = {
-            "REGISTERLINKBINARYFILE": '"%s"' % register_h.short_path,
-            "FATBINFILE": '"%s"' % fatbin_h.short_path,
+            "REGISTERLINKBINARYFILE": '"%s"' % ctx.file.register_hdr.short_path,
+            "FATBINFILE": '"%s"' % ctx.file.fatbin_hdr.short_path,
         },
     )
 
-    return [DefaultInfo(files = depset([register_h, fatbin_h]))]
-
-_device_link = rule(
-    implementation = _device_link_impl,
+_gen_link_src = rule(
+    implementation = _gen_link_src_impl,
     attrs = {
-        "deps": attr.label_list(),
-        "out": attr.output(mandatory = True),
-        "gpu_archs": attr.string_list(),
-        "nvlink_args": attr.string_list(),
-        "_nvlink": attr.label(
-            default = Label("@local_config_nccl//:nvlink"),
-            allow_single_file = True,
-            executable = True,
-            cfg = "host",
-        ),
-        "_fatbinary": attr.label(
-            default = Label("@local_config_nccl//:cuda/bin/fatbinary"),
-            allow_single_file = True,
-            executable = True,
-            cfg = "host",
-        ),
-        "_bin2c": attr.label(
-            default = Label("@local_config_nccl//:cuda/bin/bin2c"),
-            allow_single_file = True,
-            executable = True,
-            cfg = "host",
-        ),
-        "_link_stub": attr.label(
-            default = Label("@local_config_nccl//:cuda/bin/crt/link.stub"),
-            allow_single_file = True,
-        ),
+        "register_hdr": attr.label(allow_single_file = True),
+        "fatbin_hdr": attr.label(allow_single_file = True),
+        "template": attr.label(allow_single_file = True),
+        "output": attr.output(),
     },
 )
-"""Links device code and generates source code for kernel registration."""
+"""Patches the include directives for the link.stub file."""
 
-def _merge_archive_impl(ctx):
-    # Generate an mri script to the merge archives in srcs and pass it to 'ar'.
-    # See https://stackoverflow.com/a/23621751.
-    files = _pic_only(ctx.files.srcs)
-    mri_script = "create " + ctx.outputs.out.path
-    for f in files:
-        mri_script += "\\naddlib " + f.path
-    mri_script += "\\nsave\\nend"
+def rdc_library(name, deps):
+    """Produces a cc_library from deps containing relocatable device code."""
 
-    cc_toolchain = find_cpp_toolchain(ctx)
-    ctx.actions.run_shell(
-        inputs = ctx.files.srcs,  # + ctx.files._crosstool,
-        outputs = [ctx.outputs.out],
-        command = ("printf \"%s\" " % mri_script +
-                   "| %s -M" % cc_toolchain.ar_executable),
+    # From .a and .pic.a archives, just use the latter. Otherwise we get
+    # multiply defined symbols.
+    # TODO(csigg): C++ Sandwich once available should allow passing this target
+    # to a cc_library dependency, which would avoid the linking order issue.
+    _filter(
+        name = name + "_deps_a",
+        srcs = deps,
+        suffix = ".pic.a",
     )
 
-_merge_archive = rule(
-    implementation = _merge_archive_impl,
-    attrs = {
-        "srcs": attr.label_list(mandatory = True, allow_files = True),
-        "_cc_toolchain": attr.label(default = "@bazel_tools//tools/cpp:current_cc_toolchain"),
-        # "_crosstool": attr.label_list(cfg = "host", default = ["@bazel_tools//tools/cpp:crosstool"]),
-    },
-    outputs = {"out": "lib%{name}.a"},
-)
-"""Merges srcs into a single archive."""
+    # Device-link to cubins for each architecture.
+    images = []
+    cubins = []
+    for arch in %{gpu_architectures}:
+        cubin = "%s_%s.cubin" % (name, arch)
+        register_hdr = "%s_%s.h" % (name, arch)
+        nvlink = "@local_config_nccl//:nvlink"
+        cmd = ("$(location %s) " % nvlink +
+               select({
+                   # NCCL is only supported on Linux.
+                   "@org_tensorflow//tensorflow:linux_x86_64": "--cpu-arch=X86_64 ",
+                   "@org_tensorflow//tensorflow:linux_ppc64le": "--cpu-arch=PPC64LE ",
+                   "//conditions:default": "",
+               }) +
+               "--arch=%s $(SRCS) " % arch +
+               "--register-link-binaries=$(location %s) " % register_hdr +
+               "--output-file=$(location %s)" % cubin)
+        native.genrule(
+            name = "%s_%s" % (name, arch),
+            outs = [register_hdr, cubin],
+            srcs = [name + "_deps_a"],
+            cmd = cmd,
+            tools = [nvlink],
+        )
+        images.append("--image=profile=%s,file=$(location %s)" % (arch, cubin))
+        cubins.append(cubin)
 
-def cuda_rdc_library(name, hdrs = None, copts = None, linkstatic = True, **kwargs):
-    """Produces a cuda_library using separate compilation and linking.
-
-    CUDA separate compilation and linking allows device function calls across
-    translation units. This is different from the normal whole program
-    compilation where each translation unit contains all device code. For more
-    background, see
-    https://devblogs.nvidia.com/separate-compilation-linking-cuda-device-code/,
-    https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#nvcc-options-for-separate-compilation
-
-    During separate compilation, the different CUDA source files are compiled
-    to 'relocatable device code' (RDC) and embedded in the host object files.
-    When using nvcc, linking the device code for each supported GPU
-    architecture and generating kernel registration code for the CUDA runtime
-    is handled automatically. Clang supports generating relocatable device
-    code, but it can't link it. We therefore rely on tools provided by the CUDA
-    SDK to link the device code and generate the host code to register the
-    kernels.
-
-    The nvlink tool extracts the RDC code from the object files and links it
-    into cubin files, one per GPU architecture. It also produces a header file
-    with a list of kernel names to register. The cubins are merged into a
-    binary blob using the fatbinary tool, and converted to a C header file with
-    the help of the bin2c tool. The registration header file, the fatbinary
-    header file, and the link.stub file (shipped with the CUDA SDK) are
-    compiled as ordinary host code.
-
-    Here is a diagram of the CUDA separate compilation trajectory:
-
-     x.cu.cc    y.cu.cc
-           \    /            cc_library (compile RDC and archive)
-            xy.a
-           /    \            * nvlink
-    register.h  xy.cubin
-          :      |           * fatbinary and bin2c
-          :     xy.fatbin.h
-          :      :           * #include
-          dlink.cc           * Expanded from crt/dlink.stub template
-             |               cc_library (host compile and archive)
-          dlink.a
-
-    The steps marked with '*' are implemented in the _device_link rule.
-
-    The object files in both xy.a and dlink.a reference symbols defined in the
-    other archive. The separate archives are a side effect of using two
-    cc_library targets to implement a single compilation trajectory. We could
-    fix this once bazel supports C++ sandwich. For now, we just merge the two
-    archives to avoid unresolved symbols:
-
-    xy.a      dlink.a
-        \    /           merge archive
-      xy_dlink.a
-           |             cc_library (or alternatively, cc_import)
-     final target
-
-    Another complication is that cc_library produces (depending on the
-    configuration) both PIC and non-PIC archives, but the distinction
-    is hidden from Starlark until C++ sandwich becomes available. We work
-    around this by dropping the non-PIC files if PIC files are available.
-
-    Args:
-      name: Target name.
-      hdrs: Header files.
-      copts: Compiler options.
-      linkstatic: Must be true.
-      **kwargs: Any other arguments.
-    """
-
-    if not hdrs:
-        hdrs = []
-    if not copts:
-        copts = []
-
-    # Compile host and device code into library.
-    lib = name + "_lib"
-    tf_cuda_library(
-        name = lib,
-        hdrs = hdrs,
-        copts = _rdc_copts() + copts,
-        linkstatic = linkstatic,
-        **kwargs
+    # Generate fatbin header from all cubins.
+    fatbin_hdr = name + ".fatbin.h"
+    fatbinary = "@local_config_nccl//:cuda/bin/fatbinary"
+    bin2c = "@local_config_nccl//:cuda/bin/bin2c"
+    cmd = ("$(location %s) -64 --cmdline=--compile-only " % fatbinary +
+           "--link --bin2c-path $$(dirname $(location %s)) " % bin2c +
+           "--compress-all %s --create=%%{name}.fatbin " % " ".join(images) +
+           "--embedded-fatbin=$@")
+    native.genrule(
+        name = name + "_fatbin_h",
+        outs = [fatbin_hdr],
+        srcs = cubins,
+        cmd = cmd,
+        tools = [fatbinary, bin2c],
     )
 
-    # Generate source file containing linked device code.
-    dlink_hdrs = name + "_dlink_hdrs"
-    dlink_cc = name + "_dlink.cc"
-    _device_link(
-        name = dlink_hdrs,
-        deps = [lib],
-        out = dlink_cc,
-        gpu_archs = %{gpu_architectures},
-        nvlink_args = select({
-            "@org_tensorflow//tensorflow:linux_x86_64": ["--cpu-arch=X86_64"],
-            "@org_tensorflow//tensorflow:linux_ppc64le": ["--cpu-arch=PPC64LE"],
-            "//conditions:default": [],
-        }),
+    # Generate the source file #including the headers generated above.
+    _gen_link_src(
+        name = name + "_dlink_src",
+        # Include just the last one, they are equivalent.
+        register_hdr = register_hdr,
+        fatbin_hdr = fatbin_hdr,
+        template = "@local_config_nccl//:cuda/bin/crt/link.stub",
+        output = name + ".cc",
     )
 
-    # Compile the source file into a library.
-    dlink = name + "_dlink"
+    # Compile the source file into the cc_library.
     native.cc_library(
-        name = dlink,
-        srcs = [dlink_cc],
-        textual_hdrs = [dlink_hdrs],
+        name = name + "_dlink_a",
+        srcs = [
+            name + "_dlink_src",
+        ],
+        textual_hdrs = [register_hdr, fatbin_hdr],
         deps = [
             "@local_config_cuda//cuda:cuda_headers",
         ],
@@ -358,22 +222,31 @@ def cuda_rdc_library(name, hdrs = None, copts = None, linkstatic = True, **kwarg
             "__NV_EXTRA_INITIALIZATION=",
             "__NV_EXTRA_FINALIZATION=",
         ],
-        linkstatic = linkstatic,
+        linkstatic = True,
     )
 
-    # Repackage the two libs into a single archive. This is required because
-    # both libs reference symbols defined in the other one. For details, see
+    # Repackage deps into a single archive. This avoid unresolved symbols when
+    # the archives happen to be linked in the wrong order. For more details, see
     # https://eli.thegreenplace.net/2013/07/09/library-order-in-static-linking
-    archive = name + "_a"
-    _merge_archive(
-        name = archive,
-        srcs = [lib, dlink],
+    native.genrule(
+        name = name + "_a",
+        srcs = [
+            name + "_deps_a",
+            name + "_dlink_a",
+        ],
+        outs = [name + ".a"],
+        # See https://stackoverflow.com/a/23621751
+        cmd = """
+addlibs=$$(echo $(SRCS) | sed "s/[^ ]* */\\naddlib &/g")
+printf "create $@$${addlibs}\\nsave\\nend" | $(AR) -M
+""",
     )
 
-    # Create cc target from archive.
     native.cc_library(
         name = name,
-        srcs = [archive],
-        hdrs = hdrs,
-        linkstatic = linkstatic,
+        srcs = [name + "_a"],
+        deps = [
+            "@local_config_cuda//cuda:cudart_static",
+        ],
+        linkstatic = True,
     )