Use CUB from the CUDA Toolkit starting with version 11.0.

PiperOrigin-RevId: 327096097 Change-Id: I444ec3ac3348f76728c931a4bb4aa1b7cbe1b673
2020-08-17 14:12:02 -07:00 · 2020-08-17 14:12:02 -07:00 · c6769e20bf
commit c6769e20bf
parent 4112865ad4
8 changed files with 36 additions and 67 deletions
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@ -490,7 +490,7 @@ cc_library(
    name = "gpu_prim_hdrs",
    hdrs = ["gpu_prim.h"],
    deps = if_cuda([
-        "@cub_archive//:cub",
+        "@local_config_cuda//cuda:cub_headers",
    ]) + if_rocm([
        "@local_config_rocm//rocm:rocprim",
    ]),
@ -3896,7 +3896,7 @@ tf_kernel_library(
        "//tensorflow/core:framework",
        "//tensorflow/core:lib",
    ] + if_cuda([
-        "@cub_archive//:cub",
+        "@local_config_cuda//cuda:cub_headers",
        "@local_config_cuda//cuda:cudnn_header",
    ]) + if_rocm([
        "@local_config_rocm//rocm:rocprim",
@ -3986,7 +3986,7 @@ tf_kernel_library(
    ] + if_cuda_or_rocm([
        ":reduction_ops",
    ]) + if_cuda([
-        "@cub_archive//:cub",
+        "@local_config_cuda//cuda:cub_headers",
        "//tensorflow/core:stream_executor",
        "//tensorflow/stream_executor/cuda:cuda_stream",
    ]) + if_rocm([
@ -4708,7 +4708,7 @@ tf_kernel_library(
    ] + if_cuda_or_rocm([
        ":reduction_ops",
    ]) + if_cuda([
-        "@cub_archive//:cub",
+        "@local_config_cuda//cuda:cub_headers",
    ]) + if_rocm([
        "@local_config_rocm//rocm:rocprim",
    ]),
--- a/tensorflow/core/kernels/gpu_prim.h
+++ b/tensorflow/core/kernels/gpu_prim.h
@ -15,19 +15,19 @@ limitations under the license, the license you must see.
 #define TENSORFLOW_CORE_KERNELS_GPU_PRIM_H_

 #if GOOGLE_CUDA
-#include "third_party/cub/block/block_load.cuh"
-#include "third_party/cub/block/block_scan.cuh"
-#include "third_party/cub/block/block_store.cuh"
-#include "third_party/cub/device/device_histogram.cuh"
-#include "third_party/cub/device/device_radix_sort.cuh"
-#include "third_party/cub/device/device_reduce.cuh"
-#include "third_party/cub/device/device_segmented_radix_sort.cuh"
-#include "third_party/cub/device/device_segmented_reduce.cuh"
-#include "third_party/cub/device/device_select.cuh"
-#include "third_party/cub/iterator/counting_input_iterator.cuh"
-#include "third_party/cub/iterator/transform_input_iterator.cuh"
-#include "third_party/cub/thread/thread_operators.cuh"
-#include "third_party/cub/warp/warp_reduce.cuh"
+#include "cub/block/block_load.cuh"
+#include "cub/block/block_scan.cuh"
+#include "cub/block/block_store.cuh"
+#include "cub/device/device_histogram.cuh"
+#include "cub/device/device_radix_sort.cuh"
+#include "cub/device/device_reduce.cuh"
+#include "cub/device/device_segmented_radix_sort.cuh"
+#include "cub/device/device_segmented_reduce.cuh"
+#include "cub/device/device_select.cuh"
+#include "cub/iterator/counting_input_iterator.cuh"
+#include "cub/iterator/transform_input_iterator.cuh"
+#include "cub/thread/thread_operators.cuh"
+#include "cub/warp/warp_reduce.cuh"
 #include "third_party/gpus/cuda/include/cusparse.h"

 namespace gpuprim = ::cub;
--- a/tensorflow/core/util/BUILD
+++ b/tensorflow/core/util/BUILD
@ -626,7 +626,7 @@ tf_kernel_library(
        "//tensorflow/core:lib",
    ] + if_cuda([
        "//tensorflow/stream_executor/cuda:cusparse_lib",
-        "@cub_archive//:cub",
+        "@local_config_cuda//cuda:cub_headers",
    ]) + if_rocm([
        "@local_config_rocm//rocm:hipsparse",
    ]),
--- a/third_party/cub.BUILD
+++ b/third_party/cub.BUILD
@ -20,7 +20,6 @@ filegroup(
 cc_library(
    name = "cub",
    hdrs = if_cuda([":cub_header_files"]),
-    include_prefix = "third_party",
    deps = [
        "@local_config_cuda//cuda:cuda_headers",
    ],
--- a/third_party/cub.pr170.patch
+++ b/third_party/cub.pr170.patch
@ -1,48 +0,0 @@
-From fd6e7a61a16a17fa155cbd717de0c79001af71e6 Mon Sep 17 00:00:00 2001
-From: Artem Belevich <tra@google.com>
-Date: Mon, 23 Sep 2019 11:18:56 -0700
-Subject: [PATCH] Fix CUDA version detection in CUB
-
-This fixes the problem with CUB using deprecated shfl/vote instructions when CUB
-is compiled with clang (e.g. some TensorFlow builds).
---
- cub/util_arch.cuh | 3 ++-
- cub/util_type.cuh | 4 ++--
- 2 files changed, 4 insertions(+), 3 deletions(-)
-
-diff --git a/cub/util_arch.cuh b/cub/util_arch.cuh
-index 87c5ea2fb..9ad9d1cbb 100644
--- a/cub/util_arch.cuh
-+++ b/cub/util_arch.cuh
-@@ -44,7 +44,8 @@ namespace cub {
- 
- #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
- 
-#if (__CUDACC_VER_MAJOR__ >= 9) && !defined(CUB_USE_COOPERATIVE_GROUPS)
-+#if !defined(CUB_USE_COOPERATIVE_GROUPS) && \
-+    (__CUDACC_VER_MAJOR__ >= 9 || CUDA_VERSION >= 9000)
-     #define CUB_USE_COOPERATIVE_GROUPS
- #endif
- 
-diff --git a/cub/util_type.cuh b/cub/util_type.cuh
-index 0ba41e1ed..b2433d735 100644
--- a/cub/util_type.cuh
-+++ b/cub/util_type.cuh
-@@ -37,7 +37,7 @@
- #include <limits>
- #include <cfloat>
- 
-#if (__CUDACC_VER_MAJOR__ >= 9)
-+#if (__CUDACC_VER_MAJOR__ >= 9 || CUDA_VERSION >= 9000)
-     #include <cuda_fp16.h>
- #endif
- 
-@@ -1063,7 +1063,7 @@ struct FpLimits<double>
- };
- 
- 
-#if (__CUDACC_VER_MAJOR__ >= 9)
-+#if (__CUDACC_VER_MAJOR__ >= 9 || CUDA_VERSION >= 9000)
- template <>
- struct FpLimits<__half>
- {
--- a/third_party/gpus/cuda/BUILD.tpl
+++ b/third_party/gpus/cuda/BUILD.tpl
@ -176,6 +176,11 @@ cc_library(
    ],
 )

+alias(
+    name = "cub_headers",
+    actual = "%{cub_actual}"
+)
+
 cuda_header_library(
    name = "cupti_headers",
    hdrs = [":cuda-extras"],
@ -224,3 +229,4 @@ py_library(
 )

 %{copy_rules}
+
--- a/third_party/gpus/cuda/BUILD.windows.tpl
+++ b/third_party/gpus/cuda/BUILD.windows.tpl
@ -171,6 +171,11 @@ cc_library(
    ],
 )

+alias(
+    name = "cub_headers",
+    actual = "%{cub_actual}"
+)
+
 cuda_header_library(
    name = "cupti_headers",
    hdrs = [":cuda-extras"],
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@ -692,6 +692,7 @@ def _get_cuda_config(repository_ctx, find_cuda_config_script):
    return struct(
        cuda_toolkit_path = toolkit_path,
        cuda_version = cuda_version,
+        cuda_version_major = cuda_major,
        cublas_version = cublas_version,
        cusolver_version = cusolver_version,
        curand_version = curand_version,
@ -776,6 +777,7 @@ def _create_dummy_repository(repository_ctx):
            "%{curand_lib}": lib_name("curand", cpu_value),
            "%{cupti_lib}": lib_name("cupti", cpu_value),
            "%{cusparse_lib}": lib_name("cusparse", cpu_value),
+            "%{cub_actual}": ":cuda_headers",
            "%{copy_rules}": """
 filegroup(name="cuda-include")
 filegroup(name="cublas-include")
@ -1122,6 +1124,10 @@ def _create_local_cuda_repository(repository_ctx):
        },
    )

+    cub_actual = "@cub_archive//:cub"
+    if int(cuda_config.cuda_version_major) >= 11:
+        cub_actual = ":cuda_headers"
+
    repository_ctx.template(
        "cuda/BUILD",
        tpl_paths["cuda:BUILD"],
@ -1137,6 +1143,7 @@ def _create_local_cuda_repository(repository_ctx):
            "%{curand_lib}": _basename(repository_ctx, cuda_libs["curand"]),
            "%{cupti_lib}": _basename(repository_ctx, cuda_libs["cupti"]),
            "%{cusparse_lib}": _basename(repository_ctx, cuda_libs["cusparse"]),
+            "%{cub_actual}": cub_actual,
            "%{copy_rules}": "\n".join(copy_rules),
        },
    )