PR #31485: [ROCm] add ROCm RCCL support

Imported from GitHub PR #31485 Copybara import of the project: - ba5748981bb02b9d0e91114cdc30eb64d1650a46 add ROCm RCCL support by Jeff Daily <jeff.daily@amd.com> - 6f887a19731f030be58495ae4fea98b3ad1f1cc3 run buildifier against tensorflow/core/nccl/BUILD by Jeff Daily <jeff.daily@amd.com> - 55ce583cf484953d90eb9b9310dc77cf63b4c0c9 Merge 6f887a19731f030be58495ae4fea98b3ad1f1cc3 into f9233... by Jeff Daily <jeff.daily@amd.com> PiperOrigin-RevId: 264892468
2019-08-22 12:22:14 -07:00 · 2019-08-22 12:22:14 -07:00 · b8f3b8d28b
commit b8f3b8d28b
parent b84c2a888f
16 changed files with 111 additions and 34 deletions
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@ -194,6 +194,17 @@ tf_cc_test(
    ],
 )

+# virtual targets since nested select statements not possible
+tf_kernel_library(
+    name = "virtual_nccl",
+    deps = if_cuda(["@local_config_nccl//:nccl"]),
+)
+
+tf_kernel_library(
+    name = "virtual_rccl",
+    deps = if_rocm(["@local_config_rocm//rocm:rccl"]),
+)
+
 tf_kernel_library(
    name = "collective_ops",
    srcs = if_nccl([
@ -213,7 +224,8 @@ tf_kernel_library(
        "//tensorflow/core:protos_all_cc",
        "//tensorflow/core/profiler/lib:traceme",
    ] + if_nccl([
-        "@local_config_nccl//:nccl",
+        ":virtual_nccl",
+        ":virtual_rccl",
        "//tensorflow/core/nccl:nccl_lib",
    ]),
 )
@ -382,11 +394,14 @@ cc_library(

 tf_kernel_library(
    name = "nccl_kernels",
-    srcs = if_cuda([
+    srcs = if_cuda_or_rocm([
        "nccl_ops.cc",
    ]),
    deps = if_cuda([
        "@local_config_nccl//:nccl",
+    ]) + if_rocm([
+        "@local_config_rocm//rocm:rccl",
+    ]) + if_cuda_or_rocm([
        "//tensorflow/core/nccl:nccl_lib",
        "//tensorflow/core:framework",
        "//tensorflow/core:gpu_headers_lib",
--- a/tensorflow/core/kernels/collective_nccl.cc
+++ b/tensorflow/core/kernels/collective_nccl.cc
@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/kernels/collective_nccl.h"

-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #include "tensorflow/core/common_runtime/collective_util.h"
 #include "tensorflow/core/nccl/nccl_manager.h"
@ -79,4 +79,4 @@ const string NcclBase::NcclCollectiveKey(const string& exec_key, int step_id) {

 }  // namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/collective_nccl.h
+++ b/tensorflow/core/kernels/collective_nccl.h
@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/core/framework/collective.h"

 namespace tensorflow {
-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 class NcclBase : public CollectiveImplementationInterface {
 public:
@ -44,7 +44,7 @@ class NcclBase : public CollectiveImplementationInterface {
  const CollectiveParams* col_params_;  // Not owned
 };

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 }  // namespace tensorflow

 #endif  // TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_H_
--- a/tensorflow/core/kernels/collective_nccl_broadcaster.cc
+++ b/tensorflow/core/kernels/collective_nccl_broadcaster.cc
@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/kernels/collective_nccl_broadcaster.h"

-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #include "tensorflow/core/common_runtime/collective_util.h"
 #include "tensorflow/core/nccl/nccl_manager.h"
@ -80,4 +80,4 @@ REGISTER_COLLECTIVE(NcclBroadcast, NcclBroadcaster);

 }  // namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/collective_nccl_broadcaster.h
+++ b/tensorflow/core/kernels/collective_nccl_broadcaster.h
@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/collective_nccl.h"

 namespace tensorflow {
-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 class NcclBroadcaster : public NcclBase {
 public:
@ -29,7 +29,7 @@ class NcclBroadcaster : public NcclBase {
  void Run(StatusCallback done) override;
 };

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 }  // namespace tensorflow

 #endif  // TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_BROADCASTER_H_
--- a/tensorflow/core/kernels/collective_nccl_gatherer.cc
+++ b/tensorflow/core/kernels/collective_nccl_gatherer.cc
@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/kernels/collective_nccl_gatherer.h"

-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #include "tensorflow/core/common_runtime/collective_util.h"
 #include "tensorflow/core/nccl/nccl_manager.h"
@ -70,4 +70,4 @@ REGISTER_COLLECTIVE(NcclGather, NcclGatherer);

 }  // namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/collective_nccl_gatherer.h
+++ b/tensorflow/core/kernels/collective_nccl_gatherer.h
@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/collective_nccl.h"

 namespace tensorflow {
-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 class NcclGatherer : public NcclBase {
 public:
@ -29,7 +29,7 @@ class NcclGatherer : public NcclBase {
  void Run(StatusCallback done) override;
 };

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 }  // namespace tensorflow

 #endif  // TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_GATHERER_H_
--- a/tensorflow/core/kernels/collective_nccl_reducer.cc
+++ b/tensorflow/core/kernels/collective_nccl_reducer.cc
@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/kernels/collective_nccl_reducer.h"

-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #include "tensorflow/core/common_runtime/collective_util.h"
 #include "tensorflow/core/nccl/nccl_manager.h"
@ -191,4 +191,4 @@ REGISTER_COLLECTIVE(NcclReduce, NcclReducer);

 }  // namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/collective_nccl_reducer.h
+++ b/tensorflow/core/kernels/collective_nccl_reducer.h
@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/collective_nccl.h"

 namespace tensorflow {
-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 class NcclReducer : public NcclBase {
 public:
@ -29,7 +29,7 @@ class NcclReducer : public NcclBase {
  void Run(StatusCallback done) override;
 };

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 }  // namespace tensorflow

 #endif  // TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_REDUCER_H_
--- a/tensorflow/core/kernels/nccl_ops.cc
+++ b/tensorflow/core/kernels/nccl_ops.cc
@ -13,11 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #include <vector>

+#if GOOGLE_CUDA
 #include "third_party/nccl/nccl.h"
+#elif TENSORFLOW_USE_ROCM
+#include "rocm/include/rccl/rccl.h"
+#endif
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/nccl/nccl_manager.h"

@ -276,4 +280,4 @@ REGISTER_KERNEL_BUILDER(Name("NcclReduce").Device(DEVICE_GPU), NcclStubKernel);
 }  // namespace
 }  // namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/nccl/BUILD
+++ b/tensorflow/core/nccl/BUILD
@ -5,6 +5,8 @@
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_copts")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
+load("//tensorflow:tensorflow.bzl", "if_cuda_or_rocm")
 load(
    "//tensorflow/core/platform:default/build_config_root.bzl",
    "tf_cuda_tests_tags",
@ -19,18 +21,21 @@ exports_files(["LICENSE"])

 cc_library(
    name = "nccl_lib",
-    srcs = if_cuda([
+    srcs = if_cuda_or_rocm([
        "nccl_manager.cc",
        "nccl_rewrite.cc",
    ]),
-    hdrs = if_cuda([
+    hdrs = if_cuda_or_rocm([
        "nccl_manager.h",
    ]),
    copts = tf_copts(),
    deps = if_cuda([
-        "@com_google_absl//absl/container:flat_hash_map",
        "@com_google_absl//absl/memory",
        "@local_config_nccl//:nccl",
+    ]) + if_rocm([
+        "@local_config_rocm//rocm:rccl",
+    ]) + if_cuda_or_rocm([
+        "@com_google_absl//absl/container:flat_hash_map",
        "//tensorflow/core:core_cpu",
        "//tensorflow/core:framework",
        "//tensorflow/core:gpu_headers_lib",
@ -51,9 +56,13 @@ tf_cuda_cc_test(
        "//tensorflow/core:test",
        "//tensorflow/core:test_main",
        "//tensorflow/core:testlib",
-    ] + if_cuda([
+    ] + if_cuda_or_rocm([
        ":nccl_lib",
+    ]) + if_cuda([
        "@local_config_nccl//:nccl",
        "//tensorflow/core:cuda",
+    ]) + if_rocm([
+        "@local_config_rocm//rocm:rccl",
+        "//tensorflow/core:rocm",
    ]),
 )
--- a/tensorflow/core/nccl/nccl_manager.cc
+++ b/tensorflow/core/nccl/nccl_manager.cc
@ -16,15 +16,32 @@ limitations under the License.

 #include <utility>

-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/platform/cuda.h"
 #include "tensorflow/core/platform/env.h"
+#if GOOGLE_CUDA
+#include "tensorflow/core/platform/cuda.h"
+#elif TENSORFLOW_USE_ROCM
+#include "tensorflow/core/platform/rocm.h"
+#endif

 namespace tensorflow {

+#if GOOGLE_CUDA
+using se::cuda::ScopedActivateExecutorContext;
+#elif TENSORFLOW_USE_ROCM
+using se::rocm::ScopedActivateExecutorContext;
+// Local hipify of cuda symbols
+#define cudaError_t hipError_t
+#define cudaStream_t hipStream_t
+#define cudaGetErrorString hipGetErrorString
+#define cudaGetDevice hipGetDevice
+#define cudaSetDevice hipSetDevice
+#define cudaSuccess hipSuccess
+#endif
+
 #define NCCL_RETURN_IF_ERROR(...)                               \
  do {                                                          \
    ncclResult_t nccl_status = (__VA_ARGS__);                   \
@ -41,8 +58,6 @@ namespace tensorflow {
    }                                                           \
  } while (0)

-using se::cuda::ScopedActivateExecutorContext;
-
 // Contains data for a single stream used for nccl communication; this includes
 // a background thread that calls NcclManager::LoopKernelLaunches.
 struct NcclManager::NcclStream : public core::RefCounted {
@ -709,4 +724,4 @@ void NcclManager::LoopKernelLaunches(NcclStream* nccl_stream) {

 }  // namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/nccl/nccl_manager.h
+++ b/tensorflow/core/nccl/nccl_manager.h
@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_NCCL_NCCL_MANAGER_H_
 #define TENSORFLOW_CORE_NCCL_NCCL_MANAGER_H_

-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #include <vector>

@ -28,7 +28,11 @@ limitations under the License.

 #include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
+#if GOOGLE_CUDA
 #include "third_party/nccl/nccl.h"
+#elif TENSORFLOW_USE_ROCM
+#include "rocm/include/rccl/rccl.h"
+#endif
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/mutex.h"
@ -245,6 +249,6 @@ class NcclManager {

 }  // namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #endif  // TENSORFLOW_CORE_NCCL_NCCL_MANAGER_H_
--- a/tensorflow/core/nccl/nccl_manager_test.cc
+++ b/tensorflow/core/nccl/nccl_manager_test.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #include "tensorflow/core/nccl/nccl_manager.h"

@ -802,4 +802,4 @@ TYPED_TEST(NcclManagerTest, BroadcastInconsistentSource) {

 }  // namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/third_party/gpus/rocm/BUILD.tpl
+++ b/third_party/gpus/rocm/BUILD.tpl
@ -84,6 +84,18 @@ cc_library(
    visibility = ["//visibility:public"],
 )

+cc_library(
+    name = "rccl",
+    srcs = ["rocm/lib/%{rccl_lib}"],
+    data = ["rocm/lib/%{rccl_lib}"],
+    includes = [
+        ".",
+        "rocm/include",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
    name = "rocm",
    visibility = ["//visibility:public"],
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@ -201,6 +201,9 @@ def _rocm_include_path(repository_ctx, rocm_config):
    # Add MIOpen headers
    inc_dirs.append("/opt/rocm/miopen/include")

+    # Add RCCL headers
+    inc_dirs.append("/opt/rocm/rccl/include")
+
    # Add hcc headers
    inc_dirs.append("/opt/rocm/hcc/include")
    inc_dirs.append("/opt/rocm/hcc/compiler/lib/clang/7.0.0/include/")
@ -472,6 +475,12 @@ def _find_libs(repository_ctx, rocm_config):
            cpu_value,
            rocm_config.rocm_toolkit_path + "/miopen",
        ),
+        "rccl": _find_rocm_lib(
+            "rccl",
+            repository_ctx,
+            cpu_value,
+            rocm_config.rocm_toolkit_path + "/rccl",
+        ),
    }

 def _get_rocm_config(repository_ctx):
@ -554,6 +563,7 @@ def _create_dummy_repository(repository_ctx):
            "%{hip_lib}": _lib_name("hip", cpu_value),
            "%{rocblas_lib}": _lib_name("rocblas", cpu_value),
            "%{miopen_lib}": _lib_name("miopen", cpu_value),
+            "%{rccl_lib}": _lib_name("rccl", cpu_value),
            "%{rocfft_lib}": _lib_name("rocfft", cpu_value),
            "%{hiprand_lib}": _lib_name("hiprand", cpu_value),
            "%{copy_rules}": "",
@ -695,6 +705,12 @@ def _create_local_rocm_repository(repository_ctx):
            src_dir = rocm_toolkit_path + "/miopen/include",
            out_dir = "rocm/include/miopen",
        ),
+        make_copy_dir_rule(
+            repository_ctx,
+            name = "rccl-include",
+            src_dir = rocm_toolkit_path + "/rccl/include",
+            out_dir = "rocm/include/rccl",
+        ),
    ]

    rocm_libs = _find_libs(repository_ctx, rocm_config)
@ -731,11 +747,13 @@ def _create_local_rocm_repository(repository_ctx):
            "%{rocfft_lib}": rocm_libs["rocfft"].file_name,
            "%{hiprand_lib}": rocm_libs["hiprand"].file_name,
            "%{miopen_lib}": rocm_libs["miopen"].file_name,
+            "%{rccl_lib}": rocm_libs["rccl"].file_name,
            "%{copy_rules}": "\n".join(copy_rules),
            "%{rocm_headers}": ('":rocm-include",\n' +
                                '":rocfft-include",\n' +
                                '":rocblas-include",\n' +
-                                '":miopen-include",'),
+                                '":miopen-include",\n' +
+                                '":rccl-include",'),
        },
    )