PR #31485: [ROCm] add ROCm RCCL support

Imported from GitHub PR #31485

Copybara import of the project:

  - ba5748981bb02b9d0e91114cdc30eb64d1650a46 add ROCm RCCL support by Jeff Daily <jeff.daily@amd.com>
  - 6f887a19731f030be58495ae4fea98b3ad1f1cc3 run buildifier against tensorflow/core/nccl/BUILD by Jeff Daily <jeff.daily@amd.com>
  - 55ce583cf484953d90eb9b9310dc77cf63b4c0c9 Merge 6f887a19731f030be58495ae4fea98b3ad1f1cc3 into f9233... by Jeff Daily <jeff.daily@amd.com>

PiperOrigin-RevId: 264892468
This commit is contained in:
Christian Sigg 2019-08-22 12:22:14 -07:00 committed by TensorFlower Gardener
parent b84c2a888f
commit b8f3b8d28b
16 changed files with 111 additions and 34 deletions

View File

@ -194,6 +194,17 @@ tf_cc_test(
],
)
# virtual targets since nested select statements not possible
tf_kernel_library(
name = "virtual_nccl",
deps = if_cuda(["@local_config_nccl//:nccl"]),
)
tf_kernel_library(
name = "virtual_rccl",
deps = if_rocm(["@local_config_rocm//rocm:rccl"]),
)
tf_kernel_library(
name = "collective_ops",
srcs = if_nccl([
@ -213,7 +224,8 @@ tf_kernel_library(
"//tensorflow/core:protos_all_cc",
"//tensorflow/core/profiler/lib:traceme",
] + if_nccl([
"@local_config_nccl//:nccl",
":virtual_nccl",
":virtual_rccl",
"//tensorflow/core/nccl:nccl_lib",
]),
)
@ -382,11 +394,14 @@ cc_library(
tf_kernel_library(
name = "nccl_kernels",
srcs = if_cuda([
srcs = if_cuda_or_rocm([
"nccl_ops.cc",
]),
deps = if_cuda([
"@local_config_nccl//:nccl",
]) + if_rocm([
"@local_config_rocm//rocm:rccl",
]) + if_cuda_or_rocm([
"//tensorflow/core/nccl:nccl_lib",
"//tensorflow/core:framework",
"//tensorflow/core:gpu_headers_lib",

View File

@ -14,7 +14,7 @@ limitations under the License.
==============================================================================*/
#include "tensorflow/core/kernels/collective_nccl.h"
#ifdef GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#include "tensorflow/core/common_runtime/collective_util.h"
#include "tensorflow/core/nccl/nccl_manager.h"
@ -79,4 +79,4 @@ const string NcclBase::NcclCollectiveKey(const string& exec_key, int step_id) {
} // namespace tensorflow
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

View File

@ -18,7 +18,7 @@ limitations under the License.
#include "tensorflow/core/framework/collective.h"
namespace tensorflow {
#ifdef GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
class NcclBase : public CollectiveImplementationInterface {
public:
@ -44,7 +44,7 @@ class NcclBase : public CollectiveImplementationInterface {
const CollectiveParams* col_params_; // Not owned
};
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
} // namespace tensorflow
#endif // TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_H_

View File

@ -14,7 +14,7 @@ limitations under the License.
==============================================================================*/
#include "tensorflow/core/kernels/collective_nccl_broadcaster.h"
#ifdef GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#include "tensorflow/core/common_runtime/collective_util.h"
#include "tensorflow/core/nccl/nccl_manager.h"
@ -80,4 +80,4 @@ REGISTER_COLLECTIVE(NcclBroadcast, NcclBroadcaster);
} // namespace tensorflow
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

View File

@ -18,7 +18,7 @@ limitations under the License.
#include "tensorflow/core/kernels/collective_nccl.h"
namespace tensorflow {
#ifdef GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
class NcclBroadcaster : public NcclBase {
public:
@ -29,7 +29,7 @@ class NcclBroadcaster : public NcclBase {
void Run(StatusCallback done) override;
};
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
} // namespace tensorflow
#endif // TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_BROADCASTER_H_

View File

@ -14,7 +14,7 @@ limitations under the License.
==============================================================================*/
#include "tensorflow/core/kernels/collective_nccl_gatherer.h"
#ifdef GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#include "tensorflow/core/common_runtime/collective_util.h"
#include "tensorflow/core/nccl/nccl_manager.h"
@ -70,4 +70,4 @@ REGISTER_COLLECTIVE(NcclGather, NcclGatherer);
} // namespace tensorflow
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

View File

@ -18,7 +18,7 @@ limitations under the License.
#include "tensorflow/core/kernels/collective_nccl.h"
namespace tensorflow {
#ifdef GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
class NcclGatherer : public NcclBase {
public:
@ -29,7 +29,7 @@ class NcclGatherer : public NcclBase {
void Run(StatusCallback done) override;
};
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
} // namespace tensorflow
#endif // TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_GATHERER_H_

View File

@ -14,7 +14,7 @@ limitations under the License.
==============================================================================*/
#include "tensorflow/core/kernels/collective_nccl_reducer.h"
#ifdef GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#include "tensorflow/core/common_runtime/collective_util.h"
#include "tensorflow/core/nccl/nccl_manager.h"
@ -191,4 +191,4 @@ REGISTER_COLLECTIVE(NcclReduce, NcclReducer);
} // namespace tensorflow
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

View File

@ -18,7 +18,7 @@ limitations under the License.
#include "tensorflow/core/kernels/collective_nccl.h"
namespace tensorflow {
#ifdef GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
class NcclReducer : public NcclBase {
public:
@ -29,7 +29,7 @@ class NcclReducer : public NcclBase {
void Run(StatusCallback done) override;
};
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
} // namespace tensorflow
#endif // TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_REDUCER_H_

View File

@ -13,11 +13,15 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#include <vector>
#if GOOGLE_CUDA
#include "third_party/nccl/nccl.h"
#elif TENSORFLOW_USE_ROCM
#include "rocm/include/rccl/rccl.h"
#endif
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/nccl/nccl_manager.h"
@ -276,4 +280,4 @@ REGISTER_KERNEL_BUILDER(Name("NcclReduce").Device(DEVICE_GPU), NcclStubKernel);
} // namespace
} // namespace tensorflow
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

View File

@ -5,6 +5,8 @@
load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
load("//tensorflow:tensorflow.bzl", "tf_copts")
load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
load("//tensorflow:tensorflow.bzl", "if_cuda_or_rocm")
load(
"//tensorflow/core/platform:default/build_config_root.bzl",
"tf_cuda_tests_tags",
@ -19,18 +21,21 @@ exports_files(["LICENSE"])
cc_library(
name = "nccl_lib",
srcs = if_cuda([
srcs = if_cuda_or_rocm([
"nccl_manager.cc",
"nccl_rewrite.cc",
]),
hdrs = if_cuda([
hdrs = if_cuda_or_rocm([
"nccl_manager.h",
]),
copts = tf_copts(),
deps = if_cuda([
"@com_google_absl//absl/container:flat_hash_map",
"@com_google_absl//absl/memory",
"@local_config_nccl//:nccl",
]) + if_rocm([
"@local_config_rocm//rocm:rccl",
]) + if_cuda_or_rocm([
"@com_google_absl//absl/container:flat_hash_map",
"//tensorflow/core:core_cpu",
"//tensorflow/core:framework",
"//tensorflow/core:gpu_headers_lib",
@ -51,9 +56,13 @@ tf_cuda_cc_test(
"//tensorflow/core:test",
"//tensorflow/core:test_main",
"//tensorflow/core:testlib",
] + if_cuda([
] + if_cuda_or_rocm([
":nccl_lib",
]) + if_cuda([
"@local_config_nccl//:nccl",
"//tensorflow/core:cuda",
]) + if_rocm([
"@local_config_rocm//rocm:rccl",
"//tensorflow/core:rocm",
]),
)

View File

@ -16,15 +16,32 @@ limitations under the License.
#include <utility>
#ifdef GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#include "tensorflow/core/lib/core/refcount.h"
#include "tensorflow/core/lib/core/threadpool.h"
#include "tensorflow/core/platform/cuda.h"
#include "tensorflow/core/platform/env.h"
#if GOOGLE_CUDA
#include "tensorflow/core/platform/cuda.h"
#elif TENSORFLOW_USE_ROCM
#include "tensorflow/core/platform/rocm.h"
#endif
namespace tensorflow {
#if GOOGLE_CUDA
using se::cuda::ScopedActivateExecutorContext;
#elif TENSORFLOW_USE_ROCM
using se::rocm::ScopedActivateExecutorContext;
// Local hipify of cuda symbols
#define cudaError_t hipError_t
#define cudaStream_t hipStream_t
#define cudaGetErrorString hipGetErrorString
#define cudaGetDevice hipGetDevice
#define cudaSetDevice hipSetDevice
#define cudaSuccess hipSuccess
#endif
#define NCCL_RETURN_IF_ERROR(...) \
do { \
ncclResult_t nccl_status = (__VA_ARGS__); \
@ -41,8 +58,6 @@ namespace tensorflow {
} \
} while (0)
using se::cuda::ScopedActivateExecutorContext;
// Contains data for a single stream used for nccl communication; this includes
// a background thread that calls NcclManager::LoopKernelLaunches.
struct NcclManager::NcclStream : public core::RefCounted {
@ -709,4 +724,4 @@ void NcclManager::LoopKernelLaunches(NcclStream* nccl_stream) {
} // namespace tensorflow
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

View File

@ -15,7 +15,7 @@ limitations under the License.
#ifndef TENSORFLOW_CORE_NCCL_NCCL_MANAGER_H_
#define TENSORFLOW_CORE_NCCL_NCCL_MANAGER_H_
#ifdef GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#include <vector>
@ -28,7 +28,11 @@ limitations under the License.
#include "absl/container/flat_hash_map.h"
#include "absl/memory/memory.h"
#if GOOGLE_CUDA
#include "third_party/nccl/nccl.h"
#elif TENSORFLOW_USE_ROCM
#include "rocm/include/rccl/rccl.h"
#endif
#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/platform/mutex.h"
@ -245,6 +249,6 @@ class NcclManager {
} // namespace tensorflow
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#endif // TENSORFLOW_CORE_NCCL_NCCL_MANAGER_H_

View File

@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifdef GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#include "tensorflow/core/nccl/nccl_manager.h"
@ -802,4 +802,4 @@ TYPED_TEST(NcclManagerTest, BroadcastInconsistentSource) {
} // namespace tensorflow
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

View File

@ -84,6 +84,18 @@ cc_library(
visibility = ["//visibility:public"],
)
cc_library(
name = "rccl",
srcs = ["rocm/lib/%{rccl_lib}"],
data = ["rocm/lib/%{rccl_lib}"],
includes = [
".",
"rocm/include",
],
linkstatic = 1,
visibility = ["//visibility:public"],
)
cc_library(
name = "rocm",
visibility = ["//visibility:public"],

View File

@ -201,6 +201,9 @@ def _rocm_include_path(repository_ctx, rocm_config):
# Add MIOpen headers
inc_dirs.append("/opt/rocm/miopen/include")
# Add RCCL headers
inc_dirs.append("/opt/rocm/rccl/include")
# Add hcc headers
inc_dirs.append("/opt/rocm/hcc/include")
inc_dirs.append("/opt/rocm/hcc/compiler/lib/clang/7.0.0/include/")
@ -472,6 +475,12 @@ def _find_libs(repository_ctx, rocm_config):
cpu_value,
rocm_config.rocm_toolkit_path + "/miopen",
),
"rccl": _find_rocm_lib(
"rccl",
repository_ctx,
cpu_value,
rocm_config.rocm_toolkit_path + "/rccl",
),
}
def _get_rocm_config(repository_ctx):
@ -554,6 +563,7 @@ def _create_dummy_repository(repository_ctx):
"%{hip_lib}": _lib_name("hip", cpu_value),
"%{rocblas_lib}": _lib_name("rocblas", cpu_value),
"%{miopen_lib}": _lib_name("miopen", cpu_value),
"%{rccl_lib}": _lib_name("rccl", cpu_value),
"%{rocfft_lib}": _lib_name("rocfft", cpu_value),
"%{hiprand_lib}": _lib_name("hiprand", cpu_value),
"%{copy_rules}": "",
@ -695,6 +705,12 @@ def _create_local_rocm_repository(repository_ctx):
src_dir = rocm_toolkit_path + "/miopen/include",
out_dir = "rocm/include/miopen",
),
make_copy_dir_rule(
repository_ctx,
name = "rccl-include",
src_dir = rocm_toolkit_path + "/rccl/include",
out_dir = "rocm/include/rccl",
),
]
rocm_libs = _find_libs(repository_ctx, rocm_config)
@ -731,11 +747,13 @@ def _create_local_rocm_repository(repository_ctx):
"%{rocfft_lib}": rocm_libs["rocfft"].file_name,
"%{hiprand_lib}": rocm_libs["hiprand"].file_name,
"%{miopen_lib}": rocm_libs["miopen"].file_name,
"%{rccl_lib}": rocm_libs["rccl"].file_name,
"%{copy_rules}": "\n".join(copy_rules),
"%{rocm_headers}": ('":rocm-include",\n' +
'":rocfft-include",\n' +
'":rocblas-include",\n' +
'":miopen-include",'),
'":miopen-include",\n' +
'":rccl-include",'),
},
)