PR #31485: [ROCm] add ROCm RCCL support
Imported from GitHub PR #31485 Copybara import of the project: - ba5748981bb02b9d0e91114cdc30eb64d1650a46 add ROCm RCCL support by Jeff Daily <jeff.daily@amd.com> - 6f887a19731f030be58495ae4fea98b3ad1f1cc3 run buildifier against tensorflow/core/nccl/BUILD by Jeff Daily <jeff.daily@amd.com> - 55ce583cf484953d90eb9b9310dc77cf63b4c0c9 Merge 6f887a19731f030be58495ae4fea98b3ad1f1cc3 into f9233... by Jeff Daily <jeff.daily@amd.com> PiperOrigin-RevId: 264892468
This commit is contained in:
parent
b84c2a888f
commit
b8f3b8d28b
@ -194,6 +194,17 @@ tf_cc_test(
|
||||
],
|
||||
)
|
||||
|
||||
# virtual targets since nested select statements not possible
|
||||
tf_kernel_library(
|
||||
name = "virtual_nccl",
|
||||
deps = if_cuda(["@local_config_nccl//:nccl"]),
|
||||
)
|
||||
|
||||
tf_kernel_library(
|
||||
name = "virtual_rccl",
|
||||
deps = if_rocm(["@local_config_rocm//rocm:rccl"]),
|
||||
)
|
||||
|
||||
tf_kernel_library(
|
||||
name = "collective_ops",
|
||||
srcs = if_nccl([
|
||||
@ -213,7 +224,8 @@ tf_kernel_library(
|
||||
"//tensorflow/core:protos_all_cc",
|
||||
"//tensorflow/core/profiler/lib:traceme",
|
||||
] + if_nccl([
|
||||
"@local_config_nccl//:nccl",
|
||||
":virtual_nccl",
|
||||
":virtual_rccl",
|
||||
"//tensorflow/core/nccl:nccl_lib",
|
||||
]),
|
||||
)
|
||||
@ -382,11 +394,14 @@ cc_library(
|
||||
|
||||
tf_kernel_library(
|
||||
name = "nccl_kernels",
|
||||
srcs = if_cuda([
|
||||
srcs = if_cuda_or_rocm([
|
||||
"nccl_ops.cc",
|
||||
]),
|
||||
deps = if_cuda([
|
||||
"@local_config_nccl//:nccl",
|
||||
]) + if_rocm([
|
||||
"@local_config_rocm//rocm:rccl",
|
||||
]) + if_cuda_or_rocm([
|
||||
"//tensorflow/core/nccl:nccl_lib",
|
||||
"//tensorflow/core:framework",
|
||||
"//tensorflow/core:gpu_headers_lib",
|
||||
|
@ -14,7 +14,7 @@ limitations under the License.
|
||||
==============================================================================*/
|
||||
#include "tensorflow/core/kernels/collective_nccl.h"
|
||||
|
||||
#ifdef GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
#include "tensorflow/core/common_runtime/collective_util.h"
|
||||
#include "tensorflow/core/nccl/nccl_manager.h"
|
||||
@ -79,4 +79,4 @@ const string NcclBase::NcclCollectiveKey(const string& exec_key, int step_id) {
|
||||
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
@ -18,7 +18,7 @@ limitations under the License.
|
||||
#include "tensorflow/core/framework/collective.h"
|
||||
|
||||
namespace tensorflow {
|
||||
#ifdef GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
class NcclBase : public CollectiveImplementationInterface {
|
||||
public:
|
||||
@ -44,7 +44,7 @@ class NcclBase : public CollectiveImplementationInterface {
|
||||
const CollectiveParams* col_params_; // Not owned
|
||||
};
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_H_
|
||||
|
@ -14,7 +14,7 @@ limitations under the License.
|
||||
==============================================================================*/
|
||||
#include "tensorflow/core/kernels/collective_nccl_broadcaster.h"
|
||||
|
||||
#ifdef GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
#include "tensorflow/core/common_runtime/collective_util.h"
|
||||
#include "tensorflow/core/nccl/nccl_manager.h"
|
||||
@ -80,4 +80,4 @@ REGISTER_COLLECTIVE(NcclBroadcast, NcclBroadcaster);
|
||||
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
@ -18,7 +18,7 @@ limitations under the License.
|
||||
#include "tensorflow/core/kernels/collective_nccl.h"
|
||||
|
||||
namespace tensorflow {
|
||||
#ifdef GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
class NcclBroadcaster : public NcclBase {
|
||||
public:
|
||||
@ -29,7 +29,7 @@ class NcclBroadcaster : public NcclBase {
|
||||
void Run(StatusCallback done) override;
|
||||
};
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_BROADCASTER_H_
|
||||
|
@ -14,7 +14,7 @@ limitations under the License.
|
||||
==============================================================================*/
|
||||
#include "tensorflow/core/kernels/collective_nccl_gatherer.h"
|
||||
|
||||
#ifdef GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
#include "tensorflow/core/common_runtime/collective_util.h"
|
||||
#include "tensorflow/core/nccl/nccl_manager.h"
|
||||
@ -70,4 +70,4 @@ REGISTER_COLLECTIVE(NcclGather, NcclGatherer);
|
||||
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
@ -18,7 +18,7 @@ limitations under the License.
|
||||
#include "tensorflow/core/kernels/collective_nccl.h"
|
||||
|
||||
namespace tensorflow {
|
||||
#ifdef GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
class NcclGatherer : public NcclBase {
|
||||
public:
|
||||
@ -29,7 +29,7 @@ class NcclGatherer : public NcclBase {
|
||||
void Run(StatusCallback done) override;
|
||||
};
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_GATHERER_H_
|
||||
|
@ -14,7 +14,7 @@ limitations under the License.
|
||||
==============================================================================*/
|
||||
#include "tensorflow/core/kernels/collective_nccl_reducer.h"
|
||||
|
||||
#ifdef GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
#include "tensorflow/core/common_runtime/collective_util.h"
|
||||
#include "tensorflow/core/nccl/nccl_manager.h"
|
||||
@ -191,4 +191,4 @@ REGISTER_COLLECTIVE(NcclReduce, NcclReducer);
|
||||
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
@ -18,7 +18,7 @@ limitations under the License.
|
||||
#include "tensorflow/core/kernels/collective_nccl.h"
|
||||
|
||||
namespace tensorflow {
|
||||
#ifdef GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
class NcclReducer : public NcclBase {
|
||||
public:
|
||||
@ -29,7 +29,7 @@ class NcclReducer : public NcclBase {
|
||||
void Run(StatusCallback done) override;
|
||||
};
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_REDUCER_H_
|
||||
|
@ -13,11 +13,15 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
#include <vector>
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#include "third_party/nccl/nccl.h"
|
||||
#elif TENSORFLOW_USE_ROCM
|
||||
#include "rocm/include/rccl/rccl.h"
|
||||
#endif
|
||||
#include "tensorflow/core/framework/op_kernel.h"
|
||||
#include "tensorflow/core/nccl/nccl_manager.h"
|
||||
|
||||
@ -276,4 +280,4 @@ REGISTER_KERNEL_BUILDER(Name("NcclReduce").Device(DEVICE_GPU), NcclStubKernel);
|
||||
} // namespace
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
@ -5,6 +5,8 @@
|
||||
load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
|
||||
load("//tensorflow:tensorflow.bzl", "tf_copts")
|
||||
load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
|
||||
load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
|
||||
load("//tensorflow:tensorflow.bzl", "if_cuda_or_rocm")
|
||||
load(
|
||||
"//tensorflow/core/platform:default/build_config_root.bzl",
|
||||
"tf_cuda_tests_tags",
|
||||
@ -19,18 +21,21 @@ exports_files(["LICENSE"])
|
||||
|
||||
cc_library(
|
||||
name = "nccl_lib",
|
||||
srcs = if_cuda([
|
||||
srcs = if_cuda_or_rocm([
|
||||
"nccl_manager.cc",
|
||||
"nccl_rewrite.cc",
|
||||
]),
|
||||
hdrs = if_cuda([
|
||||
hdrs = if_cuda_or_rocm([
|
||||
"nccl_manager.h",
|
||||
]),
|
||||
copts = tf_copts(),
|
||||
deps = if_cuda([
|
||||
"@com_google_absl//absl/container:flat_hash_map",
|
||||
"@com_google_absl//absl/memory",
|
||||
"@local_config_nccl//:nccl",
|
||||
]) + if_rocm([
|
||||
"@local_config_rocm//rocm:rccl",
|
||||
]) + if_cuda_or_rocm([
|
||||
"@com_google_absl//absl/container:flat_hash_map",
|
||||
"//tensorflow/core:core_cpu",
|
||||
"//tensorflow/core:framework",
|
||||
"//tensorflow/core:gpu_headers_lib",
|
||||
@ -51,9 +56,13 @@ tf_cuda_cc_test(
|
||||
"//tensorflow/core:test",
|
||||
"//tensorflow/core:test_main",
|
||||
"//tensorflow/core:testlib",
|
||||
] + if_cuda([
|
||||
] + if_cuda_or_rocm([
|
||||
":nccl_lib",
|
||||
]) + if_cuda([
|
||||
"@local_config_nccl//:nccl",
|
||||
"//tensorflow/core:cuda",
|
||||
]) + if_rocm([
|
||||
"@local_config_rocm//rocm:rccl",
|
||||
"//tensorflow/core:rocm",
|
||||
]),
|
||||
)
|
||||
|
@ -16,15 +16,32 @@ limitations under the License.
|
||||
|
||||
#include <utility>
|
||||
|
||||
#ifdef GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
#include "tensorflow/core/lib/core/refcount.h"
|
||||
#include "tensorflow/core/lib/core/threadpool.h"
|
||||
#include "tensorflow/core/platform/cuda.h"
|
||||
#include "tensorflow/core/platform/env.h"
|
||||
#if GOOGLE_CUDA
|
||||
#include "tensorflow/core/platform/cuda.h"
|
||||
#elif TENSORFLOW_USE_ROCM
|
||||
#include "tensorflow/core/platform/rocm.h"
|
||||
#endif
|
||||
|
||||
namespace tensorflow {
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
using se::cuda::ScopedActivateExecutorContext;
|
||||
#elif TENSORFLOW_USE_ROCM
|
||||
using se::rocm::ScopedActivateExecutorContext;
|
||||
// Local hipify of cuda symbols
|
||||
#define cudaError_t hipError_t
|
||||
#define cudaStream_t hipStream_t
|
||||
#define cudaGetErrorString hipGetErrorString
|
||||
#define cudaGetDevice hipGetDevice
|
||||
#define cudaSetDevice hipSetDevice
|
||||
#define cudaSuccess hipSuccess
|
||||
#endif
|
||||
|
||||
#define NCCL_RETURN_IF_ERROR(...) \
|
||||
do { \
|
||||
ncclResult_t nccl_status = (__VA_ARGS__); \
|
||||
@ -41,8 +58,6 @@ namespace tensorflow {
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
using se::cuda::ScopedActivateExecutorContext;
|
||||
|
||||
// Contains data for a single stream used for nccl communication; this includes
|
||||
// a background thread that calls NcclManager::LoopKernelLaunches.
|
||||
struct NcclManager::NcclStream : public core::RefCounted {
|
||||
@ -709,4 +724,4 @@ void NcclManager::LoopKernelLaunches(NcclStream* nccl_stream) {
|
||||
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
@ -15,7 +15,7 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_CORE_NCCL_NCCL_MANAGER_H_
|
||||
#define TENSORFLOW_CORE_NCCL_NCCL_MANAGER_H_
|
||||
|
||||
#ifdef GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
#include <vector>
|
||||
|
||||
@ -28,7 +28,11 @@ limitations under the License.
|
||||
|
||||
#include "absl/container/flat_hash_map.h"
|
||||
#include "absl/memory/memory.h"
|
||||
#if GOOGLE_CUDA
|
||||
#include "third_party/nccl/nccl.h"
|
||||
#elif TENSORFLOW_USE_ROCM
|
||||
#include "rocm/include/rccl/rccl.h"
|
||||
#endif
|
||||
#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
|
||||
#include "tensorflow/core/framework/tensor.h"
|
||||
#include "tensorflow/core/platform/mutex.h"
|
||||
@ -245,6 +249,6 @@ class NcclManager {
|
||||
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
#endif // TENSORFLOW_CORE_NCCL_NCCL_MANAGER_H_
|
||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifdef GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
#include "tensorflow/core/nccl/nccl_manager.h"
|
||||
|
||||
@ -802,4 +802,4 @@ TYPED_TEST(NcclManagerTest, BroadcastInconsistentSource) {
|
||||
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
12
third_party/gpus/rocm/BUILD.tpl
vendored
12
third_party/gpus/rocm/BUILD.tpl
vendored
@ -84,6 +84,18 @@ cc_library(
|
||||
visibility = ["//visibility:public"],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "rccl",
|
||||
srcs = ["rocm/lib/%{rccl_lib}"],
|
||||
data = ["rocm/lib/%{rccl_lib}"],
|
||||
includes = [
|
||||
".",
|
||||
"rocm/include",
|
||||
],
|
||||
linkstatic = 1,
|
||||
visibility = ["//visibility:public"],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "rocm",
|
||||
visibility = ["//visibility:public"],
|
||||
|
20
third_party/gpus/rocm_configure.bzl
vendored
20
third_party/gpus/rocm_configure.bzl
vendored
@ -201,6 +201,9 @@ def _rocm_include_path(repository_ctx, rocm_config):
|
||||
# Add MIOpen headers
|
||||
inc_dirs.append("/opt/rocm/miopen/include")
|
||||
|
||||
# Add RCCL headers
|
||||
inc_dirs.append("/opt/rocm/rccl/include")
|
||||
|
||||
# Add hcc headers
|
||||
inc_dirs.append("/opt/rocm/hcc/include")
|
||||
inc_dirs.append("/opt/rocm/hcc/compiler/lib/clang/7.0.0/include/")
|
||||
@ -472,6 +475,12 @@ def _find_libs(repository_ctx, rocm_config):
|
||||
cpu_value,
|
||||
rocm_config.rocm_toolkit_path + "/miopen",
|
||||
),
|
||||
"rccl": _find_rocm_lib(
|
||||
"rccl",
|
||||
repository_ctx,
|
||||
cpu_value,
|
||||
rocm_config.rocm_toolkit_path + "/rccl",
|
||||
),
|
||||
}
|
||||
|
||||
def _get_rocm_config(repository_ctx):
|
||||
@ -554,6 +563,7 @@ def _create_dummy_repository(repository_ctx):
|
||||
"%{hip_lib}": _lib_name("hip", cpu_value),
|
||||
"%{rocblas_lib}": _lib_name("rocblas", cpu_value),
|
||||
"%{miopen_lib}": _lib_name("miopen", cpu_value),
|
||||
"%{rccl_lib}": _lib_name("rccl", cpu_value),
|
||||
"%{rocfft_lib}": _lib_name("rocfft", cpu_value),
|
||||
"%{hiprand_lib}": _lib_name("hiprand", cpu_value),
|
||||
"%{copy_rules}": "",
|
||||
@ -695,6 +705,12 @@ def _create_local_rocm_repository(repository_ctx):
|
||||
src_dir = rocm_toolkit_path + "/miopen/include",
|
||||
out_dir = "rocm/include/miopen",
|
||||
),
|
||||
make_copy_dir_rule(
|
||||
repository_ctx,
|
||||
name = "rccl-include",
|
||||
src_dir = rocm_toolkit_path + "/rccl/include",
|
||||
out_dir = "rocm/include/rccl",
|
||||
),
|
||||
]
|
||||
|
||||
rocm_libs = _find_libs(repository_ctx, rocm_config)
|
||||
@ -731,11 +747,13 @@ def _create_local_rocm_repository(repository_ctx):
|
||||
"%{rocfft_lib}": rocm_libs["rocfft"].file_name,
|
||||
"%{hiprand_lib}": rocm_libs["hiprand"].file_name,
|
||||
"%{miopen_lib}": rocm_libs["miopen"].file_name,
|
||||
"%{rccl_lib}": rocm_libs["rccl"].file_name,
|
||||
"%{copy_rules}": "\n".join(copy_rules),
|
||||
"%{rocm_headers}": ('":rocm-include",\n' +
|
||||
'":rocfft-include",\n' +
|
||||
'":rocblas-include",\n' +
|
||||
'":miopen-include",'),
|
||||
'":miopen-include",\n' +
|
||||
'":rccl-include",'),
|
||||
},
|
||||
)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user