Enable Remote Config for ROCM and CUDA RBE pre- and postsubmits

Previously TF_CUDA_CONFIG_REPO would point to a pregenerated and checked in configuration. This changes has it point to a remote repository intead that generates the configuration during the build for the specific docker image. All supported configurations can be found in third_party/toolchains/remote_config/configs.bzl. Each tensorflow_rbe_config() macro creates a few remote repositories to which to point the TF_*_CONFIG_REPO environment variables to. The remote repository names are prefixed with the macro's name. For example, tensorflow_rbe_config(name = "ubuntu") will create @ubuntu_config_python, @ubuntu_config_cuda, @ubuntu_config_nccl, etc.

This change also introduces the platform_configure. All this rule does is create a remote repository with a single platform target for the tensorflow_rbe_config(). This will make the platforms defined in //third_party/toolchains/BUILD obsolete once remote config is fully rolled out.

PiperOrigin-RevId: 296065144
Change-Id: Ia54beeb771b28846444e27a2023f70abbd9f6ad5
This commit is contained in:
Jakob Buchgraber 2020-02-19 14:59:49 -08:00 committed by TensorFlower Gardener
parent 439595440b
commit 09fe958fee
15 changed files with 365 additions and 62 deletions

View File

@ -319,6 +319,10 @@ build:xla --define=with_xla_support=true
# BEGIN TF REMOTE BUILD EXECUTION OPTIONS
# Options when using remote execution
# WARNING: THESE OPTIONS WONT WORK IF YOU DO NOT HAVE PROPER AUTHENTICATION AND PERMISSIONS
# Flag to enable remote config
common --experimental_repo_remote_exec
build:rbe --action_env=BAZEL_DO_NOT_DETECT_CPP_TOOLCHAIN=1
build:rbe --google_default_credentials
build:rbe --bes_backend=buildeventservice.googleapis.com

View File

@ -149,7 +149,9 @@ tensorflow/third_party/py/python_configure.bzl
tensorflow/third_party/pybind11.BUILD
tensorflow/third_party/python_runtime/BUILD
tensorflow/third_party/remote_config/BUILD
tensorflow/third_party/remote_config/BUILD.tpl
tensorflow/third_party/remote_config/common.bzl
tensorflow/third_party/remote_config/remote_platform_configure.bzl
tensorflow/third_party/repo.bzl
tensorflow/third_party/six.BUILD
tensorflow/third_party/snappy.BUILD
@ -280,6 +282,10 @@ tensorflow/third_party/toolchains/remote/BUILD
tensorflow/third_party/toolchains/remote/BUILD.tpl
tensorflow/third_party/toolchains/remote/configure.bzl
tensorflow/third_party/toolchains/remote/execution.bzl.tpl
tensorflow/third_party/toolchains/remote_config/BUILD
tensorflow/third_party/toolchains/remote_config/configs.bzl
tensorflow/third_party/toolchains/remote_config/containers.bzl
tensorflow/third_party/toolchains/remote_config/rbe_config.bzl
tensorflow/third_party/wrapt.BUILD
tensorflow/third_party/zlib.BUILD
tensorflow/tools/ci_build/release/common.sh

View File

@ -50,6 +50,13 @@ function run_build () {
# Get the default test targets for bazel.
source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
RBE_CONFIG="@ubuntu16.04-py3-gcc7_manylinux2010-cuda10.0-cudnn7-tensorrt5.1"
TF_CUDA_CONFIG_REPO="${RBE_CONFIG}_config_cuda"
TF_TENSORRT_CONFIG_REPO="${RBE_CONFIG}_config_tensorrt"
TF_PYTHON_CONFIG_REPO="${RBE_CONFIG}_config_python"
TF_NCCL_CONFIG_REPO="${RBE_CONFIG}_config_nccl"
TF_RBE_PLATFORM="${RBE_CONFIG}_config_platform//:platform"
# Run bazel test command. Double test timeouts to avoid flakes.
# //tensorflow/core/platform:setround_test is not supported. See b/64264700
# TODO(klimek): Re-enable tensorrt tests (with different runtime image) once
@ -65,12 +72,14 @@ function run_build () {
--action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
--action_env=REMOTE_GPU_TESTING=1 \
--action_env=TF_CUDA_COMPUTE_CAPABILITIES="${TF_CUDA_COMPUTE_CAPABILITIES}" \
--action_env=TF_CUDA_CONFIG_REPO=@org_tensorflow//third_party/toolchains/preconfig/ubuntu16.04/cuda10.0-cudnn7 \
--action_env=TF_CUDA_CONFIG_REPO="${TF_CUDA_CONFIG_REPO}" \
--action_env=TF_CUDA_VERSION=10 \
--action_env=TF_CUDNN_VERSION=7 \
--action_env=TF_NEED_TENSORRT=0 \
--action_env=TF_TENSORRT_CONFIG_REPO="${TF_TENSORRT_CONFIG_REPO}" \
--action_env=TF_NEED_CUDA=1 \
--action_env=TF_PYTHON_CONFIG_REPO=@org_tensorflow//third_party/toolchains/preconfig/ubuntu16.04/py3 \
--action_env=TF_PYTHON_CONFIG_REPO="${TF_PYTHON_CONFIG_REPO}" \
--action_env=TF_NCCL_CONFIG_REPO="${TF_NCCL_CONFIG_REPO}" \
--test_env=LD_LIBRARY_PATH \
--test_tag_filters="${tag_filters}" \
--build_tag_filters="${tag_filters}" \
@ -89,17 +98,17 @@ function run_build () {
--linkopt=-lm \
--distinct_host_configuration=false \
--remote_default_exec_properties=build=${CACHE_SILO_VAL} \
--crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.0:toolchain \
--crosstool_top="${TF_CUDA_CONFIG_REPO}//crosstool:toolchain" \
--host_javabase=@bazel_toolchains//configs/ubuntu16_04_clang/1.1:jdk8 \
--javabase=@bazel_toolchains//configs/ubuntu16_04_clang/1.0:jdk8 \
--host_java_toolchain=@bazel_tools//tools/jdk:toolchain_hostjdk8 \
--java_toolchain=@bazel_tools//tools/jdk:toolchain_hostjdk8 \
--extra_toolchains=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.0:toolchain-linux-x86_64 \
--extra_execution_platforms=@org_tensorflow//third_party/toolchains:rbe_cuda10.0-cudnn7-ubuntu16.04-manylinux2010 \
--host_platform=@org_tensorflow//third_party/toolchains:rbe_cuda10.0-cudnn7-ubuntu16.04-manylinux2010 \
--extra_toolchains="${TF_CUDA_CONFIG_REPO}//crosstool:toolchain-linux-x86_64" \
--extra_execution_platforms="${TF_RBE_PLATFORM}" \
--host_platform="${TF_RBE_PLATFORM}" \
--local_test_jobs=4 \
--remote_timeout=3600 \
--platforms=@org_tensorflow//third_party/toolchains:rbe_cuda10.0-cudnn7-ubuntu16.04-manylinux2010 \
--platforms="${TF_RBE_PLATFORM}" \
-- \
${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
@ -113,3 +122,4 @@ install_bazelisk
which bazel
run_build

View File

@ -41,6 +41,7 @@ load("//third_party/psimd:workspace.bzl", psimd = "repo")
load("//third_party/pthreadpool:workspace.bzl", pthreadpool = "repo")
load("//third_party/sobol_data:workspace.bzl", sobol_data = "repo")
load("//third_party/vulkan_headers:workspace.bzl", vulkan_headers = "repo")
load("//third_party/toolchains/remote_config:configs.bzl", "initialize_rbe_configs")
def initialize_third_party():
""" Load third party repositories. See above load() statements. """
@ -81,6 +82,9 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
def tf_repositories(path_prefix = "", tf_repo_name = ""):
"""All external dependencies for TF builds."""
# Loads all external repos to configure RBE builds.
initialize_rbe_configs()
# Note that we check the minimum bazel version in WORKSPACE.
clang6_configure(name = "local_config_clang6")
cc_download_clang_toolchain(name = "local_config_download_clang")

View File

@ -1174,6 +1174,24 @@ def _create_remote_cuda_repository(repository_ctx, remote_config_repo):
{},
)
repository_ctx.template(
"crosstool/BUILD",
config_repo_label(remote_config_repo, "crosstool:BUILD"),
{},
)
repository_ctx.template(
"crosstool/cc_toolchain_config.bzl",
config_repo_label(remote_config_repo, "crosstool:cc_toolchain_config.bzl"),
{},
)
repository_ctx.template(
"crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc",
config_repo_label(remote_config_repo, "crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc"),
{},
)
def _cuda_autoconf_impl(repository_ctx):
"""Implementation of the cuda_autoconf repository rule."""
if not enable_cuda(repository_ctx):
@ -1191,29 +1209,38 @@ def _cuda_autoconf_impl(repository_ctx):
else:
_create_local_cuda_repository(repository_ctx)
cuda_configure = repository_rule(
implementation = _cuda_autoconf_impl,
environ = [
_GCC_HOST_COMPILER_PATH,
_GCC_HOST_COMPILER_PREFIX,
_CLANG_CUDA_COMPILER_PATH,
"TF_NEED_CUDA",
"TF_CUDA_CLANG",
_TF_DOWNLOAD_CLANG,
_CUDA_TOOLKIT_PATH,
_CUDNN_INSTALL_PATH,
_TF_CUDA_VERSION,
_TF_CUDNN_VERSION,
_TF_CUDA_COMPUTE_CAPABILITIES,
_TF_CUDA_CONFIG_REPO,
"NVVMIR_LIBRARY_DIR",
_PYTHON_BIN_PATH,
"TMP",
"TMPDIR",
"TF_CUDA_PATHS",
],
_ENVIRONS = [
_GCC_HOST_COMPILER_PATH,
_GCC_HOST_COMPILER_PREFIX,
_CLANG_CUDA_COMPILER_PATH,
"TF_NEED_CUDA",
"TF_CUDA_CLANG",
_TF_DOWNLOAD_CLANG,
_CUDA_TOOLKIT_PATH,
_CUDNN_INSTALL_PATH,
_TF_CUDA_VERSION,
_TF_CUDNN_VERSION,
_TF_CUDA_COMPUTE_CAPABILITIES,
"NVVMIR_LIBRARY_DIR",
_PYTHON_BIN_PATH,
"TMP",
"TMPDIR",
"TF_CUDA_PATHS",
]
remote_cuda_configure = repository_rule(
implementation = _create_local_cuda_repository,
environ = _ENVIRONS,
remotable = True,
attrs = {
"environ": attr.string_dict(),
},
)
cuda_configure = repository_rule(
implementation = _cuda_autoconf_impl,
environ = _ENVIRONS + [_TF_CUDA_CONFIG_REPO],
)
"""Detects and configures the local CUDA toolchain.
Add the following to your WORKSPACE FILE:

View File

@ -811,6 +811,21 @@ def _create_remote_rocm_repository(repository_ctx, remote_config_repo):
config_repo_label(remote_config_repo, "rocm:rocm/rocm_config.h"),
{},
)
repository_ctx.template(
"crosstool/BUILD",
config_repo_label(remote_config_repo, "crosstool:BUILD"),
{},
)
repository_ctx.template(
"crosstool/cc_toolchain_config.bzl",
config_repo_label(remote_config_repo, "crosstool:cc_toolchain_config.bzl"),
{},
)
repository_ctx.template(
"crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc",
config_repo_label(remote_config_repo, "crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc"),
{},
)
def _rocm_autoconf_impl(repository_ctx):
"""Implementation of the rocm_autoconf repository rule."""
@ -824,20 +839,29 @@ def _rocm_autoconf_impl(repository_ctx):
else:
_create_local_rocm_repository(repository_ctx)
rocm_configure = repository_rule(
implementation = _rocm_autoconf_impl,
environ = [
_GCC_HOST_COMPILER_PATH,
_GCC_HOST_COMPILER_PREFIX,
"TF_NEED_ROCM",
_ROCM_TOOLKIT_PATH,
_TF_ROCM_VERSION,
_TF_MIOPEN_VERSION,
_TF_ROCM_AMDGPU_TARGETS,
_TF_ROCM_CONFIG_REPO,
],
_ENVIRONS = [
_GCC_HOST_COMPILER_PATH,
_GCC_HOST_COMPILER_PREFIX,
"TF_NEED_ROCM",
_ROCM_TOOLKIT_PATH,
_TF_ROCM_VERSION,
_TF_MIOPEN_VERSION,
_TF_ROCM_AMDGPU_TARGETS,
]
remote_rocm_configure = repository_rule(
implementation = _create_local_rocm_repository,
environ = _ENVIRONS,
remotable = True,
attrs = {
"environ": attr.string_dict(),
},
)
rocm_configure = repository_rule(
implementation = _rocm_autoconf_impl,
environ = _ENVIRONS + [_TF_ROCM_CONFIG_REPO],
)
"""Detects and configures the local ROCm toolchain.
Add the following to your WORKSPACE FILE:

View File

@ -139,17 +139,28 @@ def _nccl_autoconf_impl(repository_ctx):
else:
_create_local_nccl_repository(repository_ctx)
_ENVIRONS = [
_CUDA_TOOLKIT_PATH,
_NCCL_HDR_PATH,
_NCCL_INSTALL_PATH,
_TF_NCCL_VERSION,
_TF_CUDA_COMPUTE_CAPABILITIES,
_TF_NEED_CUDA,
"TF_CUDA_PATHS",
]
remote_nccl_configure = repository_rule(
implementation = _create_local_nccl_repository,
environ = _ENVIRONS,
remotable = True,
attrs = {
"environ": attr.string_dict(),
},
)
nccl_configure = repository_rule(
implementation = _nccl_autoconf_impl,
environ = [
_CUDA_TOOLKIT_PATH,
_NCCL_HDR_PATH,
_NCCL_INSTALL_PATH,
_TF_NCCL_VERSION,
_TF_CUDA_COMPUTE_CAPABILITIES,
_TF_NEED_CUDA,
"TF_CUDA_PATHS",
],
environ = _ENVIRONS,
)
"""Detects and configures the NCCL configuration.

View File

@ -262,14 +262,24 @@ def _python_autoconf_impl(repository_ctx):
else:
_create_local_python_repository(repository_ctx)
_ENVIRONS = [
BAZEL_SH,
PYTHON_BIN_PATH,
PYTHON_LIB_PATH,
]
remote_python_configure = repository_rule(
implementation = _create_local_python_repository,
environ = _ENVIRONS,
remotable = True,
attrs = {
"environ": attr.string_dict(),
},
)
python_configure = repository_rule(
implementation = _python_autoconf_impl,
environ = [
BAZEL_SH,
PYTHON_BIN_PATH,
PYTHON_LIB_PATH,
TF_PYTHON_CONFIG_REPO,
],
environ = _ENVIRONS + [TF_PYTHON_CONFIG_REPO],
)
"""Detects and configures the local Python.

11
third_party/remote_config/BUILD.tpl vendored Normal file
View File

@ -0,0 +1,11 @@
platform(
name = "platform",
constraint_values = [
"@bazel_tools//platforms:x86_64",
"@bazel_tools//platforms:linux",
],
exec_properties = {
"container-image": "%{container_image}",
"Pool": "default",
},
)

View File

@ -0,0 +1,17 @@
"""Repository rule to create a platform for a docker image to be used with RBE."""
def _remote_platform_configure_impl(repository_ctx):
repository_ctx.template(
"BUILD",
Label("@org_tensorflow//third_party/remote_config:BUILD.tpl"),
{
"%{container_image}": repository_ctx.attr.container_image,
},
)
remote_platform_configure = repository_rule(
implementation = _remote_platform_configure_impl,
attrs = {
"container_image": attr.string(mandatory = True),
},
)

View File

@ -178,15 +178,25 @@ def _tensorrt_configure_impl(repository_ctx):
_create_local_tensorrt_repository(repository_ctx)
_ENVIRONS = [
_TENSORRT_INSTALL_PATH,
_TF_TENSORRT_VERSION,
_TF_NEED_TENSORRT,
"TF_CUDA_PATHS",
]
remote_tensorrt_configure = repository_rule(
implementation = _create_local_tensorrt_repository,
environ = _ENVIRONS,
remotable = True,
attrs = {
"environ": attr.string_dict(),
},
)
tensorrt_configure = repository_rule(
implementation = _tensorrt_configure_impl,
environ = [
_TENSORRT_INSTALL_PATH,
_TF_TENSORRT_VERSION,
_TF_TENSORRT_CONFIG_REPO,
_TF_NEED_TENSORRT,
"TF_CUDA_PATHS",
],
environ = _ENVIRONS + [_TF_TENSORRT_CONFIG_REPO],
)
"""Detects and configures the local CUDA toolchain.

View File

View File

@ -0,0 +1,24 @@
"""Configurations of RBE builds used with remote config."""
load("//third_party/toolchains/remote_config:rbe_config.bzl", "tensorflow_rbe_config")
def initialize_rbe_configs():
tensorflow_rbe_config(
name = "ubuntu16.04-py3-gcc7_manylinux2010-cuda10.0-cudnn7-tensorrt5.1",
compiler = "/dt7/usr/bin/gcc",
compiler_prefix = "/usr/bin",
cuda_version = "10.0",
cudnn_version = "7",
os = "ubuntu16.04-manylinux2010",
python_version = "3",
tensorrt_install_path = "/usr",
tensorrt_version = "5.1",
)
tensorflow_rbe_config(
name = "ubuntu16.04-py3_opt-gcc5-rocm",
compiler = "gcc",
os = "ubuntu16.04",
python_version = "3",
rocm_version = "2.5", # Any version will do.
)

View File

@ -0,0 +1,20 @@
"""Docker images used with remote config and RBE."""
load("//third_party/toolchains/preconfig/generate:containers.bzl", "container_digests")
containers = {
# Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu16.04-manylinux2010.
"cuda10.0-cudnn7-ubuntu16.04-manylinux2010": {
"registry": "gcr.io",
"repository": "tensorflow-testing/nosla-cuda10.0-cudnn7-ubuntu16.04-manylinux2010",
"digest": container_digests["cuda10.0-cudnn7-ubuntu16.04-manylinux2010"],
},
# Built with //tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu16.04
"rocm-ubuntu16.04": {
"registry": "gcr.io",
"repository": "tensorflow-testing/nosla-rocm-ubuntu16.04",
"digest": container_digests["rocm-ubuntu16.04"],
},
}

View File

@ -0,0 +1,125 @@
"""Macro that creates external repositories for remote config."""
load("//third_party/py:python_configure.bzl", "remote_python_configure")
load("//third_party/gpus:cuda_configure.bzl", "remote_cuda_configure")
load("//third_party/nccl:nccl_configure.bzl", "remote_nccl_configure")
load("//third_party/gpus:rocm_configure.bzl", "remote_rocm_configure")
load("//third_party/tensorrt:tensorrt_configure.bzl", "remote_tensorrt_configure")
load("//third_party/toolchains/remote_config:containers.bzl", "containers")
load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
def _container_image_uri(container_name):
container = containers[container_name]
return "docker://%s/%s@%s" % (container["registry"], container["repository"], container["digest"])
def _tensorflow_rbe_config(name, compiler, python_version, os, rocm_version = None, cuda_version = None, cudnn_version = None, tensorrt_version = None, tensorrt_install_path = None, cudnn_install_path = None, compiler_prefix = None, sysroot = None):
if cuda_version == None and rocm_version == None:
fail("Neither cuda_version nor rocm_version specified. You need to specify exactly one.")
if cuda_version != None and rocm_version != None:
fail("Specifying both cuda_version and rocm_version is not supported.")
env = {
"ABI_VERSION": "gcc",
"ABI_LIBC_VERSION": "glibc_2.19",
"BAZEL_COMPILER": compiler,
"BAZEL_HOST_SYSTEM": "i686-unknown-linux-gnu",
"BAZEL_TARGET_LIBC": "glibc_2.19",
"BAZEL_TARGET_CPU": "k8",
"BAZEL_TARGET_SYSTEM": "x86_64-unknown-linux-gnu",
"CC_TOOLCHAIN_NAME": "linux_gnu_x86",
"CC": compiler,
"PYTHON_BIN_PATH": "/usr/bin/python%s" % python_version,
"CLEAR_CACHE": "1",
"HOST_CXX_COMPILER": compiler,
"HOST_C_COMPILER": compiler,
}
if cuda_version != None:
# The cuda toolchain currently contains its own C++ toolchain definition,
# so we do not fetch local_config_cc.
env.update({
"TF_NEED_CUDA": "1",
"TF_CUDA_CLANG": "1" if compiler.endswith("clang") else "0",
"TF_CUDA_COMPUTE_CAPABILITIES": "3.0,6.0",
"TF_ENABLE_XLA": "1",
"TF_CUDNN_VERSION": cudnn_version,
"TF_CUDA_VERSION": cuda_version,
"CUDNN_INSTALL_PATH": cudnn_install_path if cudnn_install_path != None else "/usr/lib/x86_64-linux-gnu",
"TF_NEED_TENSORRT": "1",
"TF_TENSORRT_VERSION": tensorrt_version,
"TENSORRT_INSTALL_PATH": tensorrt_install_path if tensorrt_install_path != None else "/usr/lib/x86_64-linux-gnu",
"GCC_HOST_COMPILER_PATH": compiler if not compiler.endswith("clang") else "",
"GCC_HOST_COMPILER_PREFIX": compiler_prefix if compiler_prefix != None else "/usr/bin",
"CLANG_CUDA_COMPILER_PATH": compiler if compiler.endswith("clang") else "",
"TF_SYSROOT": sysroot if sysroot else "",
})
container_name = "cuda%s-cudnn%s-%s" % (cuda_version, cudnn_version, os)
container_image = _container_image_uri(container_name)
exec_properties = {
"container-image": container_image,
"Pool": "default",
}
remote_platform_configure(
name = "%s_config_platform" % name,
container_image = container_image,
)
remote_python_configure(
name = "%s_config_python" % name,
environ = env,
exec_properties = exec_properties,
)
remote_cuda_configure(
name = "%s_config_cuda" % name,
environ = env,
exec_properties = exec_properties,
)
remote_nccl_configure(
name = "%s_config_nccl" % name,
environ = env,
exec_properties = exec_properties,
)
remote_tensorrt_configure(
name = "%s_config_tensorrt" % name,
environ = env,
exec_properties = exec_properties,
)
elif rocm_version != None:
# The rocm toolchain currently contains its own C++ toolchain definition,
# so we do not fetch local_config_cc.
env.update({
"TF_NEED_ROCM": "1",
"TF_ENABLE_XLA": "0",
})
container_name = "rocm-%s" % (os)
container_image = _container_image_uri(container_name)
exec_properties = {
"container-image": container_image,
"Pool": "default",
}
remote_platform_configure(
name = "%s_config_platform" % name,
container_image = container_image,
)
remote_python_configure(
name = "%s_config_python" % name,
environ = env,
exec_properties = exec_properties,
)
remote_rocm_configure(
name = "%s_config_rocm" % name,
environ = env,
exec_properties = exec_properties,
)
tensorflow_rbe_config = _tensorflow_rbe_config