From 8d393ea2fab0ea88ecd11e36d89f186cbc884dbe Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 30 Mar 2017 07:38:55 -0800 Subject: [PATCH] Add cuda_clang build configuration that allows to use clang as a CUDA compiler. Change: 151705528 --- configure | 65 +++- tensorflow/workspace.bzl | 56 +++- .../gpus/crosstool/CROSSTOOL_clang.tpl | 292 ++++++++++++++++++ third_party/gpus/crosstool/CROSSTOOL_nvcc.tpl | 249 +++++++++++++++ third_party/gpus/cuda/build_defs.bzl.tpl | 4 +- third_party/gpus/cuda_configure.bzl | 108 +++++-- third_party/nccl/BUILD | 0 third_party/nccl/fix_clang_compilation.patch | 85 +++++ third_party/nccl/nccl.BUILD | 66 ++++ third_party/protobuf/BUILD | 0 third_party/protobuf/add_noinlines.patch | 30 ++ tools/bazel.rc.template | 5 + 12 files changed, 920 insertions(+), 40 deletions(-) create mode 100644 third_party/gpus/crosstool/CROSSTOOL_clang.tpl create mode 100644 third_party/gpus/crosstool/CROSSTOOL_nvcc.tpl create mode 100644 third_party/nccl/BUILD create mode 100644 third_party/nccl/fix_clang_compilation.patch create mode 100644 third_party/nccl/nccl.BUILD create mode 100644 third_party/protobuf/BUILD create mode 100644 third_party/protobuf/add_noinlines.patch diff --git a/configure b/configure index 081db20d753..e59ee2a925b 100755 --- a/configure +++ b/configure @@ -38,7 +38,7 @@ function is_windows() { fi } -function bazel_clean_and_fetch() { +function bazel_fetch() { if [ -z "$TF_BAZEL_TARGETS" ]; then bazel fetch "//tensorflow/... -//tensorflow/contrib/nccl/... -//tensorflow/examples/android/..." else @@ -279,18 +279,40 @@ while [ "$TF_NEED_CUDA" == "" ]; do esac done +sed_hyphen_i -e "/--action_env TF_NEED_CUDA/d" .bazelrc +sed_hyphen_i -e "/--action_env CUD/d" .bazelrc +sed_hyphen_i -e "/--action_env GCC_HOST/d" .bazelrc +sed_hyphen_i -e "/--action_env TF_CUD/d" .bazelrc +sed_hyphen_i -e "/--action_env CLANG_CUDA/d" .bazelrc + export TF_NEED_CUDA +echo "build --action_env TF_NEED_CUDA=$TF_NEED_CUDA" >>.bazelrc + export TF_NEED_OPENCL + if [[ "$TF_NEED_CUDA" == "0" ]] && [[ "$TF_NEED_OPENCL" == "0" ]]; then echo "Configuration finished" - bazel_clean_and_fetch + bazel_fetch exit fi if [ "$TF_NEED_CUDA" == "1" ]; then +while [[ "$TF_CUDA_CLANG" == "" ]]; do + read -p "Do you want to use clang as CUDA compiler? [y/N] " INPUT + case $INPUT in + [Yy]* ) echo "Clang will be used as CUDA compiler"; TF_CUDA_CLANG=1;; + [Nn]* ) echo "nvcc will be used as CUDA compiler"; TF_CUDA_CLANG=0;; + "" ) echo "nvcc will be used as CUDA compiler"; TF_CUDA_CLANG=0;; + * ) echo "Invalid selection: " $INPUT;; + esac +done + +export TF_CUDA_CLANG +echo "build --action_env TF_CUDA_CLANG=$TF_CUDA_CLANG" >>.bazelrc + # Set up which gcc nvcc should use as the host compiler # No need to set this on Windows -while ! is_windows && true; do +while [[ "$TF_CUDA_CLANG" != "1" ]] && ! is_windows && true; do fromuser="" if [ -z "$GCC_HOST_COMPILER_PATH" ]; then default_gcc_host_compiler_path=$(which gcc || true) @@ -302,6 +324,7 @@ while ! is_windows && true; do fi if [ -e "$GCC_HOST_COMPILER_PATH" ]; then export GCC_HOST_COMPILER_PATH + echo "build --action_env GCC_HOST_COMPILER_PATH=\"$GCC_HOST_COMPILER_PATH\"" >>.bazelrc break fi echo "Invalid gcc path. ${GCC_HOST_COMPILER_PATH} cannot be found" 1>&2 @@ -312,6 +335,30 @@ while ! is_windows && true; do # Retry done +# Set up which clang we should use as the cuda / host compiler. +while [[ "$TF_CUDA_CLANG" == "1" ]] && true; do + fromuser="" + if [ -z "$CLANG_CUDA_COMPILER_PATH" ]; then + default_clang_host_compiler_path=$(which clang || true) + read -p "Please specify which clang should be used as device and host compiler. [Default is $default_clang_host_compiler_path]: " CLANG_CUDA_COMPILER_PATH + fromuser="1" + if [ -z "$CLANG_CUDA_COMPILER_PATH" ]; then + CLANG_CUDA_COMPILER_PATH="$default_clang_host_compiler_path" + fi + fi + if [ -e "$CLANG_CUDA_COMPILER_PATH" ]; then + export CLANG_CUDA_COMPILER_PATH + echo "build --action_env CLANG_CUDA_COMPILER_PATH=\"$CLANG_CUDA_COMPILER_PATH\"" >>.bazelrc + break + fi + echo "Invalid clang path. ${CLANG_CUDA_COMPILER_PATH} cannot be found" 1>&2 + if [ -z "$fromuser" ]; then + exit 1 + fi + CLANG_CUDA_COMPILER_PATH="" + # Retry +done + # Find out where the CUDA toolkit is installed while true; do # Configure the Cuda SDK version to use. @@ -352,7 +399,10 @@ while true; do if [ -e "${CUDA_TOOLKIT_PATH}/${CUDA_RT_LIB_PATH}" ]; then export CUDA_TOOLKIT_PATH + echo "build --action_env CUDA_TOOLKIT_PATH=\"$CUDA_TOOLKIT_PATH\"" >>.bazelrc + export TF_CUDA_VERSION + echo "build --action_env TF_CUDA_VERSION=$TF_CUDA_VERSION" >>.bazelrc break fi echo "Invalid path to CUDA $TF_CUDA_VERSION toolkit. ${CUDA_TOOLKIT_PATH}/${CUDA_RT_LIB_PATH} cannot be found" @@ -404,7 +454,10 @@ while true; do if [ -e "$CUDNN_INSTALL_PATH/${CUDA_DNN_LIB_ALT_PATH}" -o -e "$CUDNN_INSTALL_PATH/${CUDA_DNN_LIB_PATH}" ]; then export TF_CUDNN_VERSION + echo "build --action_env TF_CUDNN_VERSION=$TF_CUDNN_VERSION" >>.bazelrc + export CUDNN_INSTALL_PATH + echo "build --action_env CUDNN_INSTALL_PATH=\"$CUDNN_INSTALL_PATH\"" >>.bazelrc break fi @@ -417,7 +470,10 @@ while true; do CUDNN_PATH_FROM_LDCONFIG="$($LDCONFIG_BIN -p | sed -n 's/.*libcudnn.so .* => \(.*\)/\1/p')" if [ -e "${CUDNN_PATH_FROM_LDCONFIG}${TF_CUDNN_EXT}" ]; then export TF_CUDNN_VERSION + echo "build --action_env TF_CUDNN_VERSION=$TF_CUDNN_VERSION" >>.bazelrc + export CUDNN_INSTALL_PATH="$(dirname ${CUDNN_PATH_FROM_LDCONFIG})" + echo "build --action_env CUDNN_INSTALL_PATH=\"$CUDNN_INSTALL_PATH\"" >>.bazelrc break fi fi @@ -469,6 +525,7 @@ EOF fi else export TF_CUDA_COMPUTE_CAPABILITIES + echo "build --action_env TF_CUDA_COMPUTE_CAPABILITIES=$TF_CUDA_COMPUTE_CAPABILITIES" >>.bazelrc break fi TF_CUDA_COMPUTE_CAPABILITIES="" @@ -572,6 +629,6 @@ done # end of if "$TF_NEED_OPENCL" == "1" fi -bazel_clean_and_fetch +bazel_fetch echo "Configuration finished" diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index a13142fe48a..f8dfd21f846 100644 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -50,17 +50,54 @@ def _temp_workaround_http_archive_impl(repo_ctx): }, False) repo_ctx.download_and_extract(repo_ctx.attr.urls, "", repo_ctx.attr.sha256, "", repo_ctx.attr.strip_prefix) + if repo_ctx.attr.patch_file != None: + _apply_patch(repo_ctx, repo_ctx.attr.patch_file) temp_workaround_http_archive = repository_rule( implementation=_temp_workaround_http_archive_impl, attrs = { "build_file": attr.label(), "repository": attr.string(), + "patch_file": attr.label(default = None), "urls": attr.string_list(default = []), "sha256": attr.string(default = ""), "strip_prefix": attr.string(default = ""), }) +# Executes specified command with arguments and calls 'fail' if it exited with non-zero code +def _execute_and_check_ret_code(repo_ctx, cmd_and_args): + result = repo_ctx.execute(cmd_and_args) + if result.return_code != 0: + fail(("Non-zero return code({1}) when executing '{0}':\n" + + "Stdout: {2}\n" + + "Stderr: {3}").format(" ".join(cmd_and_args), + result.return_code, result.stdout, result.stderr)) + +# Apply a patch_file to the repository root directory +# Runs 'patch -p1' +def _apply_patch(repo_ctx, patch_file): + _execute_and_check_ret_code(repo_ctx, ["patch", "-p1", + "-d", repo_ctx.path("."), + "-i", repo_ctx.path(patch_file)]) + +# Download the repository and apply a patch to its root +def _patched_http_archive_impl(repo_ctx): + repo_ctx.download_and_extract(repo_ctx.attr.urls, + sha256 = repo_ctx.attr.sha256, + stripPrefix = repo_ctx.attr.strip_prefix) + _apply_patch(repo_ctx, repo_ctx.attr.patch_file) + +patched_http_archive = repository_rule( + implementation = _patched_http_archive_impl, + attrs = { + "patch_file": attr.label(), + "build_file": attr.label(), + "repository": attr.string(), + "urls": attr.string_list(default = []), + "sha256": attr.string(default = ""), + "strip_prefix": attr.string(default = ""), + }) + # If TensorFlow is linked as a submodule. # path_prefix and tf_repo_name are no longer used. def tf_workspace(path_prefix = "", tf_repo_name = ""): @@ -78,11 +115,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): native.new_http_archive( name = "eigen_archive", urls = [ - "http://bazel-mirror.storage.googleapis.com/bitbucket.org/eigen/eigen/get/9c6361787292.tar.gz", - "https://bitbucket.org/eigen/eigen/get/9c6361787292.tar.gz", + "http://bazel-mirror.storage.googleapis.com/bitbucket.org/eigen/eigen/get/deff8b280204.tar.gz", + "https://bitbucket.org/eigen/eigen/get/deff8b280204.tar.gz", ], - sha256 = "e6ec2502a5d82dd5df0b9b16e7697f5fccb81c322d0be8e3492969eecb66badd", - strip_prefix = "eigen-eigen-9c6361787292", + sha256 = "a39834683eb5bdb9a7434f0ab3621d2cbc3b07e8002db6de101e45ec536723eb", + strip_prefix = "eigen-eigen-deff8b280204", build_file = str(Label("//third_party:eigen.BUILD")), ) @@ -255,7 +292,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): actual = "@six_archive//:six", ) - native.http_archive( + patched_http_archive( name = "protobuf", urls = [ "http://bazel-mirror.storage.googleapis.com/github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz", @@ -263,6 +300,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): ], sha256 = "e5d3d4e227a0f7afb8745df049bbd4d55474b158ca5aaa2a0e31099af24be1d0", strip_prefix = "protobuf-2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a", + # TODO: remove patching when tensorflow stops linking same protos into + # multiple shared libraries loaded in runtime by python. + # This patch fixes a runtime crash when tensorflow is compiled + # with clang -O2 on Linux (see https://github.com/tensorflow/tensorflow/issues/8394) + patch_file = str(Label("//third_party/protobuf:add_noinlines.patch")), ) native.new_http_archive( @@ -452,7 +494,9 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): ], sha256 = "6787f0eed88d52ee8e32956fa4947d92c139da469f1d8e311c307f27d641118e", strip_prefix = "nccl-024d1e267845f2ed06f3e2e42476d50f04a00ee6", - build_file = str(Label("//third_party:nccl.BUILD")), + build_file = str(Label("//third_party/nccl:nccl.BUILD")), + # TODO: Remove patching after the fix is merged into nccl(see https://github.com/NVIDIA/nccl/pull/78) + patch_file = str(Label("//third_party/nccl:fix_clang_compilation.patch")), repository = tf_repo_name, ) diff --git a/third_party/gpus/crosstool/CROSSTOOL_clang.tpl b/third_party/gpus/crosstool/CROSSTOOL_clang.tpl new file mode 100644 index 00000000000..e4363d60457 --- /dev/null +++ b/third_party/gpus/crosstool/CROSSTOOL_clang.tpl @@ -0,0 +1,292 @@ +major_version: "local" +minor_version: "" +default_target_cpu: "same_as_host" + +default_toolchain { + cpu: "k8" + toolchain_identifier: "local_linux" +} +default_toolchain { + cpu: "piii" + toolchain_identifier: "local_linux" +} +default_toolchain { + cpu: "arm" + toolchain_identifier: "local_linux" +} +default_toolchain { + cpu: "darwin" + toolchain_identifier: "local_darwin" +} +default_toolchain { + cpu: "ppc" + toolchain_identifier: "local_linux" +} + +toolchain { + abi_version: "local" + abi_libc_version: "local" + compiler: "compiler" + host_system_name: "local" + needsPic: true + target_libc: "local" + target_cpu: "local" + target_system_name: "local" + toolchain_identifier: "local_linux" + + feature { + name: "c++11" + flag_set { + action: "c++-compile" + flag_group { + flag: "-std=c++11" + } + } + } + + feature { + name: "stdlib" + flag_set { + action: "c++-link-executable" + action: "c++-link-dynamic-library" + flag_group { + flag: "-lstdc++" + } + } + } + + feature { + name: "determinism" + flag_set { + action: "c-compile" + action: "c++-compile" + flag_group { + # Make C++ compilation deterministic. Use linkstamping instead of these + # compiler symbols. + flag: "-Wno-builtin-macro-redefined" + flag: "-D__DATE__=\"redacted\"" + flag: "-D__TIMESTAMP__=\"redacted\"" + flag: "-D__TIME__=\"redacted\"" + } + } + } + + feature { + name: "alwayslink" + flag_set { + action: "c++-link-dynamic-library" + action: "c++-link-executable" + flag_group { + flag: "-Wl,-no-as-needed" + } + } + } + + # This feature will be enabled for builds that support pic by bazel. + feature { + name: "pic" + flag_set { + action: "c-compile" + action: "c++-compile" + flag_group { + expand_if_all_available: "pic" + flag: "-fPIC" + } + flag_group { + expand_if_none_available: "pic" + flag: "-fPIE" + } + } + } + + # Security hardening on by default. + feature { + name: "hardening" + flag_set { + action: "c-compile" + action: "c++-compile" + flag_group { + # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases. + # We need to undef it before redefining it as some distributions now + # have it enabled by default. + flag: "-U_FORTIFY_SOURCE" + flag: "-D_FORTIFY_SOURCE=1" + flag: "-fstack-protector" + } + } + flag_set { + action: "c++-link-dynamic-library" + flag_group { + flag: "-Wl,-z,relro,-z,now" + } + } + flag_set { + action: "c++-link-executable" + flag_group { + flag: "-pie" + flag: "-Wl,-z,relro,-z,now" + } + } + } + + feature { + name: "warnings" + flag_set { + action: "c-compile" + action: "c++-compile" + flag_group { + # All warnings are enabled. Maybe enable -Werror as well? + flag: "-Wall" + # Some parts of the codebase set -Werror and hit this warning, so + # switch it off for now. + flag: "-Wno-invalid-partial-specialization" + } + } + } + + # Keep stack frames for debugging, even in opt mode. + feature { + name: "frame-pointer" + flag_set { + action: "c-compile" + action: "c++-compile" + flag_group { + flag: "-fno-omit-frame-pointer" + } + } + } + + feature { + name: "build-id" + flag_set { + action: "c++-link-executable" + action: "c++-link-dynamic-library" + flag_group { + # Stamp the binary with a unique identifier. + flag: "-Wl,--build-id=md5" + flag: "-Wl,--hash-style=gnu" + } + } + } + + feature { + name: "no-canonical-prefixes" + flag_set { + action: "c-compile" + action: "c++-compile" + action: "c++-link-executable" + action: "c++-link-dynamic-library" + flag_group { + flag:"-no-canonical-prefixes" + } + } + } + + feature { + name: "disable-assertions" + flag_set { + action: "c-compile" + action: "c++-compile" + flag_group { + flag: "-DNDEBUG" + } + } + } + + feature { + name: "linker-bin-path" + + flag_set { + action: "c++-link-executable" + action: "c++-link-dynamic-library" + flag_group { + flag: "-B/usr/bin/" + } + } + } + + feature { + name: "common" + implies: "stdlib" + implies: "c++11" + implies: "determinism" + implies: "alwayslink" + implies: "hardening" + implies: "warnings" + implies: "frame-pointer" + implies: "build-id" + implies: "no-canonical-prefixes" + implies: "linker-bin-path" + } + + feature { + name: "opt" + implies: "common" + implies: "disable-assertions" + + flag_set { + action: "c-compile" + action: "c++-compile" + flag_group { + # No debug symbols. + # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt + # or even generally? However, that can't happen here, as it requires + # special handling in Bazel. + flag: "-g0" + + # Conservative choice for -O + # -O3 can increase binary size and even slow down the resulting binaries. + # Profile first and / or use FDO if you need better performance than this. + flag: "-O2" + + # Removal of unused code and data at link time (can this increase binary size in some cases?). + flag: "-ffunction-sections" + flag: "-fdata-sections" + } + } + flag_set { + action: "c++-link-dynamic-library" + action: "c++-link-executable" + flag_group { + flag: "-Wl,--gc-sections" + } + } + } + + feature { + name: "fastbuild" + implies: "common" + } + + feature { + name: "dbg" + implies: "common" + flag_set { + action: "c-compile" + action: "c++-compile" + flag_group { + flag: "-g" + } + } + } + + # Set clang as a C/C++ compiler. + tool_path { name: "gcc" path: "%{clang_path}" } + + # Use the default system toolchain for everything else. + tool_path { name: "ar" path: "/usr/bin/ar" } + tool_path { name: "compat-ld" path: "/usr/bin/ld" } + tool_path { name: "cpp" path: "/usr/bin/cpp" } + tool_path { name: "dwp" path: "/usr/bin/dwp" } + tool_path { name: "gcov" path: "/usr/bin/gcov" } + tool_path { name: "ld" path: "/usr/bin/ld" } + tool_path { name: "nm" path: "/usr/bin/nm" } + tool_path { name: "objcopy" path: "/usr/bin/objcopy" } + tool_path { name: "objdump" path: "/usr/bin/objdump" } + tool_path { name: "strip" path: "/usr/bin/strip" } + + # Enabled dynamic linking. + linking_mode_flags { mode: DYNAMIC } + +%{host_compiler_includes} +} diff --git a/third_party/gpus/crosstool/CROSSTOOL_nvcc.tpl b/third_party/gpus/crosstool/CROSSTOOL_nvcc.tpl new file mode 100644 index 00000000000..116f67cbae4 --- /dev/null +++ b/third_party/gpus/crosstool/CROSSTOOL_nvcc.tpl @@ -0,0 +1,249 @@ +major_version: "local" +minor_version: "" +default_target_cpu: "same_as_host" + +default_toolchain { + cpu: "k8" + toolchain_identifier: "local_linux" +} +default_toolchain { + cpu: "piii" + toolchain_identifier: "local_linux" +} +default_toolchain { + cpu: "arm" + toolchain_identifier: "local_linux" +} +default_toolchain { + cpu: "darwin" + toolchain_identifier: "local_darwin" +} +default_toolchain { + cpu: "ppc" + toolchain_identifier: "local_linux" +} + +toolchain { + abi_version: "local" + abi_libc_version: "local" + builtin_sysroot: "" + compiler: "compiler" + host_system_name: "local" + needsPic: true + supports_gold_linker: false + supports_incremental_linker: false + supports_fission: false + supports_interface_shared_objects: false + supports_normalizing_ar: false + supports_start_end_lib: false + supports_thin_archives: false + target_libc: "local" + target_cpu: "local" + target_system_name: "local" + toolchain_identifier: "local_linux" + + tool_path { name: "ar" path: "/usr/bin/ar" } + tool_path { name: "compat-ld" path: "/usr/bin/ld" } + tool_path { name: "cpp" path: "/usr/bin/cpp" } + tool_path { name: "dwp" path: "/usr/bin/dwp" } + # As part of the TensorFlow release, we place some cuda-related compilation + # files in @local_config_cuda//crosstool/clang/bin, and this relative + # path, combined with the rest of our Bazel configuration causes our + # compilation to use those files. + tool_path { name: "gcc" path: "clang/bin/crosstool_wrapper_driver_is_not_gcc" } + # Use "-std=c++11" for nvcc. For consistency, force both the host compiler + # and the device compiler to use "-std=c++11". + cxx_flag: "-std=c++11" + linker_flag: "-Wl,-no-as-needed" + linker_flag: "-lstdc++" + linker_flag: "-B/usr/bin/" + +%{host_compiler_includes} + tool_path { name: "gcov" path: "/usr/bin/gcov" } + + # C(++) compiles invoke the compiler (as that is the one knowing where + # to find libraries), but we provide LD so other rules can invoke the linker. + tool_path { name: "ld" path: "/usr/bin/ld" } + + tool_path { name: "nm" path: "/usr/bin/nm" } + tool_path { name: "objcopy" path: "/usr/bin/objcopy" } + objcopy_embed_flag: "-I" + objcopy_embed_flag: "binary" + tool_path { name: "objdump" path: "/usr/bin/objdump" } + tool_path { name: "strip" path: "/usr/bin/strip" } + + # Anticipated future default. + unfiltered_cxx_flag: "-no-canonical-prefixes" + + # Make C++ compilation deterministic. Use linkstamping instead of these + # compiler symbols. + unfiltered_cxx_flag: "-Wno-builtin-macro-redefined" + unfiltered_cxx_flag: "-D__DATE__=\"redacted\"" + unfiltered_cxx_flag: "-D__TIMESTAMP__=\"redacted\"" + unfiltered_cxx_flag: "-D__TIME__=\"redacted\"" + + # Security hardening on by default. + # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases. + # We need to undef it before redefining it as some distributions now have + # it enabled by default. + compiler_flag: "-U_FORTIFY_SOURCE" + compiler_flag: "-D_FORTIFY_SOURCE=1" + compiler_flag: "-fstack-protector" + compiler_flag: "-fPIE" + linker_flag: "-pie" + linker_flag: "-Wl,-z,relro,-z,now" + + # Enable coloring even if there's no attached terminal. Bazel removes the + # escape sequences if --nocolor is specified. This isn't supported by gcc + # on Ubuntu 14.04. + # compiler_flag: "-fcolor-diagnostics" + + # All warnings are enabled. Maybe enable -Werror as well? + compiler_flag: "-Wall" + # Enable a few more warnings that aren't part of -Wall. + compiler_flag: "-Wunused-but-set-parameter" + # But disable some that are problematic. + compiler_flag: "-Wno-free-nonheap-object" # has false positives + + # Keep stack frames for debugging, even in opt mode. + compiler_flag: "-fno-omit-frame-pointer" + + # Anticipated future default. + linker_flag: "-no-canonical-prefixes" + unfiltered_cxx_flag: "-fno-canonical-system-headers" + # Have gcc return the exit code from ld. + linker_flag: "-pass-exit-codes" + # Stamp the binary with a unique identifier. + linker_flag: "-Wl,--build-id=md5" + linker_flag: "-Wl,--hash-style=gnu" + # Gold linker only? Can we enable this by default? + # linker_flag: "-Wl,--warn-execstack" + # linker_flag: "-Wl,--detect-odr-violations" + + # Include directory for cuda headers. + cxx_builtin_include_directory: "%{cuda_include_path}" + + compilation_mode_flags { + mode: DBG + # Enable debug symbols. + compiler_flag: "-g" + } + compilation_mode_flags { + mode: OPT + + # No debug symbols. + # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt or + # even generally? However, that can't happen here, as it requires special + # handling in Bazel. + compiler_flag: "-g0" + + # Conservative choice for -O + # -O3 can increase binary size and even slow down the resulting binaries. + # Profile first and / or use FDO if you need better performance than this. + compiler_flag: "-O2" + + # Disable assertions + compiler_flag: "-DNDEBUG" + + # Removal of unused code and data at link time (can this increase binary size in some cases?). + compiler_flag: "-ffunction-sections" + compiler_flag: "-fdata-sections" + linker_flag: "-Wl,--gc-sections" + } + linking_mode_flags { mode: DYNAMIC } +} + +toolchain { + abi_version: "local" + abi_libc_version: "local" + builtin_sysroot: "" + compiler: "compiler" + host_system_name: "local" + needsPic: true + target_libc: "macosx" + target_cpu: "darwin" + target_system_name: "local" + toolchain_identifier: "local_darwin" + + tool_path { name: "ar" path: "/usr/bin/libtool" } + tool_path { name: "compat-ld" path: "/usr/bin/ld" } + tool_path { name: "cpp" path: "/usr/bin/cpp" } + tool_path { name: "dwp" path: "/usr/bin/dwp" } + tool_path { name: "gcc" path: "clang/bin/crosstool_wrapper_driver_is_not_gcc" } + cxx_flag: "-std=c++11" + ar_flag: "-static" + ar_flag: "-s" + ar_flag: "-o" + linker_flag: "-lc++" + linker_flag: "-undefined" + linker_flag: "dynamic_lookup" + # TODO(ulfjack): This is wrong on so many levels. Figure out a way to auto-detect the proper + # setting from the local compiler, and also how to make incremental builds correct. + cxx_builtin_include_directory: "/" + tool_path { name: "gcov" path: "/usr/bin/gcov" } + tool_path { name: "ld" path: "/usr/bin/ld" } + tool_path { name: "nm" path: "/usr/bin/nm" } + tool_path { name: "objcopy" path: "/usr/bin/objcopy" } + objcopy_embed_flag: "-I" + objcopy_embed_flag: "binary" + tool_path { name: "objdump" path: "/usr/bin/objdump" } + tool_path { name: "strip" path: "/usr/bin/strip" } + + # Anticipated future default. + unfiltered_cxx_flag: "-no-canonical-prefixes" + # Make C++ compilation deterministic. Use linkstamping instead of these + # compiler symbols. + unfiltered_cxx_flag: "-Wno-builtin-macro-redefined" + unfiltered_cxx_flag: "-D__DATE__=\"redacted\"" + unfiltered_cxx_flag: "-D__TIMESTAMP__=\"redacted\"" + unfiltered_cxx_flag: "-D__TIME__=\"redacted\"" + + # Security hardening on by default. + # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases. + compiler_flag: "-D_FORTIFY_SOURCE=1" + compiler_flag: "-fstack-protector" + + # Enable coloring even if there's no attached terminal. Bazel removes the + # escape sequences if --nocolor is specified. + compiler_flag: "-fcolor-diagnostics" + + # All warnings are enabled. Maybe enable -Werror as well? + compiler_flag: "-Wall" + # Enable a few more warnings that aren't part of -Wall. + compiler_flag: "-Wthread-safety" + compiler_flag: "-Wself-assign" + + # Keep stack frames for debugging, even in opt mode. + compiler_flag: "-fno-omit-frame-pointer" + + # Anticipated future default. + linker_flag: "-no-canonical-prefixes" + + # Include directory for cuda headers. + cxx_builtin_include_directory: "%{cuda_include_path}" + + compilation_mode_flags { + mode: DBG + # Enable debug symbols. + compiler_flag: "-g" + } + compilation_mode_flags { + mode: OPT + # No debug symbols. + # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt or even generally? + # However, that can't happen here, as it requires special handling in Bazel. + compiler_flag: "-g0" + + # Conservative choice for -O + # -O3 can increase binary size and even slow down the resulting binaries. + # Profile first and / or use FDO if you need better performance than this. + compiler_flag: "-O2" + + # Disable assertions + compiler_flag: "-DNDEBUG" + + # Removal of unused code and data at link time (can this increase binary size in some cases?). + compiler_flag: "-ffunction-sections" + compiler_flag: "-fdata-sections" + } +} diff --git a/third_party/gpus/cuda/build_defs.bzl.tpl b/third_party/gpus/cuda/build_defs.bzl.tpl index a497ed98f03..ca8bbc1ee22 100644 --- a/third_party/gpus/cuda/build_defs.bzl.tpl +++ b/third_party/gpus/cuda/build_defs.bzl.tpl @@ -8,12 +8,14 @@ def if_cuda(if_true, if_false = []): """ return select({ "@local_config_cuda//cuda:using_nvcc": if_true, + "@local_config_cuda//cuda:using_clang": if_true, "//conditions:default": if_false }) + def cuda_default_copts(): """Default options for all CUDA compilations.""" - return if_cuda(["-x", "cuda", "-DGOOGLE_CUDA=1"]) + return if_cuda(["-x", "cuda", "-DGOOGLE_CUDA=1"] + %{cuda_extra_copts}) def cuda_is_configured(): diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl index a2b3e7d79e9..bbe0442eaf8 100644 --- a/third_party/gpus/cuda_configure.bzl +++ b/third_party/gpus/cuda_configure.bzl @@ -5,6 +5,9 @@ * `TF_NEED_CUDA`: Whether to enable building with CUDA. * `GCC_HOST_COMPILER_PATH`: The GCC host compiler path + * `TF_CUDA_CLANG`: Wheter to use clang as a cuda compiler. + * `CLANG_CUDA_COMPILER_PATH`: The clang compiler path that will be used for + both host and device code compilation if TF_CUDA_CLANG is 1. * `CUDA_TOOLKIT_PATH`: The path to the CUDA toolkit. Default is `/usr/local/cuda`. * `TF_CUDA_VERSION`: The version of the CUDA toolkit. If this is blank, then @@ -17,6 +20,7 @@ """ _GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH" +_CLANG_CUDA_COMPILER_PATH = "CLANG_CUDA_COMPILER_PATH" _CUDA_TOOLKIT_PATH = "CUDA_TOOLKIT_PATH" _TF_CUDA_VERSION = "TF_CUDA_VERSION" _TF_CUDNN_VERSION = "TF_CUDNN_VERSION" @@ -35,19 +39,25 @@ _DEFAULT_CUDA_COMPUTE_CAPABILITIES = ["3.5", "5.2"] # BEGIN cc_configure common functions. def find_cc(repository_ctx): """Find the C++ compiler.""" - cc_name = "gcc" - if _GCC_HOST_COMPILER_PATH in repository_ctx.os.environ: - cc_name = repository_ctx.os.environ[_GCC_HOST_COMPILER_PATH].strip() - if not cc_name: - cc_name = "gcc" + if _use_cuda_clang(repository_ctx): + target_cc_name = "clang" + cc_path_envvar = _CLANG_CUDA_COMPILER_PATH + else: + target_cc_name = "gcc" + cc_path_envvar = _GCC_HOST_COMPILER_PATH + cc_name = target_cc_name + + if cc_path_envvar in repository_ctx.os.environ: + cc_name_from_env = repository_ctx.os.environ[cc_path_envvar].strip() + if cc_name_from_env: + cc_name = cc_name_from_env if cc_name.startswith("/"): # Absolute path, maybe we should make this suported by our which function. return cc_name cc = repository_ctx.which(cc_name) if cc == None: - fail( - "Cannot find gcc, either correct your path or set the CC" + - " environment variable") + fail(("Cannot find {}, either correct your path or set the {}" + + " environment variable").format(target_cc_name, cc_path_envvar)) return cc @@ -64,10 +74,17 @@ def _cxx_inc_convert(path): path = path[:-_OSX_FRAMEWORK_SUFFIX_LEN].strip() return path - -def get_cxx_inc_directories(repository_ctx, cc): - """Compute the list of default C++ include directories.""" - result = repository_ctx.execute([cc, "-E", "-xc++", "-", "-v"]) +def _get_cxx_inc_directories_impl(repository_ctx, cc, lang_is_cpp): + """Compute the list of default C or C++ include directories.""" + if lang_is_cpp: + lang = "c++" + else: + lang = "c" + # TODO: We pass -no-canonical-prefixes here to match the compiler flags, + # but in cuda_clang CROSSTOOL file that is a `feature` and we should + # handle the case when it's disabled and no flag is passed + result = repository_ctx.execute([cc, "-no-canonical-prefixes", + "-E", "-x" + lang, "-", "-v"]) index1 = result.stderr.find(_INC_DIR_MARKER_BEGIN) if index1 == -1: return [] @@ -86,6 +103,19 @@ def get_cxx_inc_directories(repository_ctx, cc): return [repository_ctx.path(_cxx_inc_convert(p)) for p in inc_dirs.split("\n")] +def get_cxx_inc_directories(repository_ctx, cc): + """Compute the list of default C and C++ include directories.""" + # For some reason `clang -xc` sometimes returns include paths that are + # different from the ones from `clang -xc++`. (Symlink and a dir) + # So we run the compiler with both `-xc` and `-xc++` and merge resulting lists + includes_cpp = _get_cxx_inc_directories_impl(repository_ctx, cc, True) + includes_c = _get_cxx_inc_directories_impl(repository_ctx, cc, False) + + includes_cpp_set = set(includes_cpp) + return includes_cpp + [inc for inc in includes_c + if inc not in includes_cpp_set] + + def auto_configure_fail(msg): """Output failure message when auto configuration fails.""" red = "\033[0;31m" @@ -94,7 +124,7 @@ def auto_configure_fail(msg): # END cc_configure common functions (see TODO above). -def _gcc_host_compiler_includes(repository_ctx, cc): +def _host_compiler_includes(repository_ctx, cc): """Generates the cxx_builtin_include_directory entries for gcc inc dirs. Args: @@ -645,7 +675,8 @@ def _create_dummy_repository(repository_ctx): # Set up BUILD file for cuda/. _tpl(repository_ctx, "cuda:build_defs.bzl", { - "%{cuda_is_configured}": "False" + "%{cuda_is_configured}": "False", + "%{cuda_extra_copts}": "[]" }) _tpl(repository_ctx, "cuda:BUILD", { @@ -730,6 +761,19 @@ def _symlink_dir(repository_ctx, src_dir, dest_dir): for src_file in files: repository_ctx.symlink(src_file, dest_dir + "/" + src_file.basename) +def _use_cuda_clang(repository_ctx): + if "TF_CUDA_CLANG" in repository_ctx.os.environ: + enable_cuda = repository_ctx.os.environ["TF_CUDA_CLANG"].strip() + return enable_cuda == "1" + return False + +def _compute_cuda_extra_copts(repository_ctx, cuda_config): + if _use_cuda_clang(repository_ctx): + capability_flags = ["--cuda-gpu-arch=sm_" + cap.replace(".", "") for cap in cuda_config.compute_capabilities] + else: + # Capabilities are handled in the "crosstool_wrapper_driver_is_not_gcc" for nvcc + capability_flags = [] + return str(capability_flags) def _create_cuda_repository(repository_ctx): """Creates the repository containing files set up to build with CUDA.""" @@ -761,7 +805,9 @@ def _create_cuda_repository(repository_ctx): # Set up BUILD file for cuda/ _tpl(repository_ctx, "cuda:build_defs.bzl", { - "%{cuda_is_configured}": "True" + "%{cuda_is_configured}": "True", + "%{cuda_extra_copts}": _compute_cuda_extra_copts(repository_ctx, cuda_config), + }) _tpl(repository_ctx, "cuda:BUILD", { @@ -787,21 +833,25 @@ def _create_cuda_repository(repository_ctx): # Set up crosstool/ _file(repository_ctx, "crosstool:BUILD") cc = find_cc(repository_ctx) - gcc_host_compiler_includes = _gcc_host_compiler_includes(repository_ctx, cc) - _tpl(repository_ctx, "crosstool:CROSSTOOL", - { + host_compiler_includes = _host_compiler_includes(repository_ctx, cc) + cuda_defines = { "%{cuda_include_path}": cuda_config.cuda_toolkit_path + '/include', - "%{gcc_host_compiler_includes}": gcc_host_compiler_includes, - }) - _tpl(repository_ctx, - "crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc", - { - "%{cpu_compiler}": str(cc), - "%{cuda_version}": cuda_config.cuda_version, - "%{gcc_host_compiler_path}": str(cc), - "%{cuda_compute_capabilities}": ", ".join( - ["\"%s\"" % c for c in cuda_config.compute_capabilities]), - }) + "%{host_compiler_includes}": host_compiler_includes, + } + if _use_cuda_clang(repository_ctx): + cuda_defines["%{clang_path}"] = cc + _tpl(repository_ctx, "crosstool:CROSSTOOL_clang", cuda_defines, out="crosstool/CROSSTOOL") + else: + _tpl(repository_ctx, "crosstool:CROSSTOOL_nvcc", cuda_defines, out="crosstool/CROSSTOOL") + _tpl(repository_ctx, + "crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc", + { + "%{cpu_compiler}": str(cc), + "%{cuda_version}": cuda_config.cuda_version, + "%{gcc_host_compiler_path}": str(cc), + "%{cuda_compute_capabilities}": ", ".join( + ["\"%s\"" % c for c in cuda_config.compute_capabilities]), + }) # Set up cuda_config.h, which is used by # tensorflow/stream_executor/dso_loader.cc. diff --git a/third_party/nccl/BUILD b/third_party/nccl/BUILD new file mode 100644 index 00000000000..e69de29bb2d diff --git a/third_party/nccl/fix_clang_compilation.patch b/third_party/nccl/fix_clang_compilation.patch new file mode 100644 index 00000000000..e8d2a7dc9f3 --- /dev/null +++ b/third_party/nccl/fix_clang_compilation.patch @@ -0,0 +1,85 @@ +From 8241cd7b6ed1425eeb88fd380090575978e358f4 Mon Sep 17 00:00:00 2001 +From: Ilya Biryukov +Date: Thu, 16 Mar 2017 12:01:11 +0100 +Subject: [PATCH 1/1] Fix compilation error when compiling with 'clang -x + cuda'. + +Functions vFetch and vStore are not found by ADL with clang, +so they need to be declared before usage in ReduceCopy. +--- + src/common_kernel.h | 52 ++++++++++++++++++++++++++-------------------------- + 1 file changed, 26 insertions(+), 26 deletions(-) + +diff --git a/src/common_kernel.h b/src/common_kernel.h +index 28fbc85..cc71f8a 100644 +--- a/src/common_kernel.h ++++ b/src/common_kernel.h +@@ -30,6 +30,32 @@ + #define BAR(type, barid, nthreads) \ + BAR_EXPAND(type, barid, ROUNDUP(nthreads, WARP_SIZE)) + ++template inline __device__ ++T vFetch(const volatile T* ptr) { ++ return *ptr; ++} ++ ++#ifdef CUDA_HAS_HALF ++template<> inline __device__ ++half vFetch(const volatile half* ptr) { ++ half r; ++ r.x = ptr->x; ++ return r; ++} ++#endif ++ ++template inline __device__ ++void vStore(volatile T* ptr, const T val) { ++ *ptr = val; ++} ++ ++#ifdef CUDA_HAS_HALF ++template<> inline __device__ ++void vStore(volatile half* ptr, const half val) { ++ ptr->x = val.x; ++} ++#endif ++ + __device__ unsigned int spinct; + + // Spin wait until func evaluates to true +@@ -225,32 +251,6 @@ __device__ inline volatile T* AlignUp(volatile T * ptr, size_t align) { + return reinterpret_cast(ALIGNUP(ptrval, align)); + } + +-template inline __device__ +-T vFetch(const volatile T* ptr) { +- return *ptr; +-} +- +-#ifdef CUDA_HAS_HALF +-template<> inline __device__ +-half vFetch(const volatile half* ptr) { +- half r; +- r.x = ptr->x; +- return r; +-} +-#endif +- +-template inline __device__ +-void vStore(volatile T* ptr, const T val) { +- *ptr = val; +-} +- +-#ifdef CUDA_HAS_HALF +-template<> inline __device__ +-void vStore(volatile half* ptr, const half val) { +- ptr->x = val.x; +-} +-#endif +- + // Assumptions: + // - there is exactly 1 block + // - THREADS is the number of producer threads +-- +2.12.0.367.g23dc2f6d3c-goog + diff --git a/third_party/nccl/nccl.BUILD b/third_party/nccl/nccl.BUILD new file mode 100644 index 00000000000..06b9b8ff68a --- /dev/null +++ b/third_party/nccl/nccl.BUILD @@ -0,0 +1,66 @@ +# NVIDIA nccl +# A package of optimized primitives for collective multi-GPU communication. + +licenses(["notice"]) # BSD + +exports_files(["LICENSE.txt"]) + +load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts", "if_cuda") + +SRCS = [ + "src/all_gather.cu", + "src/all_reduce.cu", + "src/broadcast.cu", + "src/core.cu", + "src/libwrap.cu", + "src/reduce.cu", + "src/reduce_scatter.cu", +] + +# Copy .cu to .cu.cc so they can be in srcs of cc_library. +[ + genrule( + name = "gen_" + src, + srcs = [src], + outs = [src + ".cc"], + cmd = "cp $(location " + src + ") $(location " + src + ".cc)", + ) + for src in SRCS +] + +SRCS_CU_CC = [src + ".cc" for src in SRCS] + +cc_library( + name = "nccl", + srcs = if_cuda(SRCS_CU_CC + glob(["src/*.h"])), + hdrs = if_cuda(["src/nccl.h"]), + copts = [ + "-DCUDA_MAJOR=0", + "-DCUDA_MINOR=0", + "-DNCCL_MAJOR=0", + "-DNCCL_MINOR=0", + "-DNCCL_PATCH=0", + "-Iexternal/nccl_archive/src", + "-O3", + ] + cuda_default_copts(), + linkopts = select({ + "@%ws%//tensorflow:android": [ + "-pie", + ], + "@%ws%//tensorflow:darwin": [ + "-Wl,-framework", + "-Wl,CoreFoundation", + "-Wl,-framework", + "-Wl,Security", + ], + "@%ws%//tensorflow:ios": [], + "@%ws%//tensorflow:windows": [ + "ws2_32.lib", + ], + "//conditions:default": [ + "-lrt", + ], + }), + visibility = ["//visibility:public"], + deps = ["@local_config_cuda//cuda:cuda_headers"], +) diff --git a/third_party/protobuf/BUILD b/third_party/protobuf/BUILD new file mode 100644 index 00000000000..e69de29bb2d diff --git a/third_party/protobuf/add_noinlines.patch b/third_party/protobuf/add_noinlines.patch new file mode 100644 index 00000000000..af74798f067 --- /dev/null +++ b/third_party/protobuf/add_noinlines.patch @@ -0,0 +1,30 @@ +diff -u -r a/src/google/protobuf/compiler/cpp/cpp_file.cc b/src/google/protobuf/compiler/cpp/cpp_file.cc +--- a/src/google/protobuf/compiler/cpp/cpp_file.cc 2017-02-10 23:55:34.000000000 +0100 ++++ b/src/google/protobuf/compiler/cpp/cpp_file.cc 2017-03-21 13:41:46.931979154 +0100 +@@ -557,7 +557,7 @@ + " $metadata$, $enum_descriptors$, $service_descriptors$);\n" + "}\n" + "\n" +- "void protobuf_AssignDescriptorsOnce() {\n" ++ "GOOGLE_ATTRIBUTE_NOINLINE void protobuf_AssignDescriptorsOnce() {\n" + " static GOOGLE_PROTOBUF_DECLARE_ONCE(once);\n" + " ::google::protobuf::GoogleOnceInit(&once, &protobuf_AssignDescriptors);\n" + "}\n" +@@ -656,7 +656,7 @@ + printer->Print( + "}\n" + "\n" +- "void InitDefaults() {\n" ++ "GOOGLE_ATTRIBUTE_NOINLINE void InitDefaults() {\n" + " static GOOGLE_PROTOBUF_DECLARE_ONCE(once);\n" + " ::google::protobuf::GoogleOnceInit(&once, &TableStruct::InitDefaultsImpl);\n" + "}\n"); +@@ -737,7 +737,7 @@ + printer->Print( + "}\n" + "\n" +- "void AddDescriptors() {\n" ++ "GOOGLE_ATTRIBUTE_NOINLINE void AddDescriptors() {\n" + " static GOOGLE_PROTOBUF_DECLARE_ONCE(once);\n" + " ::google::protobuf::GoogleOnceInit(&once, &AddDescriptorsImpl);\n" + "}\n"); diff --git a/tools/bazel.rc.template b/tools/bazel.rc.template index 3622b9423c2..097ff7b9d07 100644 --- a/tools/bazel.rc.template +++ b/tools/bazel.rc.template @@ -1,6 +1,11 @@ build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain build:cuda --define=using_cuda=true --define=using_cuda_nvcc=true + +build:cuda_clang --crosstool_top=@local_config_cuda//crosstool:toolchain +build:cuda_clang --define=using_cuda=true --define=using_cuda_clang=true + build:win-cuda --define=using_cuda=true --define=using_cuda_nvcc=true + build:mkl --define=using_mkl=true build:sycl --crosstool_top=@local_config_sycl//crosstool:toolchain