Add cuda_configure repository rule to autodetect cuda. (#3966)

This change reimplements the CUDA autoconfiguration mechanism in Skylark, providing a `cuda_configure` workspace rule. We keep the same user interface, the ./configure script, but rather than modifying source files within the source tree, `cuda_configure` generates a `@local_config_cuda` workspace containing: * Symlinks to the CUDA headers and libraries * BUILD files generated with the correct CUDA and cuDNN versions * CROSSTOOL config with CUDA include dirs populated * crosstool_wrapper_driver_is_not_gcc wrapper script with compiler paths and CUDA compute capabilities set. * cuda_config.h header file with CUDA versions and compute capabilities set, which can be `#include`d by source files. This change also makes the following fixes to `Dockerfile.gpu`: * Change the `CUDNN_INSTALL_PATH` to point to `/usr/lib/x86_64-linux-gnu` rather than `/usr/local/cuda` since NVIDIA's image installs `libcudnn.so` under `/usr/lib/x86_64-linux-gnu`. * Add env variable to set the minimum compute capability to 3.0. Fixes #2873
2016-08-22 23:15:02 -07:00 · 2016-08-22 23:15:02 -07:00 · 58b37cf745
commit 58b37cf745
parent 44595c44ee
35 changed files with 777 additions and 657 deletions
--- a/.gitignore
+++ b/.gitignore
@ -9,13 +9,6 @@ node_modules
 /bazel-testlogs
 /bazel-tf
 /tensorflow/contrib/cmake/build
 /third_party/gpus/cuda/bin
 /third_party/gpus/cuda/cuda.config
 /third_party/gpus/cuda/extras
 /third_party/gpus/cuda/include
 /third_party/gpus/cuda/lib
 /third_party/gpus/cuda/lib64
 /third_party/gpus/cuda/nvvm
 /third_party/py/numpy/numpy_include
 /tools/bazel.rc
 /tools/python_bin_path.sh
@ -25,3 +18,4 @@ node_modules
 /_python_build
 *.pyc
 __pycache__
 *.swp
--- a/73
+++ b/73
@ -80,6 +80,7 @@ while [ "$TF_NEED_CUDA" == "" ]; do
  esac
 done
 export TF_NEED_CUDA
 if [ "$TF_NEED_CUDA" == "0" ]; then
  echo "Configuration finished"
  exit
@ -97,6 +98,7 @@ while true; do
    fi
  fi
  if [ -e "$GCC_HOST_COMPILER_PATH" ]; then
    export CC=$GCC_HOST_COMPILER_PATH
    break
  fi
  echo "Invalid gcc path. ${GCC_HOST_COMPILER_PATH} cannot be found" 1>&2
@ -107,7 +109,6 @@ while true; do
  # Retry
 done
 # Find out where the CUDA toolkit is installed
 OSNAME=`uname -s`
@ -140,6 +141,8 @@ while true; do
  fi
  if [ -e "${CUDA_TOOLKIT_PATH}/${CUDA_RT_LIB_PATH}" ]; then
    export CUDA_TOOLKIT_PATH
    export CUDA_VERSION=$TF_CUDA_VERSION
    break
  fi
  echo "Invalid path to CUDA $TF_CUDA_VERSION toolkit. ${CUDA_TOOLKIT_PATH}/${CUDA_RT_LIB_PATH} cannot be found"
@ -200,13 +203,16 @@ while true; do
  fi
  if [ -e "$CUDNN_INSTALL_PATH/${CUDA_DNN_LIB_ALT_PATH}" -o -e "$CUDNN_INSTALL_PATH/${CUDA_DNN_LIB_PATH}" ]; then
    export CUDNN_VERSION=$TF_CUDNN_VERSION
    export CUDNN_INSTALL_PATH
    break
  fi
  if [ "$OSNAME" == "Linux" ]; then
    CUDNN_PATH_FROM_LDCONFIG="$(ldconfig -p | sed -n 's/.*libcudnn.so .* => \(.*\)/\1/p')"
    if [ -e "${CUDNN_PATH_FROM_LDCONFIG}${TF_CUDNN_EXT}" ]; then
-      CUDNN_INSTALL_PATH="$(dirname ${CUDNN_PATH_FROM_LDCONFIG})"
+      export CUDNN_VERSION=$TF_CUDNN_VERSION
      export CUDNN_INSTALL_PATH="$(dirname ${CUDNN_PATH_FROM_LDCONFIG})"
      break
    fi
  fi
@ -225,42 +231,11 @@ while true; do
  CUDNN_INSTALL_PATH=""
 done
 cat > third_party/gpus/cuda/cuda.config <<EOF
 # CUDA_TOOLKIT_PATH refers to the CUDA toolkit.
 CUDA_TOOLKIT_PATH="$CUDA_TOOLKIT_PATH"
 # CUDNN_INSTALL_PATH refers to the cuDNN toolkit. The cuDNN header and library
 # files can be either in this directory, or under include/ and lib64/
 # directories separately.
 CUDNN_INSTALL_PATH="$CUDNN_INSTALL_PATH"
 # The Cuda SDK version that should be used in this build (empty to use libcudart.so symlink)
 TF_CUDA_VERSION=$TF_CUDA_VERSION
 # The Cudnn version that should be used in this build
 TF_CUDNN_VERSION=$TF_CUDNN_VERSION
 EOF
 # Configure the gcc host compiler to use
 export WARNING=$DO_NOT_SUBMIT_WARNING
 perl -pi -e "s,CPU_COMPILER = \('.*'\),# \$ENV{WARNING}\nCPU_COMPILER = ('$GCC_HOST_COMPILER_PATH'),s" third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc
 perl -pi -e "s,GCC_HOST_COMPILER_PATH = \('.*'\),# \$ENV{WARNING}\nGCC_HOST_COMPILER_PATH = ('$GCC_HOST_COMPILER_PATH'),s" third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc
 # Configure the platform name.
 perl -pi -e "s,PLATFORM = \".*\",PLATFORM = \"$OSNAME\",s" third_party/gpus/cuda/platform.bzl
 # Configure the Cuda toolkit version to work with.
 perl -pi -e "s,(GetCudaVersion.*return )\"[0-9\.]*\",\1\"$TF_CUDA_VERSION\",s" tensorflow/stream_executor/dso_loader.cc
 perl -pi -e "s,CUDA_VERSION = \"[0-9\.]*\",CUDA_VERSION = \"$TF_CUDA_VERSION\",s" third_party/gpus/cuda/platform.bzl
 # Configure the Cudnn version to work with.
 perl -pi -e "s,(GetCudnnVersion.*return )\"[0-9\.]*\",\1\"$TF_CUDNN_VERSION\",s" tensorflow/stream_executor/dso_loader.cc
 perl -pi -e "s,CUDNN_VERSION = \"[0-9\.]*\",CUDNN_VERSION = \"$TF_CUDNN_VERSION\",s" third_party/gpus/cuda/platform.bzl
 # Configure the compute capabilities that TensorFlow builds for.
 # Since Cuda toolkit is not backward-compatible, this is not guaranteed to work.
 while true; do
  fromuser=""
  default_cuda_compute_capabilities="3.5,5.2"
  if [ -z "$TF_CUDA_COMPUTE_CAPABILITIES" ]; then
 cat << EOF
 Please specify a list of comma-separated Cuda compute capabilities you want to build with.
@ -270,6 +245,9 @@ EOF
    read -p "[Default is: \"3.5,5.2\"]: " TF_CUDA_COMPUTE_CAPABILITIES
    fromuser=1
  fi
  if [ -z "$TF_CUDA_COMPUTE_CAPABILITIES" ]; then
    TF_CUDA_COMPUTE_CAPABILITIES=$default_cuda_compute_capabilities
  fi
  # Check whether all capabilities from the input is valid
  COMPUTE_CAPABILITIES=${TF_CUDA_COMPUTE_CAPABILITIES//,/ }
  ALL_VALID=1
@ -285,34 +263,13 @@ EOF
      exit 1
    fi
  else
    export CUDA_COMPUTE_CAPABILITIES=$TF_CUDA_COMPUTE_CAPABILITIES
    break
  fi
  TF_CUDA_COMPUTE_CAPABILITIES=""
 done
-if [ ! -z "$TF_CUDA_COMPUTE_CAPABILITIES" ]; then
+bazel clean --expunge
-  export WARNING=$DO_NOT_SUBMIT_WARNING
+bazel fetch //...
  function CudaGenCodeOpts() {
    OUTPUT=""
    for CAPABILITY in $@; do
      OUTPUT=${OUTPUT}"   \"${CAPABILITY}\",     "
    done
    echo $OUTPUT
  }
  export CUDA_GEN_CODES_OPTS=$(CudaGenCodeOpts ${TF_CUDA_COMPUTE_CAPABILITIES//,/ })
  perl -pi -0 -e 's,\n( *)([^\n]*supported_cuda_compute_capabilities\s*=\s*\[).*?(\]),\n\1# $ENV{WARNING}\n\1\2$ENV{CUDA_GEN_CODES_OPTS}\3,s' third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc
  function CudaVersionOpts() {
    OUTPUT=""
    for CAPABILITY in $@; do
      OUTPUT=$OUTPUT"CudaVersion(\"${CAPABILITY}\"), "
    done
    echo $OUTPUT
  }
  export CUDA_VERSION_OPTS=$(CudaVersionOpts ${TF_CUDA_COMPUTE_CAPABILITIES//,/ })
  perl -pi -0 -e 's,\n( *)([^\n]*supported_cuda_compute_capabilities\s*=\s*\{).*?(\}),\n\1// $ENV{WARNING}\n\1\2$ENV{CUDA_VERSION_OPTS}\3,s' tensorflow/core/common_runtime/gpu/gpu_device.cc
 fi
 # Invoke the cuda_config.sh and set up the TensorFlow's canonical view of the Cuda libraries
 (cd third_party/gpus/cuda; ./cuda_config.sh;) || exit -1
 echo "Configuration finished"
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@ -759,10 +759,9 @@ struct CudaVersion {
  int minor_part = -1;
 };
 // "configure" uses the specific name to substitute the following string.
 // If you change it, make sure you modify "configure" as well.
 std::vector<CudaVersion> supported_cuda_compute_capabilities = {
-    CudaVersion("3.5"), CudaVersion("5.2")};
+  TF_CUDA_CAPABILITIES,
 };
 std::vector<CudaVersion> GetSupportedCudaComputeCapabilities() {
  auto cuda_caps = supported_cuda_compute_capabilities;
--- a/tensorflow/core/kernels/lrn_op.cc
+++ b/tensorflow/core/kernels/lrn_op.cc
@ -31,7 +31,7 @@ limitations under the License.
 #endif
 #if GOOGLE_CUDA
-#include "third_party/gpus/cuda/include/cuda.h"
+#include "cuda/include/cuda.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/util/stream_executor_util.h"
 #endif  // GOOGLE_CUDA
--- a/tensorflow/core/kernels/matmul_op.cc
+++ b/tensorflow/core/kernels/matmul_op.cc
@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/fill_functor.h"
 #if GOOGLE_CUDA
-#include "third_party/gpus/cuda/include/cuda.h"
+#include "cuda/include/cuda.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #endif  // GOOGLE_CUDA
--- a/tensorflow/core/platform/default/build_config/BUILD
+++ b/tensorflow/core/platform/default/build_config/BUILD
@ -9,7 +9,7 @@ exports_files(["LICENSE"])
 load("//tensorflow:tensorflow.bzl", "tf_copts")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
-load("//third_party/gpus/cuda:platform.bzl", "cuda_library_path")
+load("@local_config_cuda//cuda:platform.bzl", "cuda_library_path")
 cc_library(
    name = "gtest",
@ -32,7 +32,7 @@ tf_cuda_library(
    deps = [
        "//tensorflow/stream_executor",
    ] + select({
-        "//third_party/gpus/cuda:darwin": ["IOKit"],
+        "@local_config_cuda//cuda:darwin": ["IOKit"],
        "//conditions:default": [],
    }),
 )
@ -91,20 +91,20 @@ filegroup(
 cc_library(
    name = "cuda",
    data = [
-        "//third_party/gpus/cuda:{}".format(cuda_library_path("cudart")),
+        "@local_config_cuda//cuda:{}".format(cuda_library_path("cudart")),
    ],
    linkopts = select({
-        "//third_party/gpus/cuda:darwin": [
+        "@local_config_cuda//cuda:darwin": [
-            "-Wl,-rpath,third_party/gpus/cuda/lib",
+            "-Wl,-rpath,../local_config_cuda/cuda/lib",
-            "-Wl,-rpath,third_party/gpus/cuda/extras/CUPTI/lib",
+            "-Wl,-rpath,../local_config_cuda/cuda/extras/CUPTI/lib",
        ],
        "//conditions:default": [
-            "-Wl,-rpath,third_party/gpus/cuda/lib64",
+            "-Wl,-rpath,../local_config_cuda/cuda/lib64",
-            "-Wl,-rpath,third_party/gpus/cuda/extras/CUPTI/lib64",
+            "-Wl,-rpath,../local_config_cuda/cuda/extras/CUPTI/lib64",
        ],
    }),
    deps = [
-        "//third_party/gpus/cuda:cudart",
+        "@local_config_cuda//cuda:cudart",
    ],
 )
--- a/tensorflow/core/platform/default/gpu/BUILD
+++ b/tensorflow/core/platform/default/gpu/BUILD
@ -15,9 +15,9 @@ tf_cuda_library(
    copts = tf_copts(),
    cuda_deps = [
        "//tensorflow/core:stream_executor",
-        "//third_party/gpus/cuda:cuda_headers",
+        "@local_config_cuda//cuda:cuda_headers",
-        "//third_party/gpus/cuda:cupti_headers",
+        "@local_config_cuda//cuda:cupti_headers",
    ],
-    data = ["//third_party/gpus/cuda:cupti_dsos"],
+    data = ["@local_config_cuda//cuda:cupti_dsos"],
    visibility = ["//visibility:public"],
 )
--- a/tensorflow/core/platform/default/gpu/cupti_wrapper.h
+++ b/tensorflow/core/platform/default/gpu/cupti_wrapper.h
@ -21,7 +21,7 @@ limitations under the License.
 #include <stddef.h>
 #include <stdint.h>
-#include "third_party/gpus/cuda/extras/CUPTI/include/cupti.h"
+#include "cuda/extras/CUPTI/include/cupti.h"
 namespace perftools {
 namespace gputools {
--- a/tensorflow/core/util/port.cc
+++ b/tensorflow/core/util/port.cc
@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/core/util/port.h"
 #if GOOGLE_CUDA
-#include "third_party/gpus/cuda/include/cuda.h"
+#include "cuda/include/cuda.h"
 #endif
 namespace tensorflow {
--- a/tensorflow/stream_executor/BUILD
+++ b/tensorflow/stream_executor/BUILD
@ -27,9 +27,10 @@ cc_library(
    ]),
    data = [
        "//tensorflow/core:cuda",
-        "//third_party/gpus/cuda:cublas",
+        "@local_config_cuda//cuda:cublas",
-        "//third_party/gpus/cuda:cudnn",
+        "@local_config_cuda//cuda:cudnn",
-        "//third_party/gpus/cuda:cufft",
+        "@local_config_cuda//cuda:cufft",
        "@local_config_cuda//cuda:curand",
    ],
    linkopts = [
        "-ldl",
@ -37,7 +38,7 @@ cc_library(
    visibility = ["//visibility:public"],
    deps = [
        "//tensorflow/core:lib",
-        "//third_party/gpus/cuda:cuda_headers",
+        "@local_config_cuda//cuda:cuda_headers",
    ],
    alwayslink = 1,
 )
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@ -18,8 +18,8 @@ limitations under the License.
 // cuda.h). This ensures that Eigen's Half.h does not attempt to make its own
 // __half typedef if CUDA has already defined one (and conversely, that we do
 // not include <cuda_fp16.h> after Half.h has made its typedef).
-#include "third_party/gpus/cuda/include/cuda.h"
+#include "cuda/include/cuda.h"
-#include "third_party/gpus/cuda/include/cublas_v2.h"
+#include "cuda/include/cublas_v2.h"
 #if CUDA_VERSION >= 7050
 #define EIGEN_HAS_CUDA_FP16
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@ -39,7 +39,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
 // clang-format off
-#include "third_party/gpus/cuda/include/cudnn.h"
+#include "cuda/include/cudnn.h"
 // clang-format on
 namespace {
--- a/tensorflow/stream_executor/cuda/cuda_driver.h
+++ b/tensorflow/stream_executor/cuda/cuda_driver.h
@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 #include "tensorflow/stream_executor/platform/port.h"
-#include "third_party/gpus/cuda/include/cuda.h"
+#include "cuda/include/cuda.h"
 namespace perftools {
 namespace gputools {
--- a/tensorflow/stream_executor/cuda/cuda_fft.h
+++ b/tensorflow/stream_executor/cuda/cuda_fft.h
@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/fft.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/plugin_registry.h"
-#include "third_party/gpus/cuda/include/cufft.h"
+#include "cuda/include/cufft.h"
 namespace perftools {
 namespace gputools {
--- a/tensorflow/stream_executor/cuda/cuda_helpers.h
+++ b/tensorflow/stream_executor/cuda/cuda_helpers.h
@ -24,8 +24,8 @@ limitations under the License.
 #include <stddef.h>
 #include <complex>
-#include "third_party/gpus/cuda/include/cuComplex.h"
+#include "cuda/include/cuComplex.h"
-#include "third_party/gpus/cuda/include/cuda.h"
+#include "cuda/include/cuda.h"
 namespace perftools {
 namespace gputools {
--- a/tensorflow/stream_executor/cuda/cuda_kernel.h
+++ b/tensorflow/stream_executor/cuda/cuda_kernel.h
@ -28,7 +28,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/casts.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/platform/logging.h"
-#include "third_party/gpus/cuda/include/cuda.h"
+#include "cuda/include/cuda.h"
 #ifdef PLATFORMS_GPUS_CUDA_DYNAMIC_LIBCUDA_DYNAMIC_LIBCUDA_H_
 #error \
--- a/tensorflow/stream_executor/cuda/cuda_rng.cc
+++ b/tensorflow/stream_executor/cuda/cuda_rng.cc
@ -28,7 +28,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/rng.h"
-#include "third_party/gpus/cuda/include/curand.h"
+#include "cuda/include/curand.h"
 // Formats curandStatus_t to output prettified values into a log stream.
 std::ostream &operator<<(std::ostream &in, const curandStatus_t &status) {
--- a/tensorflow/stream_executor/dso_loader.cc
+++ b/tensorflow/stream_executor/dso_loader.cc
@ -28,23 +28,22 @@ limitations under the License.
 #include "tensorflow/core/platform/load_library.h"
 #include "tensorflow/stream_executor/lib/error.h"
 #include "tensorflow/stream_executor/lib/str_util.h"
 #include "tensorflow/stream_executor/lib/str_util.h"
 #include "tensorflow/stream_executor/lib/strcat.h"
 #include "tensorflow/stream_executor/lib/stringprintf.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/lib/str_util.h"
 namespace perftools {
 namespace gputools {
 namespace internal {
-// TensorFlow OSS configure uses the following lines to configure versions. For
+string GetCudaVersion() { return TF_CUDA_VERSION; }
-// any modifications of the format, please make sure the script still works.
+string GetCudnnVersion() { return TF_CUDNN_VERSION; }
 string GetCudaVersion() { return ""; }
 string GetCudnnVersion() { return ""; }
 /* static */ port::Status DsoLoader::GetCublasDsoHandle(void** dso_handle) {
-  return GetDsoHandle(FindDsoPath(tensorflow::internal::FormatLibraryFileName("cublas", GetCudaVersion()),
+  return GetDsoHandle(FindDsoPath(tensorflow::internal::FormatLibraryFileName(
                                      "cublas", GetCudaVersion()),
                                  GetCudaLibraryDirPath()),
                      dso_handle);
 }
@ -53,35 +52,38 @@ string GetCudnnVersion() { return ""; }
  // libcudnn is versioned differently than the other libraries and may have a
  // different version number than other CUDA libraries.  See b/22397368 for
  // some details about the complications surrounding this.
-  return GetDsoHandle(
+  return GetDsoHandle(FindDsoPath(tensorflow::internal::FormatLibraryFileName(
-      FindDsoPath(tensorflow::internal::FormatLibraryFileName("cudnn", GetCudnnVersion()),
+                                      "cudnn", GetCudnnVersion()),
-                              GetCudaLibraryDirPath()),
+                                  GetCudaLibraryDirPath()),
                      dso_handle);
 }
 /* static */ port::Status DsoLoader::GetCufftDsoHandle(void** dso_handle) {
-  return GetDsoHandle(FindDsoPath(tensorflow::internal::FormatLibraryFileName("cufft", GetCudaVersion()),
+  return GetDsoHandle(FindDsoPath(tensorflow::internal::FormatLibraryFileName(
                                      "cufft", GetCudaVersion()),
                                  GetCudaLibraryDirPath()),
                      dso_handle);
 }
 /* static */ port::Status DsoLoader::GetCurandDsoHandle(void** dso_handle) {
-  return GetDsoHandle(FindDsoPath(tensorflow::internal::FormatLibraryFileName("curand", GetCudaVersion()),
+  return GetDsoHandle(FindDsoPath(tensorflow::internal::FormatLibraryFileName(
                                      "curand", GetCudaVersion()),
                                  GetCudaLibraryDirPath()),
                      dso_handle);
 }
 /* static */ port::Status DsoLoader::GetLibcudaDsoHandle(void** dso_handle) {
-  return GetDsoHandle(FindDsoPath(tensorflow::internal::FormatLibraryFileName("cuda", "1"),
+  return GetDsoHandle(
-                                  GetCudaDriverLibraryPath()),
+      FindDsoPath(tensorflow::internal::FormatLibraryFileName("cuda", "1"),
-                      dso_handle);
+                  GetCudaDriverLibraryPath()),
      dso_handle);
 }
 /* static */ port::Status DsoLoader::GetLibcuptiDsoHandle(void** dso_handle) {
-  return GetDsoHandle(
+  return GetDsoHandle(FindDsoPath(tensorflow::internal::FormatLibraryFileName(
-      FindDsoPath(tensorflow::internal::FormatLibraryFileName("cupti", GetCudaVersion()),
+                                      "cupti", GetCudaVersion()),
-                  GetCudaCuptiLibraryPath()),
+                                  GetCudaCuptiLibraryPath()),
-      dso_handle);
+                      dso_handle);
 }
 /* static */ void DsoLoader::RegisterRpath(port::StringPiece path) {
@ -89,11 +91,9 @@ string GetCudnnVersion() { return ""; }
  GetRpaths()->push_back(path.ToString());
 }
 /* static */ port::Status DsoLoader::GetDsoHandle(port::StringPiece path,
                                                  void** dso_handle,
                                                  LoadKind load_kind) {
  int dynload_flags =
      RTLD_LAZY | (load_kind == LoadKind::kLocal ? RTLD_LOCAL : RTLD_GLOBAL);
  string path_string = path.ToString();
@ -138,9 +138,9 @@ string GetCudnnVersion() { return ""; }
 static std::vector<string>* CreatePrimordialRpaths() {
  auto rpaths = new std::vector<string>;
 #if defined(__APPLE__)
-  rpaths->push_back("driver/driver_sh.runfiles/org_tensorflow/third_party/gpus/cuda/lib");
+  rpaths->push_back("driver/driver_sh.runfiles/local_config_cuda/cuda/lib");
 #else
-  rpaths->push_back("driver/driver_sh.runfiles/org_tensorflow/third_party/gpus/cuda/lib64");
+  rpaths->push_back("driver/driver_sh.runfiles/local_config_cuda/cuda/lib64");
 #endif
  return rpaths;
 }
@ -165,7 +165,6 @@ static std::vector<string>* CreatePrimordialRpaths() {
 /* static */ string DsoLoader::FindDsoPath(port::StringPiece library_name,
                                           port::StringPiece runfiles_relpath) {
  // Keep a record of the paths we attempted so we can dump out meaningful
  // diagnostics if no path is found.
  std::vector<string> attempted;
@ -191,29 +190,28 @@ static std::vector<string>* CreatePrimordialRpaths() {
 /* static */ string DsoLoader::GetCudaLibraryDirPath() {
 #if defined(__APPLE__)
-  return "third_party/gpus/cuda/lib";
+  return "external/local_config_cuda/cuda/lib";
 #else
-  return "third_party/gpus/cuda/lib64";
+  return "external/local_config_cuda/cuda/lib64";
 #endif
 }
 /* static */ string DsoLoader::GetCudaDriverLibraryPath() {
 #if defined(__APPLE__)
-  return "third_party/gpus/cuda/driver/lib";
+  return "external/local_config_cuda/cuda/driver/lib";
 #else
-  return "third_party/gpus/cuda/driver/lib64";
+  return "external/local_config_cuda/cuda/driver/lib64";
 #endif
 }
 /* static */ string DsoLoader::GetCudaCuptiLibraryPath() {
 #if defined(__APPLE__)
-  return "third_party/gpus/cuda/extras/CUPTI/lib";
+  return "external/local_config_cuda/cuda/extras/CUPTI/lib";
 #else
-  return "third_party/gpus/cuda/extras/CUPTI/lib64";
+  return "external/local_config_cuda/cuda/extras/CUPTI/lib64";
 #endif
 }
 // -- CachedDsoLoader
 /* static */ port::StatusOr<void*> CachedDsoLoader::GetCublasDsoHandle() {
--- a/tensorflow/stream_executor/dso_loader.h
+++ b/tensorflow/stream_executor/dso_loader.h
@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/port.h"
 #include <vector>
 #include "cuda/cuda_config.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 #include "tensorflow/stream_executor/lib/stringpiece.h"
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@ -32,7 +32,7 @@ load(
    "tf_cuda_tests_tags",
 )
 load(
-    "//third_party/gpus/cuda:build_defs.bzl",
+    "@local_config_cuda//cuda:build_defs.bzl",
    "if_cuda",
 )
@ -295,11 +295,11 @@ def tf_cc_tests(tests, deps, linkstatic=0, tags=[], size="medium", args=None,
    tf_cc_test(t, deps, linkstatic, tags=tags, size=size, args=args,
               linkopts=linkopts)
-def tf_cc_tests_gpu(tests, deps, linkstatic=0, tags=[], size="medium", args=None):
+def tf_cc_tests_gpu(tests, deps, linkstatic=0, tags=[], size="medium",
                    args=None):
  tf_cc_tests(tests, deps, linkstatic, tags=tags, size=size, args=args)
 def tf_cuda_cc_tests(tests, deps, tags=[], size="medium", linkstatic=0,
                     args=None, linkopts=[]):
  for t in tests:
@ -316,29 +316,29 @@ def _cuda_copts():
    common_cuda_opts = ["-x", "cuda", "-DGOOGLE_CUDA=1"]
    return select({
        "//conditions:default": [],
-        "//third_party/gpus/cuda:using_nvcc": (
+        "@local_config_cuda//cuda:using_nvcc": (
            common_cuda_opts +
            [
                "-nvcc_options=relaxed-constexpr",
                "-nvcc_options=ftz=true",
            ]
        ),
-        "//third_party/gpus/cuda:using_gcudacc": (
+        "@local_config_cuda//cuda:using_gcudacc": (
            common_cuda_opts +
            ["--gcudacc_flag=-ftz=true"]
        ),
-        "//third_party/gpus/cuda:using_clang": (
+        "@local_config_cuda//cuda:using_clang": (
            common_cuda_opts +
            [
                "-fcuda-flush-denormals-to-zero",
-                "--cuda-path=third_party/gpus/cuda",
+                "--cuda-path=external/local_config_cuda/cuda",
                "--cuda-gpu-arch=sm_35",
            ]
        ),
    }) + select({
        # Pass -O3 when building CUDA code with clang; some important
        # optimizations are not enabled at O2.
-        "//third_party/gpus/cuda:using_clang_opt": ["-O3"],
+        "@local_config_cuda//cuda:using_clang_opt": ["-O3"],
        "//conditions:default": [],
    })
@ -409,7 +409,8 @@ def tf_kernel_library(name, prefix=None, srcs=None, gpu_srcs=None, hdrs=None,
    * srcs = ["cwise_op_abs.cc", ..., "cwise_op_tanh.cc"],
    * hdrs = ["cwise_ops.h", "cwise_ops_common.h"],
    * gpu_srcs = ["cwise_op_gpu_abs.cu.cc", ..., "cwise_op_gpu_tanh.cu.cc",
-                  "cwise_ops.h", "cwise_ops_common.h", "cwise_ops_gpu_common.cu.h"]
+                  "cwise_ops.h", "cwise_ops_common.h",
                  "cwise_ops_gpu_common.cu.h"]
    * "cwise_ops_test.cc" is excluded
  """
  if not srcs:
@ -613,7 +614,7 @@ check_deps = rule(
 def tf_custom_op_library(name, srcs=[], gpu_srcs=[], deps=[]):
  cuda_deps = [
      "//tensorflow/core:stream_executor_headers_lib",
-      "//third_party/gpus/cuda:cudart_static",
+      "@local_config_cuda//cuda:cudart_static",
  ]
  deps = deps + tf_custom_op_library_additional_deps()
  if gpu_srcs:
@ -663,7 +664,7 @@ def tf_py_wrap_cc(name, srcs, swig_includes=[], deps=[], copts=[], **kwargs):
              module_name=module_name,
              py_module_name=name)
  extra_linkopts = select({
-      "//third_party/gpus/cuda:darwin": [
+      "@local_config_cuda//cuda:darwin": [
          "-Wl,-exported_symbols_list",
          "//tensorflow:tf_exported_symbols.lds"
      ],
@ -672,7 +673,7 @@ def tf_py_wrap_cc(name, srcs, swig_includes=[], deps=[], copts=[], **kwargs):
          "//tensorflow:tf_version_script.lds"
      ]})
  extra_deps += select({
-      "//third_party/gpus/cuda:darwin": [
+      "@local_config_cuda//cuda:darwin": [
        "//tensorflow:tf_exported_symbols.lds"
      ],
      "//conditions:default": [
@ -746,13 +747,14 @@ def py_tests(name,
               data=data,
               additional_deps=additional_deps)
-def cuda_py_tests(name, srcs, size="medium", additional_deps=[], data=[], shard_count=1, tags=[], prefix=""):
+def cuda_py_tests(name, srcs, size="medium", additional_deps=[], data=[],
                  shard_count=1, tags=[], prefix=""):
  test_tags = tags + tf_cuda_tests_tags()
  py_tests(name=name, size=size, srcs=srcs, additional_deps=additional_deps,
           data=data, tags=test_tags, shard_count=shard_count,prefix=prefix)
-# Creates a genrule named <name> for running tools/proto_text's generator to make
+# Creates a genrule named <name> for running tools/proto_text's generator to
-# the proto_text functions, for the protos passed in <srcs>.
+# make the proto_text functions, for the protos passed in <srcs>.
 #
 # Return a struct with fields (hdrs, srcs) containing the names of the
 # generated files.
--- a/tensorflow/tools/ci_build/Dockerfile.gpu
+++ b/tensorflow/tools/ci_build/Dockerfile.gpu
@ -22,5 +22,6 @@ ENV LD_LIBRARY_PATH /usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64
 # Configure the build for our CUDA configuration.
 ENV CUDA_TOOLKIT_PATH /usr/local/cuda
-ENV CUDNN_INSTALL_PATH /usr/local/cuda
+ENV CUDNN_INSTALL_PATH /usr/lib/x86_64-linux-gnu
 ENV TF_NEED_CUDA 1
 ENV CUDA_COMPUTE_CAPABILITIES 3.0,5.2
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@ -1,9 +1,12 @@
 # TensorFlow external dependencies that can be loaded in WORKSPACE files.
 load("//third_party/gpus:cuda_configure.bzl", "cuda_configure")
 # If TensorFlow is linked as a submodule, path_prefix is TensorFlow's directory
 # within the workspace (e.g. "tensorflow/"), and tf_repo_name is the name of the
 # local_repository rule (e.g. "@tf").
 def tf_workspace(path_prefix = "", tf_repo_name = ""):
  cuda_configure(name = "local_config_cuda")
  # These lines need to be changed when updating Eigen. They are parsed from
  # this file by the cmake and make builds to determine the eigen version and hash.
--- a/third_party/gpus/BUILD
+++ b/third_party/gpus/BUILD
--- a/third_party/gpus/crosstool/BUILD
+++ b/third_party/gpus/crosstool/BUILD
@ -1,42 +0,0 @@
 licenses(["restricted"])
 package(default_visibility = ["//visibility:public"])
 filegroup(
    name = "crosstool",
    srcs = ["CROSSTOOL"],
    output_licenses = ["unencumbered"],
 )
 cc_toolchain(
    name = "cc-compiler-local",
    all_files = ":empty",
    compiler_files = ":empty",
    cpu = "local",
    dwp_files = ":empty",
    dynamic_runtime_libs = [":empty"],
    linker_files = ":empty",
    objcopy_files = ":empty",
    static_runtime_libs = [":empty"],
    strip_files = ":empty",
    supports_param_files = 0,
 )
 cc_toolchain(
    name = "cc-compiler-darwin",
    all_files = ":empty",
    compiler_files = ":empty",
    cpu = "darwin",
    dwp_files = ":empty",
    dynamic_runtime_libs = [":empty"],
    linker_files = ":empty",
    objcopy_files = ":empty",
    static_runtime_libs = [":empty"],
    strip_files = ":empty",
    supports_param_files = 0,
 )
 filegroup(
    name = "empty",
    srcs = [],
 )
--- a/third_party/gpus/crosstool/BUILD.tpl
+++ b/third_party/gpus/crosstool/BUILD.tpl
@ -0,0 +1,42 @@
 licenses(["restricted"])
 package(default_visibility = ["//visibility:public"])
 filegroup(
    name = "crosstool",
    srcs = ["CROSSTOOL"],
    output_licenses = ["unencumbered"],
 )
 cc_toolchain(
    name = "cc-compiler-local",
    all_files = ":empty",
    compiler_files = ":empty",
    cpu = "local",
    dwp_files = ":empty",
    dynamic_runtime_libs = [":empty"],
    linker_files = ":empty",
    objcopy_files = ":empty",
    static_runtime_libs = [":empty"],
    strip_files = ":empty",
    supports_param_files = 0,
 )
 cc_toolchain(
    name = "cc-compiler-darwin",
    all_files = ":empty",
    compiler_files = ":empty",
    cpu = "darwin",
    dwp_files = ":empty",
    dynamic_runtime_libs = [":empty"],
    linker_files = ":empty",
    objcopy_files = ":empty",
    static_runtime_libs = [":empty"],
    strip_files = ":empty",
    supports_param_files = 0,
 )
 filegroup(
    name = "empty",
    srcs = [],
 )
--- a/third_party/gpus/crosstool/CROSSTOOL.tpl
+++ b/third_party/gpus/crosstool/CROSSTOOL.tpl
@ -47,7 +47,7 @@ toolchain {
  tool_path { name: "cpp" path: "/usr/bin/cpp" }
  tool_path { name: "dwp" path: "/usr/bin/dwp" }
  # As part of the TensorFlow release, we place some cuda-related compilation
-  # files in third_party/gpus/crosstool/clang/bin, and this relative
+  # files in @local_config_cuda//crosstool/clang/bin, and this relative
  # path, combined with the rest of our Bazel configuration causes our
  # compilation to use those files.
  tool_path { name: "gcc" path: "clang/bin/crosstool_wrapper_driver_is_not_gcc" }
@ -125,6 +125,9 @@ toolchain {
  # linker_flag: "-Wl,--warn-execstack"
  # linker_flag: "-Wl,--detect-odr-violations"
  # Include directory for cuda headers.
  cxx_builtin_include_directory: "/usr/local/cuda%{cuda_version}/include"
  compilation_mode_flags {
    mode: DBG
    # Enable debug symbols.
@ -221,6 +224,9 @@ toolchain {
  # Anticipated future default.
  linker_flag: "-no-canonical-prefixes"
  # Include directory for cuda headers.
  cxx_builtin_include_directory: "/usr/local/cuda%{cuda_version}/include"
  compilation_mode_flags {
    mode: DBG
    # Enable debug symbols.
--- a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
+++ b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
@ -45,10 +45,9 @@ import re
 import sys
 import pipes
-# "configure" uses the specific format to substitute the following string.
+# Template values set by cuda_autoconf.
-# If you change it, make sure you modify "configure" as well.
+CPU_COMPILER = ('%{cpu_compiler}')
-CPU_COMPILER = ('/usr/bin/gcc')
+GCC_HOST_COMPILER_PATH = ('%{gcc_host_compiler_path}')
 GCC_HOST_COMPILER_PATH = ('/usr/bin/gcc')
 CURRENT_DIR = os.path.dirname(sys.argv[0])
 NVCC_PATH = CURRENT_DIR + '/../../../cuda/bin/nvcc'
@ -229,9 +228,7 @@ def InvokeNvcc(argv, log=False):
  srcs = ' '.join(src_files)
  out = ' -o ' + out_file[0]
-  # "configure" uses the specific format to substitute the following string.
+  supported_cuda_compute_capabilities = [ %{cuda_compute_capabilities} ]
  # If you change it, make sure you modify "configure" as well.
  supported_cuda_compute_capabilities = [ "3.5", "5.2" ]
  nvccopts = ''
  for capability in supported_cuda_compute_capabilities:
    capability = capability.replace('.', '')
--- a/third_party/gpus/cuda/BUILD
+++ b/third_party/gpus/cuda/BUILD
@ -1,224 +0,0 @@
 licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
 load("//third_party/gpus/cuda:build_defs.bzl", "if_cuda")
 load("platform", "cuda_library_path")
 load("platform", "cuda_static_library_path")
 load("platform", "cudnn_library_path")
 load("platform", "cupti_library_path")
 load("platform", "readlink_command")
 package(default_visibility = ["//visibility:public"])
 config_setting(
    name = "using_gcudacc",
    values = {
        "define": "using_cuda_gcudacc=true",
    },
    visibility = ["//visibility:public"],
 )
 config_setting(
    name = "using_nvcc",
    values = {
        "define": "using_cuda_nvcc=true",
    },
 )
 config_setting(
    name = "using_clang",
    values = {
        "define": "using_cuda_clang=true",
    },
 )
 # Equivalent to using_clang && -c opt.
 config_setting(
    name = "using_clang_opt",
    values = {
        "define": "using_cuda_clang=true",
        "compilation_mode": "opt",
    },
 )
 config_setting(
    name = "darwin",
    values = {"cpu": "darwin"},
    visibility = ["//visibility:public"],
 )
 cc_library(
    name = "cuda_headers",
    hdrs = glob([
        "**/*.h",
    ]),
    includes = [
        ".",
        "include",
    ],
    visibility = ["//visibility:public"],
 )
 cc_library(
    name = "cudart_static",
    srcs = [
        cuda_static_library_path("cudart"),
    ],
    includes = ["include/"],
    linkopts = [
        "-ldl",
        "-lpthread",
    ] + select({
        "//tensorflow:darwin": [],
        "//conditions:default": ["-lrt"],
    }),
    visibility = ["//visibility:public"],
 )
 cc_library(
    name = "cudart",
    srcs = [
        cuda_library_path("cudart"),
    ],
    data = [
        cuda_library_path("cudart"),
    ],
    includes = ["include/"],
    linkstatic = 1,
    visibility = ["//visibility:public"],
 )
 cc_library(
    name = "cublas",
    srcs = [
        cuda_library_path("cublas"),
    ],
    data = [
        cuda_library_path("cublas"),
    ],
    includes = ["include/"],
    linkstatic = 1,
    visibility = ["//visibility:public"],
 )
 cc_library(
    name = "cudnn",
    srcs = [
        cudnn_library_path(),
    ],
    data = [
        cudnn_library_path(),
    ],
    includes = ["include/"],
    linkstatic = 1,
    visibility = ["//visibility:public"],
 )
 cc_library(
    name = "cufft",
    srcs = [
        cuda_library_path("cufft"),
    ],
    data = [
        cuda_library_path("cufft"),
    ],
    includes = ["include/"],
    linkstatic = 1,
    visibility = ["//visibility:public"],
 )
 cc_library(
    name = "cuda",
    visibility = ["//visibility:public"],
    deps = [
        ":cublas",
        ":cuda_headers",
        ":cudart",
        ":cudnn",
        ":cufft",
    ],
 )
 cc_library(
    name = "cupti_headers",
    hdrs = glob([
        "**/*.h",
    ]),
    includes = [
        ".",
        "extras/CUPTI/include/",
    ],
    visibility = ["//visibility:public"],
 )
 cc_library(
    name = "cupti_dsos",
    data = [
        cupti_library_path(),
    ],
    visibility = ["//visibility:public"],
 )
 # TODO(opensource): for now, we have to invoke the cuda_config.sh manually in the source tree.
 # This rule checks if Cuda libraries in the source tree has been properly configured.
 # The output list makes bazel runs this rule first if the Cuda files are missing.
 # This gives us an opportunity to check and print a meaningful error message.
 # But we will need to create the output file list to make bazel happy in a successful run.
 genrule(
    name = "cuda_check",
    srcs = [
        "cuda.config",
        "cuda_config.sh",
    ],
    outs = [
        "include/cuda.h",
        "include/cublas.h",
        "include/cudnn.h",
        "extras/CUPTI/include/cupti.h",
        cuda_static_library_path("cudart"),
        cuda_library_path("cublas"),
        cudnn_library_path(),
        cuda_library_path("cudart"),
        cuda_library_path("cufft"),
        cupti_library_path(),
    ],
    cmd = if_cuda(
        # Under cuda config, create all the symbolic links to the actual cuda files
        "OUTPUTDIR=`{} -f $(@D)/../../..`; cd `dirname $(location :cuda_config.sh)`; OUTPUTDIR=$$OUTPUTDIR ./cuda_config.sh --check;".format(readlink_command()),
        # Under non-cuda config, create all dummy files to make the build go through
        ";".join([
            "mkdir -p $(@D)/include",
            "mkdir -p $(@D)/lib64",
            "mkdir -p $(@D)/extras/CUPTI/include",
            "mkdir -p $(@D)/extras/CUPTI/lib64",
            "touch $(@D)/include/cuda.h",
            "touch $(@D)/include/cublas.h",
            "touch $(@D)/include/cudnn.h",
            "touch $(@D)/extras/CUPTI/include/cupti.h",
            "touch $(@D)/{}".format(cuda_static_library_path("cudart")),
            "touch $(@D)/{}".format(cuda_library_path("cublas")),
            "touch $(@D)/{}".format(cudnn_library_path()),
            "touch $(@D)/{}".format(cuda_library_path("cudart")),
            "touch $(@D)/{}".format(cuda_library_path("cufft")),
            "touch $(@D)/{}".format(cupti_library_path()),
        ]),
    ),
    local = 1,
 )
 genrule(
    name = "cuda_config_check",
    outs = [
        "cuda.config",
    ],
    cmd = if_cuda(
        # Under cuda config, create the symbolic link to the actual cuda.config
        "configfile=$(location :cuda.config); ln -sf `{} -f $${{configfile#*/*/*/}}` $(@D)/;".format(readlink_command()),
        # Under non-cuda config, create the dummy file
        ";".join([
            "touch $(@D)/cuda.config",
        ]),
    ),
    local = 1,
 )
--- a/third_party/gpus/cuda/BUILD.tpl
+++ b/third_party/gpus/cuda/BUILD.tpl
@ -0,0 +1,172 @@
 licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
 load("@local_config_cuda//cuda:platform.bzl", "cuda_library_path")
 load("@local_config_cuda//cuda:platform.bzl", "cuda_static_library_path")
 load("@local_config_cuda//cuda:platform.bzl", "cudnn_library_path")
 load("@local_config_cuda//cuda:platform.bzl", "cupti_library_path")
 load("@local_config_cuda//cuda:platform.bzl", "readlink_command")
 package(default_visibility = ["//visibility:public"])
 config_setting(
    name = "using_gcudacc",
    values = {
        "define": "using_cuda_gcudacc=true",
    },
    visibility = ["//visibility:public"],
 )
 config_setting(
    name = "using_nvcc",
    values = {
        "define": "using_cuda_nvcc=true",
    },
 )
 config_setting(
    name = "using_clang",
    values = {
        "define": "using_cuda_clang=true",
    },
 )
 # Equivalent to using_clang && -c opt.
 config_setting(
    name = "using_clang_opt",
    values = {
        "define": "using_cuda_clang=true",
        "compilation_mode": "opt",
    },
 )
 config_setting(
    name = "darwin",
    values = {"cpu": "darwin"},
    visibility = ["//visibility:public"],
 )
 cc_library(
    name = "cuda_headers",
    hdrs = glob([
        "**/*.h",
    ]),
    includes = [
        ".",
        "include",
    ],
    visibility = ["//visibility:public"],
 )
 cc_library(
    name = "cudart_static",
    srcs = [
        cuda_static_library_path("cudart"),
    ],
    includes = ["include/"],
    linkopts = [
        "-ldl",
        "-lpthread",
    ] + select({
        "@//tensorflow:darwin": [],
        "//conditions:default": ["-lrt"],
    }),
    visibility = ["//visibility:public"],
 )
 cc_library(
    name = "cudart",
    srcs = [
        cuda_library_path("cudart"),
    ],
    data = [
        cuda_library_path("cudart"),
    ],
    includes = ["include/"],
    linkstatic = 1,
    visibility = ["//visibility:public"],
 )
 cc_library(
    name = "cublas",
    srcs = [
        cuda_library_path("cublas"),
    ],
    data = [
        cuda_library_path("cublas"),
    ],
    includes = ["include/"],
    linkstatic = 1,
    visibility = ["//visibility:public"],
 )
 cc_library(
    name = "cudnn",
    srcs = [
        cudnn_library_path(),
    ],
    data = [
        cudnn_library_path(),
    ],
    includes = ["include/"],
    linkstatic = 1,
    visibility = ["//visibility:public"],
 )
 cc_library(
    name = "cufft",
    srcs = [
        cuda_library_path("cufft"),
    ],
    data = [
        cuda_library_path("cufft"),
    ],
    includes = ["include/"],
    linkstatic = 1,
    visibility = ["//visibility:public"],
 )
 cc_library(
    name = "curand",
    srcs = [
        cuda_library_path("curand"),
    ],
    data = [
        cuda_library_path("curand"),
    ],
    includes = ["include/"],
    linkstatic = 1,
    visibility = ["//visibility:public"],
 )
 cc_library(
    name = "cuda",
    deps = [
        ":cuda_headers",
        ":cudart",
        ":cublas",
        ":cudnn",
        ":cufft",
        ":curand",
    ],
    visibility = ["//visibility:public"],
 )
 cc_library(
    name = "cupti_headers",
    hdrs = glob([
        "**/*.h",
    ]),
    includes = [
        ".",
        "extras/CUPTI/include/",
    ],
    visibility = ["//visibility:public"],
 )
 cc_library(
    name = "cupti_dsos",
    data = [
        cupti_library_path(),
    ],
    visibility = ["//visibility:public"],
 )
--- a/third_party/gpus/cuda/build_defs.bzl.tpl
+++ b/third_party/gpus/cuda/build_defs.bzl.tpl
@ -8,7 +8,7 @@ def if_cuda(if_true, if_false = []):
    """
    return select({
-        "//third_party/gpus/cuda:using_nvcc": if_true,
+        "@local_config_cuda//cuda:using_nvcc": if_true,
-        "//third_party/gpus/cuda:using_gcudacc": if_true,
+        "@local_config_cuda//cuda:using_gcudacc": if_true,
        "//conditions:default": if_false
    })
--- a/third_party/gpus/cuda/cuda_config.h.tpl
+++ b/third_party/gpus/cuda/cuda_config.h.tpl
@ -0,0 +1,24 @@
 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef CUDA_CUDA_CONFIG_H_
 #define CUDA_CUDA_CONFIG_H_
 #define TF_CUDA_CAPABILITIES %{cuda_compute_capabilities}
 #define TF_CUDA_VERSION "%{cuda_version}"
 #define TF_CUDNN_VERSION "%{cudnn_version}"
 #endif  // CUDA_CUDA_CONFIG_H_
--- a/third_party/gpus/cuda/cuda_config.sh
+++ b/third_party/gpus/cuda/cuda_config.sh
@ -1,234 +0,0 @@
 #!/usr/bin/env bash
 # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 # A simple script to configure the Cuda tree needed for the TensorFlow GPU
 # build. We need both Cuda toolkit $TF_CUDA_VERSION and Cudnn $TF_CUDNN_VERSION.
 # Useage:
 #    * User edit cuda.config to point both Cuda toolkit and Cudnn libraries to their local path
 #    * run cuda_config.sh to generate symbolic links in the source tree to reflect
 #    * the file organizations needed by TensorFlow.
 print_usage() {
 cat << EOF
 Usage: $0 [--check]
  Configure TensorFlow's canonical view of Cuda libraries using cuda.config.
 Arguments:
  --check: Only check that the proper Cuda dependencies has already been
       properly configured in the source tree. It also creates symbolic links to
       the files in the gen-tree to make bazel happy.
 EOF
 }
 CHECK_ONLY=0
 # Parse the arguments. Add more arguments as the "case" line when needed.
 while [[ $# -gt 0 ]]; do
  argument="$1"
  shift
  case $argument in
    --check)
      CHECK_ONLY=1
      ;;
    *)
      echo "Error: unknown arguments"
      print_usage
      exit -1
      ;;
  esac
 done
 source cuda.config || exit -1
 OUTPUTDIR=${OUTPUTDIR:-../../..}
 CUDA_TOOLKIT_PATH=${CUDA_TOOLKIT_PATH:-/usr/local/cuda}
 CUDNN_INSTALL_BASEDIR=${CUDNN_INSTALL_PATH:-/usr/local/cuda}
 if [[ -z "$TF_CUDA_VERSION" ]]; then
  TF_CUDA_EXT=""
 else
  TF_CUDA_EXT=".$TF_CUDA_VERSION"
 fi
 if [[ -z "$TF_CUDNN_VERSION" ]]; then
  TF_CUDNN_EXT=""
 else
  TF_CUDNN_EXT=".$TF_CUDNN_VERSION"
 fi
 # An error message when the Cuda toolkit is not found
 function CudaError {
  echo ERROR: $1
 cat << EOF
 ##############################################################################
 ##############################################################################
 Cuda $TF_CUDA_VERSION toolkit is missing.
 1. Download and install the CUDA $TF_CUDA_VERSION toolkit and CUDNN $TF_CUDNN_VERSION library;
 2. Run configure from the root of the source tree, before rerunning bazel;
 Please refer to README.md for more details.
 ##############################################################################
 ##############################################################################
 EOF
  exit -1
 }
 # An error message when CUDNN is not found
 function CudnnError {
  echo ERROR: $1
 cat << EOF
 ##############################################################################
 ##############################################################################
 Cudnn $TF_CUDNN_VERSION is missing.
 1. Download and install the CUDA $TF_CUDA_VERSION toolkit and CUDNN $TF_CUDNN_VERSION library;
 2. Run configure from the root of the source tree, before rerunning bazel;
 Please refer to README.md for more details.
 ##############################################################################
 ##############################################################################
 EOF
  exit -1
 }
 # Check that Cuda libraries has already been properly configured in the source tree.
 # We still need to create links to the gen-tree to make bazel happy.
 function CheckAndLinkToSrcTree {
  ERROR_FUNC=$1
  FILE=$2
  if test ! -e $FILE; then
    $ERROR_FUNC "$PWD/$FILE cannot be found"
  fi
  # Link the output file to the source tree, avoiding self links if they are
  # the same. This could happen if invoked from the source tree by accident.
  if [ ! $($READLINK_CMD -f $PWD) == $($READLINK_CMD -f $OUTPUTDIR/third_party/gpus/cuda) ]; then
    mkdir -p $(dirname $OUTPUTDIR/third_party/gpus/cuda/$FILE)
    ln -sf $PWD/$FILE $OUTPUTDIR/third_party/gpus/cuda/$FILE
  fi
 }
 OSNAME=`uname -s`
 if [ "$OSNAME" == "Linux" ]; then
  CUDA_LIB_PATH="lib64"
  CUDA_CUPTI_LIB_DIR="extras/CUPTI/lib64"
  CUDA_RT_LIB_PATH="lib64/libcudart.so${TF_CUDA_EXT}"
  CUDA_RT_LIB_STATIC_PATH="lib64/libcudart_static.a"
  CUDA_BLAS_LIB_PATH="lib64/libcublas.so${TF_CUDA_EXT}"
  CUDA_DNN_LIB_PATH="lib64/libcudnn.so${TF_CUDNN_EXT}"
  CUDA_DNN_LIB_ALT_PATH="libcudnn.so${TF_CUDNN_EXT}"
  CUDA_FFT_LIB_PATH="lib64/libcufft.so${TF_CUDA_EXT}"
  CUDA_CUPTI_LIB_PATH="extras/CUPTI/lib64/libcupti.so${TF_CUDA_EXT}"
  READLINK_CMD="readlink"
 elif [ "$OSNAME" == "Darwin" ]; then
  CUDA_LIB_PATH="lib"
  CUDA_CUPTI_LIB_DIR="extras/CUPTI/lib"
  CUDA_RT_LIB_PATH="lib/libcudart${TF_CUDA_EXT}.dylib"
  CUDA_RT_LIB_STATIC_PATH="lib/libcudart_static.a"
  CUDA_BLAS_LIB_PATH="lib/libcublas${TF_CUDA_EXT}.dylib"
  CUDA_DNN_LIB_PATH="lib/libcudnn${TF_CUDNN_EXT}.dylib"
  CUDA_DNN_LIB_ALT_PATH="libcudnn${TF_CUDNN_EXT}.dylib"
  CUDA_FFT_LIB_PATH="lib/libcufft${TF_CUDA_EXT}.dylib"
  CUDA_CUPTI_LIB_PATH="extras/CUPTI/lib/libcupti${TF_CUDA_EXT}.dylib"
  READLINK_CMD="greadlink"
 fi
 if [ "$CHECK_ONLY" == "1" ]; then
  CheckAndLinkToSrcTree CudaError include/cuda.h
  CheckAndLinkToSrcTree CudaError include/cublas.h
  CheckAndLinkToSrcTree CudnnError include/cudnn.h
  CheckAndLinkToSrcTree CudaError extras/CUPTI/include/cupti.h
  CheckAndLinkToSrcTree CudaError $CUDA_RT_LIB_STATIC_PATH
  CheckAndLinkToSrcTree CudaError $CUDA_BLAS_LIB_PATH
  CheckAndLinkToSrcTree CudnnError $CUDA_DNN_LIB_PATH
  CheckAndLinkToSrcTree CudaError $CUDA_RT_LIB_PATH
  CheckAndLinkToSrcTree CudaError $CUDA_FFT_LIB_PATH
  CheckAndLinkToSrcTree CudaError $CUDA_CUPTI_LIB_PATH
  exit 0
 fi
 # Actually configure the source tree for TensorFlow's canonical view of Cuda
 # libraries.
 if test ! -e ${CUDA_TOOLKIT_PATH}/${CUDA_RT_LIB_PATH}; then
  CudaError "cannot find ${CUDA_TOOLKIT_PATH}/${CUDA_RT_LIB_PATH}"
 fi
 if test ! -e ${CUDA_TOOLKIT_PATH}/${CUDA_CUPTI_LIB_PATH}; then
  CudaError "cannot find ${CUDA_TOOLKIT_PATH}/${CUDA_CUPTI_LIB_PATH}"
 fi
 if test ! -d ${CUDNN_INSTALL_BASEDIR}; then
  CudnnError "cannot find dir: ${CUDNN_INSTALL_BASEDIR}"
 fi
 # Locate cudnn.h
 if test -e ${CUDNN_INSTALL_BASEDIR}/cudnn.h; then
  CUDNN_HEADER_DIR=${CUDNN_INSTALL_BASEDIR}
 elif test -e ${CUDNN_INSTALL_BASEDIR}/include/cudnn.h; then
  CUDNN_HEADER_DIR=${CUDNN_INSTALL_BASEDIR}/include
 elif test -e /usr/include/cudnn.h; then
  CUDNN_HEADER_DIR=/usr/include
 else
  CudnnError "cannot find cudnn.h under: ${CUDNN_INSTALL_BASEDIR}"
 fi
 # Locate libcudnn
 if test -e ${CUDNN_INSTALL_BASEDIR}/${CUDA_DNN_LIB_PATH}; then
  CUDNN_LIB_INSTALL_PATH=${CUDNN_INSTALL_BASEDIR}/${CUDA_DNN_LIB_PATH}
 elif test -e ${CUDNN_INSTALL_BASEDIR}/${CUDA_DNN_LIB_ALT_PATH}; then
  CUDNN_LIB_INSTALL_PATH=${CUDNN_INSTALL_BASEDIR}/${CUDA_DNN_LIB_ALT_PATH}
 else
  CudnnError "cannot find ${CUDA_DNN_LIB_PATH} or ${CUDA_DNN_LIB_ALT_PATH} under: ${CUDNN_INSTALL_BASEDIR}"
 fi
 # Helper function to build symbolic links for all files under a directory.
 function LinkOneDir {
  SRC_PREFIX=$1
  DST_PREFIX=$2
  SRC_DIR=$3
  DST_DIR=$(echo $SRC_DIR | sed "s,^$SRC_PREFIX,$DST_PREFIX,")
  mkdir -p $DST_DIR
  FILE_LIST=$(find -L $SRC_DIR -maxdepth 1 -type f)
  if test "$FILE_LIST" != ""; then
    ln -sf $FILE_LIST $DST_DIR/ || exit -1
  fi
 }
 export -f LinkOneDir
 # Build links for all files under the directory, including subdirectoreis.
 function LinkAllFiles {
  SRC_DIR=$1
  DST_DIR=$2
  find -L $SRC_DIR -type d | xargs -I {} bash -c "LinkOneDir $SRC_DIR $DST_DIR {}" || exit -1
 }
 # Set up the symbolic links for cuda toolkit. We link at individual file level,
 # not at the directory level.
 # This is because the external library may have different file layout from our desired structure.
 mkdir -p $OUTPUTDIR/third_party/gpus/cuda
 echo "Setting up Cuda include"
 LinkAllFiles ${CUDA_TOOLKIT_PATH}/include $OUTPUTDIR/third_party/gpus/cuda/include || exit -1
 echo "Setting up Cuda ${CUDA_LIB_PATH}"
 LinkAllFiles ${CUDA_TOOLKIT_PATH}/${CUDA_LIB_PATH} $OUTPUTDIR/third_party/gpus/cuda/${CUDA_LIB_PATH} || exit -1
 echo "Setting up Cuda bin"
 LinkAllFiles ${CUDA_TOOLKIT_PATH}/bin $OUTPUTDIR/third_party/gpus/cuda/bin || exit -1
 echo "Setting up Cuda nvvm"
 LinkAllFiles ${CUDA_TOOLKIT_PATH}/nvvm $OUTPUTDIR/third_party/gpus/cuda/nvvm || exit -1
 echo "Setting up CUPTI include"
 LinkAllFiles ${CUDA_TOOLKIT_PATH}/extras/CUPTI/include $OUTPUTDIR/third_party/gpus/cuda/extras/CUPTI/include || exit -1
 echo "Setting up CUPTI lib64"
 LinkAllFiles ${CUDA_TOOLKIT_PATH}/${CUDA_CUPTI_LIB_DIR} $OUTPUTDIR/third_party/gpus/cuda/${CUDA_CUPTI_LIB_DIR} || exit -1
 # Set up symbolic link for cudnn
 ln -sf $CUDNN_HEADER_DIR/cudnn.h $OUTPUTDIR/third_party/gpus/cuda/include/cudnn.h || exit -1
 ln -sf $CUDNN_LIB_INSTALL_PATH $OUTPUTDIR/third_party/gpus/cuda/$CUDA_DNN_LIB_PATH || exit -1
--- a/third_party/gpus/cuda/platform.bzl.tpl
+++ b/third_party/gpus/cuda/platform.bzl.tpl
@ -1,6 +1,6 @@
-CUDA_VERSION = ""
+CUDA_VERSION = "%{cuda_version}"
-CUDNN_VERSION = ""
+CUDNN_VERSION = "%{cudnn_version}"
-PLATFORM = ""
+PLATFORM = "%{platform}"
 def cuda_sdk_version():
  return CUDA_VERSION
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@ -0,0 +1,423 @@
 # -*- Python -*-
 """Repository rule for CUDA autoconfiguration.
 `cuda_configure` depends on the following environment variables:
  * `ENABLE_CUDA`: Whether to enable building with CUDA.
  * `CC`: The GCC host compiler path
  * `CUDA_TOOLKIT_PATH`: The path to the CUDA toolkit. Default is
    `/usr/local/cuda`.
  * `CUDA_VERSION`: The version of the CUDA toolkit. If this is blank, then
    use the system default.
  * `CUDNN_VERSION`: The version of the cuDNN library.
  * `CUDNN_INSTALL_PATH`: The path to the cuDNN library. Default is
    `/usr/local/cuda`.
  * `CUDA_COMPUTE_CAPABILITIES`: The CUDA compute capabilities. Default is
    `3.5,5.2`.
 """
 _DEFAULT_CUDA_VERSION = ""
 _DEFAULT_CUDNN_VERSION = ""
 _DEFAULT_CUDA_TOOLKIT_PATH = "/usr/local/cuda"
 _DEFAULT_CUDNN_INSTALL_PATH = "/usr/local/cuda"
 _DEFAULT_CUDA_COMPUTE_CAPABILITIES = ["3.5", "5.2"]
 # TODO(dzc): Once these functions have been factored out of Bazel's
 # cc_configure.bzl, load them from @bazel_tools instead.
 # BEGIN cc_configure common functions.
 def find_cc(repository_ctx):
  """Find the C++ compiler."""
  cc_name = "gcc"
  if "CC" in repository_ctx.os.environ:
    cc_name = repository_ctx.os.environ["CC"].strip()
    if not cc_name:
      cc_name = "gcc"
  if cc_name.startswith("/"):
    # Absolute path, maybe we should make this suported by our which function.
    return cc_name
  cc = repository_ctx.which(cc_name)
  if cc == None:
    fail(
        "Cannot find gcc, either correct your path or set the CC" +
        " environment variable")
  return cc
 _INC_DIR_MARKER_BEGIN = "#include <...>"
 # OSX add " (framework directory)" at the end of line, strip it.
 _OSX_FRAMEWORK_SUFFIX = " (framework directory)"
 _OSX_FRAMEWORK_SUFFIX_LEN =  len(_OSX_FRAMEWORK_SUFFIX)
 def _cxx_inc_convert(path):
  """Convert path returned by cc -E xc++ in a complete path."""
  path = path.strip()
  if path.endswith(_OSX_FRAMEWORK_SUFFIX):
    path = path[:-_OSX_FRAMEWORK_SUFFIX_LEN].strip()
  return path
 def get_cxx_inc_directories(repository_ctx, cc):
  """Compute the list of default C++ include directories."""
  result = repository_ctx.execute([cc, "-E", "-xc++", "-", "-v"])
  index1 = result.stderr.find(_INC_DIR_MARKER_BEGIN)
  if index1 == -1:
    return []
  index1 = result.stderr.find("\n", index1)
  if index1 == -1:
    return []
  index2 = result.stderr.rfind("\n ")
  if index2 == -1 or index2 < index1:
    return []
  index2 = result.stderr.find("\n", index2 + 1)
  if index2 == -1:
    inc_dirs = result.stderr[index1 + 1:]
  else:
    inc_dirs = result.stderr[index1 + 1:index2].strip()
  return [repository_ctx.path(_cxx_inc_convert(p))
          for p in inc_dirs.split("\n")]
 # END cc_configure common functions (see TODO above).
 def _enable_cuda(repository_ctx):
  if "TF_NEED_CUDA" in repository_ctx.os.environ:
    enable_cuda = repository_ctx.os.environ["TF_NEED_CUDA"].strip()
    return enable_cuda == "1"
  return False
 def _cuda_toolkit_path(repository_ctx):
  """Finds the cuda toolkit directory."""
  cuda_toolkit_path = _DEFAULT_CUDA_TOOLKIT_PATH
  if "CUDA_TOOLKIT_PATH" in repository_ctx.os.environ:
    cuda_toolkit_path = repository_ctx.os.environ["CUDA_TOOLKIT_PATH"].strip()
  if not repository_ctx.path(cuda_toolkit_path).exists:
    fail("Cannot find cuda toolkit path.")
  return cuda_toolkit_path
 def _cudnn_install_basedir(repository_ctx):
  """Finds the cudnn install directory."""
  cudnn_install_path = _DEFAULT_CUDNN_INSTALL_PATH
  if "CUDNN_INSTALL_PATH" in repository_ctx.os.environ:
    cudnn_install_path = repository_ctx.os.environ["CUDNN_INSTALL_PATH"].strip()
  if not repository_ctx.path(cudnn_install_path).exists:
    fail("Cannot find cudnn install path.")
  return cudnn_install_path
 def _cuda_version(repository_ctx):
  """Detects the cuda version."""
  if "CUDA_VERSION" in repository_ctx.os.environ:
    return repository_ctx.os.environ["CUDA_VERSION"].strip()
  else:
    return ""
 def _cudnn_version(repository_ctx):
  """Detects the cudnn version."""
  if "CUDNN_VERSION" in repository_ctx.os.environ:
    return repository_ctx.os.environ["CUDNN_VERSION"].strip()
  else:
    return ""
 def _compute_capabilities(repository_ctx):
  """Returns a list of strings representing cuda compute capabilities."""
  if "CUDA_COMPUTE_CAPABILITIES" not in repository_ctx.os.environ:
    return _DEFAULT_CUDA_COMPUTE_CAPABILITIES
  capabilities_str = repository_ctx.os.environ["CUDA_COMPUTE_CAPABILITIES"]
  capabilities = capabilities_str.split(",")
  for capability in capabilities:
    # Workaround for Skylark's lack of support for regex. This check should
    # be equivalent to checking:
    #     if re.match("[0-9]+.[0-9]+", capability) == None:
    parts = capability.split(".")
    if len(parts) != 2 or not parts[0].isdigit() or not parts[1].isdigit():
      fail("Invalid compute capability: %s" % capability)
  return capabilities
 def _cpu_value(repository_ctx):
  result = repository_ctx.execute(["uname", "-s"])
  return result.stdout.strip()
 def _cuda_symlink_files(cpu_value, cuda_version, cudnn_version):
  """Returns a struct containing platform-specific paths.
  Args:
    cpu_value: The string representing the host OS.
    cuda_version: The cuda version as returned by _cuda_version
    cudnn_version: The cudnn version as returned by _cudnn_version
  """
  cuda_ext = ".%s" % cuda_version if cuda_version else ""
  cudnn_ext = ".%s" % cudnn_version if cudnn_version else ""
  if cpu_value == "Linux":
    return struct(
        cuda_lib_path = "lib64",
        cuda_rt_lib = "lib64/libcudart.so%s" % cuda_ext,
        cuda_rt_lib_static = "lib64/libcudart_static.a",
        cuda_blas_lib = "lib64/libcublas.so%s" % cuda_ext,
        cuda_dnn_lib = "lib64/libcudnn.so%s" % cudnn_ext,
        cuda_dnn_lib_alt = "libcudnn.so%s" % cudnn_ext,
        cuda_rand_lib = "lib64/libcurand.so%s" % cuda_ext,
        cuda_fft_lib = "lib64/libcufft.so%s" % cuda_ext,
        cuda_cupti_lib = "extras/CUPTI/lib64/libcupti.so%s" % cuda_ext)
  elif cpu_value == "Darwin":
    return struct(
        cuda_lib_path = "lib",
        cuda_rt_lib = "lib/libcudart%s.dylib" % cuda_ext,
        cuda_rt_lib_static = "lib/libcudart_static.a",
        cuda_blas_lib = "lib/libcublas%s.dylib" % cuda_ext,
        cuda_dnn_lib = "lib/libcudnn%s.dylib" % cudnn_ext,
        cuda_dnn_lib_alt = "libcudnn%s.dylib" % cudnn_ext,
        cuda_rand_lib = "lib/libcurand%s.dylib" % cuda_ext,
        cuda_fft_lib = "lib/libcufft%s.dylib" % cuda_ext,
        cuda_cupti_lib = "extras/CUPTI/lib/libcupti%s.dylib" % cuda_ext)
  else:
    fail("Not supported CPU value %s" % cpu_value)
 def _check_lib(repository_ctx, cuda_toolkit_path, cuda_lib):
  """Checks if cuda_lib exists under cuda_toolkit_path or fail if it doesn't.
  Args:
    repository_ctx: The repository context.
    cuda_toolkit_path: The cuda toolkit directory containing the cuda libraries.
    cuda_lib: The library to look for under cuda_toolkit_path.
  """
  lib_path = cuda_toolkit_path + "/" + cuda_lib
  if not repository_ctx.path(lib_path).exists:
    fail("Cannot find %s" % lib_path)
 def _check_dir(repository_ctx, directory):
  """Checks whether the directory exists and fail if it does not.
  Args:
    repository_ctx: The repository context.
    directory: The directory to check the existence of.
  """
  if not repository_ctx.path(directory).exists:
    fail("Cannot find dir: %s" % directory)
 def _find_cudnn_header_dir(repository_ctx, cudnn_install_basedir):
  """Returns the path to the directory containing cudnn.h
  Args:
    repository_ctx: The repository context.
    cudnn_install_basedir: The cudnn install directory as returned by
      _cudnn_install_basedir.
  Returns:
    The path of the directory containing the cudnn header.
  """
  if repository_ctx.path(cudnn_install_basedir + "/cudnn.h").exists:
    return cudnn_install_basedir
  if repository_ctx.path(cudnn_install_basedir + "/include/cudnn.h").exists:
    return cudnn_install_basedir + "/include"
  if repository_ctx.path("/usr/include/cudnn.h").exists:
    return "/usr/include"
  fail("Cannot find cudnn.h under %s" % cudnn_install_basedir)
 def _find_cudnn_lib_path(repository_ctx, cudnn_install_basedir, symlink_files):
  """Returns the path to the directory containing libcudnn
  Args:
    repository_ctx: The repository context.
    cudnn_install_basedir: The cudnn install dir as returned by
      _cudnn_install_basedir.
    symlink_files: The symlink files as returned by _cuda_symlink_files.
  Returns:
    The path of the directory containing the cudnn libraries.
  """
  lib_dir = cudnn_install_basedir + "/" + symlink_files.cuda_dnn_lib
  if repository_ctx.path(lib_dir).exists:
    return lib_dir
  alt_lib_dir = cudnn_install_basedir + "/" + symlink_files.cuda_dnn_lib_alt
  if repository_ctx.path(alt_lib_dir).exists:
    return alt_lib_dir
  fail("Cannot find %s or %s under %s" %
       (symlink_files.cuda_dnn_lib, symlink_files.cuda_dnn_lib_alt,
        cudnn_install_basedir))
 def _tpl(repository_ctx, tpl, substitutions={}, out=None):
  if not out:
    out = tpl.replace(":", "/")
  repository_ctx.template(
      out,
      Label("//third_party/gpus/%s.tpl" % tpl),
      substitutions)
 def _file(repository_ctx, label):
  repository_ctx.template(
      label.replace(":", "/"),
      Label("//third_party/gpus/%s.tpl" % label),
      {})
 def _create_dummy_repository(repository_ctx):
  cpu_value = _cpu_value(repository_ctx)
  symlink_files = _cuda_symlink_files(cpu_value, _DEFAULT_CUDA_VERSION,
                                      _DEFAULT_CUDNN_VERSION)
  # Set up BUILD file for cuda/.
  _file(repository_ctx, "cuda:BUILD")
  _file(repository_ctx, "cuda:build_defs.bzl")
  _tpl(repository_ctx, "cuda:platform.bzl",
       {
           "%{cuda_version}": _DEFAULT_CUDA_VERSION,
           "%{cudnn_version}": _DEFAULT_CUDNN_VERSION,
           "%{platform}": cpu_value,
       })
  # Create dummy files for the CUDA toolkit since they are still required by
  # tensorflow/core/platform/default/build_config:cuda.
  repository_ctx.file("cuda/include/cuda.h", "")
  repository_ctx.file("cuda/include/cublas.h", "")
  repository_ctx.file("cuda/include/cudnn.h", "")
  repository_ctx.file("cuda/extras/CUPTI/include/cupti.h", "")
  repository_ctx.file("cuda/%s" % symlink_files.cuda_rt_lib, "")
  repository_ctx.file("cuda/%s" % symlink_files.cuda_rt_lib_static, "")
  repository_ctx.file("cuda/%s" % symlink_files.cuda_blas_lib, "")
  repository_ctx.file("cuda/%s" % symlink_files.cuda_dnn_lib, "")
  repository_ctx.file("cuda/%s" % symlink_files.cuda_rand_lib, "")
  repository_ctx.file("cuda/%s" % symlink_files.cuda_fft_lib, "")
  repository_ctx.file("cuda/%s" % symlink_files.cuda_cupti_lib, "")
  # Set up cuda_config.h, which is used by
  # tensorflow/stream_executor/dso_loader.cc.
  _tpl(repository_ctx, "cuda:cuda_config.h",
       {
           "%{cuda_version}": _DEFAULT_CUDA_VERSION,
           "%{cudnn_version}": _DEFAULT_CUDNN_VERSION,
           "%{cuda_compute_capabilities}": ",".join([
               "CudaVersion(\"%s\")" % c
               for c in _DEFAULT_CUDA_COMPUTE_CAPABILITIES]),
       })
 def _symlink_dir(repository_ctx, src_dir, dest_dir):
  """Symlinks all the files in a directory.
  Args:
    repository_ctx: The repository context.
    src_dir: The source directory.
    dest_dir: The destination directory to create the symlinks in.
  """
  files = repository_ctx.path(src_dir).readdir()
  for src_file in files:
    repository_ctx.symlink(src_file, dest_dir + "/" + src_file.basename)
 def _create_cuda_repository(repository_ctx):
  """Creates the repository containing files set up to build with CUDA."""
  cuda_toolkit_path = _cuda_toolkit_path(repository_ctx)
  cuda_version = _cuda_version(repository_ctx)
  cudnn_install_basedir = _cudnn_install_basedir(repository_ctx)
  cudnn_version = _cudnn_version(repository_ctx)
  compute_capabilities = _compute_capabilities(repository_ctx)
  cpu_value = _cpu_value(repository_ctx)
  symlink_files = _cuda_symlink_files(cpu_value, cuda_version, cudnn_version)
  _check_lib(repository_ctx, cuda_toolkit_path, symlink_files.cuda_rt_lib)
  _check_lib(repository_ctx, cuda_toolkit_path, symlink_files.cuda_cupti_lib)
  _check_dir(repository_ctx, cudnn_install_basedir)
  cudnn_header_dir = _find_cudnn_header_dir(repository_ctx,
                                            cudnn_install_basedir)
  cudnn_lib_path = _find_cudnn_lib_path(repository_ctx, cudnn_install_basedir,
                                        symlink_files)
  # Set up symbolic links for the cuda toolkit. We link at the individual file
  # level not at the directory level. This is because the external library may
  # have a different file layout from our desired structure.
  _symlink_dir(repository_ctx, cuda_toolkit_path + "/include", "cuda/include")
  _symlink_dir(repository_ctx,
               cuda_toolkit_path + "/" + symlink_files.cuda_lib_path,
               "cuda/" + symlink_files.cuda_lib_path)
  _symlink_dir(repository_ctx, cuda_toolkit_path + "/bin", "cuda/bin")
  _symlink_dir(repository_ctx, cuda_toolkit_path + "/nvvm", "cuda/nvvm")
  _symlink_dir(repository_ctx, cuda_toolkit_path + "/extras/CUPTI/include",
               "cuda/extras/CUPTI/include")
  repository_ctx.symlink(cuda_toolkit_path + "/" + symlink_files.cuda_cupti_lib,
                         "cuda/" + symlink_files.cuda_cupti_lib)
  # Set up the symbolic links for cudnn if cudnn was was not installed to
  # CUDA_TOOLKIT_PATH.
  if not repository_ctx.path("cuda/include/cudnn.h").exists:
    repository_ctx.symlink(cudnn_header_dir + "/cudnn.h",
                           "cuda/include/cudnn.h")
  if not repository_ctx.path("cuda/" + symlink_files.cuda_dnn_lib).exists:
    repository_ctx.symlink(cudnn_lib_path, "cuda/" + symlink_files.cuda_dnn_lib)
  # Set up BUILD file for cuda/
  _file(repository_ctx, "cuda:BUILD")
  _file(repository_ctx, "cuda:build_defs.bzl")
  _tpl(repository_ctx, "cuda:platform.bzl",
       {
           "%{cuda_version}": cuda_version,
           "%{cudnn_version}": cudnn_version,
           "%{platform}": cpu_value,
       })
  # Set up crosstool/
  _file(repository_ctx, "crosstool:BUILD")
  _tpl(repository_ctx, "crosstool:CROSSTOOL",
       {
           "%{cuda_version}": ("-%s" % cuda_version) if cuda_version else "",
       })
  _tpl(repository_ctx,
       "crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc",
       {
           "%{cpu_compiler}": str(find_cc(repository_ctx)),
           "%{gcc_host_compiler_path}": str(find_cc(repository_ctx)),
           "%{cuda_compute_capabilities}": ", ".join(
               ["\"%s\"" % c for c in compute_capabilities]),
       })
  # Set up cuda_config.h, which is used by
  # tensorflow/stream_executor/dso_loader.cc.
  _tpl(repository_ctx, "cuda:cuda_config.h",
       {
           "%{cuda_version}": cuda_version,
           "%{cudnn_version}": cudnn_version,
           "%{cuda_compute_capabilities}": ",".join(
               ["CudaVersion(\"%s\")" % c for c in compute_capabilities]),
       })
 def _cuda_autoconf_impl(repository_ctx):
  """Implementation of the cuda_autoconf repository rule."""
  if not _enable_cuda(repository_ctx):
    _create_dummy_repository(repository_ctx)
  else:
    _create_cuda_repository(repository_ctx)
 cuda_configure = repository_rule(
    implementation = _cuda_autoconf_impl,
    local = True,
 )
 """Detects and configures the local CUDA toolchain.
 Add the following to your WORKSPACE FILE:
 ```python
 cuda_configure(name = "local_config_cuda")
 ```
 Args:
  name: A unique name for this workspace rule.
 """
--- a/tools/bazel.rc.template
+++ b/tools/bazel.rc.template
@ -1,4 +1,4 @@
-build:cuda --crosstool_top=//third_party/gpus/crosstool
+build:cuda --crosstool_top=@local_config_cuda//crosstool
 build:cuda --define=using_cuda=true --define=using_cuda_nvcc=true
 build --force_python=py$PYTHON_MAJOR_VERSION