Improve CUDA runtime dependencies search.

tensorflow::CudaRoot() now may return multiple possible locations of the CUDA root. PiperOrigin-RevId: 225091635
2018-12-11 16:24:24 -08:00 · 2018-12-11 16:24:24 -08:00 · bafb874798
commit bafb874798
parent 33bc0b9788
7 changed files with 52 additions and 122 deletions
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@ -108,27 +108,33 @@ namespace {
 namespace tracing = tensorflow::tracing;
-// Returns the directory containing nvvm libdevice files.  config_cuda_data_dir
+// Returns a vector of potential locations of the CUDA root directory.
-// should be equal to config().debug_options().xla_gpu_cuda_data_dir() of the
+std::vector<string> GetCudaRootCandidates(
-// HloModule being compiled.
+    const HloModuleConfig& hlo_module_config) {
-string GetLibdeviceDir(const string& config_cuda_data_dir) {
+  std::vector<string> potential_cuda_roots = tensorflow::CandidateCudaRoots();
  std::vector<string> potential_libdevice_dirs;
  if (!config_cuda_data_dir.empty()) {
    potential_libdevice_dirs.push_back(config_cuda_data_dir);
  }
  potential_libdevice_dirs.push_back(tensorflow::LibdeviceRoot());
-  // Tries all potential libdevice directories in the order they are inserted.
+  // CUDA location explicitly specified by user via --xla_gpu_cuda_data_dir has
-  // Returns the first directory that exists in the file system.
+  // highest priority.
-  for (const string& potential_libdevice_dir : potential_libdevice_dirs) {
+  string xla_gpu_cuda_data_dir =
-    if (tensorflow::Env::Default()->IsDirectory(potential_libdevice_dir).ok()) {
+      hlo_module_config.debug_options().xla_gpu_cuda_data_dir();
-      VLOG(2) << "Found libdevice dir " << potential_libdevice_dir;
+  if (!xla_gpu_cuda_data_dir.empty()) {
-      return potential_libdevice_dir;
+    potential_cuda_roots.insert(potential_cuda_roots.begin(),
                                xla_gpu_cuda_data_dir);
  }
  return potential_cuda_roots;
 }
 // Returns the directory containing nvvm libdevice files.
 string GetLibdeviceDir(const HloModuleConfig& hlo_module_config) {
  for (const string& cuda_root : GetCudaRootCandidates(hlo_module_config)) {
    string libdevice_dir =
        tensorflow::io::JoinPath(cuda_root, "nvvm", "libdevice");
    VLOG(2) << "Looking for libdevice at " << libdevice_dir;
    if (tensorflow::Env::Default()->IsDirectory(libdevice_dir).ok()) {
      VLOG(2) << "Found libdevice dir " << libdevice_dir;
      return libdevice_dir;
    }
    VLOG(2) << "Unable to find potential libdevice dir "
            << potential_libdevice_dir;
  }
  LOG(WARNING) << "Unable to find libdevice dir. Using '.'";
  // Last resort: maybe in the current folder.
  return ".";
@ -478,14 +484,19 @@ void WarnIfBadDriverJITVersion() {
 // Compiles the given PTX string using ptxas and returns the resulting machine
 // code (i.e. a cubin) as a byte array.
-StatusOr<std::vector<uint8>> CompilePtx(const string& ptx, int cc_major,
+StatusOr<std::vector<uint8>> CompilePtx(
-                                        int cc_minor,
+    const string& ptx, int cc_major, int cc_minor,
-                                        bool disable_ptx_optimizations) {
+    const HloModuleConfig& hlo_module_config) {
  tracing::ScopedActivity activity("Compile PTX", /*is_expensive=*/true);
  const string ptxas_path =
      tensorflow::io::JoinPath(tensorflow::CudaRoot(), "bin", "ptxas");
  VLOG(2) << "Checking ptxas at " << ptxas_path;
  auto env = tensorflow::Env::Default();
  string ptxas_path;
  for (const string& cuda_root : GetCudaRootCandidates(hlo_module_config)) {
    ptxas_path = tensorflow::io::JoinPath(cuda_root, "bin", "ptxas");
    VLOG(2) << "Looking for ptxas at " << ptxas_path;
    if (env->FileExists(ptxas_path).ok()) {
      break;
    }
  }
  TF_RETURN_IF_ERROR(env->FileExists(ptxas_path));
  VLOG(2) << "Using ptxas at " << ptxas_path;
@ -520,7 +531,7 @@ StatusOr<std::vector<uint8>> CompilePtx(const string& ptx, int cc_major,
  if (VLOG_IS_ON(2)) {
    ptxas_args.push_back("-v");
  }
-  if (disable_ptx_optimizations) {
+  if (hlo_module_config.debug_options().xla_gpu_disable_ptxas_optimizations()) {
    ptxas_args.push_back("-O0");
  }
  ptxas_info_dumper.SetProgram(ptxas_path, ptxas_args);
@ -685,12 +696,8 @@ StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
    // Find the directory containing libdevice.  To avoid searching for it every
    // time, we have a one-element cache, keyed on the module's config's
    // cuda_data_dir.
-    const auto& config_cuda_data_dir =
+    if (cached_libdevice_dir_.empty()) {
-        module->config().debug_options().xla_gpu_cuda_data_dir();
+      cached_libdevice_dir_ = GetLibdeviceDir(module->config());
    if (cached_libdevice_dir_.empty() ||
        cached_cuda_data_dir_ != config_cuda_data_dir) {
      cached_cuda_data_dir_ = config_cuda_data_dir;
      cached_libdevice_dir_ = GetLibdeviceDir(config_cuda_data_dir);
    }
    libdevice_dir = cached_libdevice_dir_;
  }
@ -743,9 +750,8 @@ StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
    }
  }
-  const std::vector<uint8> cubin = CompilePtxOrGetCachedResult(
+  const std::vector<uint8> cubin =
-      ptx, cc_major, cc_minor,
+      CompilePtxOrGetCachedResult(ptx, cc_major, cc_minor, module->config());
      module->config().debug_options().xla_gpu_disable_ptxas_optimizations());
  auto thunk_schedule = absl::make_unique<ThunkSchedule>(
      ir_emitter.ConsumeThunkSequence(), std::move(stream_assignment),
@ -779,7 +785,7 @@ StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
 std::vector<uint8> NVPTXCompiler::CompilePtxOrGetCachedResult(
    const string& ptx, int cc_major, int cc_minor,
-    bool disable_ptx_optimizations) {
+    const HloModuleConfig& hlo_module_config) {
  XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::CompilePtxOrGetCachedResult");
  tracing::ScopedActivity activity("PTX->CUBIN", /*is_expensive=*/true);
  bool inserted;
@ -807,8 +813,8 @@ std::vector<uint8> NVPTXCompiler::CompilePtxOrGetCachedResult(
    if (inserted) {
      CHECK(!cache_value->compilation_done);
      if (!ptx.empty()) {
-        StatusOr<std::vector<uint8>> maybe_cubin = CompilePtx(
+        StatusOr<std::vector<uint8>> maybe_cubin =
-            *cache_ptx, cc_major, cc_minor, disable_ptx_optimizations);
+            CompilePtx(*cache_ptx, cc_major, cc_minor, hlo_module_config);
        if (maybe_cubin.ok()) {
          cache_value->cubin_data = std::move(maybe_cubin).ValueOrDie();
          VLOG(2) << "Compiled PTX size:" << ptx.size()
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
@ -99,7 +99,7 @@ class NVPTXCompiler : public LLVMCompiler {
  // compiled cubin.  If compilation was unsuccessful, returns an empty vector.
  std::vector<uint8> CompilePtxOrGetCachedResult(
      const string& ptx, int cc_major, int cc_minor,
-      bool disable_ptx_optimizations);
+      const HloModuleConfig& hlo_module_config);
  // The compilation_cache_ map is a cache from {ptx string, cc_major, cc_minor}
  // -> cubin so we don't recompile the same ptx twice.  This is important for
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@ -4062,20 +4062,6 @@ tf_cuda_cc_test(
    ],
 )
 tf_cc_test_gpu(
    name = "cuda_libdevice_path_test",
    size = "small",
    srcs = ["platform/cuda_libdevice_path_test.cc"],
    linkstatic = tf_kernel_tests_linkstatic(),
    tags = tf_cuda_tests_tags(),
    deps = [
        ":cuda_libdevice_path",
        ":lib",
        ":test",
        ":test_main",
    ],
 )
 tf_cuda_only_cc_test(
    name = "util_cuda_kernel_helper_test",
    srcs = [
@ -4931,7 +4917,7 @@ filegroup(
 cc_library(
    name = "cuda_libdevice_path",
-    srcs = ["platform/cuda_libdevice_path.cc"] + tf_additional_libdevice_srcs(),
+    srcs = tf_additional_libdevice_srcs(),
    hdrs = ["platform/cuda_libdevice_path.h"],
    copts = tf_copts(),
    data = tf_additional_libdevice_data(),
--- a/tensorflow/core/platform/cuda_libdevice_path.cc
+++ b/tensorflow/core/platform/cuda_libdevice_path.cc
@ -1,26 +0,0 @@
 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/platform/cuda_libdevice_path.h"
 #include "tensorflow/core/lib/io/path.h"
 namespace tensorflow {
 string LibdeviceRoot() {
  return tensorflow::io::JoinPath(tensorflow::CudaRoot(), "nvvm/libdevice");
 }
 }  // namespace tensorflow
--- a/tensorflow/core/platform/cuda_libdevice_path.h
+++ b/tensorflow/core/platform/cuda_libdevice_path.h
@ -16,16 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PLATFORM_CUDA_LIBDEVICE_PATH_H_
 #define TENSORFLOW_CORE_PLATFORM_CUDA_LIBDEVICE_PATH_H_
 #include <vector>
 #include "tensorflow/core/platform/types.h"
 namespace tensorflow {
-// Returns the root directory of the CUDA SDK, which contains sub-folders such
+// Returns, in order of preference, potential locations of the root directory of
-// as bin, lib64, and nvvm.
+// the CUDA SDK, which contains sub-folders such as bin, lib64, and nvvm.
-string CudaRoot();
+std::vector<string> CandidateCudaRoots();
 // Returns the directory that contains nvvm libdevice files in the CUDA SDK.
 string LibdeviceRoot();
 }  // namespace tensorflow
--- a/tensorflow/core/platform/cuda_libdevice_path_test.cc
+++ b/tensorflow/core/platform/cuda_libdevice_path_test.cc
@ -1,35 +0,0 @@
 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/platform/cuda_libdevice_path.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
 namespace tensorflow {
 #if GOOGLE_CUDA
 TEST(CudaLibdevicePathTest, LibdevicePath) {
  VLOG(2) << "Libdevice root = " << LibdeviceRoot();
  std::vector<string> libdevice_files;
  TF_EXPECT_OK(Env::Default()->GetMatchingPaths(
      io::JoinPath(LibdeviceRoot(), "libdevice.*.bc"), &libdevice_files));
  EXPECT_LT(0, libdevice_files.size());
 }
 #endif
 }  // namespace tensorflow
--- a/tensorflow/core/platform/default/cuda_libdevice_path.cc
+++ b/tensorflow/core/platform/default/cuda_libdevice_path.cc
@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/platform/cuda_libdevice_path.h"
 #include <stdlib.h>
 #include <vector>
 #if !defined(PLATFORM_GOOGLE)
 #include "cuda/cuda_config.h"
@ -24,9 +25,9 @@ limitations under the License.
 namespace tensorflow {
-string CudaRoot() {
+std::vector<string> CandidateCudaRoots() {
  VLOG(3) << "CUDA root = " << TF_CUDA_TOOLKIT_PATH;
-  return TF_CUDA_TOOLKIT_PATH;
+  return {TF_CUDA_TOOLKIT_PATH};
 }
 }  // namespace tensorflow