STT-tensorflow/tensorflow/stream_executor/gpu/asm_compiler.cc

/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#include "tensorflow/stream_executor/gpu/asm_compiler.h"

#include "absl/container/flat_hash_map.h"
#include "absl/strings/str_format.h"
#include "absl/synchronization/mutex.h"
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/gtl/cleanup.h"
#include "tensorflow/core/lib/io/path.h"
#include "tensorflow/core/platform/cuda_libdevice_path.h"
#include "tensorflow/core/platform/env.h"
#include "tensorflow/core/platform/mutex.h"
#include "tensorflow/core/platform/regexp.h"
#include "tensorflow/core/platform/subprocess.h"
#include "tensorflow/stream_executor/gpu/gpu_driver.h"
#include "tensorflow/stream_executor/lib/statusor.h"

namespace stream_executor {

#if TENSORFLOW_USE_ROCM || defined(PLATFORM_WINDOWS)

port::StatusOr<std::vector<uint8>> CompileGpuAsm(int device_ordinal,
                                                 const char* ptx_contents,
                                                 GpuAsmOpts options) {
  // TODO(b/134675935): Subprocess invocation not supported on Windows.
  return port::InternalError(
      "Invoking GPU asm compilation is supported on Cuda non-Windows "
      "platforms only");
}

port::StatusOr<absl::Span<const uint8>> CompileGpuAsmOrGetCached(
    int device_ordinal, const char* ptx, GpuAsmOpts compilation_options) {
  return CompileGpuAsm(device_ordinal, ptx, compilation_options);
}

#else

// Prints a warning if the ptxas at ptxas_path has known bugs.
//
// Only prints a warning the first time it's called for a particular value of
// ptxas_path.
//
// Locks on entry.
static void WarnIfBadPtxasVersion(const std::string& ptxas_path) {
  static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED);
  static std::unordered_set<std::string>* seen_ptxas_paths TF_GUARDED_BY(mu) =
      new std::unordered_set<std::string>();

  tensorflow::mutex_lock lock(mu);
  if (!seen_ptxas_paths->insert(ptxas_path).second) {
    // Already checked this ptx binary, nothing to do.
    return;
  }

  tensorflow::SubProcess ptxas;
  ptxas.SetProgram(ptxas_path, {ptxas_path, "--version"});
  ptxas.SetChannelAction(tensorflow::CHAN_STDOUT, tensorflow::ACTION_PIPE);
  if (!ptxas.Start()) {
    LOG(WARNING) << "Couldn't invoke " << ptxas_path << " --version";
    return;
  }

  std::string out;
  int exit_code = ptxas.Communicate(/*stdin_input=*/nullptr, &out,
                                    /*stderr_output=*/nullptr);
  if (exit_code != 0) {
    LOG(WARNING) << "Running " << ptxas_path << " --version returned "
                 << exit_code;
    return;
  }

  int64 vmaj, vmin, vdot;
  std::string vmaj_str, vmin_str, vdot_str;
  if (!RE2::PartialMatch(out, R"(\bV(\d+)\.(\d+)\.(\d+)\b)", &vmaj_str,
                         &vmin_str, &vdot_str) ||
      !absl::SimpleAtoi(vmaj_str, &vmaj) ||
      !absl::SimpleAtoi(vmin_str, &vmin) ||
      !absl::SimpleAtoi(vdot_str, &vdot)) {
    LOG(WARNING) << "Couldn't parse ptxas version in output of " << ptxas_path
                 << " --version:\n"
                 << out;
    return;
  }

  // We need ptxas >= 9.0 as a hard requirement, because we compile targeting
  // PTX 6.0.  An older ptxas will just fail to compile any of our code.
  //
  // ptxas 9.0 before 9.0.276 and ptxas 9.1 before 9.1.121 miscompile some
  // address calculations with large offsets (e.g. "load ptr + large_constant"),
  // b/70245379.
  //
  // ptxas 9.1.121 miscompiles some large multioutput fusions, again in a way
  // that appears related to address calculations, b/111107644.  ptxas 9.2.88
  // appears to work, as far as we can tell.
  if (vmaj < 9) {
    LOG(ERROR)
        << "You are using ptxas 8.x, but TF requires ptxas 9.x (and strongly "
           "prefers >= 9.2.88).  Compilation of XLA kernels below will likely "
           "fail.\n\nYou do not need to update CUDA; cherry-picking the ptxas "
           "binary is sufficient.";
  } else if (std::make_tuple(vmaj, vmin, vdot) < std::make_tuple(9, 2, 88)) {
    LOG(WARNING)
        << "*** WARNING *** You are using ptxas " << vmaj << "." << vmin << "."
        << vdot
        << ", which is older than 9.2.88. ptxas 9.x before 9.2.88 is known to "
           "miscompile XLA code, leading to incorrect results or "
           "invalid-address errors.\n\nYou do not need to update to CUDA "
           "9.2.88; cherry-picking the ptxas binary is sufficient.";
  }
}

port::StatusOr<absl::Span<const uint8>> CompileGpuAsmOrGetCached(
    int device_ordinal, const char* ptx, GpuAsmOpts compilation_options) {
  using PtxCacheKey = std::tuple<int, std::string, GpuAsmOpts::PtxOptionsTuple>;
  static tensorflow::mutex ptx_cache_mutex(tensorflow::LINKER_INITIALIZED);
  static auto& ptx_cache TF_GUARDED_BY(ptx_cache_mutex) =
      *new absl::flat_hash_map<PtxCacheKey, std::vector<uint8>>();

  tensorflow::mutex_lock lock(ptx_cache_mutex);
  PtxCacheKey cache_key{device_ordinal, std::string(ptx),
                        compilation_options.ToTuple()};
  auto it = ptx_cache.find(cache_key);
  if (it == ptx_cache.end()) {
    TF_ASSIGN_OR_RETURN(
        std::vector<uint8> compiled,
        CompileGpuAsm(device_ordinal, ptx, compilation_options));
    it = ptx_cache.emplace(cache_key, std::move(compiled)).first;
  }

  CHECK(it != ptx_cache.end());
  const std::vector<uint8>& compiled = it->second;
  return absl::MakeSpan(compiled);
}

port::StatusOr<std::vector<uint8>> CompileGpuAsm(int device_ordinal,
                                                 const char* ptx_contents,
                                                 GpuAsmOpts options) {
  gpu::GpuDeviceHandle handle;
  TF_RETURN_IF_ERROR(gpu::GpuDriver::GetDevice(device_ordinal, &handle));
  int cc_major;
  int cc_minor;
  TF_RETURN_IF_ERROR(
      gpu::GpuDriver::GetComputeCapability(&cc_major, &cc_minor, handle));
  return CompileGpuAsm(cc_major, cc_minor, ptx_contents, options);
}

port::StatusOr<std::vector<uint8>> CompileGpuAsm(int cc_major, int cc_minor,
                                                 const char* ptx_contents,
                                                 GpuAsmOpts options) {
  std::string ptxas_path;
  auto env = tensorflow::Env::Default();
  for (const std::string& cuda_root :
       tensorflow::CandidateCudaRoots(options.preferred_cuda_dir)) {
    ptxas_path = tensorflow::io::JoinPath(cuda_root, "bin", "ptxas");
    VLOG(2) << "Looking for ptxas at " << ptxas_path;
    if (env->FileExists(ptxas_path).ok()) {
      break;
    }
  }
  if (!env->FileExists(ptxas_path).ok()) {
    // Rely on subprocess invocation to find the correct binary.
    ptxas_path = "ptxas";
  }
  VLOG(2) << "Using ptxas at " << ptxas_path;

  WarnIfBadPtxasVersion(ptxas_path);

  // Write ptx into a temporary file.
  std::string ptx_path;
  if (!env->LocalTempFilename(&ptx_path)) {
    return port::InternalError("couldn't get temp PTX file name");
  }
  TF_RETURN_IF_ERROR(
      tensorflow::WriteStringToFile(env, ptx_path, ptx_contents));
  VLOG(2) << "ptx written to: " << ptx_path;

  auto ptx_cleaner = tensorflow::gtl::MakeCleanup([&ptx_path] {
    TF_CHECK_OK(tensorflow::Env::Default()->DeleteFile(ptx_path));
  });

  // Invoke ptxas and collect its output.
  std::string cubin_path;
  if (!env->LocalTempFilename(&cubin_path)) {
    return port::InternalError("couldn't get temp CUBIN file name");
  }
  auto cubin_cleaner = tensorflow::gtl::MakeCleanup([&cubin_path] {
    // CUBIN file may never be created, so the failure to delete it should not
    // produce TF error.
    tensorflow::Env::Default()->DeleteFile(cubin_path).IgnoreError();
  });
  tensorflow::SubProcess ptxas_info_dumper;
  std::vector<std::string> ptxas_args = {
      ptxas_path, ptx_path, "-o", cubin_path,
      absl::StrCat("-arch=sm_", cc_major, cc_minor)};
  if (VLOG_IS_ON(2)) {
    ptxas_args.push_back("-v");
  }
  if (options.disable_gpuasm_optimizations) {
    ptxas_args.push_back("-O0");
  }
  ptxas_args.insert(ptxas_args.end(), options.extra_flags.begin(),
                    options.extra_flags.end());
  if (VLOG_IS_ON(3)) {
    VLOG(3) << absl::StrJoin(ptxas_args, " ");
  }

  ptxas_info_dumper.SetProgram(ptxas_path, ptxas_args);
  ptxas_info_dumper.SetChannelAction(tensorflow::CHAN_STDERR,
                                     tensorflow::ACTION_PIPE);
  if (!ptxas_info_dumper.Start()) {
    return port::InternalError("Failed to launch ptxas");
  }
  std::string stderr_output;
  int exit_status = ptxas_info_dumper.Communicate(
      /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
  if (exit_status != 0) {
    return port::InternalError(
        absl::StrFormat("ptxas exited with non-zero error code %d, output: %s",
                        exit_status, stderr_output));
  }
  // Print the verbose output of ptxas.
  if (!stderr_output.empty()) {
    VLOG(2) << stderr_output;
  }

  // Read in the result of compilation and return it as a byte vector.
  std::string cubin;
  TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(tensorflow::Env::Default(),
                                                  cubin_path, &cubin));
  std::vector<uint8> cubin_vector(cubin.begin(), cubin.end());
  return cubin_vector;
}

#endif

}  // namespace stream_executor