[TF:XLA] Instead of dumping ptxas output, write to a temp file & and use to build gpu executable. Switching to ptxas to avoid relying on the JIT in the nvidia driver for compilation.

Change: 155305963
2017-05-06 21:11:19 -08:00 · 2017-05-06 21:11:19 -08:00 · 9d7e9b0e6b
commit 9d7e9b0e6b
parent 1968b8b3f9
5 changed files with 68 additions and 38 deletions
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@ -188,41 +188,52 @@ tensorflow::Status PrepareHloModuleForIrEmitting(
  return pipeline.Run(hlo_module).status();
 }

-// Invokes the ptxas tool on the given PTX string, and dumps its output.
-void DumpPtxasInfo(const string& ptx) {
+// Invokes the ptxas tool on the given PTX string, and stores the resulting
+// SASS in *cubin. If -v 2 or greater, runs ptxas with -v and dumps the
+// resulting stderr (which contains register allocation info, etc.)
+// to VLOG(2). If ptxas binary is not found *sass is set to "".
+Status CompilePTX(const string& ptx, int cc_major, int cc_minor,
+                  string* cubin) {
+  *cubin = "";
+
  const string ptxas_path =
      tensorflow::io::JoinPath(tensorflow::CudaRoot(), "bin/ptxas");
+
  // Do not log PTX stats if ptxas is not found at the given path.
-  if (!tensorflow::Env::Default()->FileExists(ptxas_path).ok()) {
-    LOG(WARNING)
-        << "Failed to dump PTX stats because ptxas is not found at path \""
-        << ptxas_path << "\".";
-    return;
-  }
+  LOG(INFO) << "Invoking ptxas at path \"" << ptxas_path << "\".";
+  TF_RETURN_IF_ERROR(tensorflow::Env::Default()->FileExists(ptxas_path));

  // Write `ptx` into a temporary file.
  char tempdir_template[] = "/tmp/ptxXXXXXX";
  char* tempdir_name = mkdtemp(tempdir_template);
  CHECK_NOTNULL(tempdir_name);
  string ptx_path = tensorflow::io::JoinPath(tempdir_name, "ptx");
+
  TF_CHECK_OK(
      tensorflow::WriteStringToFile(tensorflow::Env::Default(), ptx_path, ptx));
  LOG(INFO) << "ptx file written to: " << ptx_path;

  // Invoke ptxas and collect its output.
-  tensorflow::SubProcess ptxas_info_dumper;
-  ptxas_info_dumper.SetProgram(ptxas_path, {ptxas_path, ptx_path, "-o",
-                                            "/dev/null", "-v", "-arch=sm_35"});
-  ptxas_info_dumper.SetChannelAction(tensorflow::CHAN_STDERR,
-                                     tensorflow::ACTION_PIPE);
-  CHECK(ptxas_info_dumper.Start());
-  string stderr_output;
-  int exit_status = ptxas_info_dumper.Communicate(
-      /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
-  XLA_LOG_LINES(tensorflow::INFO, stderr_output);
-  if (exit_status != 0) {
-    LOG(FATAL) << "Invalid PTX. See the error message above for reasons.";
+  tensorflow::SubProcess ptxas_info;
+  string arch = tensorflow::strings::StrCat("sm_", cc_major, cc_minor);
+  string cubin_path = tensorflow::io::JoinPath(tempdir_name, "cubin");
+
+  if (VLOG_IS_ON(2)) {
+    ptxas_info.SetProgram(ptxas_path, {ptxas_path, "-v", "-o", cubin_path,
+                                       "-arch", arch, ptx_path});
+  } else {
+    ptxas_info.SetProgram(
+        ptxas_path, {ptxas_path, "-o", cubin_path, "-arch", arch, ptx_path});
  }
+  ptxas_info.SetChannelAction(tensorflow::CHAN_STDERR, tensorflow::ACTION_PIPE);
+  CHECK(ptxas_info.Start());
+  string stderr_output;
+  int ptxas_exit_status = ptxas_info.Communicate(
+      /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
+
+  TF_RET_CHECK(ptxas_exit_status == 0);
+  return tensorflow::ReadFileToString(tensorflow::Env::Default(), cubin_path,
+                                      cubin);
 }

 }  // namespace
@ -298,10 +309,14 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(

  // Reserve space for the PTX to be generated for this module.
  string* ptx;
+  string* cubin;
  {
    tensorflow::mutex_lock lock(mutex_);
    generated_ptxes_.emplace_back(MakeUnique<string>());
    ptx = generated_ptxes_.back().get();
+
+    generated_cubins_.emplace_back(MakeUnique<string>());
+    cubin = generated_cubins_.back().get();
  }
  int cc_major, cc_minor;
  if (!stream_exec->GetDeviceDescription().cuda_compute_capability(&cc_major,
@ -318,9 +333,6 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
  XLA_VLOG_LINES(2, llvm_ir::DumpModuleToString(llvm_module));
  VLOG(2) << "PTX:";
  XLA_VLOG_LINES(2, *ptx);
-  if (VLOG_IS_ON(2)) {
-    DumpPtxasInfo(*ptx);
-  }

  auto thunk_schedule = MakeUnique<ThunkSchedule>(
      ir_emitter.ConsumeThunkSequence(), std::move(stream_assignment),
@ -328,9 +340,13 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
  VLOG(2) << "Printing the thunk schedule...";
  XLA_VLOG_LINES(2, thunk_schedule->ToString());

+  TF_RET_CHECK(CompilePTX(*ptx, cc_major, cc_minor, cubin).ok());
+
  auto* gpu_executable =
-      new GpuExecutable(*ptx, std::move(thunk_schedule), std::move(hlo_module),
+      new GpuExecutable(*cubin, *ptx, {cc_major, cc_minor},
+                        std::move(thunk_schedule), std::move(hlo_module),
                        std::move(module_config), std::move(buffer_assignment));
+
  if (flags->xla_gpu_embed_ir) {
    DCHECK_NE("", ir_module_string_before_opt);
    gpu_executable->set_ir_module_string(ir_module_string_before_opt);
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
@ -71,6 +71,7 @@ class GpuCompiler : public Compiler {
  // StreamExecutor (b/24776264).
  tensorflow::mutex mutex_;
  std::vector<std::unique_ptr<string>> generated_ptxes_ GUARDED_BY(mutex_);
+  std::vector<std::unique_ptr<string>> generated_cubins_ GUARDED_BY(mutex_);

  // The size in bytes of a pointer. Used for computing ShapeSizeBytes.
  int64 pointer_size_;
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@ -107,13 +107,17 @@ class HloExecutionProfiler {

 // Implementation note: HLO profiling is always enabled for GPU executables,
 // since we can use timers around thunks.
-GpuExecutable::GpuExecutable(tensorflow::StringPiece ptx,
+GpuExecutable::GpuExecutable(tensorflow::StringPiece cubin,
+                             tensorflow::StringPiece ptx,
+                             std::pair<int, int> compute_capability,
                             std::unique_ptr<ThunkSchedule> thunk_schedule,
                             std::unique_ptr<HloModule> hlo_module,
                             std::unique_ptr<HloModuleConfig> module_config,
                             std::unique_ptr<BufferAssignment> assignment)
    : Executable(std::move(hlo_module), std::move(module_config)),
+      cubin_(cubin),
      ptx_(ptx),
+      compute_capability_(compute_capability),
      thunk_schedule_(std::move(thunk_schedule)),
      assignment_(std::move(assignment)) {}

@ -186,6 +190,13 @@ StatusOr<se::DeviceMemoryBase> GpuExecutable::ExecuteOnStream(
  // false.
  TF_RET_CHECK(!module_config().has_hybrid_result());

+  // Ensure the compute capability of the cubin and the stream match.
+  std::pair<int, int> stream_compute_compatibility;
+  stream->parent()->GetDeviceDescription().cuda_compute_capability(
+      &stream_compute_compatibility.first,
+      &stream_compute_compatibility.second);
+  TF_RET_CHECK(stream_compute_compatibility == compute_capability_);
+
  BufferAllocations::Builder buffer_allocations_builder;
  for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
       ++i) {
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@ -40,15 +40,17 @@ limitations under the License.

 namespace xla {
 namespace gpu {
-
 // GPU-targeting implementation of the XLA Executable interface.
 //
 // Launches the given CUDA kernel via the StreamExecutor.
-//
-// This is an immutable data type after initialization, and thus thread safe.
+
+// GPUExecutable should eventually be updated to associate a compute
+// capability with the PTX and store multiple cubins, each with their own
+// associated CC's, rather than including CC as a property of GpuExecutable.
 class GpuExecutable : public Executable {
 public:
-  GpuExecutable(tensorflow::StringPiece ptx,
+  GpuExecutable(tensorflow::StringPiece cubin, tensorflow::StringPiece ptx,
+                std::pair<int, int> compute_capability,
                std::unique_ptr<ThunkSchedule> thunk_schedule,
                std::unique_ptr<HloModule> hlo_module,
                std::unique_ptr<HloModuleConfig> module_config,
@ -62,7 +64,8 @@ class GpuExecutable : public Executable {
    ir_module_string_ = ir_module_string;
  }

-  // Returns the compiled PTX for the computation.
+  // Returns the compiled CUDA binary for the computation.
+  tensorflow::StringPiece cubin() const { return cubin_; }
  tensorflow::StringPiece ptx() const { return ptx_; }

  StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteOnStream(
@ -104,8 +107,10 @@ class GpuExecutable : public Executable {
  // This string should be modified only before ExecuteOnStream.
  string ir_module_string_;

-  // The reference to the compiled PTX for the computation.
-  const tensorflow::StringPiece ptx_;
+  // The reference to the compiled PTX & CUDA binary for the computation.
+  tensorflow::StringPiece cubin_;
+  tensorflow::StringPiece ptx_;
+  std::pair<int, int> compute_capability_;

  // The thunks to be invoked by this GpuExecutable. They are generated by the
  // IrEmitter.
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
@ -41,13 +41,10 @@ tensorflow::Status KernelThunk::Initialize(const GpuExecutable& executable) {
    // Already initialized by another thread.
    return tensorflow::Status::OK();
  }
-
  loader_spec_.reset(new se::MultiKernelLoaderSpec(io_buffers_.size() + 1));
-  tensorflow::StringPiece ptx = executable.ptx();
-  // Convert tensorflow::StringPiece to se::port::StringPiece because
-  // StreamExecutor uses the latter.
-  loader_spec_->AddCudaPtxInMemory(
-      se::port::StringPiece(ptx.data(), ptx.size()), kernel_name_);
+
+  tensorflow::StringPiece cubin = executable.cubin();
+  loader_spec_->AddCudaCubinInMemory(cubin.data(), kernel_name_);
  return tensorflow::Status::OK();
 }