[TF:XLA] Instead of dumping ptxas output, write to a temp file & and use to build gpu executable. Switching to ptxas to avoid relying on the JIT in the nvidia driver for compilation.
Change: 155305963
This commit is contained in:
parent
1968b8b3f9
commit
9d7e9b0e6b
@ -188,41 +188,52 @@ tensorflow::Status PrepareHloModuleForIrEmitting(
|
|||||||
return pipeline.Run(hlo_module).status();
|
return pipeline.Run(hlo_module).status();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Invokes the ptxas tool on the given PTX string, and dumps its output.
|
// Invokes the ptxas tool on the given PTX string, and stores the resulting
|
||||||
void DumpPtxasInfo(const string& ptx) {
|
// SASS in *cubin. If -v 2 or greater, runs ptxas with -v and dumps the
|
||||||
|
// resulting stderr (which contains register allocation info, etc.)
|
||||||
|
// to VLOG(2). If ptxas binary is not found *sass is set to "".
|
||||||
|
Status CompilePTX(const string& ptx, int cc_major, int cc_minor,
|
||||||
|
string* cubin) {
|
||||||
|
*cubin = "";
|
||||||
|
|
||||||
const string ptxas_path =
|
const string ptxas_path =
|
||||||
tensorflow::io::JoinPath(tensorflow::CudaRoot(), "bin/ptxas");
|
tensorflow::io::JoinPath(tensorflow::CudaRoot(), "bin/ptxas");
|
||||||
|
|
||||||
// Do not log PTX stats if ptxas is not found at the given path.
|
// Do not log PTX stats if ptxas is not found at the given path.
|
||||||
if (!tensorflow::Env::Default()->FileExists(ptxas_path).ok()) {
|
LOG(INFO) << "Invoking ptxas at path \"" << ptxas_path << "\".";
|
||||||
LOG(WARNING)
|
TF_RETURN_IF_ERROR(tensorflow::Env::Default()->FileExists(ptxas_path));
|
||||||
<< "Failed to dump PTX stats because ptxas is not found at path \""
|
|
||||||
<< ptxas_path << "\".";
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Write `ptx` into a temporary file.
|
// Write `ptx` into a temporary file.
|
||||||
char tempdir_template[] = "/tmp/ptxXXXXXX";
|
char tempdir_template[] = "/tmp/ptxXXXXXX";
|
||||||
char* tempdir_name = mkdtemp(tempdir_template);
|
char* tempdir_name = mkdtemp(tempdir_template);
|
||||||
CHECK_NOTNULL(tempdir_name);
|
CHECK_NOTNULL(tempdir_name);
|
||||||
string ptx_path = tensorflow::io::JoinPath(tempdir_name, "ptx");
|
string ptx_path = tensorflow::io::JoinPath(tempdir_name, "ptx");
|
||||||
|
|
||||||
TF_CHECK_OK(
|
TF_CHECK_OK(
|
||||||
tensorflow::WriteStringToFile(tensorflow::Env::Default(), ptx_path, ptx));
|
tensorflow::WriteStringToFile(tensorflow::Env::Default(), ptx_path, ptx));
|
||||||
LOG(INFO) << "ptx file written to: " << ptx_path;
|
LOG(INFO) << "ptx file written to: " << ptx_path;
|
||||||
|
|
||||||
// Invoke ptxas and collect its output.
|
// Invoke ptxas and collect its output.
|
||||||
tensorflow::SubProcess ptxas_info_dumper;
|
tensorflow::SubProcess ptxas_info;
|
||||||
ptxas_info_dumper.SetProgram(ptxas_path, {ptxas_path, ptx_path, "-o",
|
string arch = tensorflow::strings::StrCat("sm_", cc_major, cc_minor);
|
||||||
"/dev/null", "-v", "-arch=sm_35"});
|
string cubin_path = tensorflow::io::JoinPath(tempdir_name, "cubin");
|
||||||
ptxas_info_dumper.SetChannelAction(tensorflow::CHAN_STDERR,
|
|
||||||
tensorflow::ACTION_PIPE);
|
if (VLOG_IS_ON(2)) {
|
||||||
CHECK(ptxas_info_dumper.Start());
|
ptxas_info.SetProgram(ptxas_path, {ptxas_path, "-v", "-o", cubin_path,
|
||||||
string stderr_output;
|
"-arch", arch, ptx_path});
|
||||||
int exit_status = ptxas_info_dumper.Communicate(
|
} else {
|
||||||
/*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
|
ptxas_info.SetProgram(
|
||||||
XLA_LOG_LINES(tensorflow::INFO, stderr_output);
|
ptxas_path, {ptxas_path, "-o", cubin_path, "-arch", arch, ptx_path});
|
||||||
if (exit_status != 0) {
|
|
||||||
LOG(FATAL) << "Invalid PTX. See the error message above for reasons.";
|
|
||||||
}
|
}
|
||||||
|
ptxas_info.SetChannelAction(tensorflow::CHAN_STDERR, tensorflow::ACTION_PIPE);
|
||||||
|
CHECK(ptxas_info.Start());
|
||||||
|
string stderr_output;
|
||||||
|
int ptxas_exit_status = ptxas_info.Communicate(
|
||||||
|
/*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
|
||||||
|
|
||||||
|
TF_RET_CHECK(ptxas_exit_status == 0);
|
||||||
|
return tensorflow::ReadFileToString(tensorflow::Env::Default(), cubin_path,
|
||||||
|
cubin);
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
@ -298,10 +309,14 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
|
|||||||
|
|
||||||
// Reserve space for the PTX to be generated for this module.
|
// Reserve space for the PTX to be generated for this module.
|
||||||
string* ptx;
|
string* ptx;
|
||||||
|
string* cubin;
|
||||||
{
|
{
|
||||||
tensorflow::mutex_lock lock(mutex_);
|
tensorflow::mutex_lock lock(mutex_);
|
||||||
generated_ptxes_.emplace_back(MakeUnique<string>());
|
generated_ptxes_.emplace_back(MakeUnique<string>());
|
||||||
ptx = generated_ptxes_.back().get();
|
ptx = generated_ptxes_.back().get();
|
||||||
|
|
||||||
|
generated_cubins_.emplace_back(MakeUnique<string>());
|
||||||
|
cubin = generated_cubins_.back().get();
|
||||||
}
|
}
|
||||||
int cc_major, cc_minor;
|
int cc_major, cc_minor;
|
||||||
if (!stream_exec->GetDeviceDescription().cuda_compute_capability(&cc_major,
|
if (!stream_exec->GetDeviceDescription().cuda_compute_capability(&cc_major,
|
||||||
@ -318,9 +333,6 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
|
|||||||
XLA_VLOG_LINES(2, llvm_ir::DumpModuleToString(llvm_module));
|
XLA_VLOG_LINES(2, llvm_ir::DumpModuleToString(llvm_module));
|
||||||
VLOG(2) << "PTX:";
|
VLOG(2) << "PTX:";
|
||||||
XLA_VLOG_LINES(2, *ptx);
|
XLA_VLOG_LINES(2, *ptx);
|
||||||
if (VLOG_IS_ON(2)) {
|
|
||||||
DumpPtxasInfo(*ptx);
|
|
||||||
}
|
|
||||||
|
|
||||||
auto thunk_schedule = MakeUnique<ThunkSchedule>(
|
auto thunk_schedule = MakeUnique<ThunkSchedule>(
|
||||||
ir_emitter.ConsumeThunkSequence(), std::move(stream_assignment),
|
ir_emitter.ConsumeThunkSequence(), std::move(stream_assignment),
|
||||||
@ -328,9 +340,13 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
|
|||||||
VLOG(2) << "Printing the thunk schedule...";
|
VLOG(2) << "Printing the thunk schedule...";
|
||||||
XLA_VLOG_LINES(2, thunk_schedule->ToString());
|
XLA_VLOG_LINES(2, thunk_schedule->ToString());
|
||||||
|
|
||||||
|
TF_RET_CHECK(CompilePTX(*ptx, cc_major, cc_minor, cubin).ok());
|
||||||
|
|
||||||
auto* gpu_executable =
|
auto* gpu_executable =
|
||||||
new GpuExecutable(*ptx, std::move(thunk_schedule), std::move(hlo_module),
|
new GpuExecutable(*cubin, *ptx, {cc_major, cc_minor},
|
||||||
|
std::move(thunk_schedule), std::move(hlo_module),
|
||||||
std::move(module_config), std::move(buffer_assignment));
|
std::move(module_config), std::move(buffer_assignment));
|
||||||
|
|
||||||
if (flags->xla_gpu_embed_ir) {
|
if (flags->xla_gpu_embed_ir) {
|
||||||
DCHECK_NE("", ir_module_string_before_opt);
|
DCHECK_NE("", ir_module_string_before_opt);
|
||||||
gpu_executable->set_ir_module_string(ir_module_string_before_opt);
|
gpu_executable->set_ir_module_string(ir_module_string_before_opt);
|
||||||
|
@ -71,6 +71,7 @@ class GpuCompiler : public Compiler {
|
|||||||
// StreamExecutor (b/24776264).
|
// StreamExecutor (b/24776264).
|
||||||
tensorflow::mutex mutex_;
|
tensorflow::mutex mutex_;
|
||||||
std::vector<std::unique_ptr<string>> generated_ptxes_ GUARDED_BY(mutex_);
|
std::vector<std::unique_ptr<string>> generated_ptxes_ GUARDED_BY(mutex_);
|
||||||
|
std::vector<std::unique_ptr<string>> generated_cubins_ GUARDED_BY(mutex_);
|
||||||
|
|
||||||
// The size in bytes of a pointer. Used for computing ShapeSizeBytes.
|
// The size in bytes of a pointer. Used for computing ShapeSizeBytes.
|
||||||
int64 pointer_size_;
|
int64 pointer_size_;
|
||||||
|
@ -107,13 +107,17 @@ class HloExecutionProfiler {
|
|||||||
|
|
||||||
// Implementation note: HLO profiling is always enabled for GPU executables,
|
// Implementation note: HLO profiling is always enabled for GPU executables,
|
||||||
// since we can use timers around thunks.
|
// since we can use timers around thunks.
|
||||||
GpuExecutable::GpuExecutable(tensorflow::StringPiece ptx,
|
GpuExecutable::GpuExecutable(tensorflow::StringPiece cubin,
|
||||||
|
tensorflow::StringPiece ptx,
|
||||||
|
std::pair<int, int> compute_capability,
|
||||||
std::unique_ptr<ThunkSchedule> thunk_schedule,
|
std::unique_ptr<ThunkSchedule> thunk_schedule,
|
||||||
std::unique_ptr<HloModule> hlo_module,
|
std::unique_ptr<HloModule> hlo_module,
|
||||||
std::unique_ptr<HloModuleConfig> module_config,
|
std::unique_ptr<HloModuleConfig> module_config,
|
||||||
std::unique_ptr<BufferAssignment> assignment)
|
std::unique_ptr<BufferAssignment> assignment)
|
||||||
: Executable(std::move(hlo_module), std::move(module_config)),
|
: Executable(std::move(hlo_module), std::move(module_config)),
|
||||||
|
cubin_(cubin),
|
||||||
ptx_(ptx),
|
ptx_(ptx),
|
||||||
|
compute_capability_(compute_capability),
|
||||||
thunk_schedule_(std::move(thunk_schedule)),
|
thunk_schedule_(std::move(thunk_schedule)),
|
||||||
assignment_(std::move(assignment)) {}
|
assignment_(std::move(assignment)) {}
|
||||||
|
|
||||||
@ -186,6 +190,13 @@ StatusOr<se::DeviceMemoryBase> GpuExecutable::ExecuteOnStream(
|
|||||||
// false.
|
// false.
|
||||||
TF_RET_CHECK(!module_config().has_hybrid_result());
|
TF_RET_CHECK(!module_config().has_hybrid_result());
|
||||||
|
|
||||||
|
// Ensure the compute capability of the cubin and the stream match.
|
||||||
|
std::pair<int, int> stream_compute_compatibility;
|
||||||
|
stream->parent()->GetDeviceDescription().cuda_compute_capability(
|
||||||
|
&stream_compute_compatibility.first,
|
||||||
|
&stream_compute_compatibility.second);
|
||||||
|
TF_RET_CHECK(stream_compute_compatibility == compute_capability_);
|
||||||
|
|
||||||
BufferAllocations::Builder buffer_allocations_builder;
|
BufferAllocations::Builder buffer_allocations_builder;
|
||||||
for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
|
for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
|
||||||
++i) {
|
++i) {
|
||||||
|
@ -40,15 +40,17 @@ limitations under the License.
|
|||||||
|
|
||||||
namespace xla {
|
namespace xla {
|
||||||
namespace gpu {
|
namespace gpu {
|
||||||
|
|
||||||
// GPU-targeting implementation of the XLA Executable interface.
|
// GPU-targeting implementation of the XLA Executable interface.
|
||||||
//
|
//
|
||||||
// Launches the given CUDA kernel via the StreamExecutor.
|
// Launches the given CUDA kernel via the StreamExecutor.
|
||||||
//
|
|
||||||
// This is an immutable data type after initialization, and thus thread safe.
|
// GPUExecutable should eventually be updated to associate a compute
|
||||||
|
// capability with the PTX and store multiple cubins, each with their own
|
||||||
|
// associated CC's, rather than including CC as a property of GpuExecutable.
|
||||||
class GpuExecutable : public Executable {
|
class GpuExecutable : public Executable {
|
||||||
public:
|
public:
|
||||||
GpuExecutable(tensorflow::StringPiece ptx,
|
GpuExecutable(tensorflow::StringPiece cubin, tensorflow::StringPiece ptx,
|
||||||
|
std::pair<int, int> compute_capability,
|
||||||
std::unique_ptr<ThunkSchedule> thunk_schedule,
|
std::unique_ptr<ThunkSchedule> thunk_schedule,
|
||||||
std::unique_ptr<HloModule> hlo_module,
|
std::unique_ptr<HloModule> hlo_module,
|
||||||
std::unique_ptr<HloModuleConfig> module_config,
|
std::unique_ptr<HloModuleConfig> module_config,
|
||||||
@ -62,7 +64,8 @@ class GpuExecutable : public Executable {
|
|||||||
ir_module_string_ = ir_module_string;
|
ir_module_string_ = ir_module_string;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returns the compiled PTX for the computation.
|
// Returns the compiled CUDA binary for the computation.
|
||||||
|
tensorflow::StringPiece cubin() const { return cubin_; }
|
||||||
tensorflow::StringPiece ptx() const { return ptx_; }
|
tensorflow::StringPiece ptx() const { return ptx_; }
|
||||||
|
|
||||||
StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteOnStream(
|
StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteOnStream(
|
||||||
@ -104,8 +107,10 @@ class GpuExecutable : public Executable {
|
|||||||
// This string should be modified only before ExecuteOnStream.
|
// This string should be modified only before ExecuteOnStream.
|
||||||
string ir_module_string_;
|
string ir_module_string_;
|
||||||
|
|
||||||
// The reference to the compiled PTX for the computation.
|
// The reference to the compiled PTX & CUDA binary for the computation.
|
||||||
const tensorflow::StringPiece ptx_;
|
tensorflow::StringPiece cubin_;
|
||||||
|
tensorflow::StringPiece ptx_;
|
||||||
|
std::pair<int, int> compute_capability_;
|
||||||
|
|
||||||
// The thunks to be invoked by this GpuExecutable. They are generated by the
|
// The thunks to be invoked by this GpuExecutable. They are generated by the
|
||||||
// IrEmitter.
|
// IrEmitter.
|
||||||
|
@ -41,13 +41,10 @@ tensorflow::Status KernelThunk::Initialize(const GpuExecutable& executable) {
|
|||||||
// Already initialized by another thread.
|
// Already initialized by another thread.
|
||||||
return tensorflow::Status::OK();
|
return tensorflow::Status::OK();
|
||||||
}
|
}
|
||||||
|
|
||||||
loader_spec_.reset(new se::MultiKernelLoaderSpec(io_buffers_.size() + 1));
|
loader_spec_.reset(new se::MultiKernelLoaderSpec(io_buffers_.size() + 1));
|
||||||
tensorflow::StringPiece ptx = executable.ptx();
|
|
||||||
// Convert tensorflow::StringPiece to se::port::StringPiece because
|
tensorflow::StringPiece cubin = executable.cubin();
|
||||||
// StreamExecutor uses the latter.
|
loader_spec_->AddCudaCubinInMemory(cubin.data(), kernel_name_);
|
||||||
loader_spec_->AddCudaPtxInMemory(
|
|
||||||
se::port::StringPiece(ptx.data(), ptx.size()), kernel_name_);
|
|
||||||
return tensorflow::Status::OK();
|
return tensorflow::Status::OK();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user