[XLA/GPU] Add parallel compilation for LLVM and ptxas.
PiperOrigin-RevId: 346340049 Change-Id: I09a23a345d60390677167b6f67695912e3aa419e
This commit is contained in:
parent
ba81a9f785
commit
05baa49b08
@ -115,6 +115,16 @@ class ExecutableBuildOptions {
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Thread pool for parallel compilation.
|
||||
tensorflow::thread::ThreadPool* compile_thread_pool() const {
|
||||
return compile_thread_pool_;
|
||||
}
|
||||
ExecutableBuildOptions& set_run_backend_only(
|
||||
tensorflow::thread::ThreadPool* compile_thread_pool) {
|
||||
compile_thread_pool_ = compile_thread_pool;
|
||||
return *this;
|
||||
}
|
||||
|
||||
private:
|
||||
int device_ordinal_ = -1;
|
||||
Shape result_layout_;
|
||||
@ -128,6 +138,7 @@ class ExecutableBuildOptions {
|
||||
absl::optional<DeviceAssignment> device_assignment_;
|
||||
bool alias_passthrough_params_ = false;
|
||||
bool run_backend_only_ = false;
|
||||
tensorflow::thread::ThreadPool* compile_thread_pool_ = nullptr;
|
||||
};
|
||||
|
||||
} // namespace xla
|
||||
|
@ -42,6 +42,7 @@ limitations under the License.
|
||||
#include "tensorflow/core/platform/protobuf.h"
|
||||
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
||||
#include "tensorflow/core/platform/thread_annotations.h"
|
||||
#include "tensorflow/core/platform/threadpool.h"
|
||||
|
||||
namespace xla {
|
||||
|
||||
@ -165,6 +166,9 @@ class Compiler {
|
||||
// given algorithm over those buffers, to see which variant is fastest. Any
|
||||
// space allocated will be deallocated before the compilation returns.
|
||||
se::DeviceMemoryAllocator* device_allocator = nullptr;
|
||||
|
||||
// An optional thread pool for parallel compilation.
|
||||
tensorflow::thread::ThreadPool* thread_pool = nullptr;
|
||||
};
|
||||
|
||||
virtual ~Compiler() {}
|
||||
|
@ -1452,7 +1452,11 @@ cc_library(
|
||||
"//tensorflow/stream_executor:stream_executor_headers",
|
||||
"@com_google_absl//absl/memory",
|
||||
"@com_google_absl//absl/strings",
|
||||
"@llvm-project//llvm:AsmParser",
|
||||
"@llvm-project//llvm:BitReader",
|
||||
"@llvm-project//llvm:BitWriter",
|
||||
"@llvm-project//llvm:Core",
|
||||
"@llvm-project//llvm:TransformUtils",
|
||||
"@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
|
||||
"@llvm-project//mlir:IR",
|
||||
],
|
||||
@ -1517,7 +1521,7 @@ cc_library(
|
||||
"//tensorflow/stream_executor:stream_executor_headers",
|
||||
"//tensorflow/stream_executor/cuda:cuda_diagnostics",
|
||||
"//tensorflow/stream_executor/gpu:asm_compiler",
|
||||
]),
|
||||
]) + ["//tensorflow/stream_executor/gpu:gpu_driver_header"],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
|
@ -108,12 +108,17 @@ StatusOr<std::pair<std::string, std::vector<uint8>>>
|
||||
AMDGPUCompiler::CompileTargetBinary(const HloModule* module,
|
||||
llvm::Module* llvm_module,
|
||||
GpuVersion gpu_version,
|
||||
se::StreamExecutor* stream_exec) {
|
||||
se::StreamExecutor* stream_exec,
|
||||
bool relocatable) {
|
||||
if (rocdl_dir_.empty()) {
|
||||
// Compute rocdl_dir_ just once and cache it in this member.
|
||||
rocdl_dir_ = GetROCDLDir(module->config());
|
||||
}
|
||||
|
||||
if (relocatable) {
|
||||
return Unimplemented("relocatable target binary is not implemented");
|
||||
}
|
||||
|
||||
std::vector<uint8> hsaco;
|
||||
{
|
||||
XLA_SCOPED_LOGGING_TIMER(
|
||||
|
@ -41,7 +41,8 @@ class AMDGPUCompiler : public GpuCompiler {
|
||||
|
||||
StatusOr<std::pair<std::string, std::vector<uint8>>> CompileTargetBinary(
|
||||
const HloModule* hlo_module, llvm::Module* llvm_module,
|
||||
GpuVersion gpu_version, se::StreamExecutor* stream_exec) override;
|
||||
GpuVersion gpu_version, se::StreamExecutor* stream_exec,
|
||||
bool relocatable) override;
|
||||
|
||||
private:
|
||||
// The parent directory of ROCm-Device-Libs IR libraries.
|
||||
|
@ -24,11 +24,15 @@ limitations under the License.
|
||||
#include "absl/memory/memory.h"
|
||||
#include "absl/strings/numbers.h"
|
||||
#include "absl/strings/str_cat.h"
|
||||
#include "llvm/AsmParser/Parser.h"
|
||||
#include "llvm/Bitcode/BitcodeReader.h"
|
||||
#include "llvm/Bitcode/BitcodeWriter.h"
|
||||
#include "llvm/IR/DiagnosticInfo.h"
|
||||
#include "llvm/IR/DiagnosticPrinter.h"
|
||||
#include "llvm/IR/LLVMContext.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/IR/Verifier.h"
|
||||
#include "llvm/Transforms/Utils/SplitModule.h"
|
||||
#include "mlir/IR/BuiltinOps.h" // from @llvm-project
|
||||
#include "mlir/InitAllDialects.h" // from @llvm-project
|
||||
#include "tensorflow/compiler/xla/protobuf_util.h"
|
||||
@ -114,11 +118,13 @@ limitations under the License.
|
||||
#include "tensorflow/core/lib/core/status.h"
|
||||
#include "tensorflow/core/lib/gtl/cleanup.h"
|
||||
#include "tensorflow/core/lib/io/path.h"
|
||||
#include "tensorflow/core/platform/blocking_counter.h"
|
||||
#include "tensorflow/core/platform/env.h"
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
#include "tensorflow/core/platform/regexp.h"
|
||||
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
|
||||
#include "tensorflow/core/platform/subprocess.h"
|
||||
#include "tensorflow/core/platform/threadpool.h"
|
||||
#include "tensorflow/core/platform/tracing.h"
|
||||
#include "tensorflow/core/profiler/lib/traceme.h"
|
||||
#include "tensorflow/core/util/env_var.h"
|
||||
@ -641,6 +647,124 @@ static Status CompileModuleToLlvmIrImpl(
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
StatusOr<std::pair<std::string, std::vector<uint8>>>
|
||||
GpuCompiler::CompileToTargetBinary(const HloModule& module,
|
||||
std::unique_ptr<llvm::Module> llvm_module,
|
||||
se::StreamExecutor* stream_exec,
|
||||
const CompileOptions& options) {
|
||||
using BackendCompileResult = std::pair<std::string, std::vector<uint8>>;
|
||||
|
||||
const auto compile_single_module =
|
||||
[this, stream_exec, &module](
|
||||
llvm::Module* llvm_module,
|
||||
bool relocatable) -> StatusOr<BackendCompileResult> {
|
||||
{
|
||||
XLA_SCOPED_LOGGING_TIMER(
|
||||
"GpuCompiler::RunBackend - Running LLVM verifier");
|
||||
|
||||
std::string err;
|
||||
llvm::raw_string_ostream err_stream(err);
|
||||
|
||||
// verifyModule() returns true if the module is broken.
|
||||
TF_RET_CHECK(!llvm::verifyModule(*llvm_module, &err_stream))
|
||||
<< "Invalid LLVM IR before optimizations:\n"
|
||||
<< err_stream.str()
|
||||
<< "\nThis probably indicates a bug in the HLO -> LLVM IR "
|
||||
"lowering. "
|
||||
"Rerun with --xla_dump_to to get the IR and looks for files "
|
||||
"with "
|
||||
"name containing: *"
|
||||
<< FilenameFor(module, "", "") << "*";
|
||||
}
|
||||
GpuVersion gpu_version = GetGpuVersion(stream_exec);
|
||||
return CompileTargetBinary(&module, llvm_module, gpu_version, stream_exec,
|
||||
relocatable);
|
||||
};
|
||||
|
||||
tensorflow::thread::ThreadPool* thread_pool = options.thread_pool;
|
||||
if (!thread_pool) {
|
||||
return compile_single_module(llvm_module.get(), /*relocatable=*/false);
|
||||
}
|
||||
|
||||
// Test whether LinkModules is supported.
|
||||
if (this->LinkModules(stream_exec, {}).status().code() ==
|
||||
tensorflow::error::Code::UNIMPLEMENTED) {
|
||||
return compile_single_module(llvm_module.get(), /*relocatable=*/false);
|
||||
}
|
||||
|
||||
std::vector<std::unique_ptr<llvm::Module>> llvm_modules;
|
||||
int num_functions = 0;
|
||||
for (llvm::Function& func : llvm_module->functions()) {
|
||||
if (!func.isDeclaration() &&
|
||||
func.getLinkage() == llvm::GlobalValue::LinkageTypes::ExternalLinkage) {
|
||||
num_functions++;
|
||||
}
|
||||
}
|
||||
|
||||
llvm::SplitModule(
|
||||
std::move(llvm_module),
|
||||
std::max<unsigned>(
|
||||
1, std::min<unsigned>(thread_pool->NumThreads(), num_functions)),
|
||||
[&](std::unique_ptr<llvm::Module> module) {
|
||||
llvm_modules.push_back(std::move(module));
|
||||
},
|
||||
/*PreserveLocals=*/true);
|
||||
|
||||
std::vector<StatusOr<BackendCompileResult>> compile_results(
|
||||
llvm_modules.size());
|
||||
tensorflow::BlockingCounter counter(llvm_modules.size());
|
||||
for (int i = 0; i < llvm_modules.size(); i++) {
|
||||
thread_pool->Schedule([&compile_results, compile_single_module, i,
|
||||
&llvm_modules, &counter] {
|
||||
llvm::Module* original_module = llvm_modules[i].get();
|
||||
llvm::LLVMContext context;
|
||||
std::string buffer;
|
||||
llvm::raw_string_ostream error(buffer);
|
||||
llvm::DiagnosticPrinterRawOStream printer(error);
|
||||
auto DiagnosticHandler = [](const llvm::DiagnosticInfo& diag_info,
|
||||
void* Context) {
|
||||
auto printer = static_cast<llvm::DiagnosticPrinterRawOStream*>(Context);
|
||||
diag_info.print(*printer);
|
||||
};
|
||||
context.setDiagnosticHandlerCallBack(DiagnosticHandler, &printer);
|
||||
|
||||
std::unique_ptr<llvm::Module> new_llvm_module;
|
||||
{
|
||||
std::string ir;
|
||||
{
|
||||
llvm::raw_string_ostream os(ir);
|
||||
original_module->print(os, nullptr);
|
||||
}
|
||||
llvm::SMDiagnostic err;
|
||||
new_llvm_module = llvm::parseAssemblyString(ir, err, context);
|
||||
}
|
||||
|
||||
compile_results[i] =
|
||||
compile_single_module(new_llvm_module.get(), /*relocatable=*/true);
|
||||
counter.DecrementCount();
|
||||
});
|
||||
}
|
||||
counter.Wait();
|
||||
|
||||
std::string ptx_snippets;
|
||||
std::vector<std::vector<uint8>> submodule_compile_results;
|
||||
for (auto& maybe_result : compile_results) {
|
||||
TF_ASSIGN_OR_RETURN(auto result, maybe_result);
|
||||
if (result.second.empty()) {
|
||||
continue;
|
||||
}
|
||||
ptx_snippets += result.first;
|
||||
ptx_snippets += "\n";
|
||||
submodule_compile_results.push_back(result.second);
|
||||
}
|
||||
|
||||
TF_ASSIGN_OR_RETURN(
|
||||
std::vector<uint8> backend_result,
|
||||
this->LinkModules(stream_exec, std::move(submodule_compile_results)));
|
||||
|
||||
return std::make_pair(ptx_snippets, backend_result);
|
||||
}
|
||||
|
||||
StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
|
||||
std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
|
||||
const CompileOptions& options) {
|
||||
@ -650,15 +774,6 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
|
||||
TF_RET_CHECK(stream_exec != nullptr);
|
||||
|
||||
llvm::LLVMContext llvm_context;
|
||||
std::string buffer;
|
||||
llvm::raw_string_ostream error(buffer);
|
||||
llvm::DiagnosticPrinterRawOStream printer(error);
|
||||
auto DiagnosticHandler = [](const llvm::DiagnosticInfo& diag_info,
|
||||
void* Context) {
|
||||
auto printer = static_cast<llvm::DiagnosticPrinterRawOStream*>(Context);
|
||||
diag_info.print(*printer);
|
||||
};
|
||||
llvm_context.setDiagnosticHandlerCallBack(DiagnosticHandler, &printer);
|
||||
|
||||
GpuDeviceInfo gpu_device_info;
|
||||
gpu_device_info.threads_per_block_limit =
|
||||
@ -724,34 +839,16 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
|
||||
|
||||
llvm_ir::DumpIrIfEnabled(*module, *llvm_module, /*optimized=*/false);
|
||||
|
||||
{
|
||||
XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunBackend - Running LLVM verifier");
|
||||
|
||||
std::string err;
|
||||
llvm::raw_string_ostream err_stream(err);
|
||||
|
||||
// verifyModule() returns true if the module is broken.
|
||||
TF_RET_CHECK(!llvm::verifyModule(*llvm_module, &err_stream))
|
||||
<< "Invalid LLVM IR before optimizations:\n"
|
||||
<< err_stream.str()
|
||||
<< "\nThis probably indicates a bug in the HLO -> LLVM IR lowering. "
|
||||
"Rerun with --xla_dump_to to get the IR and looks for files with "
|
||||
"name containing: *"
|
||||
<< FilenameFor(*module, "", "") << "*";
|
||||
}
|
||||
|
||||
GpuVersion gpu_version = GetGpuVersion(stream_exec);
|
||||
|
||||
using BackendCompileResult = std::pair<std::string, std::vector<uint8>>;
|
||||
TF_ASSIGN_OR_RETURN(BackendCompileResult backend_result,
|
||||
CompileTargetBinary(module.get(), llvm_module.get(),
|
||||
gpu_version, stream_exec));
|
||||
|
||||
CompileToTargetBinary(*module, std::move(llvm_module),
|
||||
stream_exec, options));
|
||||
if (DumpingEnabledForHloModule(*module)) {
|
||||
DumpToFileInDirOrStdout(*module, "", "thunk_schedule",
|
||||
thunk_schedule->ToString());
|
||||
}
|
||||
|
||||
GpuVersion gpu_version = GetGpuVersion(stream_exec);
|
||||
auto* gpu_executable = new GpuExecutable(
|
||||
backend_result.first, backend_result.second, gpu_version,
|
||||
std::move(thunk_schedule), std::move(module),
|
||||
|
@ -83,8 +83,8 @@ class GpuCompiler : public LLVMCompiler {
|
||||
|
||||
virtual StatusOr<std::pair<std::string, std::vector<uint8>>>
|
||||
CompileTargetBinary(const HloModule* hlo_module, llvm::Module* llvm_module,
|
||||
GpuVersion gpu_version,
|
||||
se::StreamExecutor* stream_exec) = 0;
|
||||
GpuVersion gpu_version, se::StreamExecutor* stream_exec,
|
||||
bool relocatable) = 0;
|
||||
|
||||
Status PrepareHloModuleForIrEmitting(HloModule* hlo_module);
|
||||
|
||||
@ -96,6 +96,10 @@ class GpuCompiler : public LLVMCompiler {
|
||||
CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
|
||||
AotCompilationOptions const& options) override;
|
||||
|
||||
StatusOr<std::pair<std::string, std::vector<uint8>>> CompileToTargetBinary(
|
||||
const HloModule& module, std::unique_ptr<llvm::Module> llvm_module,
|
||||
se::StreamExecutor* stream_exec, const CompileOptions& options);
|
||||
|
||||
se::Platform::Id PlatformId() const override { return platform_id_; }
|
||||
|
||||
HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override {
|
||||
@ -115,6 +119,12 @@ class GpuCompiler : public LLVMCompiler {
|
||||
}
|
||||
|
||||
private:
|
||||
virtual StatusOr<std::vector<uint8>> LinkModules(
|
||||
se::StreamExecutor* stream_exec,
|
||||
std::vector<std::vector<uint8>> modules) {
|
||||
return Unimplemented("LinkModules is not implemented.");
|
||||
}
|
||||
|
||||
se::Platform::Id platform_id_;
|
||||
|
||||
// The triple that represents our target.
|
||||
|
@ -51,6 +51,7 @@ limitations under the License.
|
||||
#include "tensorflow/core/profiler/lib/traceme.h"
|
||||
#include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
|
||||
#include "tensorflow/stream_executor/gpu/asm_compiler.h"
|
||||
#include "tensorflow/stream_executor/gpu/gpu_driver.h"
|
||||
|
||||
namespace xla {
|
||||
namespace gpu {
|
||||
@ -299,7 +300,8 @@ StatusOr<std::pair<std::string, std::vector<uint8>>>
|
||||
NVPTXCompiler::CompileTargetBinary(const HloModule* module,
|
||||
llvm::Module* llvm_module,
|
||||
GpuVersion gpu_version,
|
||||
se::StreamExecutor* stream_exec) {
|
||||
se::StreamExecutor* stream_exec,
|
||||
bool relocatable) {
|
||||
std::pair<int, int> compute_capability =
|
||||
absl::get<std::pair<int, int>>(gpu_version);
|
||||
|
||||
@ -338,7 +340,7 @@ NVPTXCompiler::CompileTargetBinary(const HloModule* module,
|
||||
|
||||
std::vector<uint8> cubin = CompileGpuAsmOrGetCachedResult(
|
||||
stream_exec, ptx, compute_capability.first, compute_capability.second,
|
||||
module->config());
|
||||
module->config(), relocatable);
|
||||
|
||||
return std::pair<std::string, std::vector<uint8>>(std::move(ptx),
|
||||
std::move(cubin));
|
||||
@ -346,7 +348,7 @@ NVPTXCompiler::CompileTargetBinary(const HloModule* module,
|
||||
|
||||
std::vector<uint8> NVPTXCompiler::CompileGpuAsmOrGetCachedResult(
|
||||
se::StreamExecutor* stream_exec, const string& ptx, int cc_major,
|
||||
int cc_minor, const HloModuleConfig& hlo_module_config) {
|
||||
int cc_minor, const HloModuleConfig& hlo_module_config, bool relocatable) {
|
||||
XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::CompileGpuAsmOrGetCachedResult");
|
||||
tensorflow::profiler::TraceMe activity(
|
||||
"PTX->CUBIN", tensorflow::profiler::TraceMeLevel::kInfo);
|
||||
@ -361,7 +363,7 @@ std::vector<uint8> NVPTXCompiler::CompileGpuAsmOrGetCachedResult(
|
||||
tensorflow::mutex_lock lock(mutex_);
|
||||
std::tie(iter, inserted) = compilation_cache_.emplace(
|
||||
std::piecewise_construct,
|
||||
std::forward_as_tuple(ptx, cc_major, cc_minor),
|
||||
std::forward_as_tuple(ptx, cc_major, cc_minor, relocatable),
|
||||
std::forward_as_tuple());
|
||||
cache_ptx = &iter->first.ptx;
|
||||
cache_value = &iter->second;
|
||||
@ -375,9 +377,13 @@ std::vector<uint8> NVPTXCompiler::CompileGpuAsmOrGetCachedResult(
|
||||
if (inserted) {
|
||||
CHECK(!cache_value->compilation_done);
|
||||
if (!ptx.empty()) {
|
||||
StatusOr<std::vector<uint8>> maybe_cubin =
|
||||
se::CompileGpuAsm(stream_exec->device_ordinal(), cache_ptx->c_str(),
|
||||
PtxOptsFromConfig(hlo_module_config));
|
||||
auto ptxas_config = PtxOptsFromConfig(hlo_module_config);
|
||||
if (relocatable) {
|
||||
ptxas_config.extra_flags.push_back("-c");
|
||||
}
|
||||
StatusOr<std::vector<uint8>> maybe_cubin = se::CompileGpuAsm(
|
||||
stream_exec->device_ordinal(), cache_ptx->c_str(), ptxas_config);
|
||||
|
||||
if (maybe_cubin.ok()) {
|
||||
cache_value->cubin_data = std::move(maybe_cubin).ValueOrDie();
|
||||
VLOG(2) << "Compiled PTX size:" << ptx.size()
|
||||
@ -445,5 +451,17 @@ std::vector<uint8> NVPTXCompiler::CompileGpuAsmOrGetCachedResult(
|
||||
return cache_value->cubin_data;
|
||||
}
|
||||
|
||||
StatusOr<std::vector<uint8>> NVPTXCompiler::LinkModules(
|
||||
se::StreamExecutor* stream_exec, std::vector<std::vector<uint8>> modules) {
|
||||
std::vector<stream_executor::CubinOrPTXImage> images;
|
||||
images.reserve(modules.size());
|
||||
for (auto& module : modules) {
|
||||
images.push_back({"", std::move(module)});
|
||||
}
|
||||
return LinkGpuAsm(static_cast<se::gpu::GpuContext*>(
|
||||
stream_exec->implementation()->GpuContextHack()),
|
||||
images);
|
||||
}
|
||||
|
||||
} // namespace gpu
|
||||
} // namespace xla
|
||||
|
@ -52,9 +52,14 @@ class NVPTXCompiler : public GpuCompiler {
|
||||
|
||||
StatusOr<std::pair<std::string, std::vector<uint8>>> CompileTargetBinary(
|
||||
const HloModule* hlo_module, llvm::Module* llvm_module,
|
||||
GpuVersion gpu_version, se::StreamExecutor* stream_exec) override;
|
||||
GpuVersion gpu_version, se::StreamExecutor* stream_exec,
|
||||
bool relocatable) override;
|
||||
|
||||
private:
|
||||
StatusOr<std::vector<uint8>> LinkModules(
|
||||
se::StreamExecutor* stream_exec,
|
||||
std::vector<std::vector<uint8>> modules) override;
|
||||
|
||||
tensorflow::mutex mutex_;
|
||||
|
||||
// When compiling an HLO module, we need to find a path to the nvvm libdevice
|
||||
@ -71,7 +76,7 @@ class NVPTXCompiler : public GpuCompiler {
|
||||
// compiled cubin. If compilation was unsuccessful, returns an empty vector.
|
||||
std::vector<uint8> CompileGpuAsmOrGetCachedResult(
|
||||
se::StreamExecutor* stream_exec, const string& ptx, int cc_major,
|
||||
int cc_minor, const HloModuleConfig& hlo_module_config);
|
||||
int cc_minor, const HloModuleConfig& hlo_module_config, bool relocatable);
|
||||
|
||||
// The compilation_cache_ map is a cache from {ptx string, cc_major, cc_minor}
|
||||
// -> cubin so we don't recompile the same ptx twice. This is important for
|
||||
@ -86,24 +91,32 @@ class NVPTXCompiler : public GpuCompiler {
|
||||
// If compiling the ptx fails, we return an empty cubin, cross our fingers,
|
||||
// and leave compilation up to the driver.
|
||||
struct CompilationCacheKey {
|
||||
CompilationCacheKey(std::string ptx, int cc_major, int cc_minor)
|
||||
: ptx(std::move(ptx)), cc_major(cc_major), cc_minor(cc_minor) {}
|
||||
CompilationCacheKey(std::string ptx, int cc_major, int cc_minor,
|
||||
bool relocatable)
|
||||
: ptx(std::move(ptx)),
|
||||
cc_major(cc_major),
|
||||
cc_minor(cc_minor),
|
||||
relocatable(relocatable) {}
|
||||
string ptx;
|
||||
int cc_major;
|
||||
int cc_minor;
|
||||
bool relocatable;
|
||||
};
|
||||
struct CompilationCacheHash {
|
||||
size_t operator()(const CompilationCacheKey& key) const {
|
||||
return tensorflow::Hash64Combine(
|
||||
tensorflow::Hash64Combine(tensorflow::Hash64(key.ptx), key.cc_major),
|
||||
key.cc_minor);
|
||||
tensorflow::Hash64Combine(
|
||||
tensorflow::Hash64Combine(tensorflow::Hash64(key.ptx),
|
||||
key.cc_major),
|
||||
key.cc_minor),
|
||||
key.relocatable);
|
||||
}
|
||||
};
|
||||
struct CompilationCacheEq {
|
||||
size_t operator()(const CompilationCacheKey& a,
|
||||
const CompilationCacheKey& b) const {
|
||||
return a.cc_major == b.cc_major && a.cc_minor == b.cc_minor &&
|
||||
a.ptx == b.ptx;
|
||||
a.ptx == b.ptx && a.relocatable == b.relocatable;
|
||||
}
|
||||
};
|
||||
struct CompilationCacheValue {
|
||||
|
@ -190,11 +190,12 @@ LocalService::CompileExecutables(
|
||||
// single partition computations are built using `BuildExecutables`, fix it,
|
||||
// and remove this special case (provided the performance if similar).
|
||||
if (build_options.num_partitions() == 1) {
|
||||
TF_ASSIGN_OR_RETURN(
|
||||
std::unique_ptr<Executable> executable,
|
||||
BuildExecutable(proto, std::move(module_config), execute_backend_.get(),
|
||||
executor, build_options.device_allocator(),
|
||||
build_options.run_backend_only()));
|
||||
TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
|
||||
BuildExecutable(proto, std::move(module_config),
|
||||
execute_backend_.get(), executor,
|
||||
{build_options.device_allocator(),
|
||||
build_options.compile_thread_pool()},
|
||||
build_options.run_backend_only()));
|
||||
std::vector<std::unique_ptr<Executable>> executables;
|
||||
executables.push_back(std::move(executable));
|
||||
return executables;
|
||||
@ -206,10 +207,12 @@ LocalService::CompileExecutables(
|
||||
std::vector<se::StreamExecutor*> executors(build_options.num_partitions(),
|
||||
executor);
|
||||
|
||||
return BuildExecutables({&proto}, std::move(module_configs),
|
||||
execute_backend_.get(), {executors},
|
||||
build_options.device_allocator(),
|
||||
build_options.run_backend_only());
|
||||
return BuildExecutables(
|
||||
/*module_protos=*/{&proto}, std::move(module_configs),
|
||||
execute_backend_.get(), {executors},
|
||||
Compiler::CompileOptions{build_options.device_allocator(),
|
||||
build_options.compile_thread_pool()},
|
||||
build_options.run_backend_only());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -357,7 +357,7 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
|
||||
const std::vector<const HloModuleProto*>& module_protos,
|
||||
std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
|
||||
Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
|
||||
se::DeviceMemoryAllocator* device_allocator, bool run_backend_only) {
|
||||
const Compiler::CompileOptions& options, bool run_backend_only) {
|
||||
VLOG(1) << StrFormat("BuildExecutable on service %p", this);
|
||||
|
||||
// Dump computation proto state if flag is set.
|
||||
@ -387,17 +387,15 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
|
||||
|
||||
std::vector<std::unique_ptr<Executable>> executables;
|
||||
if (!run_backend_only) {
|
||||
TF_ASSIGN_OR_RETURN(
|
||||
executables,
|
||||
backend->compiler()->Compile(std::move(module_group),
|
||||
std::move(executors), device_allocator));
|
||||
TF_ASSIGN_OR_RETURN(executables, backend->compiler()->Compile(
|
||||
std::move(module_group),
|
||||
std::move(executors), options));
|
||||
} else {
|
||||
auto modules = module_group->ConsumeModules();
|
||||
for (std::unique_ptr<HloModule>& module : modules) {
|
||||
TF_ASSIGN_OR_RETURN(
|
||||
std::unique_ptr<Executable> executable,
|
||||
backend->compiler()->RunBackend(std::move(module), executors[0][0],
|
||||
device_allocator));
|
||||
TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
|
||||
backend->compiler()->RunBackend(
|
||||
std::move(module), executors[0][0], options));
|
||||
executables.push_back(std::move(executable));
|
||||
}
|
||||
}
|
||||
@ -710,7 +708,7 @@ Status Service::ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg,
|
||||
TF_ASSIGN_OR_RETURN(std::vector<std::unique_ptr<Executable>> executables,
|
||||
BuildExecutables(module_protos, std::move(module_configs),
|
||||
execute_backend_.get(), all_executors,
|
||||
/*device_allocator=*/nullptr));
|
||||
{/*device_allocator=*/nullptr}));
|
||||
std::vector<Executable*> executable_ptrs;
|
||||
executable_ptrs.reserve(executables.size());
|
||||
for (const auto& executable : executables) {
|
||||
@ -810,7 +808,7 @@ Status Service::GetDeviceHandles(const GetDeviceHandlesRequest* arg,
|
||||
StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
|
||||
const HloModuleProto& module_proto,
|
||||
std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
|
||||
se::StreamExecutor* executor, se::DeviceMemoryAllocator* device_allocator,
|
||||
se::StreamExecutor* executor, const Compiler::CompileOptions& options,
|
||||
bool run_backend_only) {
|
||||
VLOG(1) << StrFormat(
|
||||
"BuildExecutable on service %p with serialized module proto: %s", this,
|
||||
@ -822,14 +820,13 @@ StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
|
||||
DumpHloModuleIfEnabled(*module, kBeforeOptimizationsDumpName);
|
||||
|
||||
if (!run_backend_only) {
|
||||
TF_ASSIGN_OR_RETURN(
|
||||
module, backend->compiler()->RunHloPasses(std::move(module), executor,
|
||||
device_allocator));
|
||||
TF_ASSIGN_OR_RETURN(module, backend->compiler()->RunHloPasses(
|
||||
std::move(module), executor, options));
|
||||
}
|
||||
|
||||
TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
|
||||
backend->compiler()->RunBackend(
|
||||
std::move(module), executor, device_allocator));
|
||||
TF_ASSIGN_OR_RETURN(
|
||||
std::unique_ptr<Executable> executable,
|
||||
backend->compiler()->RunBackend(std::move(module), executor, options));
|
||||
|
||||
const auto& debug_opts = module_config->debug_options();
|
||||
if (DumpingEnabledForHloModule(module_proto.name(), debug_opts) &&
|
||||
@ -875,7 +872,7 @@ Status Service::Compile(const CompileRequest* arg, CompileResponse* result) {
|
||||
BuildExecutable(arg->computation(), std::move(module_config),
|
||||
execute_backend_.get(),
|
||||
execute_backend_->default_stream_executor(),
|
||||
/*device_allocator=*/nullptr));
|
||||
{/*device_allocator=*/nullptr}));
|
||||
|
||||
*result->mutable_handle() = compilation_cache_.Insert(std::move(executable));
|
||||
|
||||
|
@ -235,8 +235,7 @@ class Service : public ServiceInterface {
|
||||
StatusOr<std::unique_ptr<Executable>> BuildExecutable(
|
||||
const HloModuleProto& module_proto,
|
||||
std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
|
||||
se::StreamExecutor* executor,
|
||||
se::DeviceMemoryAllocator* device_allocator = nullptr,
|
||||
se::StreamExecutor* executor, const Compiler::CompileOptions& options,
|
||||
bool run_backend_only = false);
|
||||
|
||||
// Same as BuildExecutable() above, but builds a list of Executables for the
|
||||
@ -245,8 +244,7 @@ class Service : public ServiceInterface {
|
||||
const std::vector<const HloModuleProto*>& module_protos,
|
||||
std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
|
||||
Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
|
||||
se::DeviceMemoryAllocator* device_allocator,
|
||||
bool run_backend_only = false);
|
||||
const Compiler::CompileOptions& options, bool run_backend_only = false);
|
||||
|
||||
// Runs the given executable with the given arguments and register the result
|
||||
// in the allocation tracker. The handle of the result from the tracker is
|
||||
|
@ -57,7 +57,8 @@ class GpuDummyCompiler : public GpuCompiler {
|
||||
|
||||
StatusOr<std::pair<std::string, std::vector<uint8>>> CompileTargetBinary(
|
||||
const HloModule* hlo_module, llvm::Module* llvm_module,
|
||||
GpuVersion gpu_version, se::StreamExecutor* stream_exec) {
|
||||
GpuVersion gpu_version, se::StreamExecutor* stream_exec,
|
||||
bool relocatable) {
|
||||
if (user_post_optimization_hook_) {
|
||||
user_post_optimization_hook_(*llvm_module);
|
||||
}
|
||||
|
@ -598,6 +598,16 @@ cc_library(
|
||||
]),
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "cuda_asm_compiler",
|
||||
srcs = if_cuda_is_configured(["cuda_asm_compiler.cc"]),
|
||||
deps = if_cuda_is_configured([
|
||||
"//tensorflow/core:lib_proto_parsing",
|
||||
"//tensorflow/stream_executor/gpu:asm_compiler",
|
||||
"//tensorflow/stream_executor/gpu:gpu_driver_header",
|
||||
]),
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "cuda_gpu_executor",
|
||||
srcs = if_cuda_is_configured(["cuda_gpu_executor.cc"]),
|
||||
@ -611,6 +621,7 @@ cc_library(
|
||||
":cuda_platform_id",
|
||||
":cuda_stream",
|
||||
":cuda_timer",
|
||||
":cuda_asm_compiler",
|
||||
"@com_google_absl//absl/strings",
|
||||
"//tensorflow/stream_executor:event",
|
||||
"//tensorflow/stream_executor:plugin_registry",
|
||||
|
55
tensorflow/stream_executor/cuda/cuda_asm_compiler.cc
Normal file
55
tensorflow/stream_executor/cuda/cuda_asm_compiler.cc
Normal file
@ -0,0 +1,55 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "tensorflow/core/lib/core/errors.h"
|
||||
#include "tensorflow/stream_executor/gpu/asm_compiler.h"
|
||||
#include "tensorflow/stream_executor/gpu/gpu_driver.h"
|
||||
|
||||
namespace stream_executor {
|
||||
|
||||
#define RETURN_IF_CUDA_ERROR(expr) \
|
||||
do { \
|
||||
CUresult _status = expr; \
|
||||
if (!SE_PREDICT_TRUE(_status == CUDA_SUCCESS)) { \
|
||||
const char* error_string; \
|
||||
cuGetErrorString(_status, &error_string); \
|
||||
std::ostringstream oss; \
|
||||
oss << error_string << "\nin " << __FILE__ << "(" << __LINE__ << "): '" \
|
||||
<< #expr << "'"; \
|
||||
return port::Status(port::error::UNKNOWN, oss.str().c_str()); \
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
port::StatusOr<std::vector<uint8>> LinkGpuAsm(
|
||||
gpu::GpuContext* context, std::vector<CubinOrPTXImage> images) {
|
||||
gpu::ScopedActivateContext activation(context);
|
||||
|
||||
CUlinkState link_state;
|
||||
RETURN_IF_CUDA_ERROR(cuLinkCreate(0, nullptr, nullptr, &link_state));
|
||||
for (auto& image : images) {
|
||||
RETURN_IF_CUDA_ERROR(cuLinkAddData(
|
||||
link_state, CU_JIT_INPUT_CUBIN, static_cast<void*>(image.bytes.data()),
|
||||
image.bytes.size(), "", 0, nullptr, nullptr));
|
||||
}
|
||||
void* cubin_out;
|
||||
size_t cubin_size;
|
||||
RETURN_IF_CUDA_ERROR(cuLinkComplete(link_state, &cubin_out, &cubin_size));
|
||||
std::vector<uint8> cubin(static_cast<uint8*>(cubin_out),
|
||||
static_cast<uint8*>(cubin_out) + cubin_size);
|
||||
RETURN_IF_CUDA_ERROR(cuLinkDestroy(link_state));
|
||||
return std::move(cubin);
|
||||
}
|
||||
|
||||
} // namespace stream_executor
|
@ -24,6 +24,9 @@ limitations under the License.
|
||||
#include "tensorflow/stream_executor/platform/port.h"
|
||||
|
||||
namespace stream_executor {
|
||||
namespace gpu {
|
||||
class GpuContext;
|
||||
}
|
||||
|
||||
// Compiles the given PTX string using ptxas and returns the resulting machine
|
||||
// code (i.e. a cubin) as a byte array. The generated cubin matches the compute
|
||||
@ -72,6 +75,11 @@ struct HsacoImage {
|
||||
port::StatusOr<std::vector<uint8>> BundleGpuAsm(
|
||||
std::vector<HsacoImage> images, const std::string rocm_root_dir);
|
||||
|
||||
// Links multiple relocatable GPU images (e.g. results of ptxas -c) into a
|
||||
// single image.
|
||||
port::StatusOr<std::vector<uint8>> LinkGpuAsm(
|
||||
gpu::GpuContext* context, std::vector<CubinOrPTXImage> images);
|
||||
|
||||
} // namespace stream_executor
|
||||
|
||||
#endif // TENSORFLOW_STREAM_EXECUTOR_GPU_ASM_COMPILER_H_
|
||||
|
Loading…
Reference in New Issue
Block a user