[XLA/GPU] Add parallel compilation for LLVM and ptxas.

PiperOrigin-RevId: 346340049
Change-Id: I09a23a345d60390677167b6f67695912e3aa419e
This commit is contained in:
Tim Shen 2020-12-08 09:15:47 -08:00 committed by TensorFlower Gardener
parent ba81a9f785
commit 05baa49b08
16 changed files with 317 additions and 81 deletions

View File

@ -115,6 +115,16 @@ class ExecutableBuildOptions {
return *this;
}
// Thread pool for parallel compilation.
tensorflow::thread::ThreadPool* compile_thread_pool() const {
return compile_thread_pool_;
}
ExecutableBuildOptions& set_run_backend_only(
tensorflow::thread::ThreadPool* compile_thread_pool) {
compile_thread_pool_ = compile_thread_pool;
return *this;
}
private:
int device_ordinal_ = -1;
Shape result_layout_;
@ -128,6 +138,7 @@ class ExecutableBuildOptions {
absl::optional<DeviceAssignment> device_assignment_;
bool alias_passthrough_params_ = false;
bool run_backend_only_ = false;
tensorflow::thread::ThreadPool* compile_thread_pool_ = nullptr;
};
} // namespace xla

View File

@ -42,6 +42,7 @@ limitations under the License.
#include "tensorflow/core/platform/protobuf.h"
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
#include "tensorflow/core/platform/thread_annotations.h"
#include "tensorflow/core/platform/threadpool.h"
namespace xla {
@ -165,6 +166,9 @@ class Compiler {
// given algorithm over those buffers, to see which variant is fastest. Any
// space allocated will be deallocated before the compilation returns.
se::DeviceMemoryAllocator* device_allocator = nullptr;
// An optional thread pool for parallel compilation.
tensorflow::thread::ThreadPool* thread_pool = nullptr;
};
virtual ~Compiler() {}

View File

@ -1452,7 +1452,11 @@ cc_library(
"//tensorflow/stream_executor:stream_executor_headers",
"@com_google_absl//absl/memory",
"@com_google_absl//absl/strings",
"@llvm-project//llvm:AsmParser",
"@llvm-project//llvm:BitReader",
"@llvm-project//llvm:BitWriter",
"@llvm-project//llvm:Core",
"@llvm-project//llvm:TransformUtils",
"@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
"@llvm-project//mlir:IR",
],
@ -1517,7 +1521,7 @@ cc_library(
"//tensorflow/stream_executor:stream_executor_headers",
"//tensorflow/stream_executor/cuda:cuda_diagnostics",
"//tensorflow/stream_executor/gpu:asm_compiler",
]),
]) + ["//tensorflow/stream_executor/gpu:gpu_driver_header"],
)
cc_library(

View File

@ -108,12 +108,17 @@ StatusOr<std::pair<std::string, std::vector<uint8>>>
AMDGPUCompiler::CompileTargetBinary(const HloModule* module,
llvm::Module* llvm_module,
GpuVersion gpu_version,
se::StreamExecutor* stream_exec) {
se::StreamExecutor* stream_exec,
bool relocatable) {
if (rocdl_dir_.empty()) {
// Compute rocdl_dir_ just once and cache it in this member.
rocdl_dir_ = GetROCDLDir(module->config());
}
if (relocatable) {
return Unimplemented("relocatable target binary is not implemented");
}
std::vector<uint8> hsaco;
{
XLA_SCOPED_LOGGING_TIMER(

View File

@ -41,7 +41,8 @@ class AMDGPUCompiler : public GpuCompiler {
StatusOr<std::pair<std::string, std::vector<uint8>>> CompileTargetBinary(
const HloModule* hlo_module, llvm::Module* llvm_module,
GpuVersion gpu_version, se::StreamExecutor* stream_exec) override;
GpuVersion gpu_version, se::StreamExecutor* stream_exec,
bool relocatable) override;
private:
// The parent directory of ROCm-Device-Libs IR libraries.

View File

@ -24,11 +24,15 @@ limitations under the License.
#include "absl/memory/memory.h"
#include "absl/strings/numbers.h"
#include "absl/strings/str_cat.h"
#include "llvm/AsmParser/Parser.h"
#include "llvm/Bitcode/BitcodeReader.h"
#include "llvm/Bitcode/BitcodeWriter.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/DiagnosticPrinter.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Verifier.h"
#include "llvm/Transforms/Utils/SplitModule.h"
#include "mlir/IR/BuiltinOps.h" // from @llvm-project
#include "mlir/InitAllDialects.h" // from @llvm-project
#include "tensorflow/compiler/xla/protobuf_util.h"
@ -114,11 +118,13 @@ limitations under the License.
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/lib/gtl/cleanup.h"
#include "tensorflow/core/lib/io/path.h"
#include "tensorflow/core/platform/blocking_counter.h"
#include "tensorflow/core/platform/env.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/regexp.h"
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
#include "tensorflow/core/platform/subprocess.h"
#include "tensorflow/core/platform/threadpool.h"
#include "tensorflow/core/platform/tracing.h"
#include "tensorflow/core/profiler/lib/traceme.h"
#include "tensorflow/core/util/env_var.h"
@ -641,6 +647,124 @@ static Status CompileModuleToLlvmIrImpl(
return Status::OK();
}
StatusOr<std::pair<std::string, std::vector<uint8>>>
GpuCompiler::CompileToTargetBinary(const HloModule& module,
std::unique_ptr<llvm::Module> llvm_module,
se::StreamExecutor* stream_exec,
const CompileOptions& options) {
using BackendCompileResult = std::pair<std::string, std::vector<uint8>>;
const auto compile_single_module =
[this, stream_exec, &module](
llvm::Module* llvm_module,
bool relocatable) -> StatusOr<BackendCompileResult> {
{
XLA_SCOPED_LOGGING_TIMER(
"GpuCompiler::RunBackend - Running LLVM verifier");
std::string err;
llvm::raw_string_ostream err_stream(err);
// verifyModule() returns true if the module is broken.
TF_RET_CHECK(!llvm::verifyModule(*llvm_module, &err_stream))
<< "Invalid LLVM IR before optimizations:\n"
<< err_stream.str()
<< "\nThis probably indicates a bug in the HLO -> LLVM IR "
"lowering. "
"Rerun with --xla_dump_to to get the IR and looks for files "
"with "
"name containing: *"
<< FilenameFor(module, "", "") << "*";
}
GpuVersion gpu_version = GetGpuVersion(stream_exec);
return CompileTargetBinary(&module, llvm_module, gpu_version, stream_exec,
relocatable);
};
tensorflow::thread::ThreadPool* thread_pool = options.thread_pool;
if (!thread_pool) {
return compile_single_module(llvm_module.get(), /*relocatable=*/false);
}
// Test whether LinkModules is supported.
if (this->LinkModules(stream_exec, {}).status().code() ==
tensorflow::error::Code::UNIMPLEMENTED) {
return compile_single_module(llvm_module.get(), /*relocatable=*/false);
}
std::vector<std::unique_ptr<llvm::Module>> llvm_modules;
int num_functions = 0;
for (llvm::Function& func : llvm_module->functions()) {
if (!func.isDeclaration() &&
func.getLinkage() == llvm::GlobalValue::LinkageTypes::ExternalLinkage) {
num_functions++;
}
}
llvm::SplitModule(
std::move(llvm_module),
std::max<unsigned>(
1, std::min<unsigned>(thread_pool->NumThreads(), num_functions)),
[&](std::unique_ptr<llvm::Module> module) {
llvm_modules.push_back(std::move(module));
},
/*PreserveLocals=*/true);
std::vector<StatusOr<BackendCompileResult>> compile_results(
llvm_modules.size());
tensorflow::BlockingCounter counter(llvm_modules.size());
for (int i = 0; i < llvm_modules.size(); i++) {
thread_pool->Schedule([&compile_results, compile_single_module, i,
&llvm_modules, &counter] {
llvm::Module* original_module = llvm_modules[i].get();
llvm::LLVMContext context;
std::string buffer;
llvm::raw_string_ostream error(buffer);
llvm::DiagnosticPrinterRawOStream printer(error);
auto DiagnosticHandler = [](const llvm::DiagnosticInfo& diag_info,
void* Context) {
auto printer = static_cast<llvm::DiagnosticPrinterRawOStream*>(Context);
diag_info.print(*printer);
};
context.setDiagnosticHandlerCallBack(DiagnosticHandler, &printer);
std::unique_ptr<llvm::Module> new_llvm_module;
{
std::string ir;
{
llvm::raw_string_ostream os(ir);
original_module->print(os, nullptr);
}
llvm::SMDiagnostic err;
new_llvm_module = llvm::parseAssemblyString(ir, err, context);
}
compile_results[i] =
compile_single_module(new_llvm_module.get(), /*relocatable=*/true);
counter.DecrementCount();
});
}
counter.Wait();
std::string ptx_snippets;
std::vector<std::vector<uint8>> submodule_compile_results;
for (auto& maybe_result : compile_results) {
TF_ASSIGN_OR_RETURN(auto result, maybe_result);
if (result.second.empty()) {
continue;
}
ptx_snippets += result.first;
ptx_snippets += "\n";
submodule_compile_results.push_back(result.second);
}
TF_ASSIGN_OR_RETURN(
std::vector<uint8> backend_result,
this->LinkModules(stream_exec, std::move(submodule_compile_results)));
return std::make_pair(ptx_snippets, backend_result);
}
StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
const CompileOptions& options) {
@ -650,15 +774,6 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
TF_RET_CHECK(stream_exec != nullptr);
llvm::LLVMContext llvm_context;
std::string buffer;
llvm::raw_string_ostream error(buffer);
llvm::DiagnosticPrinterRawOStream printer(error);
auto DiagnosticHandler = [](const llvm::DiagnosticInfo& diag_info,
void* Context) {
auto printer = static_cast<llvm::DiagnosticPrinterRawOStream*>(Context);
diag_info.print(*printer);
};
llvm_context.setDiagnosticHandlerCallBack(DiagnosticHandler, &printer);
GpuDeviceInfo gpu_device_info;
gpu_device_info.threads_per_block_limit =
@ -724,34 +839,16 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
llvm_ir::DumpIrIfEnabled(*module, *llvm_module, /*optimized=*/false);
{
XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunBackend - Running LLVM verifier");
std::string err;
llvm::raw_string_ostream err_stream(err);
// verifyModule() returns true if the module is broken.
TF_RET_CHECK(!llvm::verifyModule(*llvm_module, &err_stream))
<< "Invalid LLVM IR before optimizations:\n"
<< err_stream.str()
<< "\nThis probably indicates a bug in the HLO -> LLVM IR lowering. "
"Rerun with --xla_dump_to to get the IR and looks for files with "
"name containing: *"
<< FilenameFor(*module, "", "") << "*";
}
GpuVersion gpu_version = GetGpuVersion(stream_exec);
using BackendCompileResult = std::pair<std::string, std::vector<uint8>>;
TF_ASSIGN_OR_RETURN(BackendCompileResult backend_result,
CompileTargetBinary(module.get(), llvm_module.get(),
gpu_version, stream_exec));
CompileToTargetBinary(*module, std::move(llvm_module),
stream_exec, options));
if (DumpingEnabledForHloModule(*module)) {
DumpToFileInDirOrStdout(*module, "", "thunk_schedule",
thunk_schedule->ToString());
}
GpuVersion gpu_version = GetGpuVersion(stream_exec);
auto* gpu_executable = new GpuExecutable(
backend_result.first, backend_result.second, gpu_version,
std::move(thunk_schedule), std::move(module),

View File

@ -83,8 +83,8 @@ class GpuCompiler : public LLVMCompiler {
virtual StatusOr<std::pair<std::string, std::vector<uint8>>>
CompileTargetBinary(const HloModule* hlo_module, llvm::Module* llvm_module,
GpuVersion gpu_version,
se::StreamExecutor* stream_exec) = 0;
GpuVersion gpu_version, se::StreamExecutor* stream_exec,
bool relocatable) = 0;
Status PrepareHloModuleForIrEmitting(HloModule* hlo_module);
@ -96,6 +96,10 @@ class GpuCompiler : public LLVMCompiler {
CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
AotCompilationOptions const& options) override;
StatusOr<std::pair<std::string, std::vector<uint8>>> CompileToTargetBinary(
const HloModule& module, std::unique_ptr<llvm::Module> llvm_module,
se::StreamExecutor* stream_exec, const CompileOptions& options);
se::Platform::Id PlatformId() const override { return platform_id_; }
HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override {
@ -115,6 +119,12 @@ class GpuCompiler : public LLVMCompiler {
}
private:
virtual StatusOr<std::vector<uint8>> LinkModules(
se::StreamExecutor* stream_exec,
std::vector<std::vector<uint8>> modules) {
return Unimplemented("LinkModules is not implemented.");
}
se::Platform::Id platform_id_;
// The triple that represents our target.

View File

@ -51,6 +51,7 @@ limitations under the License.
#include "tensorflow/core/profiler/lib/traceme.h"
#include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
#include "tensorflow/stream_executor/gpu/asm_compiler.h"
#include "tensorflow/stream_executor/gpu/gpu_driver.h"
namespace xla {
namespace gpu {
@ -299,7 +300,8 @@ StatusOr<std::pair<std::string, std::vector<uint8>>>
NVPTXCompiler::CompileTargetBinary(const HloModule* module,
llvm::Module* llvm_module,
GpuVersion gpu_version,
se::StreamExecutor* stream_exec) {
se::StreamExecutor* stream_exec,
bool relocatable) {
std::pair<int, int> compute_capability =
absl::get<std::pair<int, int>>(gpu_version);
@ -338,7 +340,7 @@ NVPTXCompiler::CompileTargetBinary(const HloModule* module,
std::vector<uint8> cubin = CompileGpuAsmOrGetCachedResult(
stream_exec, ptx, compute_capability.first, compute_capability.second,
module->config());
module->config(), relocatable);
return std::pair<std::string, std::vector<uint8>>(std::move(ptx),
std::move(cubin));
@ -346,7 +348,7 @@ NVPTXCompiler::CompileTargetBinary(const HloModule* module,
std::vector<uint8> NVPTXCompiler::CompileGpuAsmOrGetCachedResult(
se::StreamExecutor* stream_exec, const string& ptx, int cc_major,
int cc_minor, const HloModuleConfig& hlo_module_config) {
int cc_minor, const HloModuleConfig& hlo_module_config, bool relocatable) {
XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::CompileGpuAsmOrGetCachedResult");
tensorflow::profiler::TraceMe activity(
"PTX->CUBIN", tensorflow::profiler::TraceMeLevel::kInfo);
@ -361,7 +363,7 @@ std::vector<uint8> NVPTXCompiler::CompileGpuAsmOrGetCachedResult(
tensorflow::mutex_lock lock(mutex_);
std::tie(iter, inserted) = compilation_cache_.emplace(
std::piecewise_construct,
std::forward_as_tuple(ptx, cc_major, cc_minor),
std::forward_as_tuple(ptx, cc_major, cc_minor, relocatable),
std::forward_as_tuple());
cache_ptx = &iter->first.ptx;
cache_value = &iter->second;
@ -375,9 +377,13 @@ std::vector<uint8> NVPTXCompiler::CompileGpuAsmOrGetCachedResult(
if (inserted) {
CHECK(!cache_value->compilation_done);
if (!ptx.empty()) {
StatusOr<std::vector<uint8>> maybe_cubin =
se::CompileGpuAsm(stream_exec->device_ordinal(), cache_ptx->c_str(),
PtxOptsFromConfig(hlo_module_config));
auto ptxas_config = PtxOptsFromConfig(hlo_module_config);
if (relocatable) {
ptxas_config.extra_flags.push_back("-c");
}
StatusOr<std::vector<uint8>> maybe_cubin = se::CompileGpuAsm(
stream_exec->device_ordinal(), cache_ptx->c_str(), ptxas_config);
if (maybe_cubin.ok()) {
cache_value->cubin_data = std::move(maybe_cubin).ValueOrDie();
VLOG(2) << "Compiled PTX size:" << ptx.size()
@ -445,5 +451,17 @@ std::vector<uint8> NVPTXCompiler::CompileGpuAsmOrGetCachedResult(
return cache_value->cubin_data;
}
StatusOr<std::vector<uint8>> NVPTXCompiler::LinkModules(
se::StreamExecutor* stream_exec, std::vector<std::vector<uint8>> modules) {
std::vector<stream_executor::CubinOrPTXImage> images;
images.reserve(modules.size());
for (auto& module : modules) {
images.push_back({"", std::move(module)});
}
return LinkGpuAsm(static_cast<se::gpu::GpuContext*>(
stream_exec->implementation()->GpuContextHack()),
images);
}
} // namespace gpu
} // namespace xla

View File

@ -52,9 +52,14 @@ class NVPTXCompiler : public GpuCompiler {
StatusOr<std::pair<std::string, std::vector<uint8>>> CompileTargetBinary(
const HloModule* hlo_module, llvm::Module* llvm_module,
GpuVersion gpu_version, se::StreamExecutor* stream_exec) override;
GpuVersion gpu_version, se::StreamExecutor* stream_exec,
bool relocatable) override;
private:
StatusOr<std::vector<uint8>> LinkModules(
se::StreamExecutor* stream_exec,
std::vector<std::vector<uint8>> modules) override;
tensorflow::mutex mutex_;
// When compiling an HLO module, we need to find a path to the nvvm libdevice
@ -71,7 +76,7 @@ class NVPTXCompiler : public GpuCompiler {
// compiled cubin. If compilation was unsuccessful, returns an empty vector.
std::vector<uint8> CompileGpuAsmOrGetCachedResult(
se::StreamExecutor* stream_exec, const string& ptx, int cc_major,
int cc_minor, const HloModuleConfig& hlo_module_config);
int cc_minor, const HloModuleConfig& hlo_module_config, bool relocatable);
// The compilation_cache_ map is a cache from {ptx string, cc_major, cc_minor}
// -> cubin so we don't recompile the same ptx twice. This is important for
@ -86,24 +91,32 @@ class NVPTXCompiler : public GpuCompiler {
// If compiling the ptx fails, we return an empty cubin, cross our fingers,
// and leave compilation up to the driver.
struct CompilationCacheKey {
CompilationCacheKey(std::string ptx, int cc_major, int cc_minor)
: ptx(std::move(ptx)), cc_major(cc_major), cc_minor(cc_minor) {}
CompilationCacheKey(std::string ptx, int cc_major, int cc_minor,
bool relocatable)
: ptx(std::move(ptx)),
cc_major(cc_major),
cc_minor(cc_minor),
relocatable(relocatable) {}
string ptx;
int cc_major;
int cc_minor;
bool relocatable;
};
struct CompilationCacheHash {
size_t operator()(const CompilationCacheKey& key) const {
return tensorflow::Hash64Combine(
tensorflow::Hash64Combine(tensorflow::Hash64(key.ptx), key.cc_major),
key.cc_minor);
tensorflow::Hash64Combine(
tensorflow::Hash64Combine(tensorflow::Hash64(key.ptx),
key.cc_major),
key.cc_minor),
key.relocatable);
}
};
struct CompilationCacheEq {
size_t operator()(const CompilationCacheKey& a,
const CompilationCacheKey& b) const {
return a.cc_major == b.cc_major && a.cc_minor == b.cc_minor &&
a.ptx == b.ptx;
a.ptx == b.ptx && a.relocatable == b.relocatable;
}
};
struct CompilationCacheValue {

View File

@ -190,11 +190,12 @@ LocalService::CompileExecutables(
// single partition computations are built using `BuildExecutables`, fix it,
// and remove this special case (provided the performance if similar).
if (build_options.num_partitions() == 1) {
TF_ASSIGN_OR_RETURN(
std::unique_ptr<Executable> executable,
BuildExecutable(proto, std::move(module_config), execute_backend_.get(),
executor, build_options.device_allocator(),
build_options.run_backend_only()));
TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
BuildExecutable(proto, std::move(module_config),
execute_backend_.get(), executor,
{build_options.device_allocator(),
build_options.compile_thread_pool()},
build_options.run_backend_only()));
std::vector<std::unique_ptr<Executable>> executables;
executables.push_back(std::move(executable));
return executables;
@ -206,10 +207,12 @@ LocalService::CompileExecutables(
std::vector<se::StreamExecutor*> executors(build_options.num_partitions(),
executor);
return BuildExecutables({&proto}, std::move(module_configs),
execute_backend_.get(), {executors},
build_options.device_allocator(),
build_options.run_backend_only());
return BuildExecutables(
/*module_protos=*/{&proto}, std::move(module_configs),
execute_backend_.get(), {executors},
Compiler::CompileOptions{build_options.device_allocator(),
build_options.compile_thread_pool()},
build_options.run_backend_only());
}
}

View File

@ -357,7 +357,7 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
const std::vector<const HloModuleProto*>& module_protos,
std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
se::DeviceMemoryAllocator* device_allocator, bool run_backend_only) {
const Compiler::CompileOptions& options, bool run_backend_only) {
VLOG(1) << StrFormat("BuildExecutable on service %p", this);
// Dump computation proto state if flag is set.
@ -387,17 +387,15 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
std::vector<std::unique_ptr<Executable>> executables;
if (!run_backend_only) {
TF_ASSIGN_OR_RETURN(
executables,
backend->compiler()->Compile(std::move(module_group),
std::move(executors), device_allocator));
TF_ASSIGN_OR_RETURN(executables, backend->compiler()->Compile(
std::move(module_group),
std::move(executors), options));
} else {
auto modules = module_group->ConsumeModules();
for (std::unique_ptr<HloModule>& module : modules) {
TF_ASSIGN_OR_RETURN(
std::unique_ptr<Executable> executable,
backend->compiler()->RunBackend(std::move(module), executors[0][0],
device_allocator));
TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
backend->compiler()->RunBackend(
std::move(module), executors[0][0], options));
executables.push_back(std::move(executable));
}
}
@ -710,7 +708,7 @@ Status Service::ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg,
TF_ASSIGN_OR_RETURN(std::vector<std::unique_ptr<Executable>> executables,
BuildExecutables(module_protos, std::move(module_configs),
execute_backend_.get(), all_executors,
/*device_allocator=*/nullptr));
{/*device_allocator=*/nullptr}));
std::vector<Executable*> executable_ptrs;
executable_ptrs.reserve(executables.size());
for (const auto& executable : executables) {
@ -810,7 +808,7 @@ Status Service::GetDeviceHandles(const GetDeviceHandlesRequest* arg,
StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
const HloModuleProto& module_proto,
std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
se::StreamExecutor* executor, se::DeviceMemoryAllocator* device_allocator,
se::StreamExecutor* executor, const Compiler::CompileOptions& options,
bool run_backend_only) {
VLOG(1) << StrFormat(
"BuildExecutable on service %p with serialized module proto: %s", this,
@ -822,14 +820,13 @@ StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
DumpHloModuleIfEnabled(*module, kBeforeOptimizationsDumpName);
if (!run_backend_only) {
TF_ASSIGN_OR_RETURN(
module, backend->compiler()->RunHloPasses(std::move(module), executor,
device_allocator));
TF_ASSIGN_OR_RETURN(module, backend->compiler()->RunHloPasses(
std::move(module), executor, options));
}
TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
backend->compiler()->RunBackend(
std::move(module), executor, device_allocator));
TF_ASSIGN_OR_RETURN(
std::unique_ptr<Executable> executable,
backend->compiler()->RunBackend(std::move(module), executor, options));
const auto& debug_opts = module_config->debug_options();
if (DumpingEnabledForHloModule(module_proto.name(), debug_opts) &&
@ -875,7 +872,7 @@ Status Service::Compile(const CompileRequest* arg, CompileResponse* result) {
BuildExecutable(arg->computation(), std::move(module_config),
execute_backend_.get(),
execute_backend_->default_stream_executor(),
/*device_allocator=*/nullptr));
{/*device_allocator=*/nullptr}));
*result->mutable_handle() = compilation_cache_.Insert(std::move(executable));

View File

@ -235,8 +235,7 @@ class Service : public ServiceInterface {
StatusOr<std::unique_ptr<Executable>> BuildExecutable(
const HloModuleProto& module_proto,
std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
se::StreamExecutor* executor,
se::DeviceMemoryAllocator* device_allocator = nullptr,
se::StreamExecutor* executor, const Compiler::CompileOptions& options,
bool run_backend_only = false);
// Same as BuildExecutable() above, but builds a list of Executables for the
@ -245,8 +244,7 @@ class Service : public ServiceInterface {
const std::vector<const HloModuleProto*>& module_protos,
std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
se::DeviceMemoryAllocator* device_allocator,
bool run_backend_only = false);
const Compiler::CompileOptions& options, bool run_backend_only = false);
// Runs the given executable with the given arguments and register the result
// in the allocation tracker. The handle of the result from the tracker is

View File

@ -57,7 +57,8 @@ class GpuDummyCompiler : public GpuCompiler {
StatusOr<std::pair<std::string, std::vector<uint8>>> CompileTargetBinary(
const HloModule* hlo_module, llvm::Module* llvm_module,
GpuVersion gpu_version, se::StreamExecutor* stream_exec) {
GpuVersion gpu_version, se::StreamExecutor* stream_exec,
bool relocatable) {
if (user_post_optimization_hook_) {
user_post_optimization_hook_(*llvm_module);
}

View File

@ -598,6 +598,16 @@ cc_library(
]),
)
cc_library(
name = "cuda_asm_compiler",
srcs = if_cuda_is_configured(["cuda_asm_compiler.cc"]),
deps = if_cuda_is_configured([
"//tensorflow/core:lib_proto_parsing",
"//tensorflow/stream_executor/gpu:asm_compiler",
"//tensorflow/stream_executor/gpu:gpu_driver_header",
]),
)
cc_library(
name = "cuda_gpu_executor",
srcs = if_cuda_is_configured(["cuda_gpu_executor.cc"]),
@ -611,6 +621,7 @@ cc_library(
":cuda_platform_id",
":cuda_stream",
":cuda_timer",
":cuda_asm_compiler",
"@com_google_absl//absl/strings",
"//tensorflow/stream_executor:event",
"//tensorflow/stream_executor:plugin_registry",

View File

@ -0,0 +1,55 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/stream_executor/gpu/asm_compiler.h"
#include "tensorflow/stream_executor/gpu/gpu_driver.h"
namespace stream_executor {
#define RETURN_IF_CUDA_ERROR(expr) \
do { \
CUresult _status = expr; \
if (!SE_PREDICT_TRUE(_status == CUDA_SUCCESS)) { \
const char* error_string; \
cuGetErrorString(_status, &error_string); \
std::ostringstream oss; \
oss << error_string << "\nin " << __FILE__ << "(" << __LINE__ << "): '" \
<< #expr << "'"; \
return port::Status(port::error::UNKNOWN, oss.str().c_str()); \
} \
} while (false)
port::StatusOr<std::vector<uint8>> LinkGpuAsm(
gpu::GpuContext* context, std::vector<CubinOrPTXImage> images) {
gpu::ScopedActivateContext activation(context);
CUlinkState link_state;
RETURN_IF_CUDA_ERROR(cuLinkCreate(0, nullptr, nullptr, &link_state));
for (auto& image : images) {
RETURN_IF_CUDA_ERROR(cuLinkAddData(
link_state, CU_JIT_INPUT_CUBIN, static_cast<void*>(image.bytes.data()),
image.bytes.size(), "", 0, nullptr, nullptr));
}
void* cubin_out;
size_t cubin_size;
RETURN_IF_CUDA_ERROR(cuLinkComplete(link_state, &cubin_out, &cubin_size));
std::vector<uint8> cubin(static_cast<uint8*>(cubin_out),
static_cast<uint8*>(cubin_out) + cubin_size);
RETURN_IF_CUDA_ERROR(cuLinkDestroy(link_state));
return std::move(cubin);
}
} // namespace stream_executor

View File

@ -24,6 +24,9 @@ limitations under the License.
#include "tensorflow/stream_executor/platform/port.h"
namespace stream_executor {
namespace gpu {
class GpuContext;
}
// Compiles the given PTX string using ptxas and returns the resulting machine
// code (i.e. a cubin) as a byte array. The generated cubin matches the compute
@ -72,6 +75,11 @@ struct HsacoImage {
port::StatusOr<std::vector<uint8>> BundleGpuAsm(
std::vector<HsacoImage> images, const std::string rocm_root_dir);
// Links multiple relocatable GPU images (e.g. results of ptxas -c) into a
// single image.
port::StatusOr<std::vector<uint8>> LinkGpuAsm(
gpu::GpuContext* context, std::vector<CubinOrPTXImage> images);
} // namespace stream_executor
#endif // TENSORFLOW_STREAM_EXECUTOR_GPU_ASM_COMPILER_H_