[XLA/GPU] Emit constants computation by computation. This helps with incrementally transitioning to MLIR.

It also explicitly passes emitted constants to GpuExecutable.

PiperOrigin-RevId: 331678876
Change-Id: Ia855e16416b8b1a81e13b162d6503f8b5512c777
This commit is contained in:
Tim Shen 2020-09-14 19:09:10 -07:00 committed by TensorFlower Gardener
parent c56af7e249
commit 2f20efb58d
12 changed files with 149 additions and 91 deletions

View File

@ -307,6 +307,7 @@ Status GpuCompiler::OptimizeHloModule(
horizontal_fusion.AddPass<HloDCE>();
TF_RETURN_IF_ERROR(horizontal_fusion.Run(hlo_module).status());
}
{
HloPassPipeline pipeline("all_reduce_combiner");
pipeline.AddPass<AllReduceCombiner>(
@ -483,7 +484,8 @@ static Status CompileModuleToLlvmIrImpl(
int pointer_size, const HloProfileIndexMap* profile_index_map,
std::unique_ptr<llvm::Module>* llvm_module,
std::unique_ptr<BufferAssignment>* buffer_assignment,
std::unique_ptr<ThunkSchedule>* thunk_schedule) {
std::unique_ptr<ThunkSchedule>* thunk_schedule,
std::vector<GpuExecutable::ConstantInfo>* constants) {
*llvm_module = absl::make_unique<llvm::Module>("", *llvm_context);
(*llvm_module)->setTargetTriple(target_triple);
@ -530,8 +532,6 @@ static Status CompileModuleToLlvmIrImpl(
IrEmitterUnnested::Create(hlo_module->config(), entry_computation,
&ir_emitter_context));
TF_RETURN_IF_ERROR(ir_emitter->EmitConstantGlobals());
{
XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunBackend - IR emission");
@ -580,6 +580,10 @@ static Status CompileModuleToLlvmIrImpl(
*thunk_schedule = absl::make_unique<ThunkSchedule>(
std::make_unique<ThunkSequence>(std::move(thunk_sequence)),
std::move(stream_assignment), std::move(thunk_to_hlo));
if (constants) {
*constants = std::move(ir_emitter_context.constants());
}
}
return Status::OK();
@ -645,12 +649,13 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
std::unique_ptr<llvm::Module> llvm_module;
std::unique_ptr<BufferAssignment> buffer_assignment;
std::unique_ptr<ThunkSchedule> thunk_schedule;
std::vector<GpuExecutable::ConstantInfo> constants;
TF_RETURN_IF_ERROR(CompileModuleToLlvmIrImpl(
module.get(), &llvm_context, target_triple_, data_layout_,
stream_exec->platform()->Name(), gpu_device_info, cuda_compute_capability,
GetCanShareBuffer(), pointer_size_, profile_index_map.get(), &llvm_module,
&buffer_assignment, &thunk_schedule));
&buffer_assignment, &thunk_schedule, &constants));
if (user_pre_optimization_hook_) {
user_pre_optimization_hook_(*llvm_module);
@ -696,7 +701,7 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
backend_result.first, backend_result.second, gpu_version,
std::move(thunk_schedule), std::move(module),
std::move(buffer_assignment), std::move(profile_printer),
std::move(profile_index_map));
std::move(profile_index_map), std::move(constants));
if (embed_ir_in_executable) {
DCHECK_NE("", ir_module_string_before_opt);
gpu_executable->set_ir_module_string(ir_module_string_before_opt);
@ -730,7 +735,7 @@ StatusOr<std::unique_ptr<llvm::Module>> CompileModuleToLlvmIr(
hlo_module, llvm_context, target_triple, data_layout, platform_name,
gpu_device_info, cuda_compute_capability, DummyCanShareBufferFunction,
pointer_size, /*profile_index_map=*/nullptr, &llvm_module,
&buffer_assignment, &thunk_schedule));
&buffer_assignment, &thunk_schedule, nullptr));
return llvm_module;
}
} // namespace gpu

View File

@ -60,14 +60,16 @@ GpuExecutable::GpuExecutable(
std::shared_ptr<HloModule> hlo_module,
std::shared_ptr<const BufferAssignment> assignment,
std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map,
std::vector<ConstantInfo> globals)
: Executable(std::move(hlo_module), std::move(hlo_profile_printer_data),
std::move(hlo_profile_index_map)),
text_(text),
binary_(binary),
gpu_version_(gpu_version),
thunk_schedule_(std::move(thunk_schedule)),
assignment_(std::move(assignment)) {
assignment_(std::move(assignment)),
constants_(std::move(globals)) {
CHECK(has_module() && assignment_);
GpuDebugInfoManager::Get()->RegisterModule(module().name(), shared_module(),
assignment_);
@ -280,28 +282,23 @@ GpuExecutable::ResolveConstantGlobals(se::Stream* stream) {
se::ModuleHandle module_handle;
TF_RETURN_IF_ERROR(executor->LoadModule(module_spec, &module_handle));
for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
++i) {
const BufferAllocation& allocation = assignment_->GetAllocation(i);
if (allocation.is_constant()) {
TF_ASSIGN_OR_RETURN(
se::DeviceMemoryBase global,
executor->GetUntypedSymbol(
llvm_ir::ConstantBufferAllocationToGlobalName(allocation),
module_handle));
VLOG(3) << "Resolved global "
<< llvm_ir::ConstantBufferAllocationToGlobalName(allocation)
<< " to " << global.opaque();
InsertOrDie(&globals, i, global);
for (const auto& info : constants_) {
const Literal& literal = info.content;
const Literal& literal =
llvm_ir::LiteralForConstantAllocation(allocation);
CHECK(literal.shape().IsArray());
if (!ShouldEmitLiteralInLlvmIr(literal)) {
VLOG(3) << "H2D memcpy for constant with shape "
<< ShapeUtil::HumanString(literal.shape());
stream->ThenMemcpy(&global, literal.untyped_data(), allocation.size());
}
TF_ASSIGN_OR_RETURN(auto global, executor->GetUntypedSymbol(
info.symbol_name, module_handle));
VLOG(3) << "Resolved global " << info.symbol_name << " to "
<< global.opaque();
CHECK(literal.shape().IsArray());
if (!ShouldEmitLiteralInLlvmIr(literal)) {
VLOG(3) << "H2D memcpy for constant with shape "
<< ShapeUtil::HumanString(literal.shape());
stream->ThenMemcpy(&global, literal.untyped_data(), literal.size_bytes());
}
if (info.allocation_index != -1) {
InsertOrDie(&globals, info.allocation_index, global);
}
}
@ -334,7 +331,11 @@ StatusOr<se::DeviceMemoryBase> GpuExecutable::BufferForAllocation(
}
return registered_buffer;
} else if (allocation.is_constant()) {
return FindOrDie(*globals, arg_idx);
auto it = globals->find(arg_idx);
if (it == globals->end()) {
return se::DeviceMemoryBase();
}
return it->second;
} else {
// Allocate each allocation that might escape, or is the temp buffer.
CHECK(allocation.maybe_live_out() || allocation.IsPreallocatedTempBuffer());

View File

@ -49,6 +49,12 @@ namespace gpu {
// This is an immutable data type after initialization, and thus thread safe.
class GpuExecutable : public Executable {
public:
struct ConstantInfo {
std::string symbol_name;
xla::Literal content;
int allocation_index = -1;
};
// We need to share ownership of hlo_module and assignment with profiler to
// safely keep a reference to these objects during tracing period, thus they
// are passed as shared pointers.
@ -58,7 +64,8 @@ class GpuExecutable : public Executable {
std::shared_ptr<HloModule> hlo_module,
std::shared_ptr<const BufferAssignment> assignment,
std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map,
std::vector<ConstantInfo> constants);
~GpuExecutable() override;
int64 SizeOfGeneratedCodeInBytes() const override;
@ -169,6 +176,8 @@ class GpuExecutable : public Executable {
std::map<stream_executor::StreamExecutor*, BufferAllocToDeviceMemoryMap>
module_globals_ TF_GUARDED_BY(module_handle_mutex_);
std::vector<ConstantInfo> constants_;
TF_DISALLOW_COPY_AND_ASSIGN(GpuExecutable);
};

View File

@ -83,6 +83,8 @@ void HloToIrBindings::EmitBasePointersForHlos(
if (non_io_hlo->opcode() == HloOpcode::kConstant) {
llvm::Value* global_for_constant = module_->getGlobalVariable(
llvm_ir::ConstantHloToGlobalName(*non_io_hlo));
CHECK(global_for_constant)
<< llvm_ir::ConstantHloToGlobalName(*non_io_hlo);
BindHloToIrValue(*non_io_hlo, global_for_constant);
} else {
llvm::Type* pointee_type =

View File

@ -30,12 +30,14 @@ limitations under the License.
#include "tensorflow/compiler/xla/primitive_util.h"
#include "tensorflow/compiler/xla/service/elemental_ir_emitter.h"
#include "tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h"
#include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
#include "tensorflow/compiler/xla/service/gpu/ir_emitter_nested.h"
#include "tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h"
#include "tensorflow/compiler/xla/service/gpu/launch_dimensions.h"
#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
#include "tensorflow/compiler/xla/service/hlo_computation.h"
#include "tensorflow/compiler/xla/service/hlo_instructions.h"
#include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h"
#include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
#include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
#include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
@ -98,6 +100,64 @@ Status IrEmitter::DefaultAction(HloInstruction* hlo) {
.MakeElementGenerator(hlo, operand_to_generator));
}
Status IrEmitter::EmitConstants(const HloComputation& computation,
bool lookup_indices) {
for (HloInstruction* instr : computation.instructions()) {
if (instr->opcode() != HloOpcode::kConstant) {
continue;
}
Literal& literal = *Cast<HloConstantInstruction>(instr)->mutable_literal();
const bool should_emit_initializer = ShouldEmitLiteralInLlvmIr(literal);
llvm::ArrayType* global_type =
llvm::ArrayType::get(b_.getInt8Ty(), literal.size_bytes());
llvm::Constant* initializer =
should_emit_initializer
? llvm_ir::ConvertLiteralToIrConstant(literal, module_)
: llvm::ConstantAggregateZero::get(global_type);
if (should_emit_initializer) {
VLOG(3) << "Emitted initializer for constant with shape "
<< ShapeUtil::HumanString(literal.shape());
}
// These globals will be looked up by name by GpuExecutable so we need to
// give them an external linkage. Not all of their uses are visible in
// the LLVM IR (e.g. TupleThunk) so we can't give then a linkage that
// merely preserves their names (like available_externally), we also need
// to ensure that they stick around even if they're "unused".
//
// We may have to be more more clever here in the future if we notice that
// we're keeping around too many globals because of their linkage.
unsigned global_address_space = llvm_ir::GetGlobalMemoryAddressSpace(
*ir_emitter_context_->llvm_module());
std::string global_name = llvm_ir::ConstantHloToGlobalName(*instr);
llvm::GlobalVariable* global_for_const = new llvm::GlobalVariable(
global_type, /*isConstant=*/should_emit_initializer,
llvm::GlobalValue::ExternalLinkage,
/*Initializer=*/initializer, global_name,
/*TLMode=*/llvm::GlobalValue::NotThreadLocal,
/*AddressSpace=*/global_address_space,
/*isExternallyInitialized=*/false);
global_for_const->setAlignment(llvm::Align(kConstantBufferAlignBytes));
ir_emitter_context_->llvm_module()->getGlobalList().push_back(
global_for_const);
GpuExecutable::ConstantInfo info;
info.symbol_name = global_name;
info.content = literal.Clone();
if (lookup_indices) {
auto maybe_slice =
ir_emitter_context_->buffer_assignment().GetUniqueSlice(instr, {});
if (maybe_slice.ok()) {
info.allocation_index = maybe_slice.ValueOrDie().index();
}
}
ir_emitter_context_->constants().push_back(std::move(info));
}
return Status::OK();
}
Status IrEmitter::HandleConstant(HloInstruction* constant) {
return Status::OK();
}
@ -175,10 +235,12 @@ Status IrEmitter::EmitCallToNestedComputation(
llvm::Function*& emitted_function =
computation_to_ir_function_[&nested_computation];
if (emitted_function == nullptr) {
IrEmitterNested ir_emitter_nested(hlo_module_config_, nested_computation,
ir_emitter_context_);
TF_RETURN_IF_ERROR(ir_emitter_nested.CodegenNestedComputation());
emitted_function = ir_emitter_nested.GetEmittedFunction();
TF_ASSIGN_OR_RETURN(
auto ir_emitter_nested,
IrEmitterNested::Create(hlo_module_config_, nested_computation,
ir_emitter_context_));
TF_RETURN_IF_ERROR(ir_emitter_nested->CodegenNestedComputation());
emitted_function = ir_emitter_nested->GetEmittedFunction();
}
// Operands are in default address space for non-AMDGPU target.

View File

@ -105,6 +105,12 @@ class IrEmitter : public DfsHloVisitorWithDefault,
llvm::IRBuilder<>* builder() { return &b_; }
// Emits constants to generated LLVM IR, and also populate related
// inforamtion to ir_emitter_context for large-constant initializations. If
// `lookup_indices` is true, the allocation index associated with the constant
// is also populated.
Status EmitConstants(const HloComputation& computation, bool lookup_indices);
protected:
// Constructs an IrEmitter with the given IrEmitter context.
// ir_emitter_context is owned by the caller and should outlive the IrEmitter

View File

@ -22,12 +22,14 @@ limitations under the License.
#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
#include "tensorflow/compiler/xla/service/buffer_assignment.h"
#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
#include "tensorflow/compiler/xla/service/gpu/launch_dimensions.h"
#include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
#include "tensorflow/compiler/xla/service/name_uniquer.h"
namespace xla {
namespace gpu {
// IrEmitterContext encapsulates common (mutable and immutable) data structures
// used by both IrEmitterNested and IrEmitterUnnested, such as the buffer
// assignment and the name uniquer.
@ -71,6 +73,8 @@ class IrEmitterContext {
llvm::Module* llvm_module() { return llvm_module_; }
NameUniquer* name_uniquer() { return &name_uniquer_; }
std::vector<GpuExecutable::ConstantInfo>& constants() { return constants_; }
private:
const HloModule* hlo_module_;
const BufferAssignment* buffer_assignment_;
@ -81,6 +85,7 @@ class IrEmitterContext {
mlir::MLIRContext* mlir_context_;
llvm::Module* llvm_module_;
NameUniquer name_uniquer_;
std::vector<GpuExecutable::ConstantInfo> constants_;
};
} // namespace gpu

View File

@ -41,6 +41,16 @@ IrEmitterNested::IrEmitterNested(const HloModuleConfig& hlo_module_config,
: IrEmitter(hlo_module_config, ir_emitter_context, /*is_nested=*/true),
nested_computation_(nested_computation) {}
StatusOr<std::unique_ptr<IrEmitterNested>> IrEmitterNested::Create(
const HloModuleConfig& hlo_module_config,
const HloComputation& nested_computation,
IrEmitterContext* ir_emitter_context) {
std::unique_ptr<IrEmitterNested> emitter(new IrEmitterNested(
hlo_module_config, nested_computation, ir_emitter_context));
TF_RETURN_IF_ERROR(emitter->EmitConstants(nested_computation, false));
return emitter;
}
// Nested function serves the same purpose on GPU as a thread-local function on
// a CPU.
Status IrEmitterNested::CodegenNestedComputation() {

View File

@ -39,12 +39,11 @@ namespace gpu {
//
class IrEmitterNested : public IrEmitter {
public:
// Constructs an LLVM IR emitter for a nested HLO computation. `function` is
// the containing IR function this emitter produces IR to. See
// IrEmitter::IrEmitter for the meanings of other arguments.
IrEmitterNested(const HloModuleConfig& hlo_module_config,
const HloComputation& nested_computation,
IrEmitterContext* ir_emitter_context);
static StatusOr<std::unique_ptr<IrEmitterNested>> Create(
const HloModuleConfig& hlo_module_config,
const HloComputation& nested_computation,
IrEmitterContext* ir_emitter_context);
IrEmitterNested(const IrEmitterNested&) = delete;
IrEmitterNested& operator=(const IrEmitterNested&) = delete;
@ -62,6 +61,13 @@ class IrEmitterNested : public IrEmitter {
Status CodegenNestedComputation();
private:
// Constructs an LLVM IR emitter for a nested HLO computation. `function` is
// the containing IR function this emitter produces IR to. See
// IrEmitter::IrEmitter for the meanings of other arguments.
IrEmitterNested(const HloModuleConfig& hlo_module_config,
const HloComputation& nested_computation,
IrEmitterContext* ir_emitter_context);
const HloComputation& nested_computation_;
llvm::Function* emitted_function_;
};

View File

@ -228,6 +228,7 @@ StatusOr<std::unique_ptr<IrEmitterUnnested>> IrEmitterUnnested::Create(
auto emitter = std::unique_ptr<IrEmitterUnnested>(new IrEmitterUnnested(
hlo_module_config, hlo_computation, ir_emitter_context));
TF_RETURN_IF_ERROR(emitter->lhlo_scratch_emitter_.Initialize());
TF_RETURN_IF_ERROR(emitter->EmitConstants(*hlo_computation, true));
return std::move(emitter);
}
@ -3992,52 +3993,6 @@ Status IrEmitterUnnested::EmitReductionFromOrToContiguousDimensions(
return Status::OK();
}
Status IrEmitterUnnested::EmitConstantGlobals() {
for (const BufferAllocation& allocation :
ir_emitter_context_->buffer_assignment().Allocations()) {
if (!allocation.is_constant()) {
continue;
}
const Literal& literal = llvm_ir::LiteralForConstantAllocation(allocation);
const bool should_emit_initializer = ShouldEmitLiteralInLlvmIr(literal);
llvm::ArrayType* global_type =
llvm::ArrayType::get(b_.getInt8Ty(), allocation.size());
llvm::Constant* initializer =
should_emit_initializer
? llvm_ir::ConvertLiteralToIrConstant(literal, module_)
: llvm::ConstantAggregateZero::get(global_type);
if (should_emit_initializer) {
VLOG(3) << "Emitted initializer for constant with shape "
<< ShapeUtil::HumanString(literal.shape());
}
// These globals will be looked up by name by GpuExecutable so we need to
// give them an external linkage. Not all of their uses are visible in
// the LLVM IR (e.g. TupleThunk) so we can't give then a linkage that
// merely preserves their names (like available_externally), we also need
// to ensure that they stick around even if they're "unused".
//
// We may have to be more more clever here in the future if we notice that
// we're keeping around too many globals because of their linkage.
unsigned global_address_space = llvm_ir::GetGlobalMemoryAddressSpace(
*ir_emitter_context_->llvm_module());
llvm::GlobalVariable* global_for_const = new llvm::GlobalVariable(
global_type, /*isConstant=*/should_emit_initializer,
llvm::GlobalValue::ExternalLinkage,
/*Initializer=*/initializer,
llvm_ir::ConstantBufferAllocationToGlobalName(allocation),
/*TLMode=*/llvm::GlobalValue::NotThreadLocal,
/*AddressSpace=*/global_address_space,
/*isExternallyInitialized=*/false);
global_for_const->setAlignment(llvm::Align(kConstantBufferAlignBytes));
ir_emitter_context_->llvm_module()->getGlobalList().push_back(
global_for_const);
}
return Status::OK();
}
// Emits code for slices based on the below structure. An if statement with
// a guarding condition is generated for each ROOT slice.
//

View File

@ -180,9 +180,6 @@ class IrEmitterUnnested : public IrEmitter,
const HloInstruction& hlo, const llvm_ir::ElementGenerator& body_emitter,
KernelThunk* thunk, int unroll_factor);
// Emits LLVM global variables corresponding to constant instructions.
Status EmitConstantGlobals();
Status Postprocess(HloInstruction* hlo) override;
private:

View File

@ -580,7 +580,7 @@ StatusOr<std::unique_ptr<Executable>> MlirCompilerImpl::RunBackend(
return {absl::make_unique<GpuExecutable>(
ptx, cubin, GetGpuVersion(stream_exec), std::move(thunk_schedule),
emission_context.releaseHloModule(), std::move(buffer_assignment),
nullptr, nullptr)};
nullptr, nullptr, std::vector<GpuExecutable::ConstantInfo>())};
}
StatusOr<std::vector<std::unique_ptr<Executable>>> MlirCompilerImpl::Compile(