[XLA/GPU] Emit constants computation by computation. This helps with incrementally transitioning to MLIR.
It also explicitly passes emitted constants to GpuExecutable. PiperOrigin-RevId: 331678876 Change-Id: Ia855e16416b8b1a81e13b162d6503f8b5512c777
This commit is contained in:
parent
c56af7e249
commit
2f20efb58d
@ -307,6 +307,7 @@ Status GpuCompiler::OptimizeHloModule(
|
||||
horizontal_fusion.AddPass<HloDCE>();
|
||||
TF_RETURN_IF_ERROR(horizontal_fusion.Run(hlo_module).status());
|
||||
}
|
||||
|
||||
{
|
||||
HloPassPipeline pipeline("all_reduce_combiner");
|
||||
pipeline.AddPass<AllReduceCombiner>(
|
||||
@ -483,7 +484,8 @@ static Status CompileModuleToLlvmIrImpl(
|
||||
int pointer_size, const HloProfileIndexMap* profile_index_map,
|
||||
std::unique_ptr<llvm::Module>* llvm_module,
|
||||
std::unique_ptr<BufferAssignment>* buffer_assignment,
|
||||
std::unique_ptr<ThunkSchedule>* thunk_schedule) {
|
||||
std::unique_ptr<ThunkSchedule>* thunk_schedule,
|
||||
std::vector<GpuExecutable::ConstantInfo>* constants) {
|
||||
*llvm_module = absl::make_unique<llvm::Module>("", *llvm_context);
|
||||
|
||||
(*llvm_module)->setTargetTriple(target_triple);
|
||||
@ -530,8 +532,6 @@ static Status CompileModuleToLlvmIrImpl(
|
||||
IrEmitterUnnested::Create(hlo_module->config(), entry_computation,
|
||||
&ir_emitter_context));
|
||||
|
||||
TF_RETURN_IF_ERROR(ir_emitter->EmitConstantGlobals());
|
||||
|
||||
{
|
||||
XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunBackend - IR emission");
|
||||
|
||||
@ -580,6 +580,10 @@ static Status CompileModuleToLlvmIrImpl(
|
||||
*thunk_schedule = absl::make_unique<ThunkSchedule>(
|
||||
std::make_unique<ThunkSequence>(std::move(thunk_sequence)),
|
||||
std::move(stream_assignment), std::move(thunk_to_hlo));
|
||||
|
||||
if (constants) {
|
||||
*constants = std::move(ir_emitter_context.constants());
|
||||
}
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
@ -645,12 +649,13 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
|
||||
std::unique_ptr<llvm::Module> llvm_module;
|
||||
std::unique_ptr<BufferAssignment> buffer_assignment;
|
||||
std::unique_ptr<ThunkSchedule> thunk_schedule;
|
||||
std::vector<GpuExecutable::ConstantInfo> constants;
|
||||
|
||||
TF_RETURN_IF_ERROR(CompileModuleToLlvmIrImpl(
|
||||
module.get(), &llvm_context, target_triple_, data_layout_,
|
||||
stream_exec->platform()->Name(), gpu_device_info, cuda_compute_capability,
|
||||
GetCanShareBuffer(), pointer_size_, profile_index_map.get(), &llvm_module,
|
||||
&buffer_assignment, &thunk_schedule));
|
||||
&buffer_assignment, &thunk_schedule, &constants));
|
||||
|
||||
if (user_pre_optimization_hook_) {
|
||||
user_pre_optimization_hook_(*llvm_module);
|
||||
@ -696,7 +701,7 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
|
||||
backend_result.first, backend_result.second, gpu_version,
|
||||
std::move(thunk_schedule), std::move(module),
|
||||
std::move(buffer_assignment), std::move(profile_printer),
|
||||
std::move(profile_index_map));
|
||||
std::move(profile_index_map), std::move(constants));
|
||||
if (embed_ir_in_executable) {
|
||||
DCHECK_NE("", ir_module_string_before_opt);
|
||||
gpu_executable->set_ir_module_string(ir_module_string_before_opt);
|
||||
@ -730,7 +735,7 @@ StatusOr<std::unique_ptr<llvm::Module>> CompileModuleToLlvmIr(
|
||||
hlo_module, llvm_context, target_triple, data_layout, platform_name,
|
||||
gpu_device_info, cuda_compute_capability, DummyCanShareBufferFunction,
|
||||
pointer_size, /*profile_index_map=*/nullptr, &llvm_module,
|
||||
&buffer_assignment, &thunk_schedule));
|
||||
&buffer_assignment, &thunk_schedule, nullptr));
|
||||
return llvm_module;
|
||||
}
|
||||
} // namespace gpu
|
||||
|
@ -60,14 +60,16 @@ GpuExecutable::GpuExecutable(
|
||||
std::shared_ptr<HloModule> hlo_module,
|
||||
std::shared_ptr<const BufferAssignment> assignment,
|
||||
std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
|
||||
std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
|
||||
std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map,
|
||||
std::vector<ConstantInfo> globals)
|
||||
: Executable(std::move(hlo_module), std::move(hlo_profile_printer_data),
|
||||
std::move(hlo_profile_index_map)),
|
||||
text_(text),
|
||||
binary_(binary),
|
||||
gpu_version_(gpu_version),
|
||||
thunk_schedule_(std::move(thunk_schedule)),
|
||||
assignment_(std::move(assignment)) {
|
||||
assignment_(std::move(assignment)),
|
||||
constants_(std::move(globals)) {
|
||||
CHECK(has_module() && assignment_);
|
||||
GpuDebugInfoManager::Get()->RegisterModule(module().name(), shared_module(),
|
||||
assignment_);
|
||||
@ -280,28 +282,23 @@ GpuExecutable::ResolveConstantGlobals(se::Stream* stream) {
|
||||
se::ModuleHandle module_handle;
|
||||
TF_RETURN_IF_ERROR(executor->LoadModule(module_spec, &module_handle));
|
||||
|
||||
for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
|
||||
++i) {
|
||||
const BufferAllocation& allocation = assignment_->GetAllocation(i);
|
||||
if (allocation.is_constant()) {
|
||||
TF_ASSIGN_OR_RETURN(
|
||||
se::DeviceMemoryBase global,
|
||||
executor->GetUntypedSymbol(
|
||||
llvm_ir::ConstantBufferAllocationToGlobalName(allocation),
|
||||
module_handle));
|
||||
VLOG(3) << "Resolved global "
|
||||
<< llvm_ir::ConstantBufferAllocationToGlobalName(allocation)
|
||||
<< " to " << global.opaque();
|
||||
InsertOrDie(&globals, i, global);
|
||||
for (const auto& info : constants_) {
|
||||
const Literal& literal = info.content;
|
||||
|
||||
const Literal& literal =
|
||||
llvm_ir::LiteralForConstantAllocation(allocation);
|
||||
CHECK(literal.shape().IsArray());
|
||||
if (!ShouldEmitLiteralInLlvmIr(literal)) {
|
||||
VLOG(3) << "H2D memcpy for constant with shape "
|
||||
<< ShapeUtil::HumanString(literal.shape());
|
||||
stream->ThenMemcpy(&global, literal.untyped_data(), allocation.size());
|
||||
}
|
||||
TF_ASSIGN_OR_RETURN(auto global, executor->GetUntypedSymbol(
|
||||
info.symbol_name, module_handle));
|
||||
VLOG(3) << "Resolved global " << info.symbol_name << " to "
|
||||
<< global.opaque();
|
||||
|
||||
CHECK(literal.shape().IsArray());
|
||||
if (!ShouldEmitLiteralInLlvmIr(literal)) {
|
||||
VLOG(3) << "H2D memcpy for constant with shape "
|
||||
<< ShapeUtil::HumanString(literal.shape());
|
||||
stream->ThenMemcpy(&global, literal.untyped_data(), literal.size_bytes());
|
||||
}
|
||||
|
||||
if (info.allocation_index != -1) {
|
||||
InsertOrDie(&globals, info.allocation_index, global);
|
||||
}
|
||||
}
|
||||
|
||||
@ -334,7 +331,11 @@ StatusOr<se::DeviceMemoryBase> GpuExecutable::BufferForAllocation(
|
||||
}
|
||||
return registered_buffer;
|
||||
} else if (allocation.is_constant()) {
|
||||
return FindOrDie(*globals, arg_idx);
|
||||
auto it = globals->find(arg_idx);
|
||||
if (it == globals->end()) {
|
||||
return se::DeviceMemoryBase();
|
||||
}
|
||||
return it->second;
|
||||
} else {
|
||||
// Allocate each allocation that might escape, or is the temp buffer.
|
||||
CHECK(allocation.maybe_live_out() || allocation.IsPreallocatedTempBuffer());
|
||||
|
@ -49,6 +49,12 @@ namespace gpu {
|
||||
// This is an immutable data type after initialization, and thus thread safe.
|
||||
class GpuExecutable : public Executable {
|
||||
public:
|
||||
struct ConstantInfo {
|
||||
std::string symbol_name;
|
||||
xla::Literal content;
|
||||
int allocation_index = -1;
|
||||
};
|
||||
|
||||
// We need to share ownership of hlo_module and assignment with profiler to
|
||||
// safely keep a reference to these objects during tracing period, thus they
|
||||
// are passed as shared pointers.
|
||||
@ -58,7 +64,8 @@ class GpuExecutable : public Executable {
|
||||
std::shared_ptr<HloModule> hlo_module,
|
||||
std::shared_ptr<const BufferAssignment> assignment,
|
||||
std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
|
||||
std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
|
||||
std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map,
|
||||
std::vector<ConstantInfo> constants);
|
||||
~GpuExecutable() override;
|
||||
|
||||
int64 SizeOfGeneratedCodeInBytes() const override;
|
||||
@ -169,6 +176,8 @@ class GpuExecutable : public Executable {
|
||||
std::map<stream_executor::StreamExecutor*, BufferAllocToDeviceMemoryMap>
|
||||
module_globals_ TF_GUARDED_BY(module_handle_mutex_);
|
||||
|
||||
std::vector<ConstantInfo> constants_;
|
||||
|
||||
TF_DISALLOW_COPY_AND_ASSIGN(GpuExecutable);
|
||||
};
|
||||
|
||||
|
@ -83,6 +83,8 @@ void HloToIrBindings::EmitBasePointersForHlos(
|
||||
if (non_io_hlo->opcode() == HloOpcode::kConstant) {
|
||||
llvm::Value* global_for_constant = module_->getGlobalVariable(
|
||||
llvm_ir::ConstantHloToGlobalName(*non_io_hlo));
|
||||
CHECK(global_for_constant)
|
||||
<< llvm_ir::ConstantHloToGlobalName(*non_io_hlo);
|
||||
BindHloToIrValue(*non_io_hlo, global_for_constant);
|
||||
} else {
|
||||
llvm::Type* pointee_type =
|
||||
|
@ -30,12 +30,14 @@ limitations under the License.
|
||||
#include "tensorflow/compiler/xla/primitive_util.h"
|
||||
#include "tensorflow/compiler/xla/service/elemental_ir_emitter.h"
|
||||
#include "tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h"
|
||||
#include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
|
||||
#include "tensorflow/compiler/xla/service/gpu/ir_emitter_nested.h"
|
||||
#include "tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h"
|
||||
#include "tensorflow/compiler/xla/service/gpu/launch_dimensions.h"
|
||||
#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
|
||||
#include "tensorflow/compiler/xla/service/hlo_computation.h"
|
||||
#include "tensorflow/compiler/xla/service/hlo_instructions.h"
|
||||
#include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h"
|
||||
#include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
|
||||
#include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
|
||||
#include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
|
||||
@ -98,6 +100,64 @@ Status IrEmitter::DefaultAction(HloInstruction* hlo) {
|
||||
.MakeElementGenerator(hlo, operand_to_generator));
|
||||
}
|
||||
|
||||
Status IrEmitter::EmitConstants(const HloComputation& computation,
|
||||
bool lookup_indices) {
|
||||
for (HloInstruction* instr : computation.instructions()) {
|
||||
if (instr->opcode() != HloOpcode::kConstant) {
|
||||
continue;
|
||||
}
|
||||
Literal& literal = *Cast<HloConstantInstruction>(instr)->mutable_literal();
|
||||
const bool should_emit_initializer = ShouldEmitLiteralInLlvmIr(literal);
|
||||
llvm::ArrayType* global_type =
|
||||
llvm::ArrayType::get(b_.getInt8Ty(), literal.size_bytes());
|
||||
llvm::Constant* initializer =
|
||||
should_emit_initializer
|
||||
? llvm_ir::ConvertLiteralToIrConstant(literal, module_)
|
||||
: llvm::ConstantAggregateZero::get(global_type);
|
||||
if (should_emit_initializer) {
|
||||
VLOG(3) << "Emitted initializer for constant with shape "
|
||||
<< ShapeUtil::HumanString(literal.shape());
|
||||
}
|
||||
|
||||
// These globals will be looked up by name by GpuExecutable so we need to
|
||||
// give them an external linkage. Not all of their uses are visible in
|
||||
// the LLVM IR (e.g. TupleThunk) so we can't give then a linkage that
|
||||
// merely preserves their names (like available_externally), we also need
|
||||
// to ensure that they stick around even if they're "unused".
|
||||
//
|
||||
// We may have to be more more clever here in the future if we notice that
|
||||
// we're keeping around too many globals because of their linkage.
|
||||
unsigned global_address_space = llvm_ir::GetGlobalMemoryAddressSpace(
|
||||
*ir_emitter_context_->llvm_module());
|
||||
|
||||
std::string global_name = llvm_ir::ConstantHloToGlobalName(*instr);
|
||||
|
||||
llvm::GlobalVariable* global_for_const = new llvm::GlobalVariable(
|
||||
global_type, /*isConstant=*/should_emit_initializer,
|
||||
llvm::GlobalValue::ExternalLinkage,
|
||||
/*Initializer=*/initializer, global_name,
|
||||
/*TLMode=*/llvm::GlobalValue::NotThreadLocal,
|
||||
/*AddressSpace=*/global_address_space,
|
||||
/*isExternallyInitialized=*/false);
|
||||
global_for_const->setAlignment(llvm::Align(kConstantBufferAlignBytes));
|
||||
ir_emitter_context_->llvm_module()->getGlobalList().push_back(
|
||||
global_for_const);
|
||||
|
||||
GpuExecutable::ConstantInfo info;
|
||||
info.symbol_name = global_name;
|
||||
info.content = literal.Clone();
|
||||
if (lookup_indices) {
|
||||
auto maybe_slice =
|
||||
ir_emitter_context_->buffer_assignment().GetUniqueSlice(instr, {});
|
||||
if (maybe_slice.ok()) {
|
||||
info.allocation_index = maybe_slice.ValueOrDie().index();
|
||||
}
|
||||
}
|
||||
ir_emitter_context_->constants().push_back(std::move(info));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status IrEmitter::HandleConstant(HloInstruction* constant) {
|
||||
return Status::OK();
|
||||
}
|
||||
@ -175,10 +235,12 @@ Status IrEmitter::EmitCallToNestedComputation(
|
||||
llvm::Function*& emitted_function =
|
||||
computation_to_ir_function_[&nested_computation];
|
||||
if (emitted_function == nullptr) {
|
||||
IrEmitterNested ir_emitter_nested(hlo_module_config_, nested_computation,
|
||||
ir_emitter_context_);
|
||||
TF_RETURN_IF_ERROR(ir_emitter_nested.CodegenNestedComputation());
|
||||
emitted_function = ir_emitter_nested.GetEmittedFunction();
|
||||
TF_ASSIGN_OR_RETURN(
|
||||
auto ir_emitter_nested,
|
||||
IrEmitterNested::Create(hlo_module_config_, nested_computation,
|
||||
ir_emitter_context_));
|
||||
TF_RETURN_IF_ERROR(ir_emitter_nested->CodegenNestedComputation());
|
||||
emitted_function = ir_emitter_nested->GetEmittedFunction();
|
||||
}
|
||||
|
||||
// Operands are in default address space for non-AMDGPU target.
|
||||
|
@ -105,6 +105,12 @@ class IrEmitter : public DfsHloVisitorWithDefault,
|
||||
|
||||
llvm::IRBuilder<>* builder() { return &b_; }
|
||||
|
||||
// Emits constants to generated LLVM IR, and also populate related
|
||||
// inforamtion to ir_emitter_context for large-constant initializations. If
|
||||
// `lookup_indices` is true, the allocation index associated with the constant
|
||||
// is also populated.
|
||||
Status EmitConstants(const HloComputation& computation, bool lookup_indices);
|
||||
|
||||
protected:
|
||||
// Constructs an IrEmitter with the given IrEmitter context.
|
||||
// ir_emitter_context is owned by the caller and should outlive the IrEmitter
|
||||
|
@ -22,12 +22,14 @@ limitations under the License.
|
||||
#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
|
||||
#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
|
||||
#include "tensorflow/compiler/xla/service/buffer_assignment.h"
|
||||
#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
|
||||
#include "tensorflow/compiler/xla/service/gpu/launch_dimensions.h"
|
||||
#include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
|
||||
#include "tensorflow/compiler/xla/service/name_uniquer.h"
|
||||
|
||||
namespace xla {
|
||||
namespace gpu {
|
||||
|
||||
// IrEmitterContext encapsulates common (mutable and immutable) data structures
|
||||
// used by both IrEmitterNested and IrEmitterUnnested, such as the buffer
|
||||
// assignment and the name uniquer.
|
||||
@ -71,6 +73,8 @@ class IrEmitterContext {
|
||||
llvm::Module* llvm_module() { return llvm_module_; }
|
||||
NameUniquer* name_uniquer() { return &name_uniquer_; }
|
||||
|
||||
std::vector<GpuExecutable::ConstantInfo>& constants() { return constants_; }
|
||||
|
||||
private:
|
||||
const HloModule* hlo_module_;
|
||||
const BufferAssignment* buffer_assignment_;
|
||||
@ -81,6 +85,7 @@ class IrEmitterContext {
|
||||
mlir::MLIRContext* mlir_context_;
|
||||
llvm::Module* llvm_module_;
|
||||
NameUniquer name_uniquer_;
|
||||
std::vector<GpuExecutable::ConstantInfo> constants_;
|
||||
};
|
||||
|
||||
} // namespace gpu
|
||||
|
@ -41,6 +41,16 @@ IrEmitterNested::IrEmitterNested(const HloModuleConfig& hlo_module_config,
|
||||
: IrEmitter(hlo_module_config, ir_emitter_context, /*is_nested=*/true),
|
||||
nested_computation_(nested_computation) {}
|
||||
|
||||
StatusOr<std::unique_ptr<IrEmitterNested>> IrEmitterNested::Create(
|
||||
const HloModuleConfig& hlo_module_config,
|
||||
const HloComputation& nested_computation,
|
||||
IrEmitterContext* ir_emitter_context) {
|
||||
std::unique_ptr<IrEmitterNested> emitter(new IrEmitterNested(
|
||||
hlo_module_config, nested_computation, ir_emitter_context));
|
||||
TF_RETURN_IF_ERROR(emitter->EmitConstants(nested_computation, false));
|
||||
return emitter;
|
||||
}
|
||||
|
||||
// Nested function serves the same purpose on GPU as a thread-local function on
|
||||
// a CPU.
|
||||
Status IrEmitterNested::CodegenNestedComputation() {
|
||||
|
@ -39,12 +39,11 @@ namespace gpu {
|
||||
//
|
||||
class IrEmitterNested : public IrEmitter {
|
||||
public:
|
||||
// Constructs an LLVM IR emitter for a nested HLO computation. `function` is
|
||||
// the containing IR function this emitter produces IR to. See
|
||||
// IrEmitter::IrEmitter for the meanings of other arguments.
|
||||
IrEmitterNested(const HloModuleConfig& hlo_module_config,
|
||||
const HloComputation& nested_computation,
|
||||
IrEmitterContext* ir_emitter_context);
|
||||
static StatusOr<std::unique_ptr<IrEmitterNested>> Create(
|
||||
const HloModuleConfig& hlo_module_config,
|
||||
const HloComputation& nested_computation,
|
||||
IrEmitterContext* ir_emitter_context);
|
||||
|
||||
IrEmitterNested(const IrEmitterNested&) = delete;
|
||||
IrEmitterNested& operator=(const IrEmitterNested&) = delete;
|
||||
|
||||
@ -62,6 +61,13 @@ class IrEmitterNested : public IrEmitter {
|
||||
Status CodegenNestedComputation();
|
||||
|
||||
private:
|
||||
// Constructs an LLVM IR emitter for a nested HLO computation. `function` is
|
||||
// the containing IR function this emitter produces IR to. See
|
||||
// IrEmitter::IrEmitter for the meanings of other arguments.
|
||||
IrEmitterNested(const HloModuleConfig& hlo_module_config,
|
||||
const HloComputation& nested_computation,
|
||||
IrEmitterContext* ir_emitter_context);
|
||||
|
||||
const HloComputation& nested_computation_;
|
||||
llvm::Function* emitted_function_;
|
||||
};
|
||||
|
@ -228,6 +228,7 @@ StatusOr<std::unique_ptr<IrEmitterUnnested>> IrEmitterUnnested::Create(
|
||||
auto emitter = std::unique_ptr<IrEmitterUnnested>(new IrEmitterUnnested(
|
||||
hlo_module_config, hlo_computation, ir_emitter_context));
|
||||
TF_RETURN_IF_ERROR(emitter->lhlo_scratch_emitter_.Initialize());
|
||||
TF_RETURN_IF_ERROR(emitter->EmitConstants(*hlo_computation, true));
|
||||
return std::move(emitter);
|
||||
}
|
||||
|
||||
@ -3992,52 +3993,6 @@ Status IrEmitterUnnested::EmitReductionFromOrToContiguousDimensions(
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status IrEmitterUnnested::EmitConstantGlobals() {
|
||||
for (const BufferAllocation& allocation :
|
||||
ir_emitter_context_->buffer_assignment().Allocations()) {
|
||||
if (!allocation.is_constant()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const Literal& literal = llvm_ir::LiteralForConstantAllocation(allocation);
|
||||
const bool should_emit_initializer = ShouldEmitLiteralInLlvmIr(literal);
|
||||
llvm::ArrayType* global_type =
|
||||
llvm::ArrayType::get(b_.getInt8Ty(), allocation.size());
|
||||
llvm::Constant* initializer =
|
||||
should_emit_initializer
|
||||
? llvm_ir::ConvertLiteralToIrConstant(literal, module_)
|
||||
: llvm::ConstantAggregateZero::get(global_type);
|
||||
if (should_emit_initializer) {
|
||||
VLOG(3) << "Emitted initializer for constant with shape "
|
||||
<< ShapeUtil::HumanString(literal.shape());
|
||||
}
|
||||
|
||||
// These globals will be looked up by name by GpuExecutable so we need to
|
||||
// give them an external linkage. Not all of their uses are visible in
|
||||
// the LLVM IR (e.g. TupleThunk) so we can't give then a linkage that
|
||||
// merely preserves their names (like available_externally), we also need
|
||||
// to ensure that they stick around even if they're "unused".
|
||||
//
|
||||
// We may have to be more more clever here in the future if we notice that
|
||||
// we're keeping around too many globals because of their linkage.
|
||||
unsigned global_address_space = llvm_ir::GetGlobalMemoryAddressSpace(
|
||||
*ir_emitter_context_->llvm_module());
|
||||
llvm::GlobalVariable* global_for_const = new llvm::GlobalVariable(
|
||||
global_type, /*isConstant=*/should_emit_initializer,
|
||||
llvm::GlobalValue::ExternalLinkage,
|
||||
/*Initializer=*/initializer,
|
||||
llvm_ir::ConstantBufferAllocationToGlobalName(allocation),
|
||||
/*TLMode=*/llvm::GlobalValue::NotThreadLocal,
|
||||
/*AddressSpace=*/global_address_space,
|
||||
/*isExternallyInitialized=*/false);
|
||||
global_for_const->setAlignment(llvm::Align(kConstantBufferAlignBytes));
|
||||
ir_emitter_context_->llvm_module()->getGlobalList().push_back(
|
||||
global_for_const);
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Emits code for slices based on the below structure. An if statement with
|
||||
// a guarding condition is generated for each ROOT slice.
|
||||
//
|
||||
|
@ -180,9 +180,6 @@ class IrEmitterUnnested : public IrEmitter,
|
||||
const HloInstruction& hlo, const llvm_ir::ElementGenerator& body_emitter,
|
||||
KernelThunk* thunk, int unroll_factor);
|
||||
|
||||
// Emits LLVM global variables corresponding to constant instructions.
|
||||
Status EmitConstantGlobals();
|
||||
|
||||
Status Postprocess(HloInstruction* hlo) override;
|
||||
|
||||
private:
|
||||
|
@ -580,7 +580,7 @@ StatusOr<std::unique_ptr<Executable>> MlirCompilerImpl::RunBackend(
|
||||
return {absl::make_unique<GpuExecutable>(
|
||||
ptx, cubin, GetGpuVersion(stream_exec), std::move(thunk_schedule),
|
||||
emission_context.releaseHloModule(), std::move(buffer_assignment),
|
||||
nullptr, nullptr)};
|
||||
nullptr, nullptr, std::vector<GpuExecutable::ConstantInfo>())};
|
||||
}
|
||||
|
||||
StatusOr<std::vector<std::unique_ptr<Executable>>> MlirCompilerImpl::Compile(
|
||||
|
Loading…
x
Reference in New Issue
Block a user