[XLA/GPU] Emit constants computation by computation. This helps with incrementally transitioning to MLIR.

It also explicitly passes emitted constants to GpuExecutable. PiperOrigin-RevId: 331678876 Change-Id: Ia855e16416b8b1a81e13b162d6503f8b5512c777
2020-09-14 19:09:10 -07:00 · 2020-09-14 19:09:10 -07:00 · 2f20efb58d
commit 2f20efb58d
parent c56af7e249
12 changed files with 149 additions and 91 deletions
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@ -307,6 +307,7 @@ Status GpuCompiler::OptimizeHloModule(
    horizontal_fusion.AddPass<HloDCE>();
    TF_RETURN_IF_ERROR(horizontal_fusion.Run(hlo_module).status());
  }
+
  {
    HloPassPipeline pipeline("all_reduce_combiner");
    pipeline.AddPass<AllReduceCombiner>(
@ -483,7 +484,8 @@ static Status CompileModuleToLlvmIrImpl(
    int pointer_size, const HloProfileIndexMap* profile_index_map,
    std::unique_ptr<llvm::Module>* llvm_module,
    std::unique_ptr<BufferAssignment>* buffer_assignment,
-    std::unique_ptr<ThunkSchedule>* thunk_schedule) {
+    std::unique_ptr<ThunkSchedule>* thunk_schedule,
+    std::vector<GpuExecutable::ConstantInfo>* constants) {
  *llvm_module = absl::make_unique<llvm::Module>("", *llvm_context);

  (*llvm_module)->setTargetTriple(target_triple);
@ -530,8 +532,6 @@ static Status CompileModuleToLlvmIrImpl(
      IrEmitterUnnested::Create(hlo_module->config(), entry_computation,
                                &ir_emitter_context));

-  TF_RETURN_IF_ERROR(ir_emitter->EmitConstantGlobals());
-
  {
    XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunBackend - IR emission");

@ -580,6 +580,10 @@ static Status CompileModuleToLlvmIrImpl(
    *thunk_schedule = absl::make_unique<ThunkSchedule>(
        std::make_unique<ThunkSequence>(std::move(thunk_sequence)),
        std::move(stream_assignment), std::move(thunk_to_hlo));
+
+    if (constants) {
+      *constants = std::move(ir_emitter_context.constants());
+    }
  }

  return Status::OK();
@ -645,12 +649,13 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
  std::unique_ptr<llvm::Module> llvm_module;
  std::unique_ptr<BufferAssignment> buffer_assignment;
  std::unique_ptr<ThunkSchedule> thunk_schedule;
+  std::vector<GpuExecutable::ConstantInfo> constants;

  TF_RETURN_IF_ERROR(CompileModuleToLlvmIrImpl(
      module.get(), &llvm_context, target_triple_, data_layout_,
      stream_exec->platform()->Name(), gpu_device_info, cuda_compute_capability,
      GetCanShareBuffer(), pointer_size_, profile_index_map.get(), &llvm_module,
-      &buffer_assignment, &thunk_schedule));
+      &buffer_assignment, &thunk_schedule, &constants));

  if (user_pre_optimization_hook_) {
    user_pre_optimization_hook_(*llvm_module);
@ -696,7 +701,7 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
      backend_result.first, backend_result.second, gpu_version,
      std::move(thunk_schedule), std::move(module),
      std::move(buffer_assignment), std::move(profile_printer),
-      std::move(profile_index_map));
+      std::move(profile_index_map), std::move(constants));
  if (embed_ir_in_executable) {
    DCHECK_NE("", ir_module_string_before_opt);
    gpu_executable->set_ir_module_string(ir_module_string_before_opt);
@ -730,7 +735,7 @@ StatusOr<std::unique_ptr<llvm::Module>> CompileModuleToLlvmIr(
      hlo_module, llvm_context, target_triple, data_layout, platform_name,
      gpu_device_info, cuda_compute_capability, DummyCanShareBufferFunction,
      pointer_size, /*profile_index_map=*/nullptr, &llvm_module,
-      &buffer_assignment, &thunk_schedule));
+      &buffer_assignment, &thunk_schedule, nullptr));
  return llvm_module;
 }
 }  // namespace gpu
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@ -60,14 +60,16 @@ GpuExecutable::GpuExecutable(
    std::shared_ptr<HloModule> hlo_module,
    std::shared_ptr<const BufferAssignment> assignment,
    std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
-    std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
+    std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map,
+    std::vector<ConstantInfo> globals)
    : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data),
                 std::move(hlo_profile_index_map)),
      text_(text),
      binary_(binary),
      gpu_version_(gpu_version),
      thunk_schedule_(std::move(thunk_schedule)),
-      assignment_(std::move(assignment)) {
+      assignment_(std::move(assignment)),
+      constants_(std::move(globals)) {
  CHECK(has_module() && assignment_);
  GpuDebugInfoManager::Get()->RegisterModule(module().name(), shared_module(),
                                             assignment_);
@ -280,28 +282,23 @@ GpuExecutable::ResolveConstantGlobals(se::Stream* stream) {
  se::ModuleHandle module_handle;
  TF_RETURN_IF_ERROR(executor->LoadModule(module_spec, &module_handle));

-  for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
-       ++i) {
-    const BufferAllocation& allocation = assignment_->GetAllocation(i);
-    if (allocation.is_constant()) {
-      TF_ASSIGN_OR_RETURN(
-          se::DeviceMemoryBase global,
-          executor->GetUntypedSymbol(
-              llvm_ir::ConstantBufferAllocationToGlobalName(allocation),
-              module_handle));
-      VLOG(3) << "Resolved global "
-              << llvm_ir::ConstantBufferAllocationToGlobalName(allocation)
-              << " to " << global.opaque();
-      InsertOrDie(&globals, i, global);
+  for (const auto& info : constants_) {
+    const Literal& literal = info.content;

-      const Literal& literal =
-          llvm_ir::LiteralForConstantAllocation(allocation);
-      CHECK(literal.shape().IsArray());
-      if (!ShouldEmitLiteralInLlvmIr(literal)) {
-        VLOG(3) << "H2D memcpy for constant with shape "
-                << ShapeUtil::HumanString(literal.shape());
-        stream->ThenMemcpy(&global, literal.untyped_data(), allocation.size());
-      }
+    TF_ASSIGN_OR_RETURN(auto global, executor->GetUntypedSymbol(
+                                         info.symbol_name, module_handle));
+    VLOG(3) << "Resolved global " << info.symbol_name << " to "
+            << global.opaque();
+
+    CHECK(literal.shape().IsArray());
+    if (!ShouldEmitLiteralInLlvmIr(literal)) {
+      VLOG(3) << "H2D memcpy for constant with shape "
+              << ShapeUtil::HumanString(literal.shape());
+      stream->ThenMemcpy(&global, literal.untyped_data(), literal.size_bytes());
+    }
+
+    if (info.allocation_index != -1) {
+      InsertOrDie(&globals, info.allocation_index, global);
    }
  }

@ -334,7 +331,11 @@ StatusOr<se::DeviceMemoryBase> GpuExecutable::BufferForAllocation(
    }
    return registered_buffer;
  } else if (allocation.is_constant()) {
-    return FindOrDie(*globals, arg_idx);
+    auto it = globals->find(arg_idx);
+    if (it == globals->end()) {
+      return se::DeviceMemoryBase();
+    }
+    return it->second;
  } else {
    // Allocate each allocation that might escape, or is the temp buffer.
    CHECK(allocation.maybe_live_out() || allocation.IsPreallocatedTempBuffer());
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@ -49,6 +49,12 @@ namespace gpu {
 // This is an immutable data type after initialization, and thus thread safe.
 class GpuExecutable : public Executable {
 public:
+  struct ConstantInfo {
+    std::string symbol_name;
+    xla::Literal content;
+    int allocation_index = -1;
+  };
+
  // We need to share ownership of hlo_module and assignment with profiler to
  // safely keep a reference to these objects during tracing period, thus they
  // are passed as shared pointers.
@ -58,7 +64,8 @@ class GpuExecutable : public Executable {
                std::shared_ptr<HloModule> hlo_module,
                std::shared_ptr<const BufferAssignment> assignment,
                std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
-                std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
+                std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map,
+                std::vector<ConstantInfo> constants);
  ~GpuExecutable() override;

  int64 SizeOfGeneratedCodeInBytes() const override;
@ -169,6 +176,8 @@ class GpuExecutable : public Executable {
  std::map<stream_executor::StreamExecutor*, BufferAllocToDeviceMemoryMap>
      module_globals_ TF_GUARDED_BY(module_handle_mutex_);

+  std::vector<ConstantInfo> constants_;
+
  TF_DISALLOW_COPY_AND_ASSIGN(GpuExecutable);
 };

--- a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
@ -83,6 +83,8 @@ void HloToIrBindings::EmitBasePointersForHlos(
          if (non_io_hlo->opcode() == HloOpcode::kConstant) {
            llvm::Value* global_for_constant = module_->getGlobalVariable(
                llvm_ir::ConstantHloToGlobalName(*non_io_hlo));
+            CHECK(global_for_constant)
+                << llvm_ir::ConstantHloToGlobalName(*non_io_hlo);
            BindHloToIrValue(*non_io_hlo, global_for_constant);
          } else {
            llvm::Type* pointee_type =
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@ -30,12 +30,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/elemental_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_nested.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h"
 #include "tensorflow/compiler/xla/service/gpu/launch_dimensions.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
@ -98,6 +100,64 @@ Status IrEmitter::DefaultAction(HloInstruction* hlo) {
                .MakeElementGenerator(hlo, operand_to_generator));
 }

+Status IrEmitter::EmitConstants(const HloComputation& computation,
+                                bool lookup_indices) {
+  for (HloInstruction* instr : computation.instructions()) {
+    if (instr->opcode() != HloOpcode::kConstant) {
+      continue;
+    }
+    Literal& literal = *Cast<HloConstantInstruction>(instr)->mutable_literal();
+    const bool should_emit_initializer = ShouldEmitLiteralInLlvmIr(literal);
+    llvm::ArrayType* global_type =
+        llvm::ArrayType::get(b_.getInt8Ty(), literal.size_bytes());
+    llvm::Constant* initializer =
+        should_emit_initializer
+            ? llvm_ir::ConvertLiteralToIrConstant(literal, module_)
+            : llvm::ConstantAggregateZero::get(global_type);
+    if (should_emit_initializer) {
+      VLOG(3) << "Emitted initializer for constant with shape "
+              << ShapeUtil::HumanString(literal.shape());
+    }
+
+    // These globals will be looked up by name by GpuExecutable so we need to
+    // give them an external linkage.  Not all of their uses are visible in
+    // the LLVM IR (e.g. TupleThunk) so we can't give then a linkage that
+    // merely preserves their names (like available_externally), we also need
+    // to ensure that they stick around even if they're "unused".
+    //
+    // We may have to be more more clever here in the future if we notice that
+    // we're keeping around too many globals because of their linkage.
+    unsigned global_address_space = llvm_ir::GetGlobalMemoryAddressSpace(
+        *ir_emitter_context_->llvm_module());
+
+    std::string global_name = llvm_ir::ConstantHloToGlobalName(*instr);
+
+    llvm::GlobalVariable* global_for_const = new llvm::GlobalVariable(
+        global_type, /*isConstant=*/should_emit_initializer,
+        llvm::GlobalValue::ExternalLinkage,
+        /*Initializer=*/initializer, global_name,
+        /*TLMode=*/llvm::GlobalValue::NotThreadLocal,
+        /*AddressSpace=*/global_address_space,
+        /*isExternallyInitialized=*/false);
+    global_for_const->setAlignment(llvm::Align(kConstantBufferAlignBytes));
+    ir_emitter_context_->llvm_module()->getGlobalList().push_back(
+        global_for_const);
+
+    GpuExecutable::ConstantInfo info;
+    info.symbol_name = global_name;
+    info.content = literal.Clone();
+    if (lookup_indices) {
+      auto maybe_slice =
+          ir_emitter_context_->buffer_assignment().GetUniqueSlice(instr, {});
+      if (maybe_slice.ok()) {
+        info.allocation_index = maybe_slice.ValueOrDie().index();
+      }
+    }
+    ir_emitter_context_->constants().push_back(std::move(info));
+  }
+  return Status::OK();
+}
+
 Status IrEmitter::HandleConstant(HloInstruction* constant) {
  return Status::OK();
 }
@ -175,10 +235,12 @@ Status IrEmitter::EmitCallToNestedComputation(
  llvm::Function*& emitted_function =
      computation_to_ir_function_[&nested_computation];
  if (emitted_function == nullptr) {
-    IrEmitterNested ir_emitter_nested(hlo_module_config_, nested_computation,
-                                      ir_emitter_context_);
-    TF_RETURN_IF_ERROR(ir_emitter_nested.CodegenNestedComputation());
-    emitted_function = ir_emitter_nested.GetEmittedFunction();
+    TF_ASSIGN_OR_RETURN(
+        auto ir_emitter_nested,
+        IrEmitterNested::Create(hlo_module_config_, nested_computation,
+                                ir_emitter_context_));
+    TF_RETURN_IF_ERROR(ir_emitter_nested->CodegenNestedComputation());
+    emitted_function = ir_emitter_nested->GetEmittedFunction();
  }

  // Operands are in default address space for non-AMDGPU target.
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
@ -105,6 +105,12 @@ class IrEmitter : public DfsHloVisitorWithDefault,

  llvm::IRBuilder<>* builder() { return &b_; }

+  // Emits constants to generated LLVM IR, and also populate related
+  // inforamtion to ir_emitter_context for large-constant initializations. If
+  // `lookup_indices` is true, the allocation index associated with the constant
+  // is also populated.
+  Status EmitConstants(const HloComputation& computation, bool lookup_indices);
+
 protected:
  // Constructs an IrEmitter with the given IrEmitter context.
  // ir_emitter_context is owned by the caller and should outlive the IrEmitter
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h
@ -22,12 +22,14 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
 #include "tensorflow/compiler/xla/service/gpu/launch_dimensions.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"

 namespace xla {
 namespace gpu {
+
 // IrEmitterContext encapsulates common (mutable and immutable) data structures
 // used by both IrEmitterNested and IrEmitterUnnested, such as the buffer
 // assignment and the name uniquer.
@ -71,6 +73,8 @@ class IrEmitterContext {
  llvm::Module* llvm_module() { return llvm_module_; }
  NameUniquer* name_uniquer() { return &name_uniquer_; }

+  std::vector<GpuExecutable::ConstantInfo>& constants() { return constants_; }
+
 private:
  const HloModule* hlo_module_;
  const BufferAssignment* buffer_assignment_;
@ -81,6 +85,7 @@ class IrEmitterContext {
  mlir::MLIRContext* mlir_context_;
  llvm::Module* llvm_module_;
  NameUniquer name_uniquer_;
+  std::vector<GpuExecutable::ConstantInfo> constants_;
 };

 }  // namespace gpu
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
@ -41,6 +41,16 @@ IrEmitterNested::IrEmitterNested(const HloModuleConfig& hlo_module_config,
    : IrEmitter(hlo_module_config, ir_emitter_context, /*is_nested=*/true),
      nested_computation_(nested_computation) {}

+StatusOr<std::unique_ptr<IrEmitterNested>> IrEmitterNested::Create(
+    const HloModuleConfig& hlo_module_config,
+    const HloComputation& nested_computation,
+    IrEmitterContext* ir_emitter_context) {
+  std::unique_ptr<IrEmitterNested> emitter(new IrEmitterNested(
+      hlo_module_config, nested_computation, ir_emitter_context));
+  TF_RETURN_IF_ERROR(emitter->EmitConstants(nested_computation, false));
+  return emitter;
+}
+
 // Nested function serves the same purpose on GPU as a thread-local function on
 // a CPU.
 Status IrEmitterNested::CodegenNestedComputation() {
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.h
@ -39,12 +39,11 @@ namespace gpu {
 //
 class IrEmitterNested : public IrEmitter {
 public:
-  // Constructs an LLVM IR emitter for a nested HLO computation. `function` is
-  // the containing IR function this emitter produces IR to. See
-  // IrEmitter::IrEmitter for the meanings of other arguments.
-  IrEmitterNested(const HloModuleConfig& hlo_module_config,
-                  const HloComputation& nested_computation,
-                  IrEmitterContext* ir_emitter_context);
+  static StatusOr<std::unique_ptr<IrEmitterNested>> Create(
+      const HloModuleConfig& hlo_module_config,
+      const HloComputation& nested_computation,
+      IrEmitterContext* ir_emitter_context);
+
  IrEmitterNested(const IrEmitterNested&) = delete;
  IrEmitterNested& operator=(const IrEmitterNested&) = delete;

@ -62,6 +61,13 @@ class IrEmitterNested : public IrEmitter {
  Status CodegenNestedComputation();

 private:
+  // Constructs an LLVM IR emitter for a nested HLO computation. `function` is
+  // the containing IR function this emitter produces IR to. See
+  // IrEmitter::IrEmitter for the meanings of other arguments.
+  IrEmitterNested(const HloModuleConfig& hlo_module_config,
+                  const HloComputation& nested_computation,
+                  IrEmitterContext* ir_emitter_context);
+
  const HloComputation& nested_computation_;
  llvm::Function* emitted_function_;
 };
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@ -228,6 +228,7 @@ StatusOr<std::unique_ptr<IrEmitterUnnested>> IrEmitterUnnested::Create(
  auto emitter = std::unique_ptr<IrEmitterUnnested>(new IrEmitterUnnested(
      hlo_module_config, hlo_computation, ir_emitter_context));
  TF_RETURN_IF_ERROR(emitter->lhlo_scratch_emitter_.Initialize());
+  TF_RETURN_IF_ERROR(emitter->EmitConstants(*hlo_computation, true));
  return std::move(emitter);
 }

@ -3992,52 +3993,6 @@ Status IrEmitterUnnested::EmitReductionFromOrToContiguousDimensions(
  return Status::OK();
 }

-Status IrEmitterUnnested::EmitConstantGlobals() {
-  for (const BufferAllocation& allocation :
-       ir_emitter_context_->buffer_assignment().Allocations()) {
-    if (!allocation.is_constant()) {
-      continue;
-    }
-
-    const Literal& literal = llvm_ir::LiteralForConstantAllocation(allocation);
-    const bool should_emit_initializer = ShouldEmitLiteralInLlvmIr(literal);
-    llvm::ArrayType* global_type =
-        llvm::ArrayType::get(b_.getInt8Ty(), allocation.size());
-    llvm::Constant* initializer =
-        should_emit_initializer
-            ? llvm_ir::ConvertLiteralToIrConstant(literal, module_)
-            : llvm::ConstantAggregateZero::get(global_type);
-    if (should_emit_initializer) {
-      VLOG(3) << "Emitted initializer for constant with shape "
-              << ShapeUtil::HumanString(literal.shape());
-    }
-
-    // These globals will be looked up by name by GpuExecutable so we need to
-    // give them an external linkage.  Not all of their uses are visible in
-    // the LLVM IR (e.g. TupleThunk) so we can't give then a linkage that
-    // merely preserves their names (like available_externally), we also need
-    // to ensure that they stick around even if they're "unused".
-    //
-    // We may have to be more more clever here in the future if we notice that
-    // we're keeping around too many globals because of their linkage.
-    unsigned global_address_space = llvm_ir::GetGlobalMemoryAddressSpace(
-        *ir_emitter_context_->llvm_module());
-    llvm::GlobalVariable* global_for_const = new llvm::GlobalVariable(
-        global_type, /*isConstant=*/should_emit_initializer,
-        llvm::GlobalValue::ExternalLinkage,
-        /*Initializer=*/initializer,
-        llvm_ir::ConstantBufferAllocationToGlobalName(allocation),
-        /*TLMode=*/llvm::GlobalValue::NotThreadLocal,
-        /*AddressSpace=*/global_address_space,
-        /*isExternallyInitialized=*/false);
-    global_for_const->setAlignment(llvm::Align(kConstantBufferAlignBytes));
-    ir_emitter_context_->llvm_module()->getGlobalList().push_back(
-        global_for_const);
-  }
-
-  return Status::OK();
-}
-
 // Emits code for slices based on the below structure. An if statement with
 // a guarding condition is generated for each ROOT slice.
 //
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@ -180,9 +180,6 @@ class IrEmitterUnnested : public IrEmitter,
      const HloInstruction& hlo, const llvm_ir::ElementGenerator& body_emitter,
      KernelThunk* thunk, int unroll_factor);

-  // Emits LLVM global variables corresponding to constant instructions.
-  Status EmitConstantGlobals();
-
  Status Postprocess(HloInstruction* hlo) override;

 private:
--- a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
@ -580,7 +580,7 @@ StatusOr<std::unique_ptr<Executable>> MlirCompilerImpl::RunBackend(
  return {absl::make_unique<GpuExecutable>(
      ptx, cubin, GetGpuVersion(stream_exec), std::move(thunk_schedule),
      emission_context.releaseHloModule(), std::move(buffer_assignment),
-      nullptr, nullptr)};
+      nullptr, nullptr, std::vector<GpuExecutable::ConstantInfo>())};
 }

 StatusOr<std::vector<std::unique_ptr<Executable>>> MlirCompilerImpl::Compile(