[XLA/GPU] Simplify reduction implementation.

Specifically, before this change: * output_instructions are created by calls, * output_instructions are passed around, * output_instructions are used to reverse-lookup ShapeIndex to find IrArrays. After this change: * there is no output_instructions. It's derived by the callee. * instr_index_group gets passed around instead. * output_instructions are derived from unnested_hlo and index. * ShapeIndex is derived from index. This also removes some footages of the type "HloInstruction", making later transitions to MLIR easier. PiperOrigin-RevId: 344114759 Change-Id: I2fe7642e44d1c639453faa32fe5f128167ee6291
2020-11-24 12:52:15 -08:00 · 2020-11-24 12:52:15 -08:00 · faa6548fba
commit faa6548fba
parent 22e335d781
5 changed files with 132 additions and 124 deletions
--- a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_runner.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_runner.h
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
@ -496,32 +496,23 @@ llvm::Value* IsBlock0Thread0(llvm::IRBuilder<>* b) {
  return b->CreateAnd(is_thread0, is_block0);
 }

-bool AreFusedReductionOutputsConsistent(
-    absl::Span<const HloInstruction* const> output_instructions,
-    const HloInstruction* first_reduce) {
-  for (const HloInstruction* inst : output_instructions) {
-    if (IsReductionFromOrToContiguousDimensions(*inst)) {
-      // Shapes, layouts and dimensions must be the same for all reduces
-      // inside of this fusion.
-      // TODO(tjoerg): Relax the shape constraint. The datatype does not matter.
-      if (!(ShapeUtil::Equal(first_reduce->shape(), inst->shape()) &&
-            ShapeUtil::Equal(first_reduce->operand(0)->shape(),
-                             inst->operand(0)->shape()) &&
-            ShapeUtil::Equal(first_reduce->operand(1)->shape(),
-                             inst->operand(1)->shape()) &&
-            first_reduce->dimensions() == inst->dimensions())) {
-        return false;
-      }
-    } else {
-      if (!(ShapeUtil::CompatibleIgnoringElementType(
-                first_reduce->operand(0)->shape(), inst->shape()) &&
-            LayoutUtil::Equal(first_reduce->operand(0)->shape().layout(),
-                              inst->shape().layout()))) {
-        return false;
-      }
-    }
+bool IsFusedReductionOutputConsistent(const HloInstruction* inst,
+                                      const HloInstruction* first_reduce) {
+  if (IsReductionFromOrToContiguousDimensions(*inst)) {
+    // Shapes, layouts and dimensions must be the same for all reduces
+    // inside of this fusion.
+    // TODO(tjoerg): Relax the shape constraint. The datatype does not matter.
+    return ShapeUtil::Equal(first_reduce->shape(), inst->shape()) &&
+           ShapeUtil::Equal(first_reduce->operand(0)->shape(),
+                            inst->operand(0)->shape()) &&
+           ShapeUtil::Equal(first_reduce->operand(1)->shape(),
+                            inst->operand(1)->shape()) &&
+           first_reduce->dimensions() == inst->dimensions();
  }
-  return true;
+  return ShapeUtil::CompatibleIgnoringElementType(
+             first_reduce->operand(0)->shape(), inst->shape()) &&
+         LayoutUtil::Equal(first_reduce->operand(0)->shape().layout(),
+                           inst->shape().layout());
 }

 }  // namespace gpu
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
@ -217,10 +217,18 @@ llvm::Value* EmitFullWarpShuffleDown(llvm::Value* value, llvm::Value* offset,
 // block 0 of the kernel.
 llvm::Value* IsBlock0Thread0(llvm::IRBuilder<>* b);

-// Returns whether the outputs of a fusion with reduction are consistent.
-bool AreFusedReductionOutputsConsistent(
+// Returns whether the output of a fusion with reduction are consistent with
+// `first_reduce`.
+bool IsFusedReductionOutputConsistent(const HloInstruction* inst,
+                                      const HloInstruction* first_reduce);
+
+inline bool AreFusedReductionOutputsConsistent(
    absl::Span<const HloInstruction* const> output_instructions,
-    const HloInstruction* first_reduce);
+    const HloInstruction* first_reduce) {
+  return absl::c_all_of(output_instructions, [=](const HloInstruction* inst) {
+    return IsFusedReductionOutputConsistent(inst, first_reduce);
+  });
+}

 }  // namespace gpu
 }  // namespace xla
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@ -1247,8 +1247,7 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
        }

        CHECK_GE(root->operand_count(), 1);
-        return EmitReductionFromOrToContiguousDimensions(fusion,
-                                                         root->operands());
+        return EmitReductionFromOrToContiguousDimensions(fusion);
      }
      case HloOpcode::kReduce: {
        // HandleFusion specializes reduction from a multi-dimensional array to
@ -1259,7 +1258,7 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
          return Unimplemented(
              "Vectorized variadic reduce is not supported on GPU");
        }
-        return EmitReductionFromOrToContiguousDimensions(fusion, {root});
+        return EmitReductionFromOrToContiguousDimensions(fusion);
      }
      case HloOpcode::kSlice: {
        return EmitInputFusibleNonStridedSlices(fusion);
@ -1385,7 +1384,7 @@ Status IrEmitterUnnested::HandleReduce(HloInstruction* reduce) {

  if (IsReductionFromOrToContiguousDimensions(*reduce) &&
      reduce->shape().IsArray()) {
-    return EmitReductionFromOrToContiguousDimensions(reduce, {reduce});
+    return EmitReductionFromOrToContiguousDimensions(reduce);
  }

  return DefaultActionForMlir(input);
@ -3673,7 +3672,7 @@ void IrEmitterUnnested::EmitEpilogueForReduction(
    llvm::Type* index_ty, HloInstruction* unnested_hlo,
    const ReductionCodegenInfo& reduction_info,
    absl::Span<const HloInstruction* const> reduce_instructions,
-    absl::Span<const ShapeIndex> reduction_output_shape_indices,
+    absl::Span<const int> reduction_output_indices,
    absl::Span<HloComputation* const> reducers,
    const TilingKernelInfo& tiling_kernel_info) {
  const KernelMappingScheme& mapping_scheme =
@ -3728,8 +3727,13 @@ void IrEmitterUnnested::EmitEpilogueForReduction(
      // At this point in the function we have a "partial sum" of input elements
      // (stored in partial_result_addresses), and we need to accumulate it into
      // the correct output element.
-      auto output_array = GetIrArray(*unnested_hlo, *unnested_hlo,
-                                     reduction_output_shape_indices[i]);
+      ShapeIndex index;
+      if (unnested_hlo->IsMultiOutputFusion()) {
+        index.push_back(reduction_output_indices[i]);
+      } else {
+        CHECK_EQ(0, reduction_output_indices[i]);
+      }
+      auto output_array = GetIrArray(*unnested_hlo, *unnested_hlo, index);
      IrArray::Index element_index(
          /*linear=*/untransposed_output_linear_address,
          reduction_kept_element_shape, &b_);
@ -3872,31 +3876,28 @@ void IrEmitterUnnested::EmitPrintfWithThreadId(
  });
 }

-namespace {
-
-// Obtains the corresponding index of the out_instr in the outputs of the
-// `unnested_hlo`.
-ShapeIndex CreateShapeIndexForOutputInstruction(
-    const HloInstruction& unnested_hlo, const HloInstruction& out_instr) {
-  if (!unnested_hlo.IsMultiOutputFusion()) {
-    return ShapeIndex({});
+static HloInstruction* GetReduceFromUnnested(HloInstruction* unnested_hlo,
+                                             int index) {
+  if (unnested_hlo->opcode() == HloOpcode::kReduce) {
+    CHECK_EQ(0, index);
+    return unnested_hlo;
  }
-  const auto& all_outputs = unnested_hlo.fused_expression_root()->operands();
-  for (size_t i = 0; i < all_outputs.size(); ++i) {
-    if (all_outputs[i] == &out_instr) {
-      return ShapeIndex({static_cast<int64>(i)});
+  if (unnested_hlo->opcode() == HloOpcode::kFusion) {
+    auto root = unnested_hlo->fused_expression_root();
+    if (root->opcode() == HloOpcode::kReduce) {
+      CHECK_EQ(0, index);
+      return root;
+    }
+    if (root->opcode() == HloOpcode::kTuple) {
+      return root->mutable_operand(index);
    }
  }
-  LOG(FATAL) << " Fusion root does not contain output instruction; "
-             << " fusion: " << unnested_hlo.ToString()
-             << ", output instruction: " << out_instr.ToString();
+  return nullptr;
 }

-}  // namespace
-
 void IrEmitterUnnested::EmitTileElementForReduction(
    HloInstruction* unnested_hlo, const Shape& reduction_operand_shape,
-    absl::Span<HloInstruction* const> output_instructions,
+    absl::Span<const int> instr_index_group,
    const llvm_ir::IrArray::Index& index,
    const ReductionCodegenInfo& reduction_info,
    absl::Span<HloComputation* const> reducers, int64 x_iter_num) {
@ -3915,10 +3916,12 @@ void IrEmitterUnnested::EmitTileElementForReduction(
  if (unnested_hlo->opcode() == HloOpcode::kFusion) {
    BindFusionArguments(unnested_hlo, &fused_emitter);

-    for (int i = 0, e = output_instructions.size(); i != e; ++i) {
-      const HloInstruction* inst = output_instructions[i];
-      ShapeIndex idx =
-          CreateShapeIndexForOutputInstruction(*unnested_hlo, *inst);
+    for (int index : instr_index_group) {
+      const HloInstruction* inst = GetReduceFromUnnested(unnested_hlo, index);
+      ShapeIndex idx;
+      if (unnested_hlo->IsMultiOutputFusion()) {
+        idx.push_back(index);
+      }
      if (IsReductionFromOrToContiguousDimensions(*inst)) {
        input_gens.push_back(*fused_emitter.GetGenerator(inst->operand(0)));
      } else {
@ -4689,22 +4692,20 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
 }

 void IrEmitterUnnested::EmitIRForReduction(
-    HloInstruction* unnested_hlo,
-    absl::Span<HloInstruction* const> output_instructions,
+    HloInstruction* unnested_hlo, absl::Span<const int> instr_index_group,
    ReductionCodegenInfo* reduction_info, const Shape& input_shape) {
  std::vector<HloInstruction*> reduce_instructions;
-  InlinedVector<ShapeIndex, 1> reduction_output_shape_indices;
+  InlinedVector<int, 1> reduction_output_indices;
  InlinedVector<HloComputation*, 1> reducers;
-  for (size_t i = 0; i < output_instructions.size(); ++i) {
-    if (!IsReductionFromOrToContiguousDimensions(*output_instructions[i])) {
+  for (int index : instr_index_group) {
+    HloInstruction* output_instruction =
+        GetReduceFromUnnested(unnested_hlo, index);
+    if (!IsReductionFromOrToContiguousDimensions(*output_instruction)) {
      continue;
    }

-    HloInstruction* output_instruction = output_instructions[i];
    reduce_instructions.push_back(output_instruction);
-    reduction_output_shape_indices.push_back(
-        CreateShapeIndexForOutputInstruction(*unnested_hlo,
-                                             *output_instruction));
+    reduction_output_indices.push_back(index);
    reducers.push_back(output_instruction->to_apply());
  }
  CHECK(reduce_instructions.size() != 0)
@ -4722,7 +4723,7 @@ void IrEmitterUnnested::EmitIRForReduction(
      [&](const llvm_ir::IrArray::Index& index, llvm::Value* y_loc,
          llvm::Value* x_loc, int64 x_iter_num) {
        EmitTileElementForReduction(unnested_hlo, input_shape,
-                                    output_instructions, index, *reduction_info,
+                                    instr_index_group, index, *reduction_info,
                                    reducers, x_iter_num);
      };

@ -4736,7 +4737,7 @@ void IrEmitterUnnested::EmitIRForReduction(
                 emit_reduction_tile);
      });
  EmitEpilogueForReduction(index_ty, unnested_hlo, *reduction_info,
-                           reduce_instructions, reduction_output_shape_indices,
+                           reduce_instructions, reduction_output_indices,
                           reducers, tiling_kernel_info);
 }

@ -4751,33 +4752,33 @@ bool IsBroadcastedConstantOrScalar(const HloInstruction& instr) {
           ShapeUtil::IsScalar(instr.operand(0)->shape())));
 }

-// Divides output_instructions into groups. Different groups will be executed
+// Divides `num_reduces` reduces into groups. Different groups will be executed
 // in parallel. Generally speaking, we'd like to run the reduce instructions
 // in parallel without incurring too much recomputation overhead. The current
 // heuristic is to place reduce instructions who share nothing or only
 // (broadcasted) scalars/constants into different groups; otherwise, they are
 // placed in the same group. Non-reduce instructions always go with the reduce
 // instructions into the same group so long as they share any predecessors.
-std::vector<std::vector<HloInstruction*>> DivideOutputInstructionsIntoGroups(
-    HloInstruction* unnested_hlo,
-    absl::Span<HloInstruction* const> output_instructions) {
-  CHECK(!output_instructions.empty());
-  if (output_instructions.size() == 1) {
-    return {{output_instructions[0]}};
+std::vector<std::vector<int>> DivideOutputInstructionsIntoGroups(
+    HloInstruction* unnested_hlo, int num_reduces) {
+  CHECK_NE(0, num_reduces);
+  if (num_reduces == 1) {
+    return {{0}};
  }

  std::vector<tensorflow::UnionFind<HloInstruction*>> disjoint_sets(
-      output_instructions.size());
-  for (size_t i = 0; i < output_instructions.size(); ++i) {
-    disjoint_sets[i].Get() = output_instructions[i];
+      num_reduces);
+  for (size_t i = 0; i < num_reduces; ++i) {
+    disjoint_sets[i].Get() = GetReduceFromUnnested(unnested_hlo, i);
  }

  std::unique_ptr<HloReachabilityMap> reachability_map =
      HloReachabilityMap::Build(unnested_hlo->fused_instructions_computation());
  for (auto* instr : unnested_hlo->fused_instructions()) {
    std::vector<int64> reached_output_ids;
-    for (size_t oid = 0; oid < output_instructions.size(); ++oid) {
-      if (HloOpcode::kReduce == output_instructions[oid]->opcode() &&
+    for (size_t oid = 0; oid < num_reduces; ++oid) {
+      auto reduce = GetReduceFromUnnested(unnested_hlo, oid);
+      if (HloOpcode::kReduce == reduce->opcode() &&
          (IsBroadcastedConstantOrScalar(*instr))) {
        // Do not group output reduce instructions through broadcasted
        // constants or scalars, as the recomputation should be acceptable.
@ -4785,9 +4786,9 @@ std::vector<std::vector<HloInstruction*>> DivideOutputInstructionsIntoGroups(
        continue;
      }
      // Now group output instructions if they have common predecessors.
-      if (reachability_map->IsReachable(instr, output_instructions[oid])) {
-        VLOG(3) << "Reaching " << output_instructions[oid]->ToString()
-                << " from " << instr->ToString();
+      if (reachability_map->IsReachable(instr, reduce)) {
+        VLOG(3) << "Reaching " << reduce->ToString() << " from "
+                << instr->ToString();
        reached_output_ids.push_back(oid);
      }
    }
@ -4797,12 +4798,12 @@ std::vector<std::vector<HloInstruction*>> DivideOutputInstructionsIntoGroups(
    }
  }
  // Place output instructions in the same set into the same group.
-  absl::flat_hash_map<HloInstruction*, std::vector<HloInstruction*>> groups;
-  for (size_t oid = 0; oid < output_instructions.size(); ++oid) {
-    groups[disjoint_sets[oid].Get()].push_back(output_instructions.at(oid));
+  absl::flat_hash_map<HloInstruction*, std::vector<int>> groups;
+  for (size_t oid = 0; oid < num_reduces; ++oid) {
+    groups[disjoint_sets[oid].Get()].push_back(oid);
  }

-  std::vector<std::vector<HloInstruction*>> ret;
+  std::vector<std::vector<int>> ret;
  absl::c_for_each(
      groups, [&](auto& iter) { ret.emplace_back(std::move(iter.second)); });
  return ret;
@ -4811,15 +4812,20 @@ std::vector<std::vector<HloInstruction*>> DivideOutputInstructionsIntoGroups(
 }  // namespace

 Status IrEmitterUnnested::EmitReductionFromOrToContiguousDimensions(
-    HloInstruction* unnested_hlo,
-    absl::Span<HloInstruction* const> output_instructions) {
-  bool returns_tuple = output_instructions.size() > 1;
+    HloInstruction* unnested_hlo) {
+  int num_reduces = 1;
+  if (unnested_hlo->IsMultiOutputFusion()) {
+    num_reduces = unnested_hlo->fused_expression_root()->operand_count();
+  }
+
+  bool returns_tuple = num_reduces > 1;
  VLOG(10) << "Emitting reduction to vector " << unnested_hlo->ToString();

  // Build an initializer thunk to initialize each reduction output.
  std::vector<std::unique_ptr<Thunk>> thunks;
-  for (int i = 0; i < output_instructions.size(); ++i) {
-    if (!IsReductionFromOrToContiguousDimensions(*output_instructions[i])) {
+  for (int i = 0; i < num_reduces; ++i) {
+    if (!IsReductionFromOrToContiguousDimensions(
+            *GetReduceFromUnnested(unnested_hlo, i))) {
      continue;
    }

@ -4831,17 +4837,20 @@ Status IrEmitterUnnested::EmitReductionFromOrToContiguousDimensions(

  // Build a kernel thunk to compute all the outputs.
  const HloInstruction* first_reduce = nullptr;
-  for (int i = 0; i < output_instructions.size(); ++i) {
-    if (IsReductionFromOrToContiguousDimensions(*output_instructions[i])) {
-      first_reduce = output_instructions[i];
+  for (int i = 0; i < num_reduces; ++i) {
+    if (IsReductionFromOrToContiguousDimensions(
+            *GetReduceFromUnnested(unnested_hlo, i))) {
+      first_reduce = GetReduceFromUnnested(unnested_hlo, i);
      break;
    }
  }
  CHECK(first_reduce);
-  if (output_instructions.size() > 1) {
-    if (!AreFusedReductionOutputsConsistent(output_instructions,
-                                            first_reduce)) {
-      return InternalError("Inconsistent reduction fusion outputs");
+  if (num_reduces > 1) {
+    for (int i = 1; i < num_reduces; i++) {
+      if (!IsFusedReductionOutputConsistent(
+              GetReduceFromUnnested(unnested_hlo, i), first_reduce)) {
+        return InternalError("Inconsistent reduction fusion outputs");
+      }
    }
  }
  const Shape& input_shape = first_reduce->operand(0)->shape();
@ -4852,24 +4861,24 @@ Status IrEmitterUnnested::EmitReductionFromOrToContiguousDimensions(
                                  << first_reduce->ToString();

  // Group output instructions. Each group will be executed in parallel.
-  std::vector<std::vector<HloInstruction*>> instr_groups =
-      DivideOutputInstructionsIntoGroups(unnested_hlo, output_instructions);
-  VLOG(2) << StrCat("Generate in ", instr_groups.size(), " groups for ",
+  std::vector<std::vector<int>> instr_index_groups =
+      DivideOutputInstructionsIntoGroups(unnested_hlo, num_reduces);
+  VLOG(2) << StrCat("Generate in ", instr_index_groups.size(), " groups for ",
                    unnested_hlo->ToString());
  std::unique_ptr<KernelThunk> kernel_thunk =
      BuildKernelThunk(unnested_hlo, /*implements_whole_instruction=*/false);
  KernelSupportLibrary ksl(&b_, llvm_ir::UnrollMode::kDefaultUnroll);
-  for (size_t i = 0; i < instr_groups.size(); ++i) {
+  for (size_t i = 0; i < instr_index_groups.size(); ++i) {
    // Create a new ReductionCodegenInfo instance as it contains states for
    // code generation per reduction group. For now, let's always use the very
    // first reduce as representative to construct ReductionCodegenInfo, since
    // all the reductions are required to have the same shape and layout as
-    // verified by `AreFusedReductionOutputsConsistent()`. We can loosen the
+    // verified by `IsFusedReductionOutputConsistent()`. We can loosen the
    // constraint later when the needs arise.
    ReductionCodegenInfo reduction_info =
        ComputeReductionCodegenInfo(unnested_hlo, first_reduce);
    auto emit_reduction_func = [&] {
-      EmitIRForReduction(unnested_hlo, instr_groups[i], &reduction_info,
+      EmitIRForReduction(unnested_hlo, instr_index_groups[i], &reduction_info,
                         input_shape);
    };
    // Use raw block_id_y to select the i-th parallel reduction to run. Using
@ -4878,7 +4887,7 @@ Status IrEmitterUnnested::EmitReductionFromOrToContiguousDimensions(
    // the indices used within the reductions.
    llvm::CallInst* raw_block_id_y = gpu::EmitCallToTargetIntrinsic(
        gpu::TargetIntrinsicID::kBlockIdy, {}, {}, &b_);
-    llvm_ir::AddRangeMetadata(0, instr_groups.size(),
+    llvm_ir::AddRangeMetadata(0, instr_index_groups.size(),
                              llvm::cast<llvm::Instruction>(raw_block_id_y));
    llvm::Value* guarding_cond =
        b_.CreateICmpEQ(raw_block_id_y, b_.getInt32(i));
@ -4888,11 +4897,11 @@ Status IrEmitterUnnested::EmitReductionFromOrToContiguousDimensions(
      ComputeReductionCodegenInfo(unnested_hlo, first_reduce);
  const KernelMappingScheme& mapping_scheme =
      reduction_info.GetKernelMappingScheme();
-  // block_y_count is set to instr_groups.size(), so that each reduction group
-  // can be run in parallel by a different BlockIdy.
+  // block_y_count is set to instr_index_groups.size(), so that each reduction
+  // group can be run in parallel by a different BlockIdy.
  LaunchDimensions launch_dimensions(
      {/*x=*/mapping_scheme.GetNumberOfBlocks(),
-       /*y=*/static_cast<int64>(instr_groups.size()),
+       /*y=*/static_cast<int64>(instr_index_groups.size()),
       /*z=*/1},
      {/*x=*/mapping_scheme.GetThreadsPerBlock(), /*y=*/1, /*z=*/1});
  VLOG(3) << "Launch dimensions of " << unnested_hlo->name()
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@ -405,13 +405,8 @@ class IrEmitterUnnested : public IrEmitter,
  // complicating the index calculation in the code generation of the reduce
  // instructions. In other words, a block_id_y is assigned to a group and so
  // different groups can be run in parallel.
-  //
-  // output_instructions: Output instructions in the computation: instruction
-  // itself if it's not a fusion, fusion root if fusion is not multi-output, and
-  // elements of the fusion multi-output tuple otherwise.
  Status EmitReductionFromOrToContiguousDimensions(
-      HloInstruction* unnested_hlo,
-      absl::Span<HloInstruction* const> output_instructions);
+      HloInstruction* unnested_hlo);

  // Computes the KernelMappingScheme for the reduce HLO and indicates whether
  // the reduction is a row reduction. For an un-fused reduce op, unnested_hlo
@ -555,12 +550,17 @@ class IrEmitterUnnested : public IrEmitter,
  //
  // Calculates and stores the temporary reduction value in the corresponding
  // alloca.
-  void EmitTileElementForReduction(
-      HloInstruction* unnested_hlo, const Shape& reduction_operand_shape,
-      absl::Span<HloInstruction* const> output_instructions,
-      const llvm_ir::IrArray::Index& index,
-      const ReductionCodegenInfo& reduction_info,
-      absl::Span<HloComputation* const> reducers, int64 x_iter_num);
+  //
+  // `instr_index_group` indicates a set of reductions this call needs to emit,
+  // each i points to the ith output of unnested_hlo. Notice that if
+  // unnested_hlo is not a multi-output fusion, instr_index_group is always {0}.
+  void EmitTileElementForReduction(HloInstruction* unnested_hlo,
+                                   const Shape& reduction_operand_shape,
+                                   absl::Span<const int> instr_index_group,
+                                   const llvm_ir::IrArray::Index& index,
+                                   const ReductionCodegenInfo& reduction_info,
+                                   absl::Span<HloComputation* const> reducers,
+                                   int64 x_iter_num);

  // Prepares for the code generation for a tile block of a reduction kernel.
  //
@ -577,13 +577,13 @@ class IrEmitterUnnested : public IrEmitter,
      llvm::Type* index_ty, HloInstruction* unnested_hlo,
      const ReductionCodegenInfo& reduction_info,
      absl::Span<const HloInstruction* const> reduce_instructions,
-      absl::Span<const ShapeIndex> reduction_output_shape_indices,
+      absl::Span<const int> reduction_output_indices,
      absl::Span<HloComputation* const> reducers,
      const TilingKernelInfo& tiling_kernel_info);

-  // Emits code for reductions in the output_instructions.
+  // Emits code for reductions in the instr_index_group.
  void EmitIRForReduction(HloInstruction* unnested_hlo,
-                          absl::Span<HloInstruction* const> output_instructions,
+                          absl::Span<const int> instr_index_group,
                          ReductionCodegenInfo* reduction_info,
                          const Shape& input_shape);