Merge pull request #43964 from trentlo:horizontal_input_fusion_again

PiperOrigin-RevId: 337261311 Change-Id: I15498bba7ba9b77a2abf7001c3fe519408ee975c
2020-10-15 01:40:44 -07:00 · 2020-10-15 01:40:44 -07:00 · bc8f385f4e
commit bc8f385f4e
parent fbd73cd25d aecc90e47c
8 changed files with 533 additions and 42 deletions
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@ -1187,6 +1187,7 @@ cc_library(
        ":gpu_layout_assignment",
        ":gpu_sanitize_constant_names",
        ":gpu_scatter_expander",
        ":horizontal_input_fusion",
        ":horizontal_loop_fusion",
        ":instruction_fusion",
        ":ir_emission_utils",
@ -1770,6 +1771,7 @@ cc_library(
    srcs = ["horizontal_loop_fusion.cc"],
    hdrs = ["horizontal_loop_fusion.h"],
    deps = [
        ":gpu_fusible",
        "//tensorflow/compiler/xla:shape_util",
        "//tensorflow/compiler/xla/service:hlo",
        "//tensorflow/compiler/xla/service:hlo_creation_utils",
@ -1806,6 +1808,45 @@ tf_cc_test(
    ],
 )
 cc_library(
    name = "horizontal_input_fusion",
    srcs = ["horizontal_input_fusion.cc"],
    hdrs = ["horizontal_input_fusion.h"],
    deps = [
        ":gpu_fusible",
        ":ir_emission_utils",
        "//tensorflow/compiler/xla:shape_util",
        "//tensorflow/compiler/xla/service:hlo",
        "//tensorflow/compiler/xla/service:hlo_creation_utils",
        "//tensorflow/compiler/xla/service:hlo_pass",
        "//tensorflow/core:lib",
        "//tensorflow/core:lib_internal",
        "@com_google_absl//absl/container:flat_hash_set",
        "@com_google_absl//absl/strings",
        "@com_google_absl//absl/types:span",
    ],
 )
 tf_cc_test(
    name = "horizontal_input_fusion_test",
    srcs = ["horizontal_input_fusion_test.cc"],
    tags = tf_cuda_tests_tags(),
    deps = [
        ":horizontal_input_fusion",
        ":multi_output_fusion",
        "//tensorflow/compiler/jit:xla_gpu_jit",
        "//tensorflow/compiler/xla:shape_util",
        "//tensorflow/compiler/xla:test",
        "//tensorflow/compiler/xla:test_helpers",
        "//tensorflow/compiler/xla/service:hlo_matchers",
        "//tensorflow/compiler/xla/service:hlo_parser",
        "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
        "//tensorflow/compiler/xla/service/gpu/tests:gpu_codegen_test",
        "//tensorflow/compiler/xla/tests:filecheck",
        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
    ],
 )
 cc_library(
    name = "reduction_degenerate_dim_remover",
    srcs = ["reduction_degenerate_dim_remover.cc"],
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@ -59,6 +59,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_scatter_expander.h"
 #include "tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.h"
 #include "tensorflow/compiler/xla/service/gpu/horizontal_loop_fusion.h"
 #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
@ -306,6 +307,7 @@ Status GpuCompiler::OptimizeHloModule(
    HloPassPipeline horizontal_fusion("horizontal_fusion");
    horizontal_fusion.AddPass<GpuHorizontalLoopFusion>();
    horizontal_fusion.AddPass<GpuHorizontalInputFusion>();
    horizontal_fusion.AddPass<HloCSE>(/*is_layout_sensitive=*/true,
                                      /*only_fusion_computations=*/true);
    horizontal_fusion.AddPass<HloDCE>();
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
@ -143,29 +143,27 @@ bool IsInputFusibleReduction(const HloInstruction& instr) {
         IsReductionFromOrToContiguousDimensions(instr);
 }
 const HloInstruction* GetRealHeroForMultiOutputFusion(
    const HloInstruction& instr) {
  if (instr.opcode() != HloOpcode::kFusion) {
    return &instr;
  }
  auto fused_expression_root = instr.fused_expression_root();
  if (!instr.IsMultiOutputFusion()) {
    return fused_expression_root;
  }
  // If possible, we want to pick a reduction-from-or-to-contiguous-dims
  // operand of the fusion root, because it has the most constraints.
  for (const auto* inst : fused_expression_root->operands()) {
    if (IsReductionFromOrToContiguousDimensions(*inst)) {
      return inst;
    }
  }
  return fused_expression_root->operands()[0];
 }
 bool ShapesCompatibleForMultiOutputFusion(const HloInstruction& instr1,
                                          const HloInstruction& instr2) {
  // Returns the instructions that determines the emitter used for lowering,
  // sometimes referred to as "the real hero".
  auto get_real_hero =
      [&](const HloInstruction* instr) -> const HloInstruction* {
    if (instr->opcode() != HloOpcode::kFusion) {
      return instr;
    }
    auto fused_expression_root = instr->fused_expression_root();
    if (!instr->IsMultiOutputFusion()) {
      return fused_expression_root;
    }
    // If possible, we want to pick a reduction-to-vector operand of the
    // fusion root, because it has the most constraints.
    for (const auto* inst : fused_expression_root->operands()) {
      if (IsReductionFromOrToContiguousDimensions(*inst)) {
        return inst;
      }
    }
    return fused_expression_root->operands()[0];
  };
  // Multi-output fusion kernels share a common parallel loop. The loop
  // dimensions are determined by instruction shapes.
  auto get_loop_shape = [&](const HloInstruction* element_instr) {
@ -181,8 +179,8 @@ bool ShapesCompatibleForMultiOutputFusion(const HloInstruction& instr1,
  // root ops should have equal output shapes. An exception are
  // reduction-to-vector ops. Here the input shapes of the reduction (first
  // operand shape) and the reduction dimensions need to match.
-  auto* instr_1 = get_real_hero(&instr1);
+  auto* instr_1 = GetRealHeroForMultiOutputFusion(instr1);
-  auto* instr_2 = get_real_hero(&instr2);
+  auto* instr_2 = GetRealHeroForMultiOutputFusion(instr2);
  if (IsReductionFromOrToContiguousDimensions(*instr_1) &&
      IsReductionFromOrToContiguousDimensions(*instr_2) &&
      !AreFusedReductionOutputsConsistent({instr_1, instr_2}, instr_1)) {
@ -524,5 +522,24 @@ HloInstruction::FusionKind ChooseFusionKind(const HloInstruction& /*producer*/,
                                  : HloInstruction::FusionKind::kLoop;
 }
 bool IsConsumerTheOnlyNonRootUser(const HloInstruction& instr,
                                  const HloInstruction& consumer) {
  return absl::c_all_of(instr.users(), [&](const HloInstruction* user) {
    if (user->opcode() == HloOpcode::kGetTupleElement) {
      // Skip GTE.
      return IsConsumerTheOnlyNonRootUser(*user, consumer);
    }
    if (user == &consumer) {
      // `user` is `consumer`.
      return true;
    }
    if (user == user->parent()->root_instruction()) {
      // Consumed by ROOT.
      return true;
    }
    return false;
  });
 }
 }  // namespace gpu
 }  // namespace xla
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
@ -71,6 +71,11 @@ bool FusionWouldBeTooLarge(const HloInstruction& instr1,
 bool CreatesNestedLoop(const HloInstruction& producer,
                       const HloInstruction& consumer);
 // Returns the instruction that determines the emitter used for lowering,
 // sometimes referred to as "the real hero".
 const HloInstruction* GetRealHeroForMultiOutputFusion(
    const HloInstruction& instr);
 // Whether instruction shapes are compatible for multi-output fusion, i.e.
 // whether the emitters support lowering the resulting fusion.
 // This function works for both, sibling and producer-consumer multi-output
@ -100,6 +105,10 @@ bool IsFusibleAsMultiOutputFusionRoot(const HloInstruction& instr);
 HloInstruction::FusionKind ChooseFusionKind(const HloInstruction& producer,
                                            const HloInstruction& consumer);
 // Returns whether `consumer` is the only non-root user of `instr`.
 bool IsConsumerTheOnlyNonRootUser(const HloInstruction& instr,
                                  const HloInstruction& consumer);
 }  // namespace gpu
 }  // namespace xla
--- a/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.cc
@ -0,0 +1,167 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.h"
 #include <algorithm>
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
 #include "tensorflow/core/platform/errors.h"
 namespace xla {
 namespace gpu {
 namespace {
 // Gets the representative input shape of the multi-output fusion.
 Shape GetInputShapeForMultiOutputFusion(const HloInstruction& instr) {
  // Get the HLO that determines the emitter used for lowering.
  const HloInstruction* real_hero = GetRealHeroForMultiOutputFusion(instr);
  if (real_hero->operands().empty()) {
    // Simply return an empty shape if the representative node has no input
    // operands.
    return Shape();
  } else {
    return real_hero->operand(0)->shape();
  }
 }
 class HorizontalInputFusionImpl {
 public:
  explicit HorizontalInputFusionImpl(HloComputation* computation)
      : computation_(computation) {}
  ~HorizontalInputFusionImpl() {}
  StatusOr<bool> Run();
 private:
  HloComputation* computation_;
 };  // HorizontalInputFusionImpl
 // Compares one-by-one the dimensions of `shape_a` and `shape_b` from left to
 // right.
 bool CompareShapeDimsFromLeftToRight(const Shape& shape_a,
                                     const Shape& shape_b) {
  if (shape_a.rank() != shape_b.rank()) {
    return shape_a.rank() < shape_b.rank();
  }
  auto dims_a = shape_a.dimensions();
  auto dims_b = shape_b.dimensions();
  for (size_t i = 0; i < dims_a.size(); ++i) {
    if (dims_a[i] != dims_b[i]) {
      return dims_a[i] < dims_b[i];
    }
  }
  return true;
 }
 std::vector<HloInstruction*> FindAndSortFusionCandidates(
    HloInstruction* consumer) {
  absl::flat_hash_set<HloInstruction*> fusion_instr_set;
  for (auto opnd : consumer->operands()) {
    HloInstruction* predecessor = opnd->LatestNonGteAncestor();
    // Find out the input fusion instructions whose only consumer is `consumer`.
    // This guarantees that fusing these candidates will never create cycles, as
    // there is no back edge.
    if (IsReduceInputFusion(*predecessor) &&
        IsConsumerTheOnlyNonRootUser(*predecessor, *consumer)) {
      fusion_instr_set.insert(predecessor);
    }
  }
  std::vector<HloInstruction*> fusion_instrs;
  fusion_instrs.insert(fusion_instrs.end(), fusion_instr_set.begin(),
                       fusion_instr_set.end());
  std::sort(fusion_instrs.begin(), fusion_instrs.end(),
            [&](const HloInstruction* a, const HloInstruction* b) {
              Shape shape_a = GetInputShapeForMultiOutputFusion(*a);
              Shape shape_b = GetInputShapeForMultiOutputFusion(*b);
              if (!ShapeUtil::EqualIgnoringElementType(shape_a, shape_b)) {
                // Sort shapes according to dimensions, so that the same input
                // shapes will be placed adjacent each other.
                return CompareShapeDimsFromLeftToRight(shape_a, shape_b);
              }
              // Sort `fusion_instrs` according to instruction counts, because
              // we'd like to fuse together computations of similar sizes.
              return a->fused_instruction_count() <
                     b->fused_instruction_count();
            });
  return fusion_instrs;
 }
 StatusOr<bool> HorizontalInputFusionImpl::Run() {
  bool changed = false;
  XLA_VLOG_LINES(3, computation_->ToString());
  // Using def-to-use order is sound since we do not modify users.
  std::vector<HloInstruction*> def_to_use_order =
      computation_->MakeInstructionPostOrder();
  for (auto consumer : def_to_use_order) {
    auto candidates = FindAndSortFusionCandidates(consumer);
    if (candidates.empty()) {
      continue;
    }
    size_t fusion_anchor_id = 0;
    for (size_t j = 1; j < candidates.size(); ++j) {
      HloInstruction* fusion_anchor = candidates[fusion_anchor_id];
      HloInstruction* fused = candidates[j];
      if (ShapesCompatibleForMultiOutputFusion(*fusion_anchor, *fused) &&
          !FusionWouldBeTooLarge(*fusion_anchor, *fused)) {
        VLOG(3) << "Fuse " << fused->ToString() << " into "
                << fusion_anchor->ToString();
        fusion_anchor->MergeFusionInstructionIntoMultiOutput(fused);
        changed = true;
      } else {
        // Update the `fusion_anchor_id` since `fused` is either not
        // compatible or not beneficial to be fused with current fusion anchor.
        VLOG(3) << j - fusion_anchor_id - 1 << " instructions are fused.";
        fusion_anchor_id = j;
      }
    }
  }
  return changed;
 }
 }  // namespace
 StatusOr<bool> GpuHorizontalInputFusion::RunOnComputation(
    HloComputation* computation) {
  HorizontalInputFusionImpl horizontal_fusion_impl(computation);
  return horizontal_fusion_impl.Run();
 }
 StatusOr<bool> GpuHorizontalInputFusion::Run(HloModule* module) {
  bool changed = false;
  VLOG(2) << "Run horizontal input fusion.";
  for (auto* comp : module->MakeNonfusionComputations()) {
    TF_ASSIGN_OR_RETURN(changed, RunOnComputation(comp));
  }
  return changed;
 }
 }  // namespace gpu
 }  // namespace xla
--- a/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.h
+++ b/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.h
@ -0,0 +1,57 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HORIZONTAL_INPUT_FUSION_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HORIZONTAL_INPUT_FUSION_H_
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/core/platform/macros.h"
 namespace xla {
 namespace gpu {
 // This optimization pass horizontally fuses kInput fusions to both reduce the
 // kernel launch overhead and increase parallelism degree. See
 // GpuHorizontalFusion for general description and motivation about horizontal
 // fusion. GpuHorizontalFusion deals with kLoop fusions while this pass deals
 // with kInput fusions.
 //
 // Following GpuHorizontalFusion, a simple yet effective heuristic is used
 // to search the fusion candidates while avoiding creating cycles. That is,
 // we simply search for fusion candidates by looking for instructions whose
 // outputs are all consumed by the same instruction. This catches the typical
 // target cases; often, the candidate instructions are just consumed by the
 // ROOT tuple of the entry computation.
 class GpuHorizontalInputFusion : public HloModulePass {
 public:
  GpuHorizontalInputFusion() {}
  absl::string_view name() const override {
    return "gpu_horizontal_input_fusion";
  }
  StatusOr<bool> Run(HloModule* module) override;
 private:
  StatusOr<bool> RunOnComputation(HloComputation*);
 };
 }  // namespace gpu
 }  // namespace xla
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HORIZONTAL_INPUT_FUSION_H_
--- a/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion_test.cc
@ -0,0 +1,216 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.h"
 #include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/filecheck.h"
 namespace xla {
 namespace gpu {
 namespace {
 namespace op = xla::testing::opcode_matchers;
 class HorizontalInputFusionTest : public GpuCodegenTest {};
 TEST_F(HorizontalInputFusionTest, BasicTest) {
  auto module = ParseAndReturnVerifiedModule(R"(
 HloModule BasicTest
  %add_f16 {
    %x = f16[] parameter(0)
    %y = f16[] parameter(1)
    ROOT %add = f16[] add(%x, %y)
  }
 fused_computation.1 {
   arg.1 = f16[1024]{0} parameter(0)
   constant0 = f16[] constant(0)
   ROOT reduce1 = f16[] reduce(arg.1, constant0), dimensions={0}, to_apply=%add_f16
 }
 fused_computation.2 {
   arg.1 = f16[1024]{0} parameter(0)
   constant0 = f16[] constant(0)
   ROOT reduce1 = f16[] reduce(arg.1, constant0), dimensions={0}, to_apply=%add_f16
 }
 ENTRY entry_computation {
   arg.1 = f16[1024]{0} parameter(0)
   arg.2 = f16[1024]{0} parameter(1)
   fusion.1 = f16[] fusion(arg.1), kind=kInput, calls=fused_computation.1
   fusion.2 = f16[] fusion(arg.2), kind=kInput, calls=fused_computation.2
   ROOT tuple.1 = (f16[], f16[]) tuple(fusion.1, fusion.2)
 }
 )")
                    .ValueOrDie();
  EXPECT_TRUE(GpuHorizontalInputFusion().Run(module.get()).ValueOrDie());
  const HloInstruction* entry_root =
      module->entry_computation()->root_instruction();
  EXPECT_THAT(entry_root, op::Tuple((op::GetTupleElement(op::Fusion())),
                                    (op::GetTupleElement(op::Fusion()))));
  const HloInstruction* fusion = entry_root->operand(0)->operand(0);
  ASSERT_TRUE(fusion->IsMultiOutputFusion());
  EXPECT_THAT(fusion->fused_expression_root(),
              op::Tuple(op::Reduce(), op::Reduce()));
 }
 TEST_F(HorizontalInputFusionTest, ManyInputFusions) {
  auto module = CreateNewVerifiedModule();
  HloComputation* reduce_computation;
  {
    auto embedded_builder = HloComputation::Builder("add");
    auto lhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter(
        0, ShapeUtil::MakeShape(F32, {}), "lhs"));
    auto rhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter(
        1, ShapeUtil::MakeShape(F32, {}), "rhs"));
    embedded_builder.AddInstruction(
        HloInstruction::CreateBinary(lhs->shape(), HloOpcode::kAdd, lhs, rhs));
    reduce_computation =
        module->AddEmbeddedComputation(embedded_builder.Build());
  }
  HloComputation::Builder builder(TestName());
  std::vector<HloInstruction*> var_outs;
  auto input_shape = ShapeUtil::MakeShape(F32, {1024, 1024});
  auto output_shape = ShapeUtil::MakeShape(F32, {1024});
  for (int64 i = 0; i < 130; ++i) {
    // %fused_computation.3 (param_0: f32[1024,1024], param_1: f32[]) ->
    // f32[1024] {
    //  %param_0 = f32[1024,1024]{1,0} parameter(0)
    //  %param_1 = f32[] parameter(1)
    //  %broadcast = f32[1024,1024]{1,0} broadcast(f32[] %param_1),
    //  dimensions={}
    //  %multiply = f32[1024,1024]{1,0}
    //      multiply(f32[1024,1024]{1,0} %param_0, f32[1024,1024]{1,0}
    //      %broadcast)
    //  %constant0 = f32[] constant(0)
    //  ROOT %reduce = f32[1024]{0}
    //      reduce(f32[1024,1024]{1,0} %multiply, f32[] %constant0),
    //          dimensions={1}, to_apply=%add
    // }
    HloInstruction* param_var_in = builder.AddInstruction(
        HloInstruction::CreateParameter(i * 2 + 0, input_shape, "var.in"));
    HloInstruction* param_alpha =
        builder.AddInstruction(HloInstruction::CreateParameter(
            i * 2 + 1, ShapeUtil::MakeShape(F32, {}), "alpha"));
    auto alpha_broadcasted = builder.AddInstruction(
        HloInstruction::CreateBroadcast(input_shape, param_alpha, {}));
    auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
        input_shape, HloOpcode::kMultiply, param_var_in, alpha_broadcasted));
    HloInstruction* const0 = builder.AddInstruction(
        HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0)));
    auto reduce = builder.AddInstruction(HloInstruction::CreateReduce(
        output_shape, mul, const0, {1}, reduce_computation));
    var_outs.push_back(reduce);
  }
  builder.AddInstruction(HloInstruction::CreateTuple(var_outs));
  module->AddEntryComputation(builder.Build());
  // Verify that horizontal fusion is kicked in. Check that there are multiple
  // `reduce` instructions fused into the same fusion. 6 is just a randomly
  // picked number as we don't exactly know how large the fusion will be
  // created due to the `FusionWouldBeTooLarge` constraint.
  CompileAndVerifyIr(module->Clone(), R"(CHECK: reduce-group-6)",
                     /*match_optimized_ir=*/false);
  // Testing with the entire gpu optimization pipeline.
  EXPECT_TRUE(RunAndCompare(std::move(module), ErrorSpec{1e-5, 1e-5}));
 }
 TEST_F(HorizontalInputFusionTest, MultiOutputFusionTest) {
  // This tests the below pattern. One known issue is that gtes (to fusions) can
  // be removed after their producer fusions are merged. In the below case, gte2
  // and gte6 will be gone if Fusion2 is fused into Fusion1.
  //
  // Fusion1   Fusion2
  //  |   |    |     |
  //  |  gte1 gte2   |
  //  |   |    |     |
  //  |   Fusion3    |
  //  |    |   |     |
  // gte3 gte4 gte5 gte6
  //  \  |     |    /
  //  =====ROOT=====
  //
  auto module = ParseAndReturnVerifiedModule(R"(
 HloModule MultiOutputFusionTest
  %add_f16 {
    %x = f16[] parameter(0)
    %y = f16[] parameter(1)
    ROOT %add = f16[] add(%x, %y)
  }
 fused_computation.1 {
   arg.1 = f16[1024]{0} parameter(0)
   constant0 = f16[] constant(0)
   reduce.1 = f16[] reduce(arg.1, constant0), dimensions={0}, to_apply=%add_f16
   add.0 = f16[1024] add(arg.1, arg.1)
   ROOT tuple.1 = (f16[], f16[1024]) tuple(reduce.1, add.0)
 }
 fused_computation.2 {
   arg.1 = f16[1024]{0} parameter(0)
   constant0 = f16[] constant(0)
   reduce.1 = f16[] reduce(arg.1, constant0), dimensions={0}, to_apply=%add_f16
   add.0 = f16[1024] add(arg.1, arg.1)
   ROOT tuple.1 = (f16[], f16[1024]) tuple(reduce.1, add.0)
 }
 fused_computation.3 {
   arg.0 = f16[1024]{0} parameter(0)
   arg.1 = f16[1024]{0} parameter(1)
   add.0 = f16[1024] add(arg.0, arg.1)
   mul.0 = f16[1024] multiply(arg.0, arg.1)
   ROOT tuple.1 = (f16[1024], f16[1024]) tuple(add.0, mul.0)
 }
 ENTRY entry_computation {
   arg.1 = f16[1024]{0} parameter(0)
   arg.2 = f16[1024]{0} parameter(1)
   fusion.1 = (f16[],f16[1024]) fusion(arg.1), kind=kInput, calls=fused_computation.1
   fusion.2 = (f16[],f16[1024]) fusion(arg.2), kind=kInput, calls=fused_computation.2
   gte.3 = f16[] get-tuple-element(fusion.1), index=0
   gte.1 = f16[1024]{0} get-tuple-element(fusion.1), index=1
   gte.2 = f16[1024]{0} get-tuple-element(fusion.2), index=1
   gte.6 = f16[] get-tuple-element(fusion.2), index=0
   fusion.3 = (f16[1024],f16[1024]) fusion(gte.1, gte.2),
       kind=kLoop, calls=fused_computation.3
   gte.4 = f16[1024] get-tuple-element(fusion.3), index=0
   gte.5 = f16[1024]{0} get-tuple-element(fusion.3), index=1
   ROOT tuple.1 = (f16[], f16[1024]{0}, f16[], f16[1024]{0})
       tuple(gte.3, gte.4, gte.5, gte.6)
 }
 )")
                    .ValueOrDie();
  EXPECT_TRUE(GpuHorizontalInputFusion().Run(module.get()).ValueOrDie());
 }
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
--- a/tensorflow/compiler/xla/service/gpu/horizontal_loop_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/horizontal_loop_fusion.cc
@ -20,6 +20,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/util/env_var.h"
@ -137,25 +138,6 @@ bool IsFusionSupported(const HloInstruction& instr) {
  return true;
 }
 bool IsConsumerTheOnlyNonRootUser(const HloInstruction& instr,
                                  const HloInstruction& consumer) {
  return absl::c_all_of(instr.users(), [&](const HloInstruction* user) {
    if (user->opcode() == HloOpcode::kGetTupleElement) {
      // Skip GTE.
      return IsConsumerTheOnlyNonRootUser(*user, consumer);
    } else if (user == &consumer) {
      // `user` is `consumer`.
      return true;
    } else if (user == user->parent()->root_instruction()) {
      // Consumed by ROOT is always fine, since it is impossible to create
      // cycles through ROOT.
      return true;
    } else {
      return false;
    }
  });
 }
 // Returns whether `instr` is a profitable candidate to be horizontally fused.
 // Since the primary benefit of horizontal fusion comes from reducing the
 // kernel launch overhead, we want to exclude the instructions with