From bc054640623b621e56b7fb95857b0947ed048c14 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Fri, 8 Mar 2019 03:02:46 -0800
Subject: [PATCH] Don't fuse in CpuInstructionFusion if it causes ineffiencies
 in FusedIrEmitter.

Also move the function which checks for inefficiencies to FusedIrEmitter.

PiperOrigin-RevId: 237419419
---
 tensorflow/compiler/tests/image_ops_test.py   |   4 +-
 tensorflow/compiler/xla/service/cpu/BUILD     |   1 +
 .../xla/service/cpu/cpu_instruction_fusion.cc |   9 ++
 tensorflow/compiler/xla/service/gpu/BUILD     |   4 +-
 .../compiler/xla/service/gpu/fusion_merger.cc |   5 +-
 .../compiler/xla/service/gpu/gpu_fusible.cc   | 103 +----------------
 .../compiler/xla/service/gpu/gpu_fusible.h    |   8 --
 .../xla/service/gpu/instruction_fusion.cc     |   3 +-
 tensorflow/compiler/xla/service/llvm_ir/BUILD |   2 +
 .../xla/service/llvm_ir/fused_ir_emitter.cc   | 105 ++++++++++++++++++
 .../xla/service/llvm_ir/fused_ir_emitter.h    |   8 ++
 .../xla/tests/array_elementwise_ops_test.cc   |   3 +-
 12 files changed, 137 insertions(+), 118 deletions(-)

diff --git a/tensorflow/compiler/tests/image_ops_test.py b/tensorflow/compiler/tests/image_ops_test.py
index 42e688174fc..c8b71f802c4 100644
--- a/tensorflow/compiler/tests/image_ops_test.py
+++ b/tensorflow/compiler/tests/image_ops_test.py
@@ -68,8 +68,8 @@ class RGBToHSVTest(xla_test.XLATestCase):
                                                 {batch0: inp})
 
       # Verify that processing batch elements together is the same as separate
-      self.assertAllClose(batch1, join1)
-      self.assertAllClose(batch2, join2)
+      self.assertAllCloseAccordingToType(batch1, join1, half_rtol=0.000002)
+      self.assertAllCloseAccordingToType(batch2, join2, half_rtol=0.000002)
       self.assertAllCloseAccordingToType(
           batch2, inp, bfloat16_atol=0.03, half_rtol=0.02)
 
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index c8fef147b85..3cd93b1e860 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -748,6 +748,7 @@ cc_library(
         ":ir_emission_utils",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:instruction_fusion",
+        "//tensorflow/compiler/xla/service/llvm_ir:fused_ir_emitter",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
index 2e22cdaca52..5793f0080b6 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
 
 namespace xla {
 namespace cpu {
@@ -117,6 +118,14 @@ bool CpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
     return false;
   }
 
+  // Don't fuse if fusing would cause too much code duplication because of
+  // inefficiencies in the fusion emitter.
+  // TODO(b/119692968): Remove this once the fusion emitter can handle
+  // arbitrary fusion nodes.
+  if (FusedIrEmitter::IsFusedIrEmitterInefficient(consumer, producer)) {
+    return false;
+  }
+
   if (consumer->opcode() == HloOpcode::kDot) {
     // In the general case we call out to optimized "black box" GEMM routines
     // for Dot, which precludes fusion.  However, in very specific cases, we try
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 04cd7a76985..2a65d54b1f6 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -582,6 +582,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:instruction_fusion",
         "//tensorflow/compiler/xla/service:pattern_matcher",
+        "//tensorflow/compiler/xla/service/llvm_ir:fused_ir_emitter",
         "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
@@ -703,6 +704,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_cost_analysis",
         "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/compiler/xla/service/llvm_ir:fused_ir_emitter",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
@@ -1102,8 +1104,6 @@ cc_library(
         ":ir_emission_utils",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/service:hlo",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
index 7ef9cc9ff58..c28b3b77ec3 100644
--- a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
+++ b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h"
 #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -289,8 +290,8 @@ Status FusionInstructionMerger::HandleFusion(HloInstruction* fusion) {
   // TODO(b/119692968): Remove this once the fusion emitter can handle arbitrary
   // fusion nodes.
   if (absl::c_any_of(fusion->users(), [fusion](const HloInstruction* user) {
-        return IsFusionEmitterInefficient(/*consumer=*/user,
-                                          /*producer=*/fusion);
+        return FusedIrEmitter::IsFusedIrEmitterInefficient(/*consumer=*/user,
+                                                           /*producer=*/fusion);
       })) {
     VLOG(3) << "Not merging " << fusion->name()
             << ": Contains one or more users where fusing would cause "
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
index 5abd01417dd..0cbd92a9553 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
@@ -15,12 +15,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h"
 
-#include <algorithm>
+#include <iterator>
+#include <vector>
 
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape.h"
@@ -186,102 +184,5 @@ bool IsFusible(const HloInstruction& instr) {
   return IsInputFusible(instr) || IsLoopFusible(instr);
 }
 
-bool IsFusionEmitterInefficient(const HloInstruction* consumer,
-                                const HloInstruction* producer) {
-  if (consumer->opcode() != HloOpcode::kFusion) {
-    return false;
-  }
-  // Collects for each instruction in the fusion node from which (indirect)
-  // users newly created index values are passed. Roughly speaking, we reuse
-  // index values if the shapes are equal when ignoring the element type (we may
-  // reuse also if the shape change is a bitcast, but we don't consider that
-  // here). By ignoring potential reuses our estimate whether the fusion emitter
-  // is inefficient is a bit more conservative than necessary.
-  absl::flat_hash_map<const HloInstruction*,
-                      absl::flat_hash_set<const HloInstruction*>>
-      indexing_users;
-  // Stores the number of different index accesses for each instruction in the
-  // fusion node. The fusion emitter caches access with the same index, so this
-  // value indicates how many times a specific instruction will be emitted.
-  absl::flat_hash_map<const HloInstruction*, int64> index_usage_count;
-  index_usage_count[consumer] = 1;
-
-  auto evaluate_fusion_computation = [&indexing_users, &index_usage_count](
-                                         const HloInstruction* fusion) {
-    auto postorder =
-        fusion->fused_instructions_computation()->MakeInstructionPostOrder();
-    std::reverse(postorder.begin(), postorder.end());
-    for (const auto* instruction : postorder) {
-      if (instruction->opcode() == HloOpcode::kParameter) {
-        continue;
-      }
-      int64& total = index_usage_count[instruction];
-      if (indexing_users[instruction].empty()) {
-        total = index_usage_count[fusion];
-      } else {
-        total = 0;
-        for (const auto* user : indexing_users[instruction]) {
-          int64 weight = 1;
-          // Concatenate is special: the index differs for each operand, so
-          // in the worst case we have to deal with as many index values as
-          // the number of operands of Concatenate. By considering the worst
-          // case, we are more conservative than necessary regarding
-          // refusing to fuse.
-          if (user->opcode() == HloOpcode::kConcatenate) {
-            weight = user->operand_count();
-          }
-          total += index_usage_count[user] * weight;
-        }
-      }
-      for (const auto* operand : instruction->operands()) {
-        // For simplicity we assume that all shape and layout changing
-        // operations invalidate index reuse.
-        if (Shape::Equal().IgnoreElementType()(operand->shape(),
-                                               instruction->shape())) {
-          // If the index is reused, it means the operand gets index values
-          // from the same set of (indirect) users as 'instruction' itself.
-          indexing_users[operand].insert(indexing_users[instruction].begin(),
-                                         indexing_users[instruction].end());
-        } else {
-          // If the index is not reused, it means 'instruction' computes a
-          // new index derived from the index it gets.
-          indexing_users[operand].insert(instruction);
-        }
-      }
-    }
-  };
-  evaluate_fusion_computation(consumer);
-
-  // Also account for the 'producer' if it would be fused. Find the operand it
-  // corresponds to.
-  for (int64 operand_num = 0; operand_num < consumer->operand_count();
-       ++operand_num) {
-    if (consumer->operand(operand_num) == producer) {
-      auto instruction = consumer->fused_parameter(operand_num);
-      int64& total = index_usage_count[producer];
-      total = 0;
-      for (const auto* user : indexing_users[instruction]) {
-        total += index_usage_count[user];
-      }
-      break;
-    }
-  }
-
-  // If 'producer' is a fusion node as well, also evaluate it.
-  if (producer->opcode() == HloOpcode::kFusion) {
-    evaluate_fusion_computation(producer);
-  }
-
-  // Sum up the total number of emitted ops.
-  int64 total = 0;
-  for (const auto& entry : index_usage_count) {
-    total += entry.second;
-  }
-
-  // Check that the code duplication has at most a factor of 8 (where 8 is an
-  // arbitrary constant that seems to work).
-  return total > 8 * index_usage_count.size();
-}
-
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.h b/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
index a6f06570ff3..3a59c74f64b 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
@@ -69,14 +69,6 @@ bool IsInputFusibleScatter(const HloInstruction& instr);
 bool ShapesCompatibleForMultiOutputFusion(const HloInstruction& instr1,
                                           const HloInstruction& instr2);
 
-// Evaluates whether fusing 'producer' into 'consumer' might cause exponential
-// behavior in the fusion emitter. We currently can have exponential time/memory
-// requirements for emitting certain fusion kernels, in which case we don't want
-// to fuse.
-// TODO(b/119692968): Remove this once we have fixed our fusion emitter.
-bool IsFusionEmitterInefficient(const HloInstruction* consumer,
-                                const HloInstruction* producer);
-
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
index 8fcc0450bc6..62f625defc3 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -267,7 +268,7 @@ bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
   // have exponential time/memory requirements for emitting certain fusion
   // kernels, in which case we don't want to fuse.
   // TODO(b/119692968): Remove this once we have fixed our fusion emitter.
-  return !IsFusionEmitterInefficient(consumer, producer);
+  return !FusedIrEmitter::IsFusedIrEmitterInefficient(consumer, producer);
 }
 
 bool GpuInstructionFusion::ShouldFuseIntoMultiOutput(HloInstruction* consumer,
diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD
index c5d59fb28e0..be6371ae01c 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/BUILD
+++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD
@@ -161,6 +161,7 @@ cc_library(
         ":llvm_util",
         ":loop_emitter",
         ":tuple_ops",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
@@ -169,6 +170,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
         "@llvm//:core",
diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
index 9f094330bdc..a550abe9493 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
@@ -15,14 +15,22 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
 
+#include <algorithm>
 #include <functional>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/service/elemental_ir_emitter.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h"
+#include "tensorflow/compiler/xla/shape.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -195,4 +203,101 @@ FusedIrEmitter::IndexedGenerator FusedIrEmitter::GetGenerator(
   return indexed_generators_.at(instruction);
 }
 
+bool FusedIrEmitter::IsFusedIrEmitterInefficient(
+    const HloInstruction* consumer, const HloInstruction* producer) {
+  if (consumer->opcode() != HloOpcode::kFusion) {
+    return false;
+  }
+  // Collects for each instruction in the fusion node from which (indirect)
+  // users newly created index values are passed. Roughly speaking, we reuse
+  // index values if the shapes are equal when ignoring the element type (we may
+  // reuse also if the shape change is a bitcast, but we don't consider that
+  // here). By ignoring potential reuses our estimate whether the fusion emitter
+  // is inefficient is a bit more conservative than necessary.
+  absl::flat_hash_map<const HloInstruction*,
+                      absl::flat_hash_set<const HloInstruction*>>
+      indexing_users;
+  // Stores the number of different index accesses for each instruction in the
+  // fusion node. The fusion emitter caches access with the same index, so this
+  // value indicates how many times a specific instruction will be emitted.
+  absl::flat_hash_map<const HloInstruction*, int64> index_usage_count;
+  index_usage_count[consumer] = 1;
+
+  auto evaluate_fusion_computation = [&indexing_users, &index_usage_count](
+                                         const HloInstruction* fusion) {
+    auto postorder =
+        fusion->fused_instructions_computation()->MakeInstructionPostOrder();
+    std::reverse(postorder.begin(), postorder.end());
+    for (const auto* instruction : postorder) {
+      if (instruction->opcode() == HloOpcode::kParameter) {
+        continue;
+      }
+      int64& total = index_usage_count[instruction];
+      if (indexing_users[instruction].empty()) {
+        total = index_usage_count[fusion];
+      } else {
+        total = 0;
+        for (const auto* user : indexing_users[instruction]) {
+          int64 weight = 1;
+          // Concatenate is special: the index differs for each operand, so
+          // in the worst case we have to deal with as many index values as
+          // the number of operands of Concatenate. By considering the worst
+          // case, we are more conservative than necessary regarding
+          // refusing to fuse.
+          if (user->opcode() == HloOpcode::kConcatenate) {
+            weight = user->operand_count();
+          }
+          total += index_usage_count[user] * weight;
+        }
+      }
+      for (const auto* operand : instruction->operands()) {
+        // For simplicity we assume that all shape and layout changing
+        // operations invalidate index reuse.
+        if (Shape::Equal().IgnoreElementType()(operand->shape(),
+                                               instruction->shape())) {
+          // If the index is reused, it means the operand gets index values
+          // from the same set of (indirect) users as 'instruction' itself.
+          indexing_users[operand].insert(indexing_users[instruction].begin(),
+                                         indexing_users[instruction].end());
+        } else {
+          // If the index is not reused, it means 'instruction' computes a
+          // new index derived from the index it gets.
+          indexing_users[operand].insert(instruction);
+        }
+      }
+    }
+  };
+  evaluate_fusion_computation(consumer);
+
+  // Also account for the 'producer' if it would be fused. Find the operand it
+  // corresponds to.
+  for (int64 operand_num = 0; operand_num < consumer->operand_count();
+       ++operand_num) {
+    if (consumer->operand(operand_num) == producer) {
+      auto instruction = consumer->fused_parameter(operand_num);
+      int64& total = index_usage_count[producer];
+      total = 0;
+      for (const auto* user : indexing_users[instruction]) {
+        total += index_usage_count[user];
+      }
+      break;
+    }
+  }
+
+  // If 'producer' is a fusion node as well, also evaluate it.
+  if (producer->opcode() == HloOpcode::kFusion) {
+    evaluate_fusion_computation(producer);
+  }
+
+  // Sum up the total number of emitted ops.
+  int64 total = 0;
+  for (const auto& entry : index_usage_count) {
+    total += entry.second;
+  }
+
+  // Check that the code duplication has at most a factor of 8 (where 8 is an
+  // arbitrary constant that seems to work).
+  return total > 8 * index_usage_count.size();
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
index e6d52a580c0..b1aa6d59634 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
@@ -91,6 +91,14 @@ class FusedIrEmitter : public DfsHloVisitorWithDefault {
     tiled_parameter_info_ = info;
   }
 
+  // Evaluates whether fusing 'producer' into 'consumer' might cause exponential
+  // behavior in FusedIrEmitter. We currently can have exponential time/memory
+  // requirements for emitting certain fusion kernels, in which case we don't
+  // want to fuse.
+  // TODO(b/119692968): Remove this once we have fixed our fusion emitter.
+  static bool IsFusedIrEmitterInefficient(const HloInstruction* consumer,
+                                          const HloInstruction* producer);
+
  protected:
   // Returns the IrArrays for the fusion instruction operands.
   llvm_ir::IrArray& GetIrArrayForFusedParameter(int64 parameter_number) {
diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index c353e8d46df..21458b40b10 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -349,8 +349,7 @@ TEST_P(ArrayElementwiseOpTestParamCount, AddManyValues) {
                              error_spec_);
 }
 
-// TODO(b/119692968): This test runs OOM on the CPU backend.
-XLA_TEST_F(ArrayElementwiseOpTest, DISABLED_ON_CPU(DeeplyNestedAddWithSlices)) {
+XLA_TEST_F(ArrayElementwiseOpTest, DeeplyNestedAddWithSlices) {
   XlaBuilder builder(TestName());
   std::vector<float> values(30, 0.0);
   auto a_literal = LiteralUtil::CreateR1<float>(values);