From ee4657facfb5805c85a7fa0d68493e642a22e51b Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Sun, 19 May 2019 11:44:16 -0700
Subject: [PATCH] [XLA:CPU] Run ScatterExpander much earlier in CPU pipeline.

Before, the ScatterExpander was run after fusion (!), meaning that nothing it
emitted would ever be fused.

On my machine, this is good for a 3.2/2.6 = 1.2x speedup on the testcase from
https://github.com/google/jax/issues/695.

PiperOrigin-RevId: 248950865
---
 tensorflow/compiler/xla/service/cpu/cpu_compiler.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index d852d0aae5d..a3e224824ba 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -297,6 +297,7 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
     pass.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/false,
                                           /*allow_mixed_precision=*/false);
 
+    pass.AddPass<ScatterExpander>();
     pass.AddPass<BatchNormExpander>(
         /*rewrite_training_op=*/true,
         /*rewrite_inference_op=*/true,
@@ -340,8 +341,6 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
 
   pipeline.AddPass<CpuInstructionFusion>();
 
-  pipeline.AddPass<ScatterExpander>();
-
   ReducePrecisionInsertion::AddPasses(
       &pipeline, module->config().debug_options(),
       ReducePrecisionInsertion::PassTiming::AFTER_FUSION);