[XLA:GPU] Re-enable horizontal input fusion. Use a minimum of 64 threads per block for multi-output fusions.

This is the general advice and smaller block sizes. Moreover, smaller block sizes led to miscompiles in XLA:GPU emitters. "A minimum of 64 threads per block should be used, and only if there are multiple concurrent blocks per multiprocessor." https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html PiperOrigin-RevId: 342828730 Change-Id: Ic5c3d5ae45c3e4c78aec9d7c82b62960d5105e6b
2020-11-17 03:22:04 -08:00 · 2020-11-17 03:22:04 -08:00 · 67adc58cd6
commit 67adc58cd6
parent 2afdf93bf7
3 changed files with 3 additions and 7 deletions
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@ -308,10 +308,7 @@ Status GpuCompiler::OptimizeHloModule(
    HloPassPipeline horizontal_fusion("horizontal_fusion");
    horizontal_fusion.AddPass<GpuHorizontalLoopFusion>();
-    // The code generated for fusions created by GpuHorizontalInputFusion has
+    horizontal_fusion.AddPass<GpuHorizontalInputFusion>();
    // been observed to fail with CUDA_ERROR_ILLEGAL_ADDRESS errors.
    // TODO(b/171227713): Re-enable once the emitters are fixed.
    // horizontal_fusion.AddPass<GpuHorizontalInputFusion>();
    horizontal_fusion.AddPass<HloCSE>(/*is_layout_sensitive=*/true,
                                      /*only_fusion_computations=*/true);
    horizontal_fusion.AddPass<HloDCE>();
--- a/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion_test.cc
@ -77,8 +77,7 @@ TEST_F(HorizontalInputFusionTest, BasicTest) {
              op::Tuple(op::Reduce(), op::Reduce()));
 }
-// TODO(b/171227713): Re-enable once fixed.
+TEST_F(HorizontalInputFusionTest, ManyInputFusions) {
 TEST_F(HorizontalInputFusionTest, DISABLED_ManyInputFusions) {
  auto module = CreateNewVerifiedModule();
  HloComputation* reduce_computation;
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@ -4381,7 +4381,7 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
          unnested_hlo->IsMultiOutputFusion()
              ? unnested_hlo->fused_expression_root()->operand_count()
              : 1;
-      int64 max_block_size = std::max(16LL, 512LL / NearestPowerOfTwo(fan_out));
+      int64 max_block_size = std::max(64LL, 512LL / NearestPowerOfTwo(fan_out));
      return std::min(
          max_block_size,
          RoundUpToNearest(CeilOfRatio(reduction_dimensions.dimensions[2],