diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc index 6e81dc0d5e2..1c1a028e2f9 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc @@ -308,10 +308,7 @@ Status GpuCompiler::OptimizeHloModule( HloPassPipeline horizontal_fusion("horizontal_fusion"); horizontal_fusion.AddPass(); - // The code generated for fusions created by GpuHorizontalInputFusion has - // been observed to fail with CUDA_ERROR_ILLEGAL_ADDRESS errors. - // TODO(b/171227713): Re-enable once the emitters are fixed. - // horizontal_fusion.AddPass(); + horizontal_fusion.AddPass(); horizontal_fusion.AddPass(/*is_layout_sensitive=*/true, /*only_fusion_computations=*/true); horizontal_fusion.AddPass(); diff --git a/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion_test.cc index 96e46fe723c..8ecfbb5a8d2 100644 --- a/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion_test.cc +++ b/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion_test.cc @@ -77,8 +77,7 @@ TEST_F(HorizontalInputFusionTest, BasicTest) { op::Tuple(op::Reduce(), op::Reduce())); } -// TODO(b/171227713): Re-enable once fixed. -TEST_F(HorizontalInputFusionTest, DISABLED_ManyInputFusions) { +TEST_F(HorizontalInputFusionTest, ManyInputFusions) { auto module = CreateNewVerifiedModule(); HloComputation* reduce_computation; diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index cf31f1ca1f2..b22d6a0c810 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -4381,7 +4381,7 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( unnested_hlo->IsMultiOutputFusion() ? unnested_hlo->fused_expression_root()->operand_count() : 1; - int64 max_block_size = std::max(16LL, 512LL / NearestPowerOfTwo(fan_out)); + int64 max_block_size = std::max(64LL, 512LL / NearestPowerOfTwo(fan_out)); return std::min( max_block_size, RoundUpToNearest(CeilOfRatio(reduction_dimensions.dimensions[2],