diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index 6e81dc0d5e2..1c1a028e2f9 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -308,10 +308,7 @@ Status GpuCompiler::OptimizeHloModule(
 
     HloPassPipeline horizontal_fusion("horizontal_fusion");
     horizontal_fusion.AddPass<GpuHorizontalLoopFusion>();
-    // The code generated for fusions created by GpuHorizontalInputFusion has
-    // been observed to fail with CUDA_ERROR_ILLEGAL_ADDRESS errors.
-    // TODO(b/171227713): Re-enable once the emitters are fixed.
-    // horizontal_fusion.AddPass<GpuHorizontalInputFusion>();
+    horizontal_fusion.AddPass<GpuHorizontalInputFusion>();
     horizontal_fusion.AddPass<HloCSE>(/*is_layout_sensitive=*/true,
                                       /*only_fusion_computations=*/true);
     horizontal_fusion.AddPass<HloDCE>();
diff --git a/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion_test.cc
index 96e46fe723c..8ecfbb5a8d2 100644
--- a/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion_test.cc
@@ -77,8 +77,7 @@ TEST_F(HorizontalInputFusionTest, BasicTest) {
               op::Tuple(op::Reduce(), op::Reduce()));
 }
 
-// TODO(b/171227713): Re-enable once fixed.
-TEST_F(HorizontalInputFusionTest, DISABLED_ManyInputFusions) {
+TEST_F(HorizontalInputFusionTest, ManyInputFusions) {
   auto module = CreateNewVerifiedModule();
 
   HloComputation* reduce_computation;
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index cf31f1ca1f2..b22d6a0c810 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -4381,7 +4381,7 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
           unnested_hlo->IsMultiOutputFusion()
               ? unnested_hlo->fused_expression_root()->operand_count()
               : 1;
-      int64 max_block_size = std::max(16LL, 512LL / NearestPowerOfTwo(fan_out));
+      int64 max_block_size = std::max(64LL, 512LL / NearestPowerOfTwo(fan_out));
       return std::min(
           max_block_size,
           RoundUpToNearest(CeilOfRatio(reduction_dimensions.dimensions[2],