[XLA:GPU] Re-enable horizontal input fusion. Use a minimum of 64 threads per block for multi-output fusions.

This is the general advice and smaller block sizes. Moreover, smaller block sizes led to miscompiles in XLA:GPU emitters.

"A minimum of 64 threads per block should be used, and only if there are multiple concurrent blocks per multiprocessor."
https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html

PiperOrigin-RevId: 342828730
Change-Id: Ic5c3d5ae45c3e4c78aec9d7c82b62960d5105e6b
This commit is contained in:
Thomas Joerg 2020-11-17 03:22:04 -08:00 committed by TensorFlower Gardener
parent 2afdf93bf7
commit 67adc58cd6
3 changed files with 3 additions and 7 deletions

View File

@ -308,10 +308,7 @@ Status GpuCompiler::OptimizeHloModule(
HloPassPipeline horizontal_fusion("horizontal_fusion"); HloPassPipeline horizontal_fusion("horizontal_fusion");
horizontal_fusion.AddPass<GpuHorizontalLoopFusion>(); horizontal_fusion.AddPass<GpuHorizontalLoopFusion>();
// The code generated for fusions created by GpuHorizontalInputFusion has horizontal_fusion.AddPass<GpuHorizontalInputFusion>();
// been observed to fail with CUDA_ERROR_ILLEGAL_ADDRESS errors.
// TODO(b/171227713): Re-enable once the emitters are fixed.
// horizontal_fusion.AddPass<GpuHorizontalInputFusion>();
horizontal_fusion.AddPass<HloCSE>(/*is_layout_sensitive=*/true, horizontal_fusion.AddPass<HloCSE>(/*is_layout_sensitive=*/true,
/*only_fusion_computations=*/true); /*only_fusion_computations=*/true);
horizontal_fusion.AddPass<HloDCE>(); horizontal_fusion.AddPass<HloDCE>();

View File

@ -77,8 +77,7 @@ TEST_F(HorizontalInputFusionTest, BasicTest) {
op::Tuple(op::Reduce(), op::Reduce())); op::Tuple(op::Reduce(), op::Reduce()));
} }
// TODO(b/171227713): Re-enable once fixed. TEST_F(HorizontalInputFusionTest, ManyInputFusions) {
TEST_F(HorizontalInputFusionTest, DISABLED_ManyInputFusions) {
auto module = CreateNewVerifiedModule(); auto module = CreateNewVerifiedModule();
HloComputation* reduce_computation; HloComputation* reduce_computation;

View File

@ -4381,7 +4381,7 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
unnested_hlo->IsMultiOutputFusion() unnested_hlo->IsMultiOutputFusion()
? unnested_hlo->fused_expression_root()->operand_count() ? unnested_hlo->fused_expression_root()->operand_count()
: 1; : 1;
int64 max_block_size = std::max(16LL, 512LL / NearestPowerOfTwo(fan_out)); int64 max_block_size = std::max(64LL, 512LL / NearestPowerOfTwo(fan_out));
return std::min( return std::min(
max_block_size, max_block_size,
RoundUpToNearest(CeilOfRatio(reduction_dimensions.dimensions[2], RoundUpToNearest(CeilOfRatio(reduction_dimensions.dimensions[2],