[XLA:GPU] Re-enable horizontal input fusion. Use a minimum of 64 threads per block for multi-output fusions.
This is the general advice and smaller block sizes. Moreover, smaller block sizes led to miscompiles in XLA:GPU emitters. "A minimum of 64 threads per block should be used, and only if there are multiple concurrent blocks per multiprocessor." https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html PiperOrigin-RevId: 342828730 Change-Id: Ic5c3d5ae45c3e4c78aec9d7c82b62960d5105e6b
This commit is contained in:
parent
2afdf93bf7
commit
67adc58cd6
@ -308,10 +308,7 @@ Status GpuCompiler::OptimizeHloModule(
|
|||||||
|
|
||||||
HloPassPipeline horizontal_fusion("horizontal_fusion");
|
HloPassPipeline horizontal_fusion("horizontal_fusion");
|
||||||
horizontal_fusion.AddPass<GpuHorizontalLoopFusion>();
|
horizontal_fusion.AddPass<GpuHorizontalLoopFusion>();
|
||||||
// The code generated for fusions created by GpuHorizontalInputFusion has
|
horizontal_fusion.AddPass<GpuHorizontalInputFusion>();
|
||||||
// been observed to fail with CUDA_ERROR_ILLEGAL_ADDRESS errors.
|
|
||||||
// TODO(b/171227713): Re-enable once the emitters are fixed.
|
|
||||||
// horizontal_fusion.AddPass<GpuHorizontalInputFusion>();
|
|
||||||
horizontal_fusion.AddPass<HloCSE>(/*is_layout_sensitive=*/true,
|
horizontal_fusion.AddPass<HloCSE>(/*is_layout_sensitive=*/true,
|
||||||
/*only_fusion_computations=*/true);
|
/*only_fusion_computations=*/true);
|
||||||
horizontal_fusion.AddPass<HloDCE>();
|
horizontal_fusion.AddPass<HloDCE>();
|
||||||
|
@ -77,8 +77,7 @@ TEST_F(HorizontalInputFusionTest, BasicTest) {
|
|||||||
op::Tuple(op::Reduce(), op::Reduce()));
|
op::Tuple(op::Reduce(), op::Reduce()));
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO(b/171227713): Re-enable once fixed.
|
TEST_F(HorizontalInputFusionTest, ManyInputFusions) {
|
||||||
TEST_F(HorizontalInputFusionTest, DISABLED_ManyInputFusions) {
|
|
||||||
auto module = CreateNewVerifiedModule();
|
auto module = CreateNewVerifiedModule();
|
||||||
|
|
||||||
HloComputation* reduce_computation;
|
HloComputation* reduce_computation;
|
||||||
|
@ -4381,7 +4381,7 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
|
|||||||
unnested_hlo->IsMultiOutputFusion()
|
unnested_hlo->IsMultiOutputFusion()
|
||||||
? unnested_hlo->fused_expression_root()->operand_count()
|
? unnested_hlo->fused_expression_root()->operand_count()
|
||||||
: 1;
|
: 1;
|
||||||
int64 max_block_size = std::max(16LL, 512LL / NearestPowerOfTwo(fan_out));
|
int64 max_block_size = std::max(64LL, 512LL / NearestPowerOfTwo(fan_out));
|
||||||
return std::min(
|
return std::min(
|
||||||
max_block_size,
|
max_block_size,
|
||||||
RoundUpToNearest(CeilOfRatio(reduction_dimensions.dimensions[2],
|
RoundUpToNearest(CeilOfRatio(reduction_dimensions.dimensions[2],
|
||||||
|
Loading…
x
Reference in New Issue
Block a user