[XLA/GPU] Refactor sort.hlo CHECK lines.

PiperOrigin-RevId: 356890843
Change-Id: If17e35dacbc68f9376bfe719b492597c0990f8ce
This commit is contained in:
Tim Shen 2021-02-10 20:41:44 -08:00 committed by TensorFlower Gardener
parent 6f52c65fbf
commit 69f5aecd05

View File

@ -8,163 +8,174 @@ compare {
ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
} }
// CHECK: define void @sort(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]]) // CHECK: define void @sort(i8* noalias align 64 dereferenceable(24) %[[VAL_0:.*]]) {
// CHECK-NEXT: entry: // CHECK: entry:
// CHECK-NEXT: [[COMPARE_RETURN_BUFFER:%.*]] = alloca i8, align 1 // CHECK: %[[VAL_1:.*]] = alloca i8, align 1
// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0:%.*]], i64 0 // CHECK: %[[VAL_2:.*]] = getelementptr inbounds i8, i8* %[[VAL_0]], i64 0
// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to [2 x [3 x float]]* // CHECK: %[[VAL_3:.*]] = bitcast i8* %[[VAL_2]] to [2 x [3 x float]]*
// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0]], i64 0 // CHECK: %[[VAL_4:.*]] = getelementptr inbounds i8, i8* %[[VAL_0]], i64 0
// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to [2 x [3 x float]]* // CHECK: %[[VAL_5:.*]] = bitcast i8* %[[VAL_4]] to [2 x [3 x float]]*
// CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6 // CHECK: %[[VAL_6:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
// CHECK-NEXT: [[BLOCK_ID:%.*]] = zext i32 [[TMP4]] to i64 // CHECK: %[[VAL_7:.*]] = zext i32 %[[VAL_6]] to i64
// CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7 // CHECK: %[[VAL_8:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
// CHECK-NEXT: [[THREAD_ID:%.*]] = zext i32 [[TMP5]] to i64 // CHECK: %[[VAL_9:.*]] = zext i32 %[[VAL_8]] to i64
// CHECK-NEXT: [[TMP6:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4 // CHECK: %[[VAL_10:.*]] = mul nuw nsw i64 %[[VAL_7]], 4
// CHECK-NEXT: [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP6]], [[THREAD_ID]] // CHECK: %[[VAL_11:.*]] = add nuw nsw i64 %[[VAL_10]], %[[VAL_9]]
// CHECK-NEXT: [[LINEAR_INDEX_IN_RANGE:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4 // CHECK: %[[VAL_12:.*]] = icmp ult i64 %[[VAL_11]], 4
// CHECK-NEXT: call void @llvm.assume(i1 [[LINEAR_INDEX_IN_RANGE]]) // CHECK: call void @llvm.assume(i1 %[[VAL_12]])
// CHECK-NEXT: [[TMP7:%.*]] = udiv i64 [[LINEAR_INDEX]], 1 // CHECK: %[[VAL_13:.*]] = udiv i64 %[[VAL_11]], 1
// CHECK-NEXT: [[TMP8:%.*]] = urem i64 [[TMP7]], 2 // CHECK: %[[VAL_14:.*]] = urem i64 %[[VAL_13]], 2
// CHECK-NEXT: [[TMP9:%.*]] = udiv i64 [[LINEAR_INDEX]], 2 // CHECK: %[[VAL_15:.*]] = udiv i64 %[[VAL_11]], 2
// CHECK-NEXT: [[TMP10:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4 // CHECK: %[[VAL_16:.*]] = icmp ult i64 %[[VAL_11]], 4
// CHECK-NEXT: br i1 [[TMP10]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]] // CHECK: br i1 %[[VAL_16]], label %[[VAL_17:.*]], label %[[VAL_18:.*]]
// CHECK: sort.in_bounds-after: // CHECK: sort.in_bounds-after: ; preds = %[[VAL_19:.*]], %[[VAL_20:.*]]
// CHECK-NEXT: ret void // CHECK: ret void
// CHECK: sort.in_bounds-true: // CHECK: sort.in_bounds-true: ; preds = %[[VAL_20]]
// CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP8]], 2 // CHECK: %[[VAL_21:.*]] = mul i64 %[[VAL_14]], 2
// CHECK-NEXT: [[TMP12:%.*]] = xor i64 [[TMP11]], 1 // CHECK: %[[VAL_22:.*]] = xor i64 %[[VAL_21]], 1
// CHECK-NEXT: [[TMP13:%.*]] = icmp slt i64 [[TMP11]], [[TMP12]] // CHECK: %[[VAL_23:.*]] = icmp slt i64 %[[VAL_21]], %[[VAL_22]]
// CHECK-NEXT: [[TMP14:%.*]] = icmp slt i64 [[TMP12]], 3 // CHECK: %[[VAL_24:.*]] = icmp slt i64 %[[VAL_22]], 3
// CHECK-NEXT: [[TMP15:%.*]] = and i1 [[TMP13]], [[TMP14]] // CHECK: %[[VAL_25:.*]] = and i1 %[[VAL_23]], %[[VAL_24]]
// CHECK-NEXT: br i1 [[TMP15]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]] // CHECK: br i1 %[[VAL_25]], label %[[VAL_26:.*]], label %[[VAL_19]]
// CHECK: smaller_comparison_index-after: // CHECK: smaller_comparison_index-after: ; preds = %[[VAL_27:.*]], %[[VAL_17]]
// CHECK-NEXT: br label [[SORT_IN_BOUNDS_AFTER]] // CHECK: br label %[[VAL_18]]
// CHECK: smaller_comparison_index-true: // CHECK: smaller_comparison_index-true: ; preds = %[[VAL_17]]
// CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP12]] // CHECK: %[[VAL_28:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_3]], i64 0, i64 %[[VAL_15]], i64 %[[VAL_22]]
// CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP11]] // CHECK: %[[VAL_29:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_3]], i64 0, i64 %[[VAL_15]], i64 %[[VAL_21]]
// CHECK-NEXT: call void @region_0_4(float* [[TMP16]], float* [[TMP17]], i8* [[COMPARE_RETURN_BUFFER]]) // CHECK: call void @region_0_4(float* %[[VAL_28]], float* %[[VAL_29]], i8* %[[VAL_1]])
// CHECK-NEXT: [[TMP18:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1 // CHECK: %[[VAL_30:.*]] = load i8, i8* %[[VAL_1]], align 1
// CHECK-NEXT: [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP18]], 0 // CHECK: %[[VAL_31:.*]] = icmp ne i8 %[[VAL_30]], 0
// CHECK-NEXT: br i1 [[BOOLEAN_PREDICATE]], label [[IS_SMALLER_THAN_TRUE:%.*]], label [[IS_SMALLER_THAN_AFTER:%.*]] // CHECK: br i1 %[[VAL_31]], label %[[VAL_32:.*]], label %[[VAL_27]]
// CHECK: is_smaller_than-after: // CHECK: is_smaller_than-after: ; preds = %[[VAL_32]], %[[VAL_26]]
// CHECK-NEXT: br label [[SMALLER_COMPARISON_INDEX_AFTER]] // CHECK: br label %[[VAL_19]]
// CHECK: is_smaller_than-true: // CHECK: is_smaller_than-true: ; preds = %[[VAL_26]]
// CHECK-NEXT: [[TMP19:%.*]] = load float, float* [[TMP16]], align 4 // CHECK: %[[VAL_33:.*]] = load float, float* %[[VAL_28]], align 4
// CHECK-NEXT: [[TMP20:%.*]] = load float, float* [[TMP17]], align 4 // CHECK: %[[VAL_34:.*]] = load float, float* %[[VAL_29]], align 4
// CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP11]] // CHECK: %[[VAL_35:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_3]], i64 0, i64 %[[VAL_15]], i64 %[[VAL_21]]
// CHECK-NEXT: store float [[TMP19]], float* [[TMP21]], align 4 // CHECK: store float %[[VAL_33]], float* %[[VAL_35]], align 4
// CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP12]] // CHECK: %[[VAL_36:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_3]], i64 0, i64 %[[VAL_15]], i64 %[[VAL_22]]
// CHECK-NEXT: store float [[TMP20]], float* [[TMP22]], align 4 // CHECK: store float %[[VAL_34]], float* %[[VAL_36]], align 4
// CHECK-NEXT: br label [[IS_SMALLER_THAN_AFTER]] // CHECK: br label %[[VAL_27]]
// CHECK: }
// CHECK: ; Function Attrs: nounwind readnone
// CHECK: declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #0
// CHECK: ; Function Attrs: nounwind readnone
// CHECK: declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
// CHECK: ; Function Attrs: nofree nosync nounwind willreturn
// CHECK: declare void @llvm.assume(i1 noundef) #1
// CHECK: define internal void @region_0_4(float* dereferenceable(4) [[P_0_LHS_TYPED:%.*]], float* dereferenceable(4) [[P_0_RHS_TYPED:%.*]], i8* dereferenceable(1) [[OUTPUT_ARG:%.*]]) // CHECK: define internal void @region_0_4(float* dereferenceable(4) %[[VAL_0:.*]], float* dereferenceable(4) %[[VAL_1:.*]], i8* dereferenceable(1) %[[VAL_2:.*]]) {
// CHECK-NEXT: entry: // CHECK: entry:
// CHECK-NEXT: [[COMPARE_3_TYPED:%.*]] = alloca i8, align 1 // CHECK: %[[VAL_3:.*]] = alloca i8, align 1
// CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[ARG_0_1_TYPED:%.*]], align 4 // CHECK: %[[VAL_4:.*]] = load float, float* %[[VAL_0]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARG_1_2_TYPED:%.*]], align 4 // CHECK: %[[VAL_5:.*]] = load float, float* %[[VAL_1]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = fcmp olt float [[TMP0]], [[TMP1]] // CHECK: %[[VAL_6:.*]] = fcmp olt float %[[VAL_4]], %[[VAL_5]]
// CHECK-NEXT: [[TMP3:%.*]] = zext i1 [[TMP2]] to i8 // CHECK: %[[VAL_7:.*]] = zext i1 %[[VAL_6]] to i8
// CHECK-NEXT: store i8 [[TMP3]], i8* [[COMPARE_3_TYPED]], align 1 // CHECK: store i8 %[[VAL_7]], i8* %[[VAL_3]], align 1
// CHECK-NEXT: [[LOAD_RET_VALUE:%.*]] = load i8, i8* [[COMPARE_3_TYPED]], align 1 // CHECK: %[[VAL_8:.*]] = load i8, i8* %[[VAL_3]], align 1
// CHECK-NEXT: store i8 [[LOAD_RET_VALUE]], i8* [[OUTPUT_ARG:%.*]], align 1 // CHECK: store i8 %[[VAL_8]], i8* %[[VAL_2]], align 1
// CHECK-NEXT: ret void // CHECK: ret void
// CHECK: }
// CHECK: define void @sort__1(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]]) { // CHECK: define void @sort__1(i8* noalias align 64 dereferenceable(24) %[[VAL_0:.*]]) {
// CHECK-NEXT: entry: // CHECK: entry:
// CHECK-NEXT: [[COMPARE_RETURN_BUFFER:%.*]] = alloca i8, align 1 // CHECK: %[[VAL_1:.*]] = alloca i8, align 1
// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0:%.*]], i64 0 // CHECK: %[[VAL_2:.*]] = getelementptr inbounds i8, i8* %[[VAL_0]], i64 0
// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to [2 x [3 x float]]* // CHECK: %[[VAL_3:.*]] = bitcast i8* %[[VAL_2]] to [2 x [3 x float]]*
// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0]], i64 0 // CHECK: %[[VAL_4:.*]] = getelementptr inbounds i8, i8* %[[VAL_0]], i64 0
// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to [2 x [3 x float]]* // CHECK: %[[VAL_5:.*]] = bitcast i8* %[[VAL_4]] to [2 x [3 x float]]*
// CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6 // CHECK: %[[VAL_6:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
// CHECK-NEXT: [[BLOCK_ID:%.*]] = zext i32 [[TMP4]] to i64 // CHECK: %[[VAL_7:.*]] = zext i32 %[[VAL_6]] to i64
// CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7 // CHECK: %[[VAL_8:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
// CHECK-NEXT: [[THREAD_ID:%.*]] = zext i32 [[TMP5]] to i64 // CHECK: %[[VAL_9:.*]] = zext i32 %[[VAL_8]] to i64
// CHECK-NEXT: [[TMP6:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4 // CHECK: %[[VAL_10:.*]] = mul nuw nsw i64 %[[VAL_7]], 4
// CHECK-NEXT: [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP6]], [[THREAD_ID]] // CHECK: %[[VAL_11:.*]] = add nuw nsw i64 %[[VAL_10]], %[[VAL_9]]
// CHECK-NEXT: [[LINEAR_INDEX_IN_RANGE:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4 // CHECK: %[[VAL_12:.*]] = icmp ult i64 %[[VAL_11]], 4
// CHECK-NEXT: call void @llvm.assume(i1 [[LINEAR_INDEX_IN_RANGE]]) // CHECK: call void @llvm.assume(i1 %[[VAL_12]])
// CHECK-NEXT: [[TMP7:%.*]] = udiv i64 [[LINEAR_INDEX]], 1 // CHECK: %[[VAL_13:.*]] = udiv i64 %[[VAL_11]], 1
// CHECK-NEXT: [[TMP8:%.*]] = urem i64 [[TMP7]], 2 // CHECK: %[[VAL_14:.*]] = urem i64 %[[VAL_13]], 2
// CHECK-NEXT: [[TMP9:%.*]] = udiv i64 [[LINEAR_INDEX]], 2 // CHECK: %[[VAL_15:.*]] = udiv i64 %[[VAL_11]], 2
// CHECK-NEXT: [[TMP10:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4 // CHECK: %[[VAL_16:.*]] = icmp ult i64 %[[VAL_11]], 4
// CHECK-NEXT: br i1 [[TMP10]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]] // CHECK: br i1 %[[VAL_16]], label %[[VAL_17:.*]], label %[[VAL_18:.*]]
// CHECK: sort.in_bounds-after: // CHECK: sort.in_bounds-after: ; preds = %[[VAL_19:.*]], %[[VAL_20:.*]]
// CHECK-NEXT: ret void // CHECK: ret void
// CHECK: sort.in_bounds-true: // CHECK: sort.in_bounds-true: ; preds = %[[VAL_20]]
// CHECK-NEXT: [[TMP11:%.*]] = xor i64 [[TMP8]], 3 // CHECK: %[[VAL_21:.*]] = xor i64 %[[VAL_14]], 3
// CHECK-NEXT: [[TMP12:%.*]] = icmp slt i64 [[TMP8]], [[TMP11]] // CHECK: %[[VAL_22:.*]] = icmp slt i64 %[[VAL_14]], %[[VAL_21]]
// CHECK-NEXT: [[TMP13:%.*]] = icmp slt i64 [[TMP11]], 3 // CHECK: %[[VAL_23:.*]] = icmp slt i64 %[[VAL_21]], 3
// CHECK-NEXT: [[TMP14:%.*]] = and i1 [[TMP12]], [[TMP13]] // CHECK: %[[VAL_24:.*]] = and i1 %[[VAL_22]], %[[VAL_23]]
// CHECK-NEXT: br i1 [[TMP14]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]] // CHECK: br i1 %[[VAL_24]], label %[[VAL_25:.*]], label %[[VAL_19]]
// CHECK: smaller_comparison_index-after: // CHECK: smaller_comparison_index-after: ; preds = %[[VAL_26:.*]], %[[VAL_17]]
// CHECK-NEXT: br label [[SORT_IN_BOUNDS_AFTER]] // CHECK: br label %[[VAL_18]]
// CHECK: smaller_comparison_index-true: // CHECK: smaller_comparison_index-true: ; preds = %[[VAL_17]]
// CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP11]] // CHECK: %[[VAL_27:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_3]], i64 0, i64 %[[VAL_15]], i64 %[[VAL_21]]
// CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP8]] // CHECK: %[[VAL_28:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_3]], i64 0, i64 %[[VAL_15]], i64 %[[VAL_14]]
// CHECK-NEXT: call void @region_0_4(float* [[TMP15]], float* [[TMP16]], i8* [[COMPARE_RETURN_BUFFER]]) // CHECK: call void @region_0_4(float* %[[VAL_27]], float* %[[VAL_28]], i8* %[[VAL_1]])
// CHECK-NEXT: [[TMP17:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1 // CHECK: %[[VAL_29:.*]] = load i8, i8* %[[VAL_1]], align 1
// CHECK-NEXT: [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP17]], 0 // CHECK: %[[VAL_30:.*]] = icmp ne i8 %[[VAL_29]], 0
// CHECK-NEXT: br i1 [[BOOLEAN_PREDICATE]], label [[IS_SMALLER_THAN_TRUE:%.*]], label [[IS_SMALLER_THAN_AFTER:%.*]] // CHECK: br i1 %[[VAL_30]], label %[[VAL_31:.*]], label %[[VAL_26]]
// CHECK: is_smaller_than-after: // CHECK: is_smaller_than-after: ; preds = %[[VAL_31]], %[[VAL_25]]
// CHECK-NEXT: br label [[SMALLER_COMPARISON_INDEX_AFTER]] // CHECK: br label %[[VAL_19]]
// CHECK: is_smaller_than-true: // CHECK: is_smaller_than-true: ; preds = %[[VAL_25]]
// CHECK-NEXT: [[TMP18:%.*]] = load float, float* [[TMP15]], align 4 // CHECK: %[[VAL_32:.*]] = load float, float* %[[VAL_27]], align 4
// CHECK-NEXT: [[TMP19:%.*]] = load float, float* [[TMP16]], align 4 // CHECK: %[[VAL_33:.*]] = load float, float* %[[VAL_28]], align 4
// CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP8]] // CHECK: %[[VAL_34:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_3]], i64 0, i64 %[[VAL_15]], i64 %[[VAL_14]]
// CHECK-NEXT: store float [[TMP18]], float* [[TMP20]], align 4 // CHECK: store float %[[VAL_32]], float* %[[VAL_34]], align 4
// CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP11]] // CHECK: %[[VAL_35:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_3]], i64 0, i64 %[[VAL_15]], i64 %[[VAL_21]]
// CHECK-NEXT: store float [[TMP19]], float* [[TMP21]], align 4 // CHECK: store float %[[VAL_33]], float* %[[VAL_35]], align 4
// CHECK-NEXT: br label [[IS_SMALLER_THAN_AFTER]] // CHECK: br label %[[VAL_26]]
// CHECK: }
// CHECK: define void @sort__2(i8* noalias align 64 dereferenceable(24) %[[VAL_0:.*]]) {
// CHECK: entry:
// CHECK: %[[VAL_1:.*]] = alloca i8, align 1
// CHECK: %[[VAL_2:.*]] = getelementptr inbounds i8, i8* %[[VAL_0]], i64 0
// CHECK: %[[VAL_3:.*]] = bitcast i8* %[[VAL_2]] to [2 x [3 x float]]*
// CHECK: %[[VAL_4:.*]] = getelementptr inbounds i8, i8* %[[VAL_0]], i64 0
// CHECK: %[[VAL_5:.*]] = bitcast i8* %[[VAL_4]] to [2 x [3 x float]]*
// CHECK: %[[VAL_6:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
// CHECK: %[[VAL_7:.*]] = zext i32 %[[VAL_6]] to i64
// CHECK: %[[VAL_8:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
// CHECK: %[[VAL_9:.*]] = zext i32 %[[VAL_8]] to i64
// CHECK: %[[VAL_10:.*]] = mul nuw nsw i64 %[[VAL_7]], 4
// CHECK: %[[VAL_11:.*]] = add nuw nsw i64 %[[VAL_10]], %[[VAL_9]]
// CHECK: %[[VAL_12:.*]] = icmp ult i64 %[[VAL_11]], 4
// CHECK: call void @llvm.assume(i1 %[[VAL_12]])
// CHECK: %[[VAL_13:.*]] = udiv i64 %[[VAL_11]], 1
// CHECK: %[[VAL_14:.*]] = urem i64 %[[VAL_13]], 2
// CHECK: %[[VAL_15:.*]] = udiv i64 %[[VAL_11]], 2
// CHECK: %[[VAL_16:.*]] = icmp ult i64 %[[VAL_11]], 4
// CHECK: br i1 %[[VAL_16]], label %[[VAL_17:.*]], label %[[VAL_18:.*]]
// CHECK: sort.in_bounds-after: ; preds = %[[VAL_19:.*]], %[[VAL_20:.*]]
// CHECK: ret void
// CHECK: sort.in_bounds-true: ; preds = %[[VAL_20]]
// CHECK: %[[VAL_21:.*]] = mul i64 %[[VAL_14]], 2
// CHECK: %[[VAL_22:.*]] = xor i64 %[[VAL_21]], 1
// CHECK: %[[VAL_23:.*]] = icmp slt i64 %[[VAL_21]], %[[VAL_22]]
// CHECK: %[[VAL_24:.*]] = icmp slt i64 %[[VAL_22]], 3
// CHECK: %[[VAL_25:.*]] = and i1 %[[VAL_23]], %[[VAL_24]]
// CHECK: br i1 %[[VAL_25]], label %[[VAL_26:.*]], label %[[VAL_19]]
// CHECK: smaller_comparison_index-after: ; preds = %[[VAL_27:.*]], %[[VAL_17]]
// CHECK: br label %[[VAL_18]]
// CHECK: smaller_comparison_index-true: ; preds = %[[VAL_17]]
// CHECK: %[[VAL_28:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_3]], i64 0, i64 %[[VAL_15]], i64 %[[VAL_22]]
// CHECK: %[[VAL_29:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_3]], i64 0, i64 %[[VAL_15]], i64 %[[VAL_21]]
// CHECK: call void @region_0_4(float* %[[VAL_28]], float* %[[VAL_29]], i8* %[[VAL_1]])
// CHECK: %[[VAL_30:.*]] = load i8, i8* %[[VAL_1]], align 1
// CHECK: %[[VAL_31:.*]] = icmp ne i8 %[[VAL_30]], 0
// CHECK: br i1 %[[VAL_31]], label %[[VAL_32:.*]], label %[[VAL_27]]
// CHECK: is_smaller_than-after: ; preds = %[[VAL_32]], %[[VAL_26]]
// CHECK: br label %[[VAL_19]]
// CHECK: is_smaller_than-true: ; preds = %[[VAL_26]]
// CHECK: %[[VAL_33:.*]] = load float, float* %[[VAL_28]], align 4
// CHECK: %[[VAL_34:.*]] = load float, float* %[[VAL_29]], align 4
// CHECK: %[[VAL_35:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_3]], i64 0, i64 %[[VAL_15]], i64 %[[VAL_21]]
// CHECK: store float %[[VAL_33]], float* %[[VAL_35]], align 4
// CHECK: %[[VAL_36:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_3]], i64 0, i64 %[[VAL_15]], i64 %[[VAL_22]]
// CHECK: store float %[[VAL_34]], float* %[[VAL_36]], align 4
// CHECK: br label %[[VAL_27]]
// CHECK: }
// CHECK: define void @sort__2(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]]) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[COMPARE_RETURN_BUFFER:%.*]] = alloca i8, align 1
// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0:%.*]], i64 0
// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to [2 x [3 x float]]*
// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0]], i64 0
// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to [2 x [3 x float]]*
// CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
// CHECK-NEXT: [[BLOCK_ID:%.*]] = zext i32 [[TMP4]] to i64
// CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
// CHECK-NEXT: [[THREAD_ID:%.*]] = zext i32 [[TMP5]] to i64
// CHECK-NEXT: [[TMP6:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4
// CHECK-NEXT: [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP6]], [[THREAD_ID]]
// CHECK-NEXT: [[LINEAR_INDEX_IN_RANGE:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
// CHECK-NEXT: call void @llvm.assume(i1 [[LINEAR_INDEX_IN_RANGE]])
// CHECK-NEXT: [[TMP7:%.*]] = udiv i64 [[LINEAR_INDEX]], 1
// CHECK-NEXT: [[TMP8:%.*]] = urem i64 [[TMP7]], 2
// CHECK-NEXT: [[TMP9:%.*]] = udiv i64 [[LINEAR_INDEX]], 2
// CHECK-NEXT: [[TMP10:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
// CHECK-NEXT: br i1 [[TMP10]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]]
// CHECK: sort.in_bounds-after:
// CHECK-NEXT: ret void
// CHECK: sort.in_bounds-true:
// CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP8]], 2
// CHECK-NEXT: [[TMP12:%.*]] = xor i64 [[TMP11]], 1
// CHECK-NEXT: [[TMP13:%.*]] = icmp slt i64 [[TMP11]], [[TMP12]]
// CHECK-NEXT: [[TMP14:%.*]] = icmp slt i64 [[TMP12]], 3
// CHECK-NEXT: [[TMP15:%.*]] = and i1 [[TMP13]], [[TMP14]]
// CHECK-NEXT: br i1 [[TMP15]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]]
// CHECK: smaller_comparison_index-after:
// CHECK-NEXT: br label [[SORT_IN_BOUNDS_AFTER]]
// CHECK: smaller_comparison_index-true:
// CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP12]]
// CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP11]]
// CHECK-NEXT: call void @region_0_4(float* [[TMP16]], float* [[TMP17]], i8* [[COMPARE_RETURN_BUFFER]])
// CHECK-NEXT: [[TMP18:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1
// CHECK-NEXT: [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP18]], 0
// CHECK-NEXT: br i1 [[BOOLEAN_PREDICATE]], label [[IS_SMALLER_THAN_TRUE:%.*]], label [[IS_SMALLER_THAN_AFTER:%.*]]
// CHECK: is_smaller_than-after:
// CHECK-NEXT: br label [[SMALLER_COMPARISON_INDEX_AFTER]]
// CHECK: is_smaller_than-true:
// CHECK-NEXT: [[TMP19:%.*]] = load float, float* [[TMP16]], align 4
// CHECK-NEXT: [[TMP20:%.*]] = load float, float* [[TMP17]], align 4
// CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP11]]
// CHECK-NEXT: store float [[TMP19]], float* [[TMP21]], align 4
// CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP12]]
// CHECK-NEXT: store float [[TMP20]], float* [[TMP22]], align 4
// CHECK-NEXT: br label [[IS_SMALLER_THAN_AFTER]]
ENTRY main { ENTRY main {
x = f32[2, 3] parameter(0) x = f32[2, 3] parameter(0)
ROOT sort = f32[2, 3] sort(x), dimensions={1}, to_apply=compare ROOT sort = f32[2, 3] sort(x), dimensions={1}, to_apply=compare
@ -182,193 +193,204 @@ compare {
ROOT lt = pred[] compare(p.1.lhs, p.1.rhs), direction=LT ROOT lt = pred[] compare(p.1.lhs, p.1.rhs), direction=LT
} }
// CHECK: define void @sort(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]], i8* noalias align 64 dereferenceable(24) [[ALLOC1:%.*]], i8* noalias align 64 dereferenceable(16) [[ALLOC4:%.*]]) // CHECK: define void @sort(i8* noalias align 64 dereferenceable(24) %[[VAL_0:.*]], i8* noalias align 64 dereferenceable(24) %[[VAL_1:.*]], i8* noalias align 64 dereferenceable(16) %[[VAL_2:.*]]) {
// CHECK-NEXT: entry: // CHECK: entry:
// CHECK-NEXT: [[COMPARE_RETURN_BUFFER:%.*]] = alloca i8, align 1 // CHECK: %[[VAL_3:.*]] = alloca i8, align 1
// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0:%.*]], i64 0 // CHECK: %[[VAL_4:.*]] = getelementptr inbounds i8, i8* %[[VAL_0]], i64 0
// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to [2 x [3 x i32]]* // CHECK: %[[VAL_5:.*]] = bitcast i8* %[[VAL_4]] to [2 x [3 x i32]]*
// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[ALLOC1:%.*]], i64 0 // CHECK: %[[VAL_6:.*]] = getelementptr inbounds i8, i8* %[[VAL_1]], i64 0
// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to [2 x [3 x float]]* // CHECK: %[[VAL_7:.*]] = bitcast i8* %[[VAL_6]] to [2 x [3 x float]]*
// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[ALLOC4:%.*]], i64 0 // CHECK: %[[VAL_8:.*]] = getelementptr inbounds i8, i8* %[[VAL_2]], i64 0
// CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* // CHECK: %[[VAL_9:.*]] = bitcast i8* %[[VAL_8]] to [2 x i8*]*
// CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6 // CHECK: %[[VAL_10:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
// CHECK-NEXT: [[BLOCK_ID:%.*]] = zext i32 [[TMP6]] to i64 // CHECK: %[[VAL_11:.*]] = zext i32 %[[VAL_10]] to i64
// CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7 // CHECK: %[[VAL_12:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
// CHECK-NEXT: [[THREAD_ID:%.*]] = zext i32 [[TMP7]] to i64 // CHECK: %[[VAL_13:.*]] = zext i32 %[[VAL_12]] to i64
// CHECK-NEXT: [[TMP8:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4 // CHECK: %[[VAL_14:.*]] = mul nuw nsw i64 %[[VAL_11]], 4
// CHECK-NEXT: [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP8]], [[THREAD_ID]] // CHECK: %[[VAL_15:.*]] = add nuw nsw i64 %[[VAL_14]], %[[VAL_13]]
// CHECK-NEXT: [[LINEAR_INDEX_IN_RANGE:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4 // CHECK: %[[VAL_16:.*]] = icmp ult i64 %[[VAL_15]], 4
// CHECK-NEXT: call void @llvm.assume(i1 [[LINEAR_INDEX_IN_RANGE]]) // CHECK: call void @llvm.assume(i1 %[[VAL_16]])
// CHECK-NEXT: [[TMP9:%.*]] = udiv i64 [[LINEAR_INDEX]], 1 // CHECK: %[[VAL_17:.*]] = udiv i64 %[[VAL_15]], 1
// CHECK-NEXT: [[TMP10:%.*]] = urem i64 [[TMP9]], 2 // CHECK: %[[VAL_18:.*]] = urem i64 %[[VAL_17]], 2
// CHECK-NEXT: [[TMP11:%.*]] = udiv i64 [[LINEAR_INDEX]], 2 // CHECK: %[[VAL_19:.*]] = udiv i64 %[[VAL_15]], 2
// CHECK-NEXT: [[TMP12:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4 // CHECK: %[[VAL_20:.*]] = icmp ult i64 %[[VAL_15]], 4
// CHECK-NEXT: br i1 [[TMP12]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]] // CHECK: br i1 %[[VAL_20]], label %[[VAL_21:.*]], label %[[VAL_22:.*]]
// CHECK: sort.in_bounds-after: // CHECK: sort.in_bounds-after: ; preds = %[[VAL_23:.*]], %[[VAL_24:.*]]
// CHECK-NEXT: ret void // CHECK: ret void
// CHECK: sort.in_bounds-true: // CHECK: sort.in_bounds-true: ; preds = %[[VAL_24]]
// CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP10]], 2 // CHECK: %[[VAL_25:.*]] = mul i64 %[[VAL_18]], 2
// CHECK-NEXT: [[TMP14:%.*]] = xor i64 [[TMP13]], 1 // CHECK: %[[VAL_26:.*]] = xor i64 %[[VAL_25]], 1
// CHECK-NEXT: [[TMP15:%.*]] = icmp slt i64 [[TMP13]], [[TMP14]] // CHECK: %[[VAL_27:.*]] = icmp slt i64 %[[VAL_25]], %[[VAL_26]]
// CHECK-NEXT: [[TMP16:%.*]] = icmp slt i64 [[TMP14]], 3 // CHECK: %[[VAL_28:.*]] = icmp slt i64 %[[VAL_26]], 3
// CHECK-NEXT: [[TMP17:%.*]] = and i1 [[TMP15]], [[TMP16]] // CHECK: %[[VAL_29:.*]] = and i1 %[[VAL_27]], %[[VAL_28]]
// CHECK-NEXT: br i1 [[TMP17]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]] // CHECK: br i1 %[[VAL_29]], label %[[VAL_30:.*]], label %[[VAL_23]]
// CHECK: smaller_comparison_index-after: // CHECK: smaller_comparison_index-after: ; preds = %[[VAL_31:.*]], %[[VAL_21]]
// CHECK-NEXT: br label [[SORT_IN_BOUNDS_AFTER]] // CHECK: br label %[[VAL_22]]
// CHECK: smaller_comparison_index-true: // CHECK: smaller_comparison_index-true: ; preds = %[[VAL_21]]
// CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP14]] // CHECK: %[[VAL_32:.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* %[[VAL_5]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_26]]
// CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP13]] // CHECK: %[[VAL_33:.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* %[[VAL_5]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_25]]
// CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP14]] // CHECK: %[[VAL_34:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_7]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_26]]
// CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP13]] // CHECK: %[[VAL_35:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_7]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_25]]
// CHECK-NEXT: call void @region_0_6(i32* [[TMP18]], i32* [[TMP19]], float* [[TMP20]], float* [[TMP21]], i8* [[COMPARE_RETURN_BUFFER]]) // CHECK: call void @region_0_6(i32* %[[VAL_32]], i32* %[[VAL_33]], float* %[[VAL_34]], float* %[[VAL_35]], i8* %[[VAL_3]])
// CHECK-NEXT: [[TMP22:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1 // CHECK: %[[VAL_36:.*]] = load i8, i8* %[[VAL_3]], align 1
// CHECK-NEXT: [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP22]], 0 // CHECK: %[[VAL_37:.*]] = icmp ne i8 %[[VAL_36]], 0
// CHECK-NEXT: br i1 [[BOOLEAN_PREDICATE]], label [[IS_SMALLER_THAN_TRUE:%.*]], label [[IS_SMALLER_THAN_AFTER:%.*]] // CHECK: br i1 %[[VAL_37]], label %[[VAL_38:.*]], label %[[VAL_31]]
// CHECK: is_smaller_than-after: // CHECK: is_smaller_than-after: ; preds = %[[VAL_38]], %[[VAL_30]]
// CHECK-NEXT: br label [[SMALLER_COMPARISON_INDEX_AFTER]] // CHECK: br label %[[VAL_23]]
// CHECK: is_smaller_than-true: // CHECK: is_smaller_than-true: ; preds = %[[VAL_30]]
// CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP18]], align 4 // CHECK: %[[VAL_39:.*]] = load i32, i32* %[[VAL_32]], align 4
// CHECK-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP19]], align 4 // CHECK: %[[VAL_40:.*]] = load i32, i32* %[[VAL_33]], align 4
// CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP13]] // CHECK: %[[VAL_41:.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* %[[VAL_5]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_25]]
// CHECK-NEXT: store i32 [[TMP23]], i32* [[TMP25]], align 4 // CHECK: store i32 %[[VAL_39]], i32* %[[VAL_41]], align 4
// CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP14]] // CHECK: %[[VAL_42:.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* %[[VAL_5]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_26]]
// CHECK-NEXT: store i32 [[TMP24]], i32* [[TMP26]], align 4 // CHECK: store i32 %[[VAL_40]], i32* %[[VAL_42]], align 4
// CHECK-NEXT: [[TMP27:%.*]] = load float, float* [[TMP20]], align 4 // CHECK: %[[VAL_43:.*]] = load float, float* %[[VAL_34]], align 4
// CHECK-NEXT: [[TMP28:%.*]] = load float, float* [[TMP21]], align 4 // CHECK: %[[VAL_44:.*]] = load float, float* %[[VAL_35]], align 4
// CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP13]] // CHECK: %[[VAL_45:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_7]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_25]]
// CHECK-NEXT: store float [[TMP27]], float* [[TMP29]], align 4 // CHECK: store float %[[VAL_43]], float* %[[VAL_45]], align 4
// CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP14]] // CHECK: %[[VAL_46:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_7]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_26]]
// CHECK-NEXT: store float [[TMP28]], float* [[TMP30]], align 4 // CHECK: store float %[[VAL_44]], float* %[[VAL_46]], align 4
// CHECK-NEXT: br label [[IS_SMALLER_THAN_AFTER]] // CHECK: br label %[[VAL_31]]
// CHECK: }
// CHECK: ; Function Attrs: nounwind readnone
// CHECK: declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #0
// CHECK: ; Function Attrs: nounwind readnone
// CHECK: declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
// CHECK: ; Function Attrs: nofree nosync nounwind willreturn
// CHECK: declare void @llvm.assume(i1 noundef) #1
// CHECK: define internal void @region_0_6(i32* dereferenceable(4) [[P_0_LHS_TYPED:%.*]], i32* dereferenceable(4) [[P_0_RHS_TYPED:%.*]], float* dereferenceable(4) [[P_1_LHS_TYPED:%.*]], float* dereferenceable(4) [[P_1_RHS_TYPED:%.*]], i8* dereferenceable(1) [[OUTPUT_ARG:%.*]]) // CHECK: define internal void @region_0_6(i32* dereferenceable(4) %[[VAL_0:.*]], i32* dereferenceable(4) %[[VAL_1:.*]], float* dereferenceable(4) %[[VAL_2:.*]], float* dereferenceable(4) %[[VAL_3:.*]], i8* dereferenceable(1) %[[VAL_4:.*]]) {
// CHECK-NEXT: entry: // CHECK: entry:
// CHECK-NEXT: [[COMPARE_5_TYPED:%.*]] = alloca i8, align 1 // CHECK: %[[VAL_5:.*]] = alloca i8, align 1
// CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[ARG_2_3_TYPED:%.*]], align 4 // CHECK: %[[VAL_6:.*]] = load float, float* %[[VAL_2]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARG_3_4_TYPED:%.*]], align 4 // CHECK: %[[VAL_7:.*]] = load float, float* %[[VAL_3]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = fcmp olt float [[TMP0]], [[TMP1]] // CHECK: %[[VAL_8:.*]] = fcmp olt float %[[VAL_6]], %[[VAL_7]]
// CHECK-NEXT: [[TMP3:%.*]] = zext i1 [[TMP2]] to i8 // CHECK: %[[VAL_9:.*]] = zext i1 %[[VAL_8]] to i8
// CHECK-NEXT: store i8 [[TMP3]], i8* [[COMPARE_5_TYPED]], align 1 // CHECK: store i8 %[[VAL_9]], i8* %[[VAL_5]], align 1
// CHECK-NEXT: [[LOAD_RET_VALUE:%.*]] = load i8, i8* [[COMPARE_5_TYPED]], align 1 // CHECK: %[[VAL_10:.*]] = load i8, i8* %[[VAL_5]], align 1
// CHECK-NEXT: store i8 [[LOAD_RET_VALUE]], i8* [[OUTPUT_ARG:%.*]], align 1 // CHECK: store i8 %[[VAL_10]], i8* %[[VAL_4]], align 1
// CHECK-NEXT: ret void // CHECK: ret void
// CHECK: }
// CHECK: define void @sort__1(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]], i8* noalias align 64 dereferenceable(24) [[ALLOC1:%.*]], i8* noalias align 64 dereferenceable(16) [[ALLOC4:%.*]]) // CHECK: define void @sort__1(i8* noalias align 64 dereferenceable(24) %[[VAL_0:.*]], i8* noalias align 64 dereferenceable(24) %[[VAL_1:.*]], i8* noalias align 64 dereferenceable(16) %[[VAL_2:.*]]) {
// CHECK-NEXT: entry: // CHECK: entry:
// CHECK-NEXT: [[COMPARE_RETURN_BUFFER:%.*]] = alloca i8, align 1 // CHECK: %[[VAL_3:.*]] = alloca i8, align 1
// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0:%.*]], i64 0 // CHECK: %[[VAL_4:.*]] = getelementptr inbounds i8, i8* %[[VAL_0]], i64 0
// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to [2 x [3 x i32]]* // CHECK: %[[VAL_5:.*]] = bitcast i8* %[[VAL_4]] to [2 x [3 x i32]]*
// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[ALLOC1:%.*]], i64 0 // CHECK: %[[VAL_6:.*]] = getelementptr inbounds i8, i8* %[[VAL_1]], i64 0
// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to [2 x [3 x float]]* // CHECK: %[[VAL_7:.*]] = bitcast i8* %[[VAL_6]] to [2 x [3 x float]]*
// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[ALLOC4:%.*]], i64 0 // CHECK: %[[VAL_8:.*]] = getelementptr inbounds i8, i8* %[[VAL_2]], i64 0
// CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* // CHECK: %[[VAL_9:.*]] = bitcast i8* %[[VAL_8]] to [2 x i8*]*
// CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6 // CHECK: %[[VAL_10:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
// CHECK-NEXT: [[BLOCK_ID:%.*]] = zext i32 [[TMP6]] to i64 // CHECK: %[[VAL_11:.*]] = zext i32 %[[VAL_10]] to i64
// CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7 // CHECK: %[[VAL_12:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
// CHECK-NEXT: [[THREAD_ID:%.*]] = zext i32 [[TMP7]] to i64 // CHECK: %[[VAL_13:.*]] = zext i32 %[[VAL_12]] to i64
// CHECK-NEXT: [[TMP8:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4 // CHECK: %[[VAL_14:.*]] = mul nuw nsw i64 %[[VAL_11]], 4
// CHECK-NEXT: [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP8]], [[THREAD_ID]] // CHECK: %[[VAL_15:.*]] = add nuw nsw i64 %[[VAL_14]], %[[VAL_13]]
// CHECK-NEXT: [[LINEAR_INDEX_IN_RANGE:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4 // CHECK: %[[VAL_16:.*]] = icmp ult i64 %[[VAL_15]], 4
// CHECK-NEXT: call void @llvm.assume(i1 [[LINEAR_INDEX_IN_RANGE]]) // CHECK: call void @llvm.assume(i1 %[[VAL_16]])
// CHECK-NEXT: [[TMP9:%.*]] = udiv i64 [[LINEAR_INDEX]], 1 // CHECK: %[[VAL_17:.*]] = udiv i64 %[[VAL_15]], 1
// CHECK-NEXT: [[TMP10:%.*]] = urem i64 [[TMP9]], 2 // CHECK: %[[VAL_18:.*]] = urem i64 %[[VAL_17]], 2
// CHECK-NEXT: [[TMP11:%.*]] = udiv i64 [[LINEAR_INDEX]], 2 // CHECK: %[[VAL_19:.*]] = udiv i64 %[[VAL_15]], 2
// CHECK-NEXT: [[TMP12:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4 // CHECK: %[[VAL_20:.*]] = icmp ult i64 %[[VAL_15]], 4
// CHECK-NEXT: br i1 [[TMP12]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]] // CHECK: br i1 %[[VAL_20]], label %[[VAL_21:.*]], label %[[VAL_22:.*]]
// CHECK: sort.in_bounds-after: // CHECK: sort.in_bounds-after: ; preds = %[[VAL_23:.*]], %[[VAL_24:.*]]
// CHECK-NEXT: ret void // CHECK: ret void
// CHECK: sort.in_bounds-true: // CHECK: sort.in_bounds-true: ; preds = %[[VAL_24]]
// CHECK-NEXT: [[TMP13:%.*]] = xor i64 [[TMP10]], 3 // CHECK: %[[VAL_25:.*]] = xor i64 %[[VAL_18]], 3
// CHECK-NEXT: [[TMP14:%.*]] = icmp slt i64 [[TMP10]], [[TMP13]] // CHECK: %[[VAL_26:.*]] = icmp slt i64 %[[VAL_18]], %[[VAL_25]]
// CHECK-NEXT: [[TMP15:%.*]] = icmp slt i64 [[TMP13]], 3 // CHECK: %[[VAL_27:.*]] = icmp slt i64 %[[VAL_25]], 3
// CHECK-NEXT: [[TMP16:%.*]] = and i1 [[TMP14]], [[TMP15]] // CHECK: %[[VAL_28:.*]] = and i1 %[[VAL_26]], %[[VAL_27]]
// CHECK-NEXT: br i1 [[TMP16]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]] // CHECK: br i1 %[[VAL_28]], label %[[VAL_29:.*]], label %[[VAL_23]]
// CHECK: smaller_comparison_index-after: // CHECK: smaller_comparison_index-after: ; preds = %[[VAL_30:.*]], %[[VAL_21]]
// CHECK-NEXT: br label [[SORT_IN_BOUNDS_AFTER]] // CHECK: br label %[[VAL_22]]
// CHECK: smaller_comparison_index-true: // CHECK: smaller_comparison_index-true: ; preds = %[[VAL_21]]
// CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP13]] // CHECK: %[[VAL_31:.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* %[[VAL_5]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_25]]
// CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP10]] // CHECK: %[[VAL_32:.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* %[[VAL_5]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_18]]
// CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP13]] // CHECK: %[[VAL_33:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_7]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_25]]
// CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP10]] // CHECK: %[[VAL_34:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_7]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_18]]
// CHECK-NEXT: call void @region_0_6(i32* [[TMP17]], i32* [[TMP18]], float* [[TMP19]], float* [[TMP20]], i8* [[COMPARE_RETURN_BUFFER]]) // CHECK: call void @region_0_6(i32* %[[VAL_31]], i32* %[[VAL_32]], float* %[[VAL_33]], float* %[[VAL_34]], i8* %[[VAL_3]])
// CHECK-NEXT: [[TMP21:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1 // CHECK: %[[VAL_35:.*]] = load i8, i8* %[[VAL_3]], align 1
// CHECK-NEXT: [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP21]], 0 // CHECK: %[[VAL_36:.*]] = icmp ne i8 %[[VAL_35]], 0
// CHECK-NEXT: br i1 [[BOOLEAN_PREDICATE]], label [[IS_SMALLER_THAN_TRUE:%.*]], label [[IS_SMALLER_THAN_AFTER:%.*]] // CHECK: br i1 %[[VAL_36]], label %[[VAL_37:.*]], label %[[VAL_30]]
// CHECK: is_smaller_than-after: // CHECK: is_smaller_than-after: ; preds = %[[VAL_37]], %[[VAL_29]]
// CHECK-NEXT: br label [[SMALLER_COMPARISON_INDEX_AFTER]] // CHECK: br label %[[VAL_23]]
// CHECK: is_smaller_than-true: // CHECK: is_smaller_than-true: ; preds = %[[VAL_29]]
// CHECK-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP17]], align 4 // CHECK: %[[VAL_38:.*]] = load i32, i32* %[[VAL_31]], align 4
// CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP18]], align 4 // CHECK: %[[VAL_39:.*]] = load i32, i32* %[[VAL_32]], align 4
// CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP10]] // CHECK: %[[VAL_40:.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* %[[VAL_5]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_18]]
// CHECK-NEXT: store i32 [[TMP22]], i32* [[TMP24]], align 4 // CHECK: store i32 %[[VAL_38]], i32* %[[VAL_40]], align 4
// CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP13]] // CHECK: %[[VAL_41:.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* %[[VAL_5]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_25]]
// CHECK-NEXT: store i32 [[TMP23]], i32* [[TMP25]], align 4 // CHECK: store i32 %[[VAL_39]], i32* %[[VAL_41]], align 4
// CHECK-NEXT: [[TMP26:%.*]] = load float, float* [[TMP19]], align 4 // CHECK: %[[VAL_42:.*]] = load float, float* %[[VAL_33]], align 4
// CHECK-NEXT: [[TMP27:%.*]] = load float, float* [[TMP20]], align 4 // CHECK: %[[VAL_43:.*]] = load float, float* %[[VAL_34]], align 4
// CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP10]] // CHECK: %[[VAL_44:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_7]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_18]]
// CHECK-NEXT: store float [[TMP26]], float* [[TMP28]], align 4 // CHECK: store float %[[VAL_42]], float* %[[VAL_44]], align 4
// CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP13]] // CHECK: %[[VAL_45:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_7]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_25]]
// CHECK-NEXT: store float [[TMP27]], float* [[TMP29]], align 4 // CHECK: store float %[[VAL_43]], float* %[[VAL_45]], align 4
// CHECK-NEXT: br label [[IS_SMALLER_THAN_AFTER]] // CHECK: br label %[[VAL_30]]
// CHECK: }
// CHECK: define void @sort__2(i8* noalias align 64 dereferenceable(24) %[[VAL_0:.*]], i8* noalias align 64 dereferenceable(24) %[[VAL_1:.*]], i8* noalias align 64 dereferenceable(16) %[[VAL_2:.*]]) {
// CHECK: entry:
// CHECK: %[[VAL_3:.*]] = alloca i8, align 1
// CHECK: %[[VAL_4:.*]] = getelementptr inbounds i8, i8* %[[VAL_0]], i64 0
// CHECK: %[[VAL_5:.*]] = bitcast i8* %[[VAL_4]] to [2 x [3 x i32]]*
// CHECK: %[[VAL_6:.*]] = getelementptr inbounds i8, i8* %[[VAL_1]], i64 0
// CHECK: %[[VAL_7:.*]] = bitcast i8* %[[VAL_6]] to [2 x [3 x float]]*
// CHECK: %[[VAL_8:.*]] = getelementptr inbounds i8, i8* %[[VAL_2]], i64 0
// CHECK: %[[VAL_9:.*]] = bitcast i8* %[[VAL_8]] to [2 x i8*]*
// CHECK: %[[VAL_10:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
// CHECK: %[[VAL_11:.*]] = zext i32 %[[VAL_10]] to i64
// CHECK: %[[VAL_12:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
// CHECK: %[[VAL_13:.*]] = zext i32 %[[VAL_12]] to i64
// CHECK: %[[VAL_14:.*]] = mul nuw nsw i64 %[[VAL_11]], 4
// CHECK: %[[VAL_15:.*]] = add nuw nsw i64 %[[VAL_14]], %[[VAL_13]]
// CHECK: %[[VAL_16:.*]] = icmp ult i64 %[[VAL_15]], 4
// CHECK: call void @llvm.assume(i1 %[[VAL_16]])
// CHECK: %[[VAL_17:.*]] = udiv i64 %[[VAL_15]], 1
// CHECK: %[[VAL_18:.*]] = urem i64 %[[VAL_17]], 2
// CHECK: %[[VAL_19:.*]] = udiv i64 %[[VAL_15]], 2
// CHECK: %[[VAL_20:.*]] = icmp ult i64 %[[VAL_15]], 4
// CHECK: br i1 %[[VAL_20]], label %[[VAL_21:.*]], label %[[VAL_22:.*]]
// CHECK: sort.in_bounds-after: ; preds = %[[VAL_23:.*]], %[[VAL_24:.*]]
// CHECK: ret void
// CHECK: sort.in_bounds-true: ; preds = %[[VAL_24]]
// CHECK: %[[VAL_25:.*]] = mul i64 %[[VAL_18]], 2
// CHECK: %[[VAL_26:.*]] = xor i64 %[[VAL_25]], 1
// CHECK: %[[VAL_27:.*]] = icmp slt i64 %[[VAL_25]], %[[VAL_26]]
// CHECK: %[[VAL_28:.*]] = icmp slt i64 %[[VAL_26]], 3
// CHECK: %[[VAL_29:.*]] = and i1 %[[VAL_27]], %[[VAL_28]]
// CHECK: br i1 %[[VAL_29]], label %[[VAL_30:.*]], label %[[VAL_23]]
// CHECK: smaller_comparison_index-after: ; preds = %[[VAL_31:.*]], %[[VAL_21]]
// CHECK: br label %[[VAL_22]]
// CHECK: smaller_comparison_index-true: ; preds = %[[VAL_21]]
// CHECK: %[[VAL_32:.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* %[[VAL_5]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_26]]
// CHECK: %[[VAL_33:.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* %[[VAL_5]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_25]]
// CHECK: %[[VAL_34:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_7]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_26]]
// CHECK: %[[VAL_35:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_7]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_25]]
// CHECK: call void @region_0_6(i32* %[[VAL_32]], i32* %[[VAL_33]], float* %[[VAL_34]], float* %[[VAL_35]], i8* %[[VAL_3]])
// CHECK: %[[VAL_36:.*]] = load i8, i8* %[[VAL_3]], align 1
// CHECK: %[[VAL_37:.*]] = icmp ne i8 %[[VAL_36]], 0
// CHECK: br i1 %[[VAL_37]], label %[[VAL_38:.*]], label %[[VAL_31]]
// CHECK: is_smaller_than-after: ; preds = %[[VAL_38]], %[[VAL_30]]
// CHECK: br label %[[VAL_23]]
// CHECK: is_smaller_than-true: ; preds = %[[VAL_30]]
// CHECK: %[[VAL_39:.*]] = load i32, i32* %[[VAL_32]], align 4
// CHECK: %[[VAL_40:.*]] = load i32, i32* %[[VAL_33]], align 4
// CHECK: %[[VAL_41:.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* %[[VAL_5]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_25]]
// CHECK: store i32 %[[VAL_39]], i32* %[[VAL_41]], align 4
// CHECK: %[[VAL_42:.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* %[[VAL_5]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_26]]
// CHECK: store i32 %[[VAL_40]], i32* %[[VAL_42]], align 4
// CHECK: %[[VAL_43:.*]] = load float, float* %[[VAL_34]], align 4
// CHECK: %[[VAL_44:.*]] = load float, float* %[[VAL_35]], align 4
// CHECK: %[[VAL_45:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_7]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_25]]
// CHECK: store float %[[VAL_43]], float* %[[VAL_45]], align 4
// CHECK: %[[VAL_46:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_7]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_26]]
// CHECK: store float %[[VAL_44]], float* %[[VAL_46]], align 4
// CHECK: br label %[[VAL_31]]
// CHECK: }
// CHECK: define void @sort__2(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]], i8* noalias align 64 dereferenceable(24) [[ALLOC1:%.*]], i8* noalias align 64 dereferenceable(16) [[ALLOC4:%.*]])
// CHECK-NEXT: entry:
// CHECK-NEXT: [[COMPARE_RETURN_BUFFER:%.*]] = alloca i8, align 1
// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0:%.*]], i64 0
// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to [2 x [3 x i32]]*
// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[ALLOC1:%.*]], i64 0
// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to [2 x [3 x float]]*
// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[ALLOC4:%.*]], i64 0
// CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
// CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
// CHECK-NEXT: [[BLOCK_ID:%.*]] = zext i32 [[TMP6]] to i64
// CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
// CHECK-NEXT: [[THREAD_ID:%.*]] = zext i32 [[TMP7]] to i64
// CHECK-NEXT: [[TMP8:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4
// CHECK-NEXT: [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP8]], [[THREAD_ID]]
// CHECK-NEXT: [[LINEAR_INDEX_IN_RANGE:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
// CHECK-NEXT: call void @llvm.assume(i1 [[LINEAR_INDEX_IN_RANGE]])
// CHECK-NEXT: [[TMP9:%.*]] = udiv i64 [[LINEAR_INDEX]], 1
// CHECK-NEXT: [[TMP10:%.*]] = urem i64 [[TMP9]], 2
// CHECK-NEXT: [[TMP11:%.*]] = udiv i64 [[LINEAR_INDEX]], 2
// CHECK-NEXT: [[TMP12:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
// CHECK-NEXT: br i1 [[TMP12]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]]
// CHECK: sort.in_bounds-after:
// CHECK-NEXT: ret void
// CHECK: sort.in_bounds-true:
// CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP10]], 2
// CHECK-NEXT: [[TMP18:%.*]] = xor i64 [[TMP17]], 1
// CHECK-NEXT: [[TMP19:%.*]] = icmp slt i64 [[TMP17]], [[TMP18]]
// CHECK-NEXT: [[TMP20:%.*]] = icmp slt i64 [[TMP18]], 3
// CHECK-NEXT: [[TMP21:%.*]] = and i1 [[TMP19]], [[TMP20]]
// CHECK-NEXT: br i1 [[TMP21]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]]
// CHECK: smaller_comparison_index-after:
// CHECK-NEXT: br label [[SORT_IN_BOUNDS_AFTER]]
// CHECK: smaller_comparison_index-true:
// CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP18]]
// CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP17]]
// CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP18]]
// CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP17]]
// CHECK-NEXT: call void @region_0_6(i32* [[TMP22]], i32* [[TMP23]], float* [[TMP24]], float* [[TMP25]], i8* [[COMPARE_RETURN_BUFFER]])
// CHECK-NEXT: [[TMP26:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1
// CHECK-NEXT: [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP26]], 0
// CHECK-NEXT: br i1 [[BOOLEAN_PREDICATE]], label [[IS_SMALLER_THAN_TRUE:%.*]], label [[IS_SMALLER_THAN_AFTER:%.*]]
// CHECK: is_smaller_than-after:
// CHECK-NEXT: br label [[SMALLER_COMPARISON_INDEX_AFTER]]
// CHECK: is_smaller_than-true:
// CHECK-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP22]], align 4
// CHECK-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP23]], align 4
// CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP17]]
// CHECK-NEXT: store i32 [[TMP27]], i32* [[TMP29]], align 4
// CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP18]]
// CHECK-NEXT: store i32 [[TMP28]], i32* [[TMP30]], align 4
// CHECK-NEXT: [[TMP31:%.*]] = load float, float* [[TMP24]], align 4
// CHECK-NEXT: [[TMP32:%.*]] = load float, float* [[TMP25]], align 4
// CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP17]]
// CHECK-NEXT: store float [[TMP31]], float* [[TMP33]], align 4
// CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP18]]
// CHECK-NEXT: store float [[TMP32]], float* [[TMP34]], align 4
// CHECK-NEXT: br label [[IS_SMALLER_THAN_AFTER]]
ENTRY main { ENTRY main {
x = s32[2, 3] parameter(0) x = s32[2, 3] parameter(0)
y = f32[2, 3] parameter(1) y = f32[2, 3] parameter(1)