diff --git a/tensorflow/compiler/xla/service/gpu/tests/sorting.hlo b/tensorflow/compiler/xla/service/gpu/tests/sorting.hlo index 2b22144130a..8e4e8bfff81 100644 --- a/tensorflow/compiler/xla/service/gpu/tests/sorting.hlo +++ b/tensorflow/compiler/xla/service/gpu/tests/sorting.hlo @@ -8,163 +8,174 @@ compare { ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT } -// CHECK: define void @sort(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]]) -// CHECK-NEXT: entry: -// CHECK-NEXT: [[COMPARE_RETURN_BUFFER:%.*]] = alloca i8, align 1 -// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0:%.*]], i64 0 -// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to [2 x [3 x float]]* -// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0]], i64 0 -// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to [2 x [3 x float]]* -// CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6 -// CHECK-NEXT: [[BLOCK_ID:%.*]] = zext i32 [[TMP4]] to i64 -// CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7 -// CHECK-NEXT: [[THREAD_ID:%.*]] = zext i32 [[TMP5]] to i64 -// CHECK-NEXT: [[TMP6:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4 -// CHECK-NEXT: [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP6]], [[THREAD_ID]] -// CHECK-NEXT: [[LINEAR_INDEX_IN_RANGE:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4 -// CHECK-NEXT: call void @llvm.assume(i1 [[LINEAR_INDEX_IN_RANGE]]) -// CHECK-NEXT: [[TMP7:%.*]] = udiv i64 [[LINEAR_INDEX]], 1 -// CHECK-NEXT: [[TMP8:%.*]] = urem i64 [[TMP7]], 2 -// CHECK-NEXT: [[TMP9:%.*]] = udiv i64 [[LINEAR_INDEX]], 2 -// CHECK-NEXT: [[TMP10:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4 -// CHECK-NEXT: br i1 [[TMP10]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]] -// CHECK: sort.in_bounds-after: -// CHECK-NEXT: ret void -// CHECK: sort.in_bounds-true: -// CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP8]], 2 -// CHECK-NEXT: [[TMP12:%.*]] = xor i64 [[TMP11]], 1 -// CHECK-NEXT: [[TMP13:%.*]] = icmp slt i64 [[TMP11]], [[TMP12]] -// CHECK-NEXT: [[TMP14:%.*]] = icmp slt i64 [[TMP12]], 3 -// CHECK-NEXT: [[TMP15:%.*]] = and i1 [[TMP13]], [[TMP14]] -// CHECK-NEXT: br i1 [[TMP15]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]] -// CHECK: smaller_comparison_index-after: -// CHECK-NEXT: br label [[SORT_IN_BOUNDS_AFTER]] -// CHECK: smaller_comparison_index-true: -// CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP12]] -// CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP11]] -// CHECK-NEXT: call void @region_0_4(float* [[TMP16]], float* [[TMP17]], i8* [[COMPARE_RETURN_BUFFER]]) -// CHECK-NEXT: [[TMP18:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1 -// CHECK-NEXT: [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP18]], 0 -// CHECK-NEXT: br i1 [[BOOLEAN_PREDICATE]], label [[IS_SMALLER_THAN_TRUE:%.*]], label [[IS_SMALLER_THAN_AFTER:%.*]] -// CHECK: is_smaller_than-after: -// CHECK-NEXT: br label [[SMALLER_COMPARISON_INDEX_AFTER]] -// CHECK: is_smaller_than-true: -// CHECK-NEXT: [[TMP19:%.*]] = load float, float* [[TMP16]], align 4 -// CHECK-NEXT: [[TMP20:%.*]] = load float, float* [[TMP17]], align 4 -// CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP11]] -// CHECK-NEXT: store float [[TMP19]], float* [[TMP21]], align 4 -// CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP12]] -// CHECK-NEXT: store float [[TMP20]], float* [[TMP22]], align 4 -// CHECK-NEXT: br label [[IS_SMALLER_THAN_AFTER]] +// CHECK: define void @sort(i8* noalias align 64 dereferenceable(24) %[[VAL_0:.*]]) { +// CHECK: entry: +// CHECK: %[[VAL_1:.*]] = alloca i8, align 1 +// CHECK: %[[VAL_2:.*]] = getelementptr inbounds i8, i8* %[[VAL_0]], i64 0 +// CHECK: %[[VAL_3:.*]] = bitcast i8* %[[VAL_2]] to [2 x [3 x float]]* +// CHECK: %[[VAL_4:.*]] = getelementptr inbounds i8, i8* %[[VAL_0]], i64 0 +// CHECK: %[[VAL_5:.*]] = bitcast i8* %[[VAL_4]] to [2 x [3 x float]]* +// CHECK: %[[VAL_6:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6 +// CHECK: %[[VAL_7:.*]] = zext i32 %[[VAL_6]] to i64 +// CHECK: %[[VAL_8:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7 +// CHECK: %[[VAL_9:.*]] = zext i32 %[[VAL_8]] to i64 +// CHECK: %[[VAL_10:.*]] = mul nuw nsw i64 %[[VAL_7]], 4 +// CHECK: %[[VAL_11:.*]] = add nuw nsw i64 %[[VAL_10]], %[[VAL_9]] +// CHECK: %[[VAL_12:.*]] = icmp ult i64 %[[VAL_11]], 4 +// CHECK: call void @llvm.assume(i1 %[[VAL_12]]) +// CHECK: %[[VAL_13:.*]] = udiv i64 %[[VAL_11]], 1 +// CHECK: %[[VAL_14:.*]] = urem i64 %[[VAL_13]], 2 +// CHECK: %[[VAL_15:.*]] = udiv i64 %[[VAL_11]], 2 +// CHECK: %[[VAL_16:.*]] = icmp ult i64 %[[VAL_11]], 4 +// CHECK: br i1 %[[VAL_16]], label %[[VAL_17:.*]], label %[[VAL_18:.*]] +// CHECK: sort.in_bounds-after: ; preds = %[[VAL_19:.*]], %[[VAL_20:.*]] +// CHECK: ret void +// CHECK: sort.in_bounds-true: ; preds = %[[VAL_20]] +// CHECK: %[[VAL_21:.*]] = mul i64 %[[VAL_14]], 2 +// CHECK: %[[VAL_22:.*]] = xor i64 %[[VAL_21]], 1 +// CHECK: %[[VAL_23:.*]] = icmp slt i64 %[[VAL_21]], %[[VAL_22]] +// CHECK: %[[VAL_24:.*]] = icmp slt i64 %[[VAL_22]], 3 +// CHECK: %[[VAL_25:.*]] = and i1 %[[VAL_23]], %[[VAL_24]] +// CHECK: br i1 %[[VAL_25]], label %[[VAL_26:.*]], label %[[VAL_19]] +// CHECK: smaller_comparison_index-after: ; preds = %[[VAL_27:.*]], %[[VAL_17]] +// CHECK: br label %[[VAL_18]] +// CHECK: smaller_comparison_index-true: ; preds = %[[VAL_17]] +// CHECK: %[[VAL_28:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_3]], i64 0, i64 %[[VAL_15]], i64 %[[VAL_22]] +// CHECK: %[[VAL_29:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_3]], i64 0, i64 %[[VAL_15]], i64 %[[VAL_21]] +// CHECK: call void @region_0_4(float* %[[VAL_28]], float* %[[VAL_29]], i8* %[[VAL_1]]) +// CHECK: %[[VAL_30:.*]] = load i8, i8* %[[VAL_1]], align 1 +// CHECK: %[[VAL_31:.*]] = icmp ne i8 %[[VAL_30]], 0 +// CHECK: br i1 %[[VAL_31]], label %[[VAL_32:.*]], label %[[VAL_27]] +// CHECK: is_smaller_than-after: ; preds = %[[VAL_32]], %[[VAL_26]] +// CHECK: br label %[[VAL_19]] +// CHECK: is_smaller_than-true: ; preds = %[[VAL_26]] +// CHECK: %[[VAL_33:.*]] = load float, float* %[[VAL_28]], align 4 +// CHECK: %[[VAL_34:.*]] = load float, float* %[[VAL_29]], align 4 +// CHECK: %[[VAL_35:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_3]], i64 0, i64 %[[VAL_15]], i64 %[[VAL_21]] +// CHECK: store float %[[VAL_33]], float* %[[VAL_35]], align 4 +// CHECK: %[[VAL_36:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_3]], i64 0, i64 %[[VAL_15]], i64 %[[VAL_22]] +// CHECK: store float %[[VAL_34]], float* %[[VAL_36]], align 4 +// CHECK: br label %[[VAL_27]] +// CHECK: } +// CHECK: ; Function Attrs: nounwind readnone +// CHECK: declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #0 +// CHECK: ; Function Attrs: nounwind readnone +// CHECK: declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 +// CHECK: ; Function Attrs: nofree nosync nounwind willreturn +// CHECK: declare void @llvm.assume(i1 noundef) #1 -// CHECK: define internal void @region_0_4(float* dereferenceable(4) [[P_0_LHS_TYPED:%.*]], float* dereferenceable(4) [[P_0_RHS_TYPED:%.*]], i8* dereferenceable(1) [[OUTPUT_ARG:%.*]]) -// CHECK-NEXT: entry: -// CHECK-NEXT: [[COMPARE_3_TYPED:%.*]] = alloca i8, align 1 -// CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[ARG_0_1_TYPED:%.*]], align 4 -// CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARG_1_2_TYPED:%.*]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = fcmp olt float [[TMP0]], [[TMP1]] -// CHECK-NEXT: [[TMP3:%.*]] = zext i1 [[TMP2]] to i8 -// CHECK-NEXT: store i8 [[TMP3]], i8* [[COMPARE_3_TYPED]], align 1 -// CHECK-NEXT: [[LOAD_RET_VALUE:%.*]] = load i8, i8* [[COMPARE_3_TYPED]], align 1 -// CHECK-NEXT: store i8 [[LOAD_RET_VALUE]], i8* [[OUTPUT_ARG:%.*]], align 1 -// CHECK-NEXT: ret void +// CHECK: define internal void @region_0_4(float* dereferenceable(4) %[[VAL_0:.*]], float* dereferenceable(4) %[[VAL_1:.*]], i8* dereferenceable(1) %[[VAL_2:.*]]) { +// CHECK: entry: +// CHECK: %[[VAL_3:.*]] = alloca i8, align 1 +// CHECK: %[[VAL_4:.*]] = load float, float* %[[VAL_0]], align 4 +// CHECK: %[[VAL_5:.*]] = load float, float* %[[VAL_1]], align 4 +// CHECK: %[[VAL_6:.*]] = fcmp olt float %[[VAL_4]], %[[VAL_5]] +// CHECK: %[[VAL_7:.*]] = zext i1 %[[VAL_6]] to i8 +// CHECK: store i8 %[[VAL_7]], i8* %[[VAL_3]], align 1 +// CHECK: %[[VAL_8:.*]] = load i8, i8* %[[VAL_3]], align 1 +// CHECK: store i8 %[[VAL_8]], i8* %[[VAL_2]], align 1 +// CHECK: ret void +// CHECK: } -// CHECK: define void @sort__1(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]]) { -// CHECK-NEXT: entry: -// CHECK-NEXT: [[COMPARE_RETURN_BUFFER:%.*]] = alloca i8, align 1 -// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0:%.*]], i64 0 -// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to [2 x [3 x float]]* -// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0]], i64 0 -// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to [2 x [3 x float]]* -// CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6 -// CHECK-NEXT: [[BLOCK_ID:%.*]] = zext i32 [[TMP4]] to i64 -// CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7 -// CHECK-NEXT: [[THREAD_ID:%.*]] = zext i32 [[TMP5]] to i64 -// CHECK-NEXT: [[TMP6:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4 -// CHECK-NEXT: [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP6]], [[THREAD_ID]] -// CHECK-NEXT: [[LINEAR_INDEX_IN_RANGE:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4 -// CHECK-NEXT: call void @llvm.assume(i1 [[LINEAR_INDEX_IN_RANGE]]) -// CHECK-NEXT: [[TMP7:%.*]] = udiv i64 [[LINEAR_INDEX]], 1 -// CHECK-NEXT: [[TMP8:%.*]] = urem i64 [[TMP7]], 2 -// CHECK-NEXT: [[TMP9:%.*]] = udiv i64 [[LINEAR_INDEX]], 2 -// CHECK-NEXT: [[TMP10:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4 -// CHECK-NEXT: br i1 [[TMP10]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]] -// CHECK: sort.in_bounds-after: -// CHECK-NEXT: ret void -// CHECK: sort.in_bounds-true: -// CHECK-NEXT: [[TMP11:%.*]] = xor i64 [[TMP8]], 3 -// CHECK-NEXT: [[TMP12:%.*]] = icmp slt i64 [[TMP8]], [[TMP11]] -// CHECK-NEXT: [[TMP13:%.*]] = icmp slt i64 [[TMP11]], 3 -// CHECK-NEXT: [[TMP14:%.*]] = and i1 [[TMP12]], [[TMP13]] -// CHECK-NEXT: br i1 [[TMP14]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]] -// CHECK: smaller_comparison_index-after: -// CHECK-NEXT: br label [[SORT_IN_BOUNDS_AFTER]] -// CHECK: smaller_comparison_index-true: -// CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP11]] -// CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP8]] -// CHECK-NEXT: call void @region_0_4(float* [[TMP15]], float* [[TMP16]], i8* [[COMPARE_RETURN_BUFFER]]) -// CHECK-NEXT: [[TMP17:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1 -// CHECK-NEXT: [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP17]], 0 -// CHECK-NEXT: br i1 [[BOOLEAN_PREDICATE]], label [[IS_SMALLER_THAN_TRUE:%.*]], label [[IS_SMALLER_THAN_AFTER:%.*]] -// CHECK: is_smaller_than-after: -// CHECK-NEXT: br label [[SMALLER_COMPARISON_INDEX_AFTER]] -// CHECK: is_smaller_than-true: -// CHECK-NEXT: [[TMP18:%.*]] = load float, float* [[TMP15]], align 4 -// CHECK-NEXT: [[TMP19:%.*]] = load float, float* [[TMP16]], align 4 -// CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP8]] -// CHECK-NEXT: store float [[TMP18]], float* [[TMP20]], align 4 -// CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP11]] -// CHECK-NEXT: store float [[TMP19]], float* [[TMP21]], align 4 -// CHECK-NEXT: br label [[IS_SMALLER_THAN_AFTER]] +// CHECK: define void @sort__1(i8* noalias align 64 dereferenceable(24) %[[VAL_0:.*]]) { +// CHECK: entry: +// CHECK: %[[VAL_1:.*]] = alloca i8, align 1 +// CHECK: %[[VAL_2:.*]] = getelementptr inbounds i8, i8* %[[VAL_0]], i64 0 +// CHECK: %[[VAL_3:.*]] = bitcast i8* %[[VAL_2]] to [2 x [3 x float]]* +// CHECK: %[[VAL_4:.*]] = getelementptr inbounds i8, i8* %[[VAL_0]], i64 0 +// CHECK: %[[VAL_5:.*]] = bitcast i8* %[[VAL_4]] to [2 x [3 x float]]* +// CHECK: %[[VAL_6:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6 +// CHECK: %[[VAL_7:.*]] = zext i32 %[[VAL_6]] to i64 +// CHECK: %[[VAL_8:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7 +// CHECK: %[[VAL_9:.*]] = zext i32 %[[VAL_8]] to i64 +// CHECK: %[[VAL_10:.*]] = mul nuw nsw i64 %[[VAL_7]], 4 +// CHECK: %[[VAL_11:.*]] = add nuw nsw i64 %[[VAL_10]], %[[VAL_9]] +// CHECK: %[[VAL_12:.*]] = icmp ult i64 %[[VAL_11]], 4 +// CHECK: call void @llvm.assume(i1 %[[VAL_12]]) +// CHECK: %[[VAL_13:.*]] = udiv i64 %[[VAL_11]], 1 +// CHECK: %[[VAL_14:.*]] = urem i64 %[[VAL_13]], 2 +// CHECK: %[[VAL_15:.*]] = udiv i64 %[[VAL_11]], 2 +// CHECK: %[[VAL_16:.*]] = icmp ult i64 %[[VAL_11]], 4 +// CHECK: br i1 %[[VAL_16]], label %[[VAL_17:.*]], label %[[VAL_18:.*]] +// CHECK: sort.in_bounds-after: ; preds = %[[VAL_19:.*]], %[[VAL_20:.*]] +// CHECK: ret void +// CHECK: sort.in_bounds-true: ; preds = %[[VAL_20]] +// CHECK: %[[VAL_21:.*]] = xor i64 %[[VAL_14]], 3 +// CHECK: %[[VAL_22:.*]] = icmp slt i64 %[[VAL_14]], %[[VAL_21]] +// CHECK: %[[VAL_23:.*]] = icmp slt i64 %[[VAL_21]], 3 +// CHECK: %[[VAL_24:.*]] = and i1 %[[VAL_22]], %[[VAL_23]] +// CHECK: br i1 %[[VAL_24]], label %[[VAL_25:.*]], label %[[VAL_19]] +// CHECK: smaller_comparison_index-after: ; preds = %[[VAL_26:.*]], %[[VAL_17]] +// CHECK: br label %[[VAL_18]] +// CHECK: smaller_comparison_index-true: ; preds = %[[VAL_17]] +// CHECK: %[[VAL_27:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_3]], i64 0, i64 %[[VAL_15]], i64 %[[VAL_21]] +// CHECK: %[[VAL_28:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_3]], i64 0, i64 %[[VAL_15]], i64 %[[VAL_14]] +// CHECK: call void @region_0_4(float* %[[VAL_27]], float* %[[VAL_28]], i8* %[[VAL_1]]) +// CHECK: %[[VAL_29:.*]] = load i8, i8* %[[VAL_1]], align 1 +// CHECK: %[[VAL_30:.*]] = icmp ne i8 %[[VAL_29]], 0 +// CHECK: br i1 %[[VAL_30]], label %[[VAL_31:.*]], label %[[VAL_26]] +// CHECK: is_smaller_than-after: ; preds = %[[VAL_31]], %[[VAL_25]] +// CHECK: br label %[[VAL_19]] +// CHECK: is_smaller_than-true: ; preds = %[[VAL_25]] +// CHECK: %[[VAL_32:.*]] = load float, float* %[[VAL_27]], align 4 +// CHECK: %[[VAL_33:.*]] = load float, float* %[[VAL_28]], align 4 +// CHECK: %[[VAL_34:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_3]], i64 0, i64 %[[VAL_15]], i64 %[[VAL_14]] +// CHECK: store float %[[VAL_32]], float* %[[VAL_34]], align 4 +// CHECK: %[[VAL_35:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_3]], i64 0, i64 %[[VAL_15]], i64 %[[VAL_21]] +// CHECK: store float %[[VAL_33]], float* %[[VAL_35]], align 4 +// CHECK: br label %[[VAL_26]] +// CHECK: } + +// CHECK: define void @sort__2(i8* noalias align 64 dereferenceable(24) %[[VAL_0:.*]]) { +// CHECK: entry: +// CHECK: %[[VAL_1:.*]] = alloca i8, align 1 +// CHECK: %[[VAL_2:.*]] = getelementptr inbounds i8, i8* %[[VAL_0]], i64 0 +// CHECK: %[[VAL_3:.*]] = bitcast i8* %[[VAL_2]] to [2 x [3 x float]]* +// CHECK: %[[VAL_4:.*]] = getelementptr inbounds i8, i8* %[[VAL_0]], i64 0 +// CHECK: %[[VAL_5:.*]] = bitcast i8* %[[VAL_4]] to [2 x [3 x float]]* +// CHECK: %[[VAL_6:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6 +// CHECK: %[[VAL_7:.*]] = zext i32 %[[VAL_6]] to i64 +// CHECK: %[[VAL_8:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7 +// CHECK: %[[VAL_9:.*]] = zext i32 %[[VAL_8]] to i64 +// CHECK: %[[VAL_10:.*]] = mul nuw nsw i64 %[[VAL_7]], 4 +// CHECK: %[[VAL_11:.*]] = add nuw nsw i64 %[[VAL_10]], %[[VAL_9]] +// CHECK: %[[VAL_12:.*]] = icmp ult i64 %[[VAL_11]], 4 +// CHECK: call void @llvm.assume(i1 %[[VAL_12]]) +// CHECK: %[[VAL_13:.*]] = udiv i64 %[[VAL_11]], 1 +// CHECK: %[[VAL_14:.*]] = urem i64 %[[VAL_13]], 2 +// CHECK: %[[VAL_15:.*]] = udiv i64 %[[VAL_11]], 2 +// CHECK: %[[VAL_16:.*]] = icmp ult i64 %[[VAL_11]], 4 +// CHECK: br i1 %[[VAL_16]], label %[[VAL_17:.*]], label %[[VAL_18:.*]] +// CHECK: sort.in_bounds-after: ; preds = %[[VAL_19:.*]], %[[VAL_20:.*]] +// CHECK: ret void +// CHECK: sort.in_bounds-true: ; preds = %[[VAL_20]] +// CHECK: %[[VAL_21:.*]] = mul i64 %[[VAL_14]], 2 +// CHECK: %[[VAL_22:.*]] = xor i64 %[[VAL_21]], 1 +// CHECK: %[[VAL_23:.*]] = icmp slt i64 %[[VAL_21]], %[[VAL_22]] +// CHECK: %[[VAL_24:.*]] = icmp slt i64 %[[VAL_22]], 3 +// CHECK: %[[VAL_25:.*]] = and i1 %[[VAL_23]], %[[VAL_24]] +// CHECK: br i1 %[[VAL_25]], label %[[VAL_26:.*]], label %[[VAL_19]] +// CHECK: smaller_comparison_index-after: ; preds = %[[VAL_27:.*]], %[[VAL_17]] +// CHECK: br label %[[VAL_18]] +// CHECK: smaller_comparison_index-true: ; preds = %[[VAL_17]] +// CHECK: %[[VAL_28:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_3]], i64 0, i64 %[[VAL_15]], i64 %[[VAL_22]] +// CHECK: %[[VAL_29:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_3]], i64 0, i64 %[[VAL_15]], i64 %[[VAL_21]] +// CHECK: call void @region_0_4(float* %[[VAL_28]], float* %[[VAL_29]], i8* %[[VAL_1]]) +// CHECK: %[[VAL_30:.*]] = load i8, i8* %[[VAL_1]], align 1 +// CHECK: %[[VAL_31:.*]] = icmp ne i8 %[[VAL_30]], 0 +// CHECK: br i1 %[[VAL_31]], label %[[VAL_32:.*]], label %[[VAL_27]] +// CHECK: is_smaller_than-after: ; preds = %[[VAL_32]], %[[VAL_26]] +// CHECK: br label %[[VAL_19]] +// CHECK: is_smaller_than-true: ; preds = %[[VAL_26]] +// CHECK: %[[VAL_33:.*]] = load float, float* %[[VAL_28]], align 4 +// CHECK: %[[VAL_34:.*]] = load float, float* %[[VAL_29]], align 4 +// CHECK: %[[VAL_35:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_3]], i64 0, i64 %[[VAL_15]], i64 %[[VAL_21]] +// CHECK: store float %[[VAL_33]], float* %[[VAL_35]], align 4 +// CHECK: %[[VAL_36:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_3]], i64 0, i64 %[[VAL_15]], i64 %[[VAL_22]] +// CHECK: store float %[[VAL_34]], float* %[[VAL_36]], align 4 +// CHECK: br label %[[VAL_27]] +// CHECK: } -// CHECK: define void @sort__2(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]]) { -// CHECK-NEXT: entry: -// CHECK-NEXT: [[COMPARE_RETURN_BUFFER:%.*]] = alloca i8, align 1 -// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0:%.*]], i64 0 -// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to [2 x [3 x float]]* -// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0]], i64 0 -// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to [2 x [3 x float]]* -// CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6 -// CHECK-NEXT: [[BLOCK_ID:%.*]] = zext i32 [[TMP4]] to i64 -// CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7 -// CHECK-NEXT: [[THREAD_ID:%.*]] = zext i32 [[TMP5]] to i64 -// CHECK-NEXT: [[TMP6:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4 -// CHECK-NEXT: [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP6]], [[THREAD_ID]] -// CHECK-NEXT: [[LINEAR_INDEX_IN_RANGE:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4 -// CHECK-NEXT: call void @llvm.assume(i1 [[LINEAR_INDEX_IN_RANGE]]) -// CHECK-NEXT: [[TMP7:%.*]] = udiv i64 [[LINEAR_INDEX]], 1 -// CHECK-NEXT: [[TMP8:%.*]] = urem i64 [[TMP7]], 2 -// CHECK-NEXT: [[TMP9:%.*]] = udiv i64 [[LINEAR_INDEX]], 2 -// CHECK-NEXT: [[TMP10:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4 -// CHECK-NEXT: br i1 [[TMP10]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]] -// CHECK: sort.in_bounds-after: -// CHECK-NEXT: ret void -// CHECK: sort.in_bounds-true: -// CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP8]], 2 -// CHECK-NEXT: [[TMP12:%.*]] = xor i64 [[TMP11]], 1 -// CHECK-NEXT: [[TMP13:%.*]] = icmp slt i64 [[TMP11]], [[TMP12]] -// CHECK-NEXT: [[TMP14:%.*]] = icmp slt i64 [[TMP12]], 3 -// CHECK-NEXT: [[TMP15:%.*]] = and i1 [[TMP13]], [[TMP14]] -// CHECK-NEXT: br i1 [[TMP15]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]] -// CHECK: smaller_comparison_index-after: -// CHECK-NEXT: br label [[SORT_IN_BOUNDS_AFTER]] -// CHECK: smaller_comparison_index-true: -// CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP12]] -// CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP11]] -// CHECK-NEXT: call void @region_0_4(float* [[TMP16]], float* [[TMP17]], i8* [[COMPARE_RETURN_BUFFER]]) -// CHECK-NEXT: [[TMP18:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1 -// CHECK-NEXT: [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP18]], 0 -// CHECK-NEXT: br i1 [[BOOLEAN_PREDICATE]], label [[IS_SMALLER_THAN_TRUE:%.*]], label [[IS_SMALLER_THAN_AFTER:%.*]] -// CHECK: is_smaller_than-after: -// CHECK-NEXT: br label [[SMALLER_COMPARISON_INDEX_AFTER]] -// CHECK: is_smaller_than-true: -// CHECK-NEXT: [[TMP19:%.*]] = load float, float* [[TMP16]], align 4 -// CHECK-NEXT: [[TMP20:%.*]] = load float, float* [[TMP17]], align 4 -// CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP11]] -// CHECK-NEXT: store float [[TMP19]], float* [[TMP21]], align 4 -// CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP12]] -// CHECK-NEXT: store float [[TMP20]], float* [[TMP22]], align 4 -// CHECK-NEXT: br label [[IS_SMALLER_THAN_AFTER]] ENTRY main { x = f32[2, 3] parameter(0) ROOT sort = f32[2, 3] sort(x), dimensions={1}, to_apply=compare @@ -182,193 +193,204 @@ compare { ROOT lt = pred[] compare(p.1.lhs, p.1.rhs), direction=LT } -// CHECK: define void @sort(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]], i8* noalias align 64 dereferenceable(24) [[ALLOC1:%.*]], i8* noalias align 64 dereferenceable(16) [[ALLOC4:%.*]]) -// CHECK-NEXT: entry: -// CHECK-NEXT: [[COMPARE_RETURN_BUFFER:%.*]] = alloca i8, align 1 -// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0:%.*]], i64 0 -// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to [2 x [3 x i32]]* -// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[ALLOC1:%.*]], i64 0 -// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to [2 x [3 x float]]* -// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[ALLOC4:%.*]], i64 0 -// CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* -// CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6 -// CHECK-NEXT: [[BLOCK_ID:%.*]] = zext i32 [[TMP6]] to i64 -// CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7 -// CHECK-NEXT: [[THREAD_ID:%.*]] = zext i32 [[TMP7]] to i64 -// CHECK-NEXT: [[TMP8:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4 -// CHECK-NEXT: [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP8]], [[THREAD_ID]] -// CHECK-NEXT: [[LINEAR_INDEX_IN_RANGE:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4 -// CHECK-NEXT: call void @llvm.assume(i1 [[LINEAR_INDEX_IN_RANGE]]) -// CHECK-NEXT: [[TMP9:%.*]] = udiv i64 [[LINEAR_INDEX]], 1 -// CHECK-NEXT: [[TMP10:%.*]] = urem i64 [[TMP9]], 2 -// CHECK-NEXT: [[TMP11:%.*]] = udiv i64 [[LINEAR_INDEX]], 2 -// CHECK-NEXT: [[TMP12:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4 -// CHECK-NEXT: br i1 [[TMP12]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]] -// CHECK: sort.in_bounds-after: -// CHECK-NEXT: ret void -// CHECK: sort.in_bounds-true: -// CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP10]], 2 -// CHECK-NEXT: [[TMP14:%.*]] = xor i64 [[TMP13]], 1 -// CHECK-NEXT: [[TMP15:%.*]] = icmp slt i64 [[TMP13]], [[TMP14]] -// CHECK-NEXT: [[TMP16:%.*]] = icmp slt i64 [[TMP14]], 3 -// CHECK-NEXT: [[TMP17:%.*]] = and i1 [[TMP15]], [[TMP16]] -// CHECK-NEXT: br i1 [[TMP17]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]] -// CHECK: smaller_comparison_index-after: -// CHECK-NEXT: br label [[SORT_IN_BOUNDS_AFTER]] -// CHECK: smaller_comparison_index-true: -// CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP14]] -// CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP13]] -// CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP14]] -// CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP13]] -// CHECK-NEXT: call void @region_0_6(i32* [[TMP18]], i32* [[TMP19]], float* [[TMP20]], float* [[TMP21]], i8* [[COMPARE_RETURN_BUFFER]]) -// CHECK-NEXT: [[TMP22:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1 -// CHECK-NEXT: [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP22]], 0 -// CHECK-NEXT: br i1 [[BOOLEAN_PREDICATE]], label [[IS_SMALLER_THAN_TRUE:%.*]], label [[IS_SMALLER_THAN_AFTER:%.*]] -// CHECK: is_smaller_than-after: -// CHECK-NEXT: br label [[SMALLER_COMPARISON_INDEX_AFTER]] -// CHECK: is_smaller_than-true: -// CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP18]], align 4 -// CHECK-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP19]], align 4 -// CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP13]] -// CHECK-NEXT: store i32 [[TMP23]], i32* [[TMP25]], align 4 -// CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP14]] -// CHECK-NEXT: store i32 [[TMP24]], i32* [[TMP26]], align 4 -// CHECK-NEXT: [[TMP27:%.*]] = load float, float* [[TMP20]], align 4 -// CHECK-NEXT: [[TMP28:%.*]] = load float, float* [[TMP21]], align 4 -// CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP13]] -// CHECK-NEXT: store float [[TMP27]], float* [[TMP29]], align 4 -// CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP14]] -// CHECK-NEXT: store float [[TMP28]], float* [[TMP30]], align 4 -// CHECK-NEXT: br label [[IS_SMALLER_THAN_AFTER]] +// CHECK: define void @sort(i8* noalias align 64 dereferenceable(24) %[[VAL_0:.*]], i8* noalias align 64 dereferenceable(24) %[[VAL_1:.*]], i8* noalias align 64 dereferenceable(16) %[[VAL_2:.*]]) { +// CHECK: entry: +// CHECK: %[[VAL_3:.*]] = alloca i8, align 1 +// CHECK: %[[VAL_4:.*]] = getelementptr inbounds i8, i8* %[[VAL_0]], i64 0 +// CHECK: %[[VAL_5:.*]] = bitcast i8* %[[VAL_4]] to [2 x [3 x i32]]* +// CHECK: %[[VAL_6:.*]] = getelementptr inbounds i8, i8* %[[VAL_1]], i64 0 +// CHECK: %[[VAL_7:.*]] = bitcast i8* %[[VAL_6]] to [2 x [3 x float]]* +// CHECK: %[[VAL_8:.*]] = getelementptr inbounds i8, i8* %[[VAL_2]], i64 0 +// CHECK: %[[VAL_9:.*]] = bitcast i8* %[[VAL_8]] to [2 x i8*]* +// CHECK: %[[VAL_10:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6 +// CHECK: %[[VAL_11:.*]] = zext i32 %[[VAL_10]] to i64 +// CHECK: %[[VAL_12:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7 +// CHECK: %[[VAL_13:.*]] = zext i32 %[[VAL_12]] to i64 +// CHECK: %[[VAL_14:.*]] = mul nuw nsw i64 %[[VAL_11]], 4 +// CHECK: %[[VAL_15:.*]] = add nuw nsw i64 %[[VAL_14]], %[[VAL_13]] +// CHECK: %[[VAL_16:.*]] = icmp ult i64 %[[VAL_15]], 4 +// CHECK: call void @llvm.assume(i1 %[[VAL_16]]) +// CHECK: %[[VAL_17:.*]] = udiv i64 %[[VAL_15]], 1 +// CHECK: %[[VAL_18:.*]] = urem i64 %[[VAL_17]], 2 +// CHECK: %[[VAL_19:.*]] = udiv i64 %[[VAL_15]], 2 +// CHECK: %[[VAL_20:.*]] = icmp ult i64 %[[VAL_15]], 4 +// CHECK: br i1 %[[VAL_20]], label %[[VAL_21:.*]], label %[[VAL_22:.*]] +// CHECK: sort.in_bounds-after: ; preds = %[[VAL_23:.*]], %[[VAL_24:.*]] +// CHECK: ret void +// CHECK: sort.in_bounds-true: ; preds = %[[VAL_24]] +// CHECK: %[[VAL_25:.*]] = mul i64 %[[VAL_18]], 2 +// CHECK: %[[VAL_26:.*]] = xor i64 %[[VAL_25]], 1 +// CHECK: %[[VAL_27:.*]] = icmp slt i64 %[[VAL_25]], %[[VAL_26]] +// CHECK: %[[VAL_28:.*]] = icmp slt i64 %[[VAL_26]], 3 +// CHECK: %[[VAL_29:.*]] = and i1 %[[VAL_27]], %[[VAL_28]] +// CHECK: br i1 %[[VAL_29]], label %[[VAL_30:.*]], label %[[VAL_23]] +// CHECK: smaller_comparison_index-after: ; preds = %[[VAL_31:.*]], %[[VAL_21]] +// CHECK: br label %[[VAL_22]] +// CHECK: smaller_comparison_index-true: ; preds = %[[VAL_21]] +// CHECK: %[[VAL_32:.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* %[[VAL_5]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_26]] +// CHECK: %[[VAL_33:.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* %[[VAL_5]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_25]] +// CHECK: %[[VAL_34:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_7]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_26]] +// CHECK: %[[VAL_35:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_7]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_25]] +// CHECK: call void @region_0_6(i32* %[[VAL_32]], i32* %[[VAL_33]], float* %[[VAL_34]], float* %[[VAL_35]], i8* %[[VAL_3]]) +// CHECK: %[[VAL_36:.*]] = load i8, i8* %[[VAL_3]], align 1 +// CHECK: %[[VAL_37:.*]] = icmp ne i8 %[[VAL_36]], 0 +// CHECK: br i1 %[[VAL_37]], label %[[VAL_38:.*]], label %[[VAL_31]] +// CHECK: is_smaller_than-after: ; preds = %[[VAL_38]], %[[VAL_30]] +// CHECK: br label %[[VAL_23]] +// CHECK: is_smaller_than-true: ; preds = %[[VAL_30]] +// CHECK: %[[VAL_39:.*]] = load i32, i32* %[[VAL_32]], align 4 +// CHECK: %[[VAL_40:.*]] = load i32, i32* %[[VAL_33]], align 4 +// CHECK: %[[VAL_41:.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* %[[VAL_5]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_25]] +// CHECK: store i32 %[[VAL_39]], i32* %[[VAL_41]], align 4 +// CHECK: %[[VAL_42:.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* %[[VAL_5]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_26]] +// CHECK: store i32 %[[VAL_40]], i32* %[[VAL_42]], align 4 +// CHECK: %[[VAL_43:.*]] = load float, float* %[[VAL_34]], align 4 +// CHECK: %[[VAL_44:.*]] = load float, float* %[[VAL_35]], align 4 +// CHECK: %[[VAL_45:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_7]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_25]] +// CHECK: store float %[[VAL_43]], float* %[[VAL_45]], align 4 +// CHECK: %[[VAL_46:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_7]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_26]] +// CHECK: store float %[[VAL_44]], float* %[[VAL_46]], align 4 +// CHECK: br label %[[VAL_31]] +// CHECK: } +// CHECK: ; Function Attrs: nounwind readnone +// CHECK: declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #0 +// CHECK: ; Function Attrs: nounwind readnone +// CHECK: declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 +// CHECK: ; Function Attrs: nofree nosync nounwind willreturn +// CHECK: declare void @llvm.assume(i1 noundef) #1 -// CHECK: define internal void @region_0_6(i32* dereferenceable(4) [[P_0_LHS_TYPED:%.*]], i32* dereferenceable(4) [[P_0_RHS_TYPED:%.*]], float* dereferenceable(4) [[P_1_LHS_TYPED:%.*]], float* dereferenceable(4) [[P_1_RHS_TYPED:%.*]], i8* dereferenceable(1) [[OUTPUT_ARG:%.*]]) -// CHECK-NEXT: entry: -// CHECK-NEXT: [[COMPARE_5_TYPED:%.*]] = alloca i8, align 1 -// CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[ARG_2_3_TYPED:%.*]], align 4 -// CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARG_3_4_TYPED:%.*]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = fcmp olt float [[TMP0]], [[TMP1]] -// CHECK-NEXT: [[TMP3:%.*]] = zext i1 [[TMP2]] to i8 -// CHECK-NEXT: store i8 [[TMP3]], i8* [[COMPARE_5_TYPED]], align 1 -// CHECK-NEXT: [[LOAD_RET_VALUE:%.*]] = load i8, i8* [[COMPARE_5_TYPED]], align 1 -// CHECK-NEXT: store i8 [[LOAD_RET_VALUE]], i8* [[OUTPUT_ARG:%.*]], align 1 -// CHECK-NEXT: ret void +// CHECK: define internal void @region_0_6(i32* dereferenceable(4) %[[VAL_0:.*]], i32* dereferenceable(4) %[[VAL_1:.*]], float* dereferenceable(4) %[[VAL_2:.*]], float* dereferenceable(4) %[[VAL_3:.*]], i8* dereferenceable(1) %[[VAL_4:.*]]) { +// CHECK: entry: +// CHECK: %[[VAL_5:.*]] = alloca i8, align 1 +// CHECK: %[[VAL_6:.*]] = load float, float* %[[VAL_2]], align 4 +// CHECK: %[[VAL_7:.*]] = load float, float* %[[VAL_3]], align 4 +// CHECK: %[[VAL_8:.*]] = fcmp olt float %[[VAL_6]], %[[VAL_7]] +// CHECK: %[[VAL_9:.*]] = zext i1 %[[VAL_8]] to i8 +// CHECK: store i8 %[[VAL_9]], i8* %[[VAL_5]], align 1 +// CHECK: %[[VAL_10:.*]] = load i8, i8* %[[VAL_5]], align 1 +// CHECK: store i8 %[[VAL_10]], i8* %[[VAL_4]], align 1 +// CHECK: ret void +// CHECK: } -// CHECK: define void @sort__1(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]], i8* noalias align 64 dereferenceable(24) [[ALLOC1:%.*]], i8* noalias align 64 dereferenceable(16) [[ALLOC4:%.*]]) -// CHECK-NEXT: entry: -// CHECK-NEXT: [[COMPARE_RETURN_BUFFER:%.*]] = alloca i8, align 1 -// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0:%.*]], i64 0 -// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to [2 x [3 x i32]]* -// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[ALLOC1:%.*]], i64 0 -// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to [2 x [3 x float]]* -// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[ALLOC4:%.*]], i64 0 -// CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* -// CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6 -// CHECK-NEXT: [[BLOCK_ID:%.*]] = zext i32 [[TMP6]] to i64 -// CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7 -// CHECK-NEXT: [[THREAD_ID:%.*]] = zext i32 [[TMP7]] to i64 -// CHECK-NEXT: [[TMP8:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4 -// CHECK-NEXT: [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP8]], [[THREAD_ID]] -// CHECK-NEXT: [[LINEAR_INDEX_IN_RANGE:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4 -// CHECK-NEXT: call void @llvm.assume(i1 [[LINEAR_INDEX_IN_RANGE]]) -// CHECK-NEXT: [[TMP9:%.*]] = udiv i64 [[LINEAR_INDEX]], 1 -// CHECK-NEXT: [[TMP10:%.*]] = urem i64 [[TMP9]], 2 -// CHECK-NEXT: [[TMP11:%.*]] = udiv i64 [[LINEAR_INDEX]], 2 -// CHECK-NEXT: [[TMP12:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4 -// CHECK-NEXT: br i1 [[TMP12]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]] -// CHECK: sort.in_bounds-after: -// CHECK-NEXT: ret void -// CHECK: sort.in_bounds-true: -// CHECK-NEXT: [[TMP13:%.*]] = xor i64 [[TMP10]], 3 -// CHECK-NEXT: [[TMP14:%.*]] = icmp slt i64 [[TMP10]], [[TMP13]] -// CHECK-NEXT: [[TMP15:%.*]] = icmp slt i64 [[TMP13]], 3 -// CHECK-NEXT: [[TMP16:%.*]] = and i1 [[TMP14]], [[TMP15]] -// CHECK-NEXT: br i1 [[TMP16]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]] -// CHECK: smaller_comparison_index-after: -// CHECK-NEXT: br label [[SORT_IN_BOUNDS_AFTER]] -// CHECK: smaller_comparison_index-true: -// CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP13]] -// CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP10]] -// CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP13]] -// CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP10]] -// CHECK-NEXT: call void @region_0_6(i32* [[TMP17]], i32* [[TMP18]], float* [[TMP19]], float* [[TMP20]], i8* [[COMPARE_RETURN_BUFFER]]) -// CHECK-NEXT: [[TMP21:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1 -// CHECK-NEXT: [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP21]], 0 -// CHECK-NEXT: br i1 [[BOOLEAN_PREDICATE]], label [[IS_SMALLER_THAN_TRUE:%.*]], label [[IS_SMALLER_THAN_AFTER:%.*]] -// CHECK: is_smaller_than-after: -// CHECK-NEXT: br label [[SMALLER_COMPARISON_INDEX_AFTER]] -// CHECK: is_smaller_than-true: -// CHECK-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP17]], align 4 -// CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP18]], align 4 -// CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP10]] -// CHECK-NEXT: store i32 [[TMP22]], i32* [[TMP24]], align 4 -// CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP13]] -// CHECK-NEXT: store i32 [[TMP23]], i32* [[TMP25]], align 4 -// CHECK-NEXT: [[TMP26:%.*]] = load float, float* [[TMP19]], align 4 -// CHECK-NEXT: [[TMP27:%.*]] = load float, float* [[TMP20]], align 4 -// CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP10]] -// CHECK-NEXT: store float [[TMP26]], float* [[TMP28]], align 4 -// CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP13]] -// CHECK-NEXT: store float [[TMP27]], float* [[TMP29]], align 4 -// CHECK-NEXT: br label [[IS_SMALLER_THAN_AFTER]] +// CHECK: define void @sort__1(i8* noalias align 64 dereferenceable(24) %[[VAL_0:.*]], i8* noalias align 64 dereferenceable(24) %[[VAL_1:.*]], i8* noalias align 64 dereferenceable(16) %[[VAL_2:.*]]) { +// CHECK: entry: +// CHECK: %[[VAL_3:.*]] = alloca i8, align 1 +// CHECK: %[[VAL_4:.*]] = getelementptr inbounds i8, i8* %[[VAL_0]], i64 0 +// CHECK: %[[VAL_5:.*]] = bitcast i8* %[[VAL_4]] to [2 x [3 x i32]]* +// CHECK: %[[VAL_6:.*]] = getelementptr inbounds i8, i8* %[[VAL_1]], i64 0 +// CHECK: %[[VAL_7:.*]] = bitcast i8* %[[VAL_6]] to [2 x [3 x float]]* +// CHECK: %[[VAL_8:.*]] = getelementptr inbounds i8, i8* %[[VAL_2]], i64 0 +// CHECK: %[[VAL_9:.*]] = bitcast i8* %[[VAL_8]] to [2 x i8*]* +// CHECK: %[[VAL_10:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6 +// CHECK: %[[VAL_11:.*]] = zext i32 %[[VAL_10]] to i64 +// CHECK: %[[VAL_12:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7 +// CHECK: %[[VAL_13:.*]] = zext i32 %[[VAL_12]] to i64 +// CHECK: %[[VAL_14:.*]] = mul nuw nsw i64 %[[VAL_11]], 4 +// CHECK: %[[VAL_15:.*]] = add nuw nsw i64 %[[VAL_14]], %[[VAL_13]] +// CHECK: %[[VAL_16:.*]] = icmp ult i64 %[[VAL_15]], 4 +// CHECK: call void @llvm.assume(i1 %[[VAL_16]]) +// CHECK: %[[VAL_17:.*]] = udiv i64 %[[VAL_15]], 1 +// CHECK: %[[VAL_18:.*]] = urem i64 %[[VAL_17]], 2 +// CHECK: %[[VAL_19:.*]] = udiv i64 %[[VAL_15]], 2 +// CHECK: %[[VAL_20:.*]] = icmp ult i64 %[[VAL_15]], 4 +// CHECK: br i1 %[[VAL_20]], label %[[VAL_21:.*]], label %[[VAL_22:.*]] +// CHECK: sort.in_bounds-after: ; preds = %[[VAL_23:.*]], %[[VAL_24:.*]] +// CHECK: ret void +// CHECK: sort.in_bounds-true: ; preds = %[[VAL_24]] +// CHECK: %[[VAL_25:.*]] = xor i64 %[[VAL_18]], 3 +// CHECK: %[[VAL_26:.*]] = icmp slt i64 %[[VAL_18]], %[[VAL_25]] +// CHECK: %[[VAL_27:.*]] = icmp slt i64 %[[VAL_25]], 3 +// CHECK: %[[VAL_28:.*]] = and i1 %[[VAL_26]], %[[VAL_27]] +// CHECK: br i1 %[[VAL_28]], label %[[VAL_29:.*]], label %[[VAL_23]] +// CHECK: smaller_comparison_index-after: ; preds = %[[VAL_30:.*]], %[[VAL_21]] +// CHECK: br label %[[VAL_22]] +// CHECK: smaller_comparison_index-true: ; preds = %[[VAL_21]] +// CHECK: %[[VAL_31:.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* %[[VAL_5]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_25]] +// CHECK: %[[VAL_32:.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* %[[VAL_5]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_18]] +// CHECK: %[[VAL_33:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_7]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_25]] +// CHECK: %[[VAL_34:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_7]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_18]] +// CHECK: call void @region_0_6(i32* %[[VAL_31]], i32* %[[VAL_32]], float* %[[VAL_33]], float* %[[VAL_34]], i8* %[[VAL_3]]) +// CHECK: %[[VAL_35:.*]] = load i8, i8* %[[VAL_3]], align 1 +// CHECK: %[[VAL_36:.*]] = icmp ne i8 %[[VAL_35]], 0 +// CHECK: br i1 %[[VAL_36]], label %[[VAL_37:.*]], label %[[VAL_30]] +// CHECK: is_smaller_than-after: ; preds = %[[VAL_37]], %[[VAL_29]] +// CHECK: br label %[[VAL_23]] +// CHECK: is_smaller_than-true: ; preds = %[[VAL_29]] +// CHECK: %[[VAL_38:.*]] = load i32, i32* %[[VAL_31]], align 4 +// CHECK: %[[VAL_39:.*]] = load i32, i32* %[[VAL_32]], align 4 +// CHECK: %[[VAL_40:.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* %[[VAL_5]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_18]] +// CHECK: store i32 %[[VAL_38]], i32* %[[VAL_40]], align 4 +// CHECK: %[[VAL_41:.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* %[[VAL_5]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_25]] +// CHECK: store i32 %[[VAL_39]], i32* %[[VAL_41]], align 4 +// CHECK: %[[VAL_42:.*]] = load float, float* %[[VAL_33]], align 4 +// CHECK: %[[VAL_43:.*]] = load float, float* %[[VAL_34]], align 4 +// CHECK: %[[VAL_44:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_7]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_18]] +// CHECK: store float %[[VAL_42]], float* %[[VAL_44]], align 4 +// CHECK: %[[VAL_45:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_7]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_25]] +// CHECK: store float %[[VAL_43]], float* %[[VAL_45]], align 4 +// CHECK: br label %[[VAL_30]] +// CHECK: } + +// CHECK: define void @sort__2(i8* noalias align 64 dereferenceable(24) %[[VAL_0:.*]], i8* noalias align 64 dereferenceable(24) %[[VAL_1:.*]], i8* noalias align 64 dereferenceable(16) %[[VAL_2:.*]]) { +// CHECK: entry: +// CHECK: %[[VAL_3:.*]] = alloca i8, align 1 +// CHECK: %[[VAL_4:.*]] = getelementptr inbounds i8, i8* %[[VAL_0]], i64 0 +// CHECK: %[[VAL_5:.*]] = bitcast i8* %[[VAL_4]] to [2 x [3 x i32]]* +// CHECK: %[[VAL_6:.*]] = getelementptr inbounds i8, i8* %[[VAL_1]], i64 0 +// CHECK: %[[VAL_7:.*]] = bitcast i8* %[[VAL_6]] to [2 x [3 x float]]* +// CHECK: %[[VAL_8:.*]] = getelementptr inbounds i8, i8* %[[VAL_2]], i64 0 +// CHECK: %[[VAL_9:.*]] = bitcast i8* %[[VAL_8]] to [2 x i8*]* +// CHECK: %[[VAL_10:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6 +// CHECK: %[[VAL_11:.*]] = zext i32 %[[VAL_10]] to i64 +// CHECK: %[[VAL_12:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7 +// CHECK: %[[VAL_13:.*]] = zext i32 %[[VAL_12]] to i64 +// CHECK: %[[VAL_14:.*]] = mul nuw nsw i64 %[[VAL_11]], 4 +// CHECK: %[[VAL_15:.*]] = add nuw nsw i64 %[[VAL_14]], %[[VAL_13]] +// CHECK: %[[VAL_16:.*]] = icmp ult i64 %[[VAL_15]], 4 +// CHECK: call void @llvm.assume(i1 %[[VAL_16]]) +// CHECK: %[[VAL_17:.*]] = udiv i64 %[[VAL_15]], 1 +// CHECK: %[[VAL_18:.*]] = urem i64 %[[VAL_17]], 2 +// CHECK: %[[VAL_19:.*]] = udiv i64 %[[VAL_15]], 2 +// CHECK: %[[VAL_20:.*]] = icmp ult i64 %[[VAL_15]], 4 +// CHECK: br i1 %[[VAL_20]], label %[[VAL_21:.*]], label %[[VAL_22:.*]] +// CHECK: sort.in_bounds-after: ; preds = %[[VAL_23:.*]], %[[VAL_24:.*]] +// CHECK: ret void +// CHECK: sort.in_bounds-true: ; preds = %[[VAL_24]] +// CHECK: %[[VAL_25:.*]] = mul i64 %[[VAL_18]], 2 +// CHECK: %[[VAL_26:.*]] = xor i64 %[[VAL_25]], 1 +// CHECK: %[[VAL_27:.*]] = icmp slt i64 %[[VAL_25]], %[[VAL_26]] +// CHECK: %[[VAL_28:.*]] = icmp slt i64 %[[VAL_26]], 3 +// CHECK: %[[VAL_29:.*]] = and i1 %[[VAL_27]], %[[VAL_28]] +// CHECK: br i1 %[[VAL_29]], label %[[VAL_30:.*]], label %[[VAL_23]] +// CHECK: smaller_comparison_index-after: ; preds = %[[VAL_31:.*]], %[[VAL_21]] +// CHECK: br label %[[VAL_22]] +// CHECK: smaller_comparison_index-true: ; preds = %[[VAL_21]] +// CHECK: %[[VAL_32:.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* %[[VAL_5]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_26]] +// CHECK: %[[VAL_33:.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* %[[VAL_5]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_25]] +// CHECK: %[[VAL_34:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_7]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_26]] +// CHECK: %[[VAL_35:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_7]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_25]] +// CHECK: call void @region_0_6(i32* %[[VAL_32]], i32* %[[VAL_33]], float* %[[VAL_34]], float* %[[VAL_35]], i8* %[[VAL_3]]) +// CHECK: %[[VAL_36:.*]] = load i8, i8* %[[VAL_3]], align 1 +// CHECK: %[[VAL_37:.*]] = icmp ne i8 %[[VAL_36]], 0 +// CHECK: br i1 %[[VAL_37]], label %[[VAL_38:.*]], label %[[VAL_31]] +// CHECK: is_smaller_than-after: ; preds = %[[VAL_38]], %[[VAL_30]] +// CHECK: br label %[[VAL_23]] +// CHECK: is_smaller_than-true: ; preds = %[[VAL_30]] +// CHECK: %[[VAL_39:.*]] = load i32, i32* %[[VAL_32]], align 4 +// CHECK: %[[VAL_40:.*]] = load i32, i32* %[[VAL_33]], align 4 +// CHECK: %[[VAL_41:.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* %[[VAL_5]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_25]] +// CHECK: store i32 %[[VAL_39]], i32* %[[VAL_41]], align 4 +// CHECK: %[[VAL_42:.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* %[[VAL_5]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_26]] +// CHECK: store i32 %[[VAL_40]], i32* %[[VAL_42]], align 4 +// CHECK: %[[VAL_43:.*]] = load float, float* %[[VAL_34]], align 4 +// CHECK: %[[VAL_44:.*]] = load float, float* %[[VAL_35]], align 4 +// CHECK: %[[VAL_45:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_7]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_25]] +// CHECK: store float %[[VAL_43]], float* %[[VAL_45]], align 4 +// CHECK: %[[VAL_46:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_7]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_26]] +// CHECK: store float %[[VAL_44]], float* %[[VAL_46]], align 4 +// CHECK: br label %[[VAL_31]] +// CHECK: } -// CHECK: define void @sort__2(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]], i8* noalias align 64 dereferenceable(24) [[ALLOC1:%.*]], i8* noalias align 64 dereferenceable(16) [[ALLOC4:%.*]]) -// CHECK-NEXT: entry: -// CHECK-NEXT: [[COMPARE_RETURN_BUFFER:%.*]] = alloca i8, align 1 -// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0:%.*]], i64 0 -// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to [2 x [3 x i32]]* -// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[ALLOC1:%.*]], i64 0 -// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to [2 x [3 x float]]* -// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[ALLOC4:%.*]], i64 0 -// CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* -// CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6 -// CHECK-NEXT: [[BLOCK_ID:%.*]] = zext i32 [[TMP6]] to i64 -// CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7 -// CHECK-NEXT: [[THREAD_ID:%.*]] = zext i32 [[TMP7]] to i64 -// CHECK-NEXT: [[TMP8:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4 -// CHECK-NEXT: [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP8]], [[THREAD_ID]] -// CHECK-NEXT: [[LINEAR_INDEX_IN_RANGE:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4 -// CHECK-NEXT: call void @llvm.assume(i1 [[LINEAR_INDEX_IN_RANGE]]) -// CHECK-NEXT: [[TMP9:%.*]] = udiv i64 [[LINEAR_INDEX]], 1 -// CHECK-NEXT: [[TMP10:%.*]] = urem i64 [[TMP9]], 2 -// CHECK-NEXT: [[TMP11:%.*]] = udiv i64 [[LINEAR_INDEX]], 2 -// CHECK-NEXT: [[TMP12:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4 -// CHECK-NEXT: br i1 [[TMP12]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]] -// CHECK: sort.in_bounds-after: -// CHECK-NEXT: ret void -// CHECK: sort.in_bounds-true: -// CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP10]], 2 -// CHECK-NEXT: [[TMP18:%.*]] = xor i64 [[TMP17]], 1 -// CHECK-NEXT: [[TMP19:%.*]] = icmp slt i64 [[TMP17]], [[TMP18]] -// CHECK-NEXT: [[TMP20:%.*]] = icmp slt i64 [[TMP18]], 3 -// CHECK-NEXT: [[TMP21:%.*]] = and i1 [[TMP19]], [[TMP20]] -// CHECK-NEXT: br i1 [[TMP21]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]] -// CHECK: smaller_comparison_index-after: -// CHECK-NEXT: br label [[SORT_IN_BOUNDS_AFTER]] -// CHECK: smaller_comparison_index-true: -// CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP18]] -// CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP17]] -// CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP18]] -// CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP17]] -// CHECK-NEXT: call void @region_0_6(i32* [[TMP22]], i32* [[TMP23]], float* [[TMP24]], float* [[TMP25]], i8* [[COMPARE_RETURN_BUFFER]]) -// CHECK-NEXT: [[TMP26:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1 -// CHECK-NEXT: [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP26]], 0 -// CHECK-NEXT: br i1 [[BOOLEAN_PREDICATE]], label [[IS_SMALLER_THAN_TRUE:%.*]], label [[IS_SMALLER_THAN_AFTER:%.*]] -// CHECK: is_smaller_than-after: -// CHECK-NEXT: br label [[SMALLER_COMPARISON_INDEX_AFTER]] -// CHECK: is_smaller_than-true: -// CHECK-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP22]], align 4 -// CHECK-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP23]], align 4 -// CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP17]] -// CHECK-NEXT: store i32 [[TMP27]], i32* [[TMP29]], align 4 -// CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP18]] -// CHECK-NEXT: store i32 [[TMP28]], i32* [[TMP30]], align 4 -// CHECK-NEXT: [[TMP31:%.*]] = load float, float* [[TMP24]], align 4 -// CHECK-NEXT: [[TMP32:%.*]] = load float, float* [[TMP25]], align 4 -// CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP17]] -// CHECK-NEXT: store float [[TMP31]], float* [[TMP33]], align 4 -// CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP18]] -// CHECK-NEXT: store float [[TMP32]], float* [[TMP34]], align 4 -// CHECK-NEXT: br label [[IS_SMALLER_THAN_AFTER]] ENTRY main { x = s32[2, 3] parameter(0) y = f32[2, 3] parameter(1)