diff --git a/tensorflow/compiler/xla/service/gpu/tests/sorting.hlo b/tensorflow/compiler/xla/service/gpu/tests/sorting.hlo
index 2b22144130a..8e4e8bfff81 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/sorting.hlo
+++ b/tensorflow/compiler/xla/service/gpu/tests/sorting.hlo
@@ -8,163 +8,174 @@ compare {
   ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
 }
 
-// CHECK: define void @sort(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]])
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[COMPARE_RETURN_BUFFER:%.*]] = alloca i8, align 1
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0:%.*]], i64 0
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to [2 x [3 x float]]*
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0]], i64 0
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to [2 x [3 x float]]*
-// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
-// CHECK-NEXT:    [[BLOCK_ID:%.*]] = zext i32 [[TMP4]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
-// CHECK-NEXT:    [[THREAD_ID:%.*]] = zext i32 [[TMP5]] to i64
-// CHECK-NEXT:    [[TMP6:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4
-// CHECK-NEXT:    [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP6]], [[THREAD_ID]]
-// CHECK-NEXT:    [[LINEAR_INDEX_IN_RANGE:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
-// CHECK-NEXT:    call void @llvm.assume(i1 [[LINEAR_INDEX_IN_RANGE]])
-// CHECK-NEXT:    [[TMP7:%.*]] = udiv i64 [[LINEAR_INDEX]], 1
-// CHECK-NEXT:    [[TMP8:%.*]] = urem i64 [[TMP7]], 2
-// CHECK-NEXT:    [[TMP9:%.*]] = udiv i64 [[LINEAR_INDEX]], 2
-// CHECK-NEXT:    [[TMP10:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
-// CHECK-NEXT:    br i1 [[TMP10]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]]
-// CHECK:       sort.in_bounds-after:
-// CHECK-NEXT:    ret void
-// CHECK:       sort.in_bounds-true:
-// CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP8]], 2
-// CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 1
-// CHECK-NEXT:    [[TMP13:%.*]] = icmp slt i64 [[TMP11]], [[TMP12]]
-// CHECK-NEXT:    [[TMP14:%.*]] = icmp slt i64 [[TMP12]], 3
-// CHECK-NEXT:    [[TMP15:%.*]] = and i1 [[TMP13]], [[TMP14]]
-// CHECK-NEXT:    br i1 [[TMP15]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]]
-// CHECK:       smaller_comparison_index-after:
-// CHECK-NEXT:    br label [[SORT_IN_BOUNDS_AFTER]]
-// CHECK:       smaller_comparison_index-true:
-// CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP12]]
-// CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP11]]
-// CHECK-NEXT:    call void @region_0_4(float* [[TMP16]], float* [[TMP17]], i8* [[COMPARE_RETURN_BUFFER]])
-// CHECK-NEXT:    [[TMP18:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1
-// CHECK-NEXT:    [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP18]], 0
-// CHECK-NEXT:    br i1 [[BOOLEAN_PREDICATE]], label [[IS_SMALLER_THAN_TRUE:%.*]], label [[IS_SMALLER_THAN_AFTER:%.*]]
-// CHECK:       is_smaller_than-after:
-// CHECK-NEXT:    br label [[SMALLER_COMPARISON_INDEX_AFTER]]
-// CHECK:       is_smaller_than-true:
-// CHECK-NEXT:    [[TMP19:%.*]] = load float, float* [[TMP16]], align 4
-// CHECK-NEXT:    [[TMP20:%.*]] = load float, float* [[TMP17]], align 4
-// CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP11]]
-// CHECK-NEXT:    store float [[TMP19]], float* [[TMP21]], align 4
-// CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP12]]
-// CHECK-NEXT:    store float [[TMP20]], float* [[TMP22]], align 4
-// CHECK-NEXT:    br label [[IS_SMALLER_THAN_AFTER]]
+// CHECK:     define void @sort(i8* noalias align 64 dereferenceable(24) %[[VAL_0:.*]]) {
+// CHECK:       entry:
+// CHECK:         %[[VAL_1:.*]] = alloca i8, align 1
+// CHECK:         %[[VAL_2:.*]] = getelementptr inbounds i8, i8* %[[VAL_0]], i64 0
+// CHECK:         %[[VAL_3:.*]] = bitcast i8* %[[VAL_2]] to [2 x [3 x float]]*
+// CHECK:         %[[VAL_4:.*]] = getelementptr inbounds i8, i8* %[[VAL_0]], i64 0
+// CHECK:         %[[VAL_5:.*]] = bitcast i8* %[[VAL_4]] to [2 x [3 x float]]*
+// CHECK:         %[[VAL_6:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
+// CHECK:         %[[VAL_7:.*]] = zext i32 %[[VAL_6]] to i64
+// CHECK:         %[[VAL_8:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
+// CHECK:         %[[VAL_9:.*]] = zext i32 %[[VAL_8]] to i64
+// CHECK:         %[[VAL_10:.*]] = mul nuw nsw i64 %[[VAL_7]], 4
+// CHECK:         %[[VAL_11:.*]] = add nuw nsw i64 %[[VAL_10]], %[[VAL_9]]
+// CHECK:         %[[VAL_12:.*]] = icmp ult i64 %[[VAL_11]], 4
+// CHECK:         call void @llvm.assume(i1 %[[VAL_12]])
+// CHECK:         %[[VAL_13:.*]] = udiv i64 %[[VAL_11]], 1
+// CHECK:         %[[VAL_14:.*]] = urem i64 %[[VAL_13]], 2
+// CHECK:         %[[VAL_15:.*]] = udiv i64 %[[VAL_11]], 2
+// CHECK:         %[[VAL_16:.*]] = icmp ult i64 %[[VAL_11]], 4
+// CHECK:         br i1 %[[VAL_16]], label %[[VAL_17:.*]], label %[[VAL_18:.*]]
+// CHECK:       sort.in_bounds-after:                             ; preds = %[[VAL_19:.*]], %[[VAL_20:.*]]
+// CHECK:         ret void
+// CHECK:       sort.in_bounds-true:                              ; preds = %[[VAL_20]]
+// CHECK:         %[[VAL_21:.*]] = mul i64 %[[VAL_14]], 2
+// CHECK:         %[[VAL_22:.*]] = xor i64 %[[VAL_21]], 1
+// CHECK:         %[[VAL_23:.*]] = icmp slt i64 %[[VAL_21]], %[[VAL_22]]
+// CHECK:         %[[VAL_24:.*]] = icmp slt i64 %[[VAL_22]], 3
+// CHECK:         %[[VAL_25:.*]] = and i1 %[[VAL_23]], %[[VAL_24]]
+// CHECK:         br i1 %[[VAL_25]], label %[[VAL_26:.*]], label %[[VAL_19]]
+// CHECK:       smaller_comparison_index-after:                   ; preds = %[[VAL_27:.*]], %[[VAL_17]]
+// CHECK:         br label %[[VAL_18]]
+// CHECK:       smaller_comparison_index-true:                    ; preds = %[[VAL_17]]
+// CHECK:         %[[VAL_28:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_3]], i64 0, i64 %[[VAL_15]], i64 %[[VAL_22]]
+// CHECK:         %[[VAL_29:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_3]], i64 0, i64 %[[VAL_15]], i64 %[[VAL_21]]
+// CHECK:         call void @region_0_4(float* %[[VAL_28]], float* %[[VAL_29]], i8* %[[VAL_1]])
+// CHECK:         %[[VAL_30:.*]] = load i8, i8* %[[VAL_1]], align 1
+// CHECK:         %[[VAL_31:.*]] = icmp ne i8 %[[VAL_30]], 0
+// CHECK:         br i1 %[[VAL_31]], label %[[VAL_32:.*]], label %[[VAL_27]]
+// CHECK:       is_smaller_than-after:                            ; preds = %[[VAL_32]], %[[VAL_26]]
+// CHECK:         br label %[[VAL_19]]
+// CHECK:       is_smaller_than-true:                             ; preds = %[[VAL_26]]
+// CHECK:         %[[VAL_33:.*]] = load float, float* %[[VAL_28]], align 4
+// CHECK:         %[[VAL_34:.*]] = load float, float* %[[VAL_29]], align 4
+// CHECK:         %[[VAL_35:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_3]], i64 0, i64 %[[VAL_15]], i64 %[[VAL_21]]
+// CHECK:         store float %[[VAL_33]], float* %[[VAL_35]], align 4
+// CHECK:         %[[VAL_36:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_3]], i64 0, i64 %[[VAL_15]], i64 %[[VAL_22]]
+// CHECK:         store float %[[VAL_34]], float* %[[VAL_36]], align 4
+// CHECK:         br label %[[VAL_27]]
+// CHECK:       }
+// CHECK:       ; Function Attrs: nounwind readnone
+// CHECK:       declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #0
+// CHECK:       ; Function Attrs: nounwind readnone
+// CHECK:       declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+// CHECK:       ; Function Attrs: nofree nosync nounwind willreturn
+// CHECK:       declare void @llvm.assume(i1 noundef) #1
 
-// CHECK: define internal void @region_0_4(float* dereferenceable(4) [[P_0_LHS_TYPED:%.*]], float* dereferenceable(4) [[P_0_RHS_TYPED:%.*]], i8* dereferenceable(1) [[OUTPUT_ARG:%.*]])
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[COMPARE_3_TYPED:%.*]] = alloca i8, align 1
-// CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[ARG_0_1_TYPED:%.*]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARG_1_2_TYPED:%.*]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = fcmp olt float [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i8
-// CHECK-NEXT:    store i8 [[TMP3]], i8* [[COMPARE_3_TYPED]], align 1
-// CHECK-NEXT:    [[LOAD_RET_VALUE:%.*]] = load i8, i8* [[COMPARE_3_TYPED]], align 1
-// CHECK-NEXT:    store i8 [[LOAD_RET_VALUE]], i8* [[OUTPUT_ARG:%.*]], align 1
-// CHECK-NEXT:    ret void
+// CHECK:     define internal void @region_0_4(float* dereferenceable(4) %[[VAL_0:.*]], float* dereferenceable(4) %[[VAL_1:.*]], i8* dereferenceable(1) %[[VAL_2:.*]]) {
+// CHECK:       entry:
+// CHECK:         %[[VAL_3:.*]] = alloca i8, align 1
+// CHECK:         %[[VAL_4:.*]] = load float, float* %[[VAL_0]], align 4
+// CHECK:         %[[VAL_5:.*]] = load float, float* %[[VAL_1]], align 4
+// CHECK:         %[[VAL_6:.*]] = fcmp olt float %[[VAL_4]], %[[VAL_5]]
+// CHECK:         %[[VAL_7:.*]] = zext i1 %[[VAL_6]] to i8
+// CHECK:         store i8 %[[VAL_7]], i8* %[[VAL_3]], align 1
+// CHECK:         %[[VAL_8:.*]] = load i8, i8* %[[VAL_3]], align 1
+// CHECK:         store i8 %[[VAL_8]], i8* %[[VAL_2]], align 1
+// CHECK:         ret void
+// CHECK:       }
 
-// CHECK: define void @sort__1(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]]) {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[COMPARE_RETURN_BUFFER:%.*]] = alloca i8, align 1
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0:%.*]], i64 0
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to [2 x [3 x float]]*
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0]], i64 0
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to [2 x [3 x float]]*
-// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
-// CHECK-NEXT:    [[BLOCK_ID:%.*]] = zext i32 [[TMP4]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
-// CHECK-NEXT:    [[THREAD_ID:%.*]] = zext i32 [[TMP5]] to i64
-// CHECK-NEXT:    [[TMP6:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4
-// CHECK-NEXT:    [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP6]], [[THREAD_ID]]
-// CHECK-NEXT:    [[LINEAR_INDEX_IN_RANGE:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
-// CHECK-NEXT:    call void @llvm.assume(i1 [[LINEAR_INDEX_IN_RANGE]])
-// CHECK-NEXT:    [[TMP7:%.*]] = udiv i64 [[LINEAR_INDEX]], 1
-// CHECK-NEXT:    [[TMP8:%.*]] = urem i64 [[TMP7]], 2
-// CHECK-NEXT:    [[TMP9:%.*]] = udiv i64 [[LINEAR_INDEX]], 2
-// CHECK-NEXT:    [[TMP10:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
-// CHECK-NEXT:    br i1 [[TMP10]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]]
-// CHECK:       sort.in_bounds-after:
-// CHECK-NEXT:    ret void
-// CHECK:       sort.in_bounds-true:
-// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP8]], 3
-// CHECK-NEXT:    [[TMP12:%.*]] = icmp slt i64 [[TMP8]], [[TMP11]]
-// CHECK-NEXT:    [[TMP13:%.*]] = icmp slt i64 [[TMP11]], 3
-// CHECK-NEXT:    [[TMP14:%.*]] = and i1 [[TMP12]], [[TMP13]]
-// CHECK-NEXT:    br i1 [[TMP14]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]]
-// CHECK:       smaller_comparison_index-after:
-// CHECK-NEXT:    br label [[SORT_IN_BOUNDS_AFTER]]
-// CHECK:       smaller_comparison_index-true:
-// CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP11]]
-// CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP8]]
-// CHECK-NEXT:    call void @region_0_4(float* [[TMP15]], float* [[TMP16]], i8* [[COMPARE_RETURN_BUFFER]])
-// CHECK-NEXT:    [[TMP17:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1
-// CHECK-NEXT:    [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP17]], 0
-// CHECK-NEXT:    br i1 [[BOOLEAN_PREDICATE]], label [[IS_SMALLER_THAN_TRUE:%.*]], label [[IS_SMALLER_THAN_AFTER:%.*]]
-// CHECK:       is_smaller_than-after:
-// CHECK-NEXT:    br label [[SMALLER_COMPARISON_INDEX_AFTER]]
-// CHECK:       is_smaller_than-true:
-// CHECK-NEXT:    [[TMP18:%.*]] = load float, float* [[TMP15]], align 4
-// CHECK-NEXT:    [[TMP19:%.*]] = load float, float* [[TMP16]], align 4
-// CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP8]]
-// CHECK-NEXT:    store float [[TMP18]], float* [[TMP20]], align 4
-// CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP11]]
-// CHECK-NEXT:    store float [[TMP19]], float* [[TMP21]], align 4
-// CHECK-NEXT:    br label [[IS_SMALLER_THAN_AFTER]]
+// CHECK:     define void @sort__1(i8* noalias align 64 dereferenceable(24) %[[VAL_0:.*]]) {
+// CHECK:       entry:
+// CHECK:         %[[VAL_1:.*]] = alloca i8, align 1
+// CHECK:         %[[VAL_2:.*]] = getelementptr inbounds i8, i8* %[[VAL_0]], i64 0
+// CHECK:         %[[VAL_3:.*]] = bitcast i8* %[[VAL_2]] to [2 x [3 x float]]*
+// CHECK:         %[[VAL_4:.*]] = getelementptr inbounds i8, i8* %[[VAL_0]], i64 0
+// CHECK:         %[[VAL_5:.*]] = bitcast i8* %[[VAL_4]] to [2 x [3 x float]]*
+// CHECK:         %[[VAL_6:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
+// CHECK:         %[[VAL_7:.*]] = zext i32 %[[VAL_6]] to i64
+// CHECK:         %[[VAL_8:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
+// CHECK:         %[[VAL_9:.*]] = zext i32 %[[VAL_8]] to i64
+// CHECK:         %[[VAL_10:.*]] = mul nuw nsw i64 %[[VAL_7]], 4
+// CHECK:         %[[VAL_11:.*]] = add nuw nsw i64 %[[VAL_10]], %[[VAL_9]]
+// CHECK:         %[[VAL_12:.*]] = icmp ult i64 %[[VAL_11]], 4
+// CHECK:         call void @llvm.assume(i1 %[[VAL_12]])
+// CHECK:         %[[VAL_13:.*]] = udiv i64 %[[VAL_11]], 1
+// CHECK:         %[[VAL_14:.*]] = urem i64 %[[VAL_13]], 2
+// CHECK:         %[[VAL_15:.*]] = udiv i64 %[[VAL_11]], 2
+// CHECK:         %[[VAL_16:.*]] = icmp ult i64 %[[VAL_11]], 4
+// CHECK:         br i1 %[[VAL_16]], label %[[VAL_17:.*]], label %[[VAL_18:.*]]
+// CHECK:       sort.in_bounds-after:                             ; preds = %[[VAL_19:.*]], %[[VAL_20:.*]]
+// CHECK:         ret void
+// CHECK:       sort.in_bounds-true:                              ; preds = %[[VAL_20]]
+// CHECK:         %[[VAL_21:.*]] = xor i64 %[[VAL_14]], 3
+// CHECK:         %[[VAL_22:.*]] = icmp slt i64 %[[VAL_14]], %[[VAL_21]]
+// CHECK:         %[[VAL_23:.*]] = icmp slt i64 %[[VAL_21]], 3
+// CHECK:         %[[VAL_24:.*]] = and i1 %[[VAL_22]], %[[VAL_23]]
+// CHECK:         br i1 %[[VAL_24]], label %[[VAL_25:.*]], label %[[VAL_19]]
+// CHECK:       smaller_comparison_index-after:                   ; preds = %[[VAL_26:.*]], %[[VAL_17]]
+// CHECK:         br label %[[VAL_18]]
+// CHECK:       smaller_comparison_index-true:                    ; preds = %[[VAL_17]]
+// CHECK:         %[[VAL_27:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_3]], i64 0, i64 %[[VAL_15]], i64 %[[VAL_21]]
+// CHECK:         %[[VAL_28:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_3]], i64 0, i64 %[[VAL_15]], i64 %[[VAL_14]]
+// CHECK:         call void @region_0_4(float* %[[VAL_27]], float* %[[VAL_28]], i8* %[[VAL_1]])
+// CHECK:         %[[VAL_29:.*]] = load i8, i8* %[[VAL_1]], align 1
+// CHECK:         %[[VAL_30:.*]] = icmp ne i8 %[[VAL_29]], 0
+// CHECK:         br i1 %[[VAL_30]], label %[[VAL_31:.*]], label %[[VAL_26]]
+// CHECK:       is_smaller_than-after:                            ; preds = %[[VAL_31]], %[[VAL_25]]
+// CHECK:         br label %[[VAL_19]]
+// CHECK:       is_smaller_than-true:                             ; preds = %[[VAL_25]]
+// CHECK:         %[[VAL_32:.*]] = load float, float* %[[VAL_27]], align 4
+// CHECK:         %[[VAL_33:.*]] = load float, float* %[[VAL_28]], align 4
+// CHECK:         %[[VAL_34:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_3]], i64 0, i64 %[[VAL_15]], i64 %[[VAL_14]]
+// CHECK:         store float %[[VAL_32]], float* %[[VAL_34]], align 4
+// CHECK:         %[[VAL_35:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_3]], i64 0, i64 %[[VAL_15]], i64 %[[VAL_21]]
+// CHECK:         store float %[[VAL_33]], float* %[[VAL_35]], align 4
+// CHECK:         br label %[[VAL_26]]
+// CHECK:       }
+
+// CHECK:     define void @sort__2(i8* noalias align 64 dereferenceable(24) %[[VAL_0:.*]]) {
+// CHECK:       entry:
+// CHECK:         %[[VAL_1:.*]] = alloca i8, align 1
+// CHECK:         %[[VAL_2:.*]] = getelementptr inbounds i8, i8* %[[VAL_0]], i64 0
+// CHECK:         %[[VAL_3:.*]] = bitcast i8* %[[VAL_2]] to [2 x [3 x float]]*
+// CHECK:         %[[VAL_4:.*]] = getelementptr inbounds i8, i8* %[[VAL_0]], i64 0
+// CHECK:         %[[VAL_5:.*]] = bitcast i8* %[[VAL_4]] to [2 x [3 x float]]*
+// CHECK:         %[[VAL_6:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
+// CHECK:         %[[VAL_7:.*]] = zext i32 %[[VAL_6]] to i64
+// CHECK:         %[[VAL_8:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
+// CHECK:         %[[VAL_9:.*]] = zext i32 %[[VAL_8]] to i64
+// CHECK:         %[[VAL_10:.*]] = mul nuw nsw i64 %[[VAL_7]], 4
+// CHECK:         %[[VAL_11:.*]] = add nuw nsw i64 %[[VAL_10]], %[[VAL_9]]
+// CHECK:         %[[VAL_12:.*]] = icmp ult i64 %[[VAL_11]], 4
+// CHECK:         call void @llvm.assume(i1 %[[VAL_12]])
+// CHECK:         %[[VAL_13:.*]] = udiv i64 %[[VAL_11]], 1
+// CHECK:         %[[VAL_14:.*]] = urem i64 %[[VAL_13]], 2
+// CHECK:         %[[VAL_15:.*]] = udiv i64 %[[VAL_11]], 2
+// CHECK:         %[[VAL_16:.*]] = icmp ult i64 %[[VAL_11]], 4
+// CHECK:         br i1 %[[VAL_16]], label %[[VAL_17:.*]], label %[[VAL_18:.*]]
+// CHECK:       sort.in_bounds-after:                             ; preds = %[[VAL_19:.*]], %[[VAL_20:.*]]
+// CHECK:         ret void
+// CHECK:       sort.in_bounds-true:                              ; preds = %[[VAL_20]]
+// CHECK:         %[[VAL_21:.*]] = mul i64 %[[VAL_14]], 2
+// CHECK:         %[[VAL_22:.*]] = xor i64 %[[VAL_21]], 1
+// CHECK:         %[[VAL_23:.*]] = icmp slt i64 %[[VAL_21]], %[[VAL_22]]
+// CHECK:         %[[VAL_24:.*]] = icmp slt i64 %[[VAL_22]], 3
+// CHECK:         %[[VAL_25:.*]] = and i1 %[[VAL_23]], %[[VAL_24]]
+// CHECK:         br i1 %[[VAL_25]], label %[[VAL_26:.*]], label %[[VAL_19]]
+// CHECK:       smaller_comparison_index-after:                   ; preds = %[[VAL_27:.*]], %[[VAL_17]]
+// CHECK:         br label %[[VAL_18]]
+// CHECK:       smaller_comparison_index-true:                    ; preds = %[[VAL_17]]
+// CHECK:         %[[VAL_28:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_3]], i64 0, i64 %[[VAL_15]], i64 %[[VAL_22]]
+// CHECK:         %[[VAL_29:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_3]], i64 0, i64 %[[VAL_15]], i64 %[[VAL_21]]
+// CHECK:         call void @region_0_4(float* %[[VAL_28]], float* %[[VAL_29]], i8* %[[VAL_1]])
+// CHECK:         %[[VAL_30:.*]] = load i8, i8* %[[VAL_1]], align 1
+// CHECK:         %[[VAL_31:.*]] = icmp ne i8 %[[VAL_30]], 0
+// CHECK:         br i1 %[[VAL_31]], label %[[VAL_32:.*]], label %[[VAL_27]]
+// CHECK:       is_smaller_than-after:                            ; preds = %[[VAL_32]], %[[VAL_26]]
+// CHECK:         br label %[[VAL_19]]
+// CHECK:       is_smaller_than-true:                             ; preds = %[[VAL_26]]
+// CHECK:         %[[VAL_33:.*]] = load float, float* %[[VAL_28]], align 4
+// CHECK:         %[[VAL_34:.*]] = load float, float* %[[VAL_29]], align 4
+// CHECK:         %[[VAL_35:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_3]], i64 0, i64 %[[VAL_15]], i64 %[[VAL_21]]
+// CHECK:         store float %[[VAL_33]], float* %[[VAL_35]], align 4
+// CHECK:         %[[VAL_36:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_3]], i64 0, i64 %[[VAL_15]], i64 %[[VAL_22]]
+// CHECK:         store float %[[VAL_34]], float* %[[VAL_36]], align 4
+// CHECK:         br label %[[VAL_27]]
+// CHECK:       }
 
-// CHECK: define void @sort__2(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]]) {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[COMPARE_RETURN_BUFFER:%.*]] = alloca i8, align 1
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0:%.*]], i64 0
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to [2 x [3 x float]]*
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0]], i64 0
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to [2 x [3 x float]]*
-// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
-// CHECK-NEXT:    [[BLOCK_ID:%.*]] = zext i32 [[TMP4]] to i64
-// CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
-// CHECK-NEXT:    [[THREAD_ID:%.*]] = zext i32 [[TMP5]] to i64
-// CHECK-NEXT:    [[TMP6:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4
-// CHECK-NEXT:    [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP6]], [[THREAD_ID]]
-// CHECK-NEXT:    [[LINEAR_INDEX_IN_RANGE:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
-// CHECK-NEXT:    call void @llvm.assume(i1 [[LINEAR_INDEX_IN_RANGE]])
-// CHECK-NEXT:    [[TMP7:%.*]] = udiv i64 [[LINEAR_INDEX]], 1
-// CHECK-NEXT:    [[TMP8:%.*]] = urem i64 [[TMP7]], 2
-// CHECK-NEXT:    [[TMP9:%.*]] = udiv i64 [[LINEAR_INDEX]], 2
-// CHECK-NEXT:    [[TMP10:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
-// CHECK-NEXT:    br i1 [[TMP10]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]]
-// CHECK:       sort.in_bounds-after:
-// CHECK-NEXT:    ret void
-// CHECK:       sort.in_bounds-true:
-// CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP8]], 2
-// CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 1
-// CHECK-NEXT:    [[TMP13:%.*]] = icmp slt i64 [[TMP11]], [[TMP12]]
-// CHECK-NEXT:    [[TMP14:%.*]] = icmp slt i64 [[TMP12]], 3
-// CHECK-NEXT:    [[TMP15:%.*]] = and i1 [[TMP13]], [[TMP14]]
-// CHECK-NEXT:    br i1 [[TMP15]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]]
-// CHECK:       smaller_comparison_index-after:
-// CHECK-NEXT:    br label [[SORT_IN_BOUNDS_AFTER]]
-// CHECK:       smaller_comparison_index-true:
-// CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP12]]
-// CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP11]]
-// CHECK-NEXT:    call void @region_0_4(float* [[TMP16]], float* [[TMP17]], i8* [[COMPARE_RETURN_BUFFER]])
-// CHECK-NEXT:    [[TMP18:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1
-// CHECK-NEXT:    [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP18]], 0
-// CHECK-NEXT:    br i1 [[BOOLEAN_PREDICATE]], label [[IS_SMALLER_THAN_TRUE:%.*]], label [[IS_SMALLER_THAN_AFTER:%.*]]
-// CHECK:       is_smaller_than-after:
-// CHECK-NEXT:    br label [[SMALLER_COMPARISON_INDEX_AFTER]]
-// CHECK:       is_smaller_than-true:
-// CHECK-NEXT:    [[TMP19:%.*]] = load float, float* [[TMP16]], align 4
-// CHECK-NEXT:    [[TMP20:%.*]] = load float, float* [[TMP17]], align 4
-// CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP11]]
-// CHECK-NEXT:    store float [[TMP19]], float* [[TMP21]], align 4
-// CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP12]]
-// CHECK-NEXT:    store float [[TMP20]], float* [[TMP22]], align 4
-// CHECK-NEXT:    br label [[IS_SMALLER_THAN_AFTER]]
 ENTRY main {
   x = f32[2, 3] parameter(0)
   ROOT sort = f32[2, 3] sort(x), dimensions={1}, to_apply=compare
@@ -182,193 +193,204 @@ compare {
   ROOT lt = pred[] compare(p.1.lhs, p.1.rhs), direction=LT
 }
 
-// CHECK: define void @sort(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]], i8* noalias align 64 dereferenceable(24) [[ALLOC1:%.*]], i8* noalias align 64 dereferenceable(16) [[ALLOC4:%.*]])
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[COMPARE_RETURN_BUFFER:%.*]] = alloca i8, align 1
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0:%.*]], i64 0
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to [2 x [3 x i32]]*
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[ALLOC1:%.*]], i64 0
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to [2 x [3 x float]]*
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[ALLOC4:%.*]], i64 0
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
-// CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
-// CHECK-NEXT:    [[BLOCK_ID:%.*]] = zext i32 [[TMP6]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
-// CHECK-NEXT:    [[THREAD_ID:%.*]] = zext i32 [[TMP7]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4
-// CHECK-NEXT:    [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP8]], [[THREAD_ID]]
-// CHECK-NEXT:    [[LINEAR_INDEX_IN_RANGE:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
-// CHECK-NEXT:    call void @llvm.assume(i1 [[LINEAR_INDEX_IN_RANGE]])
-// CHECK-NEXT:    [[TMP9:%.*]] = udiv i64 [[LINEAR_INDEX]], 1
-// CHECK-NEXT:    [[TMP10:%.*]] = urem i64 [[TMP9]], 2
-// CHECK-NEXT:    [[TMP11:%.*]] = udiv i64 [[LINEAR_INDEX]], 2
-// CHECK-NEXT:    [[TMP12:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
-// CHECK-NEXT:    br i1 [[TMP12]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]]
-// CHECK:       sort.in_bounds-after:
-// CHECK-NEXT:    ret void
-// CHECK:       sort.in_bounds-true:
-// CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP10]], 2
-// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 1
-// CHECK-NEXT:    [[TMP15:%.*]] = icmp slt i64 [[TMP13]], [[TMP14]]
-// CHECK-NEXT:    [[TMP16:%.*]] = icmp slt i64 [[TMP14]], 3
-// CHECK-NEXT:    [[TMP17:%.*]] = and i1 [[TMP15]], [[TMP16]]
-// CHECK-NEXT:    br i1 [[TMP17]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]]
-// CHECK:       smaller_comparison_index-after:
-// CHECK-NEXT:    br label [[SORT_IN_BOUNDS_AFTER]]
-// CHECK:       smaller_comparison_index-true:
-// CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP14]]
-// CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP13]]
-// CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP14]]
-// CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP13]]
-// CHECK-NEXT:    call void @region_0_6(i32* [[TMP18]], i32* [[TMP19]], float* [[TMP20]], float* [[TMP21]], i8* [[COMPARE_RETURN_BUFFER]])
-// CHECK-NEXT:    [[TMP22:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1
-// CHECK-NEXT:    [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP22]], 0
-// CHECK-NEXT:    br i1 [[BOOLEAN_PREDICATE]], label [[IS_SMALLER_THAN_TRUE:%.*]], label [[IS_SMALLER_THAN_AFTER:%.*]]
-// CHECK:       is_smaller_than-after:
-// CHECK-NEXT:    br label [[SMALLER_COMPARISON_INDEX_AFTER]]
-// CHECK:       is_smaller_than-true:
-// CHECK-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP18]], align 4
-// CHECK-NEXT:    [[TMP24:%.*]] = load i32, i32* [[TMP19]], align 4
-// CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP13]]
-// CHECK-NEXT:    store i32 [[TMP23]], i32* [[TMP25]], align 4
-// CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP14]]
-// CHECK-NEXT:    store i32 [[TMP24]], i32* [[TMP26]], align 4
-// CHECK-NEXT:    [[TMP27:%.*]] = load float, float* [[TMP20]], align 4
-// CHECK-NEXT:    [[TMP28:%.*]] = load float, float* [[TMP21]], align 4
-// CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP13]]
-// CHECK-NEXT:    store float [[TMP27]], float* [[TMP29]], align 4
-// CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP14]]
-// CHECK-NEXT:    store float [[TMP28]], float* [[TMP30]], align 4
-// CHECK-NEXT:    br label [[IS_SMALLER_THAN_AFTER]]
+// CHECK:     define void @sort(i8* noalias align 64 dereferenceable(24) %[[VAL_0:.*]], i8* noalias align 64 dereferenceable(24) %[[VAL_1:.*]], i8* noalias align 64 dereferenceable(16) %[[VAL_2:.*]]) {
+// CHECK:       entry:
+// CHECK:         %[[VAL_3:.*]] = alloca i8, align 1
+// CHECK:         %[[VAL_4:.*]] = getelementptr inbounds i8, i8* %[[VAL_0]], i64 0
+// CHECK:         %[[VAL_5:.*]] = bitcast i8* %[[VAL_4]] to [2 x [3 x i32]]*
+// CHECK:         %[[VAL_6:.*]] = getelementptr inbounds i8, i8* %[[VAL_1]], i64 0
+// CHECK:         %[[VAL_7:.*]] = bitcast i8* %[[VAL_6]] to [2 x [3 x float]]*
+// CHECK:         %[[VAL_8:.*]] = getelementptr inbounds i8, i8* %[[VAL_2]], i64 0
+// CHECK:         %[[VAL_9:.*]] = bitcast i8* %[[VAL_8]] to [2 x i8*]*
+// CHECK:         %[[VAL_10:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
+// CHECK:         %[[VAL_11:.*]] = zext i32 %[[VAL_10]] to i64
+// CHECK:         %[[VAL_12:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
+// CHECK:         %[[VAL_13:.*]] = zext i32 %[[VAL_12]] to i64
+// CHECK:         %[[VAL_14:.*]] = mul nuw nsw i64 %[[VAL_11]], 4
+// CHECK:         %[[VAL_15:.*]] = add nuw nsw i64 %[[VAL_14]], %[[VAL_13]]
+// CHECK:         %[[VAL_16:.*]] = icmp ult i64 %[[VAL_15]], 4
+// CHECK:         call void @llvm.assume(i1 %[[VAL_16]])
+// CHECK:         %[[VAL_17:.*]] = udiv i64 %[[VAL_15]], 1
+// CHECK:         %[[VAL_18:.*]] = urem i64 %[[VAL_17]], 2
+// CHECK:         %[[VAL_19:.*]] = udiv i64 %[[VAL_15]], 2
+// CHECK:         %[[VAL_20:.*]] = icmp ult i64 %[[VAL_15]], 4
+// CHECK:         br i1 %[[VAL_20]], label %[[VAL_21:.*]], label %[[VAL_22:.*]]
+// CHECK:       sort.in_bounds-after:                             ; preds = %[[VAL_23:.*]], %[[VAL_24:.*]]
+// CHECK:         ret void
+// CHECK:       sort.in_bounds-true:                              ; preds = %[[VAL_24]]
+// CHECK:         %[[VAL_25:.*]] = mul i64 %[[VAL_18]], 2
+// CHECK:         %[[VAL_26:.*]] = xor i64 %[[VAL_25]], 1
+// CHECK:         %[[VAL_27:.*]] = icmp slt i64 %[[VAL_25]], %[[VAL_26]]
+// CHECK:         %[[VAL_28:.*]] = icmp slt i64 %[[VAL_26]], 3
+// CHECK:         %[[VAL_29:.*]] = and i1 %[[VAL_27]], %[[VAL_28]]
+// CHECK:         br i1 %[[VAL_29]], label %[[VAL_30:.*]], label %[[VAL_23]]
+// CHECK:       smaller_comparison_index-after:                   ; preds = %[[VAL_31:.*]], %[[VAL_21]]
+// CHECK:         br label %[[VAL_22]]
+// CHECK:       smaller_comparison_index-true:                    ; preds = %[[VAL_21]]
+// CHECK:         %[[VAL_32:.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* %[[VAL_5]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_26]]
+// CHECK:         %[[VAL_33:.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* %[[VAL_5]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_25]]
+// CHECK:         %[[VAL_34:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_7]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_26]]
+// CHECK:         %[[VAL_35:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_7]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_25]]
+// CHECK:         call void @region_0_6(i32* %[[VAL_32]], i32* %[[VAL_33]], float* %[[VAL_34]], float* %[[VAL_35]], i8* %[[VAL_3]])
+// CHECK:         %[[VAL_36:.*]] = load i8, i8* %[[VAL_3]], align 1
+// CHECK:         %[[VAL_37:.*]] = icmp ne i8 %[[VAL_36]], 0
+// CHECK:         br i1 %[[VAL_37]], label %[[VAL_38:.*]], label %[[VAL_31]]
+// CHECK:       is_smaller_than-after:                            ; preds = %[[VAL_38]], %[[VAL_30]]
+// CHECK:         br label %[[VAL_23]]
+// CHECK:       is_smaller_than-true:                             ; preds = %[[VAL_30]]
+// CHECK:         %[[VAL_39:.*]] = load i32, i32* %[[VAL_32]], align 4
+// CHECK:         %[[VAL_40:.*]] = load i32, i32* %[[VAL_33]], align 4
+// CHECK:         %[[VAL_41:.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* %[[VAL_5]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_25]]
+// CHECK:         store i32 %[[VAL_39]], i32* %[[VAL_41]], align 4
+// CHECK:         %[[VAL_42:.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* %[[VAL_5]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_26]]
+// CHECK:         store i32 %[[VAL_40]], i32* %[[VAL_42]], align 4
+// CHECK:         %[[VAL_43:.*]] = load float, float* %[[VAL_34]], align 4
+// CHECK:         %[[VAL_44:.*]] = load float, float* %[[VAL_35]], align 4
+// CHECK:         %[[VAL_45:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_7]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_25]]
+// CHECK:         store float %[[VAL_43]], float* %[[VAL_45]], align 4
+// CHECK:         %[[VAL_46:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_7]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_26]]
+// CHECK:         store float %[[VAL_44]], float* %[[VAL_46]], align 4
+// CHECK:         br label %[[VAL_31]]
+// CHECK:       }
+// CHECK:       ; Function Attrs: nounwind readnone
+// CHECK:       declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #0
+// CHECK:       ; Function Attrs: nounwind readnone
+// CHECK:       declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+// CHECK:       ; Function Attrs: nofree nosync nounwind willreturn
+// CHECK:       declare void @llvm.assume(i1 noundef) #1
 
-// CHECK: define internal void @region_0_6(i32* dereferenceable(4) [[P_0_LHS_TYPED:%.*]], i32* dereferenceable(4) [[P_0_RHS_TYPED:%.*]], float* dereferenceable(4) [[P_1_LHS_TYPED:%.*]], float* dereferenceable(4) [[P_1_RHS_TYPED:%.*]], i8* dereferenceable(1) [[OUTPUT_ARG:%.*]])
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[COMPARE_5_TYPED:%.*]] = alloca i8, align 1
-// CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[ARG_2_3_TYPED:%.*]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARG_3_4_TYPED:%.*]], align 4
-// CHECK-NEXT:    [[TMP2:%.*]] = fcmp olt float [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i8
-// CHECK-NEXT:    store i8 [[TMP3]], i8* [[COMPARE_5_TYPED]], align 1
-// CHECK-NEXT:    [[LOAD_RET_VALUE:%.*]] = load i8, i8* [[COMPARE_5_TYPED]], align 1
-// CHECK-NEXT:    store i8 [[LOAD_RET_VALUE]], i8* [[OUTPUT_ARG:%.*]], align 1
-// CHECK-NEXT:    ret void
+// CHECK:     define internal void @region_0_6(i32* dereferenceable(4) %[[VAL_0:.*]], i32* dereferenceable(4) %[[VAL_1:.*]], float* dereferenceable(4) %[[VAL_2:.*]], float* dereferenceable(4) %[[VAL_3:.*]], i8* dereferenceable(1) %[[VAL_4:.*]]) {
+// CHECK:       entry:
+// CHECK:         %[[VAL_5:.*]] = alloca i8, align 1
+// CHECK:         %[[VAL_6:.*]] = load float, float* %[[VAL_2]], align 4
+// CHECK:         %[[VAL_7:.*]] = load float, float* %[[VAL_3]], align 4
+// CHECK:         %[[VAL_8:.*]] = fcmp olt float %[[VAL_6]], %[[VAL_7]]
+// CHECK:         %[[VAL_9:.*]] = zext i1 %[[VAL_8]] to i8
+// CHECK:         store i8 %[[VAL_9]], i8* %[[VAL_5]], align 1
+// CHECK:         %[[VAL_10:.*]] = load i8, i8* %[[VAL_5]], align 1
+// CHECK:         store i8 %[[VAL_10]], i8* %[[VAL_4]], align 1
+// CHECK:         ret void
+// CHECK:       }
 
-// CHECK: define void @sort__1(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]], i8* noalias align 64 dereferenceable(24) [[ALLOC1:%.*]], i8* noalias align 64 dereferenceable(16) [[ALLOC4:%.*]])
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[COMPARE_RETURN_BUFFER:%.*]] = alloca i8, align 1
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0:%.*]], i64 0
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to [2 x [3 x i32]]*
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[ALLOC1:%.*]], i64 0
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to [2 x [3 x float]]*
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[ALLOC4:%.*]], i64 0
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
-// CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
-// CHECK-NEXT:    [[BLOCK_ID:%.*]] = zext i32 [[TMP6]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
-// CHECK-NEXT:    [[THREAD_ID:%.*]] = zext i32 [[TMP7]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4
-// CHECK-NEXT:    [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP8]], [[THREAD_ID]]
-// CHECK-NEXT:    [[LINEAR_INDEX_IN_RANGE:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
-// CHECK-NEXT:    call void @llvm.assume(i1 [[LINEAR_INDEX_IN_RANGE]])
-// CHECK-NEXT:    [[TMP9:%.*]] = udiv i64 [[LINEAR_INDEX]], 1
-// CHECK-NEXT:    [[TMP10:%.*]] = urem i64 [[TMP9]], 2
-// CHECK-NEXT:    [[TMP11:%.*]] = udiv i64 [[LINEAR_INDEX]], 2
-// CHECK-NEXT:    [[TMP12:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
-// CHECK-NEXT:    br i1 [[TMP12]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]]
-// CHECK:       sort.in_bounds-after:
-// CHECK-NEXT:    ret void
-// CHECK:       sort.in_bounds-true:
-// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP10]], 3
-// CHECK-NEXT:    [[TMP14:%.*]] = icmp slt i64 [[TMP10]], [[TMP13]]
-// CHECK-NEXT:    [[TMP15:%.*]] = icmp slt i64 [[TMP13]], 3
-// CHECK-NEXT:    [[TMP16:%.*]] = and i1 [[TMP14]], [[TMP15]]
-// CHECK-NEXT:    br i1 [[TMP16]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]]
-// CHECK:       smaller_comparison_index-after:
-// CHECK-NEXT:    br label [[SORT_IN_BOUNDS_AFTER]]
-// CHECK:       smaller_comparison_index-true:
-// CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP13]]
-// CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP10]]
-// CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP13]]
-// CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP10]]
-// CHECK-NEXT:    call void @region_0_6(i32* [[TMP17]], i32* [[TMP18]], float* [[TMP19]], float* [[TMP20]], i8* [[COMPARE_RETURN_BUFFER]])
-// CHECK-NEXT:    [[TMP21:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1
-// CHECK-NEXT:    [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP21]], 0
-// CHECK-NEXT:    br i1 [[BOOLEAN_PREDICATE]], label [[IS_SMALLER_THAN_TRUE:%.*]], label [[IS_SMALLER_THAN_AFTER:%.*]]
-// CHECK:       is_smaller_than-after:
-// CHECK-NEXT:    br label [[SMALLER_COMPARISON_INDEX_AFTER]]
-// CHECK:       is_smaller_than-true:
-// CHECK-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP17]], align 4
-// CHECK-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP18]], align 4
-// CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP10]]
-// CHECK-NEXT:    store i32 [[TMP22]], i32* [[TMP24]], align 4
-// CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP13]]
-// CHECK-NEXT:    store i32 [[TMP23]], i32* [[TMP25]], align 4
-// CHECK-NEXT:    [[TMP26:%.*]] = load float, float* [[TMP19]], align 4
-// CHECK-NEXT:    [[TMP27:%.*]] = load float, float* [[TMP20]], align 4
-// CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP10]]
-// CHECK-NEXT:    store float [[TMP26]], float* [[TMP28]], align 4
-// CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP13]]
-// CHECK-NEXT:    store float [[TMP27]], float* [[TMP29]], align 4
-// CHECK-NEXT:    br label [[IS_SMALLER_THAN_AFTER]]
+// CHECK:     define void @sort__1(i8* noalias align 64 dereferenceable(24) %[[VAL_0:.*]], i8* noalias align 64 dereferenceable(24) %[[VAL_1:.*]], i8* noalias align 64 dereferenceable(16) %[[VAL_2:.*]]) {
+// CHECK:       entry:
+// CHECK:         %[[VAL_3:.*]] = alloca i8, align 1
+// CHECK:         %[[VAL_4:.*]] = getelementptr inbounds i8, i8* %[[VAL_0]], i64 0
+// CHECK:         %[[VAL_5:.*]] = bitcast i8* %[[VAL_4]] to [2 x [3 x i32]]*
+// CHECK:         %[[VAL_6:.*]] = getelementptr inbounds i8, i8* %[[VAL_1]], i64 0
+// CHECK:         %[[VAL_7:.*]] = bitcast i8* %[[VAL_6]] to [2 x [3 x float]]*
+// CHECK:         %[[VAL_8:.*]] = getelementptr inbounds i8, i8* %[[VAL_2]], i64 0
+// CHECK:         %[[VAL_9:.*]] = bitcast i8* %[[VAL_8]] to [2 x i8*]*
+// CHECK:         %[[VAL_10:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
+// CHECK:         %[[VAL_11:.*]] = zext i32 %[[VAL_10]] to i64
+// CHECK:         %[[VAL_12:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
+// CHECK:         %[[VAL_13:.*]] = zext i32 %[[VAL_12]] to i64
+// CHECK:         %[[VAL_14:.*]] = mul nuw nsw i64 %[[VAL_11]], 4
+// CHECK:         %[[VAL_15:.*]] = add nuw nsw i64 %[[VAL_14]], %[[VAL_13]]
+// CHECK:         %[[VAL_16:.*]] = icmp ult i64 %[[VAL_15]], 4
+// CHECK:         call void @llvm.assume(i1 %[[VAL_16]])
+// CHECK:         %[[VAL_17:.*]] = udiv i64 %[[VAL_15]], 1
+// CHECK:         %[[VAL_18:.*]] = urem i64 %[[VAL_17]], 2
+// CHECK:         %[[VAL_19:.*]] = udiv i64 %[[VAL_15]], 2
+// CHECK:         %[[VAL_20:.*]] = icmp ult i64 %[[VAL_15]], 4
+// CHECK:         br i1 %[[VAL_20]], label %[[VAL_21:.*]], label %[[VAL_22:.*]]
+// CHECK:       sort.in_bounds-after:                             ; preds = %[[VAL_23:.*]], %[[VAL_24:.*]]
+// CHECK:         ret void
+// CHECK:       sort.in_bounds-true:                              ; preds = %[[VAL_24]]
+// CHECK:         %[[VAL_25:.*]] = xor i64 %[[VAL_18]], 3
+// CHECK:         %[[VAL_26:.*]] = icmp slt i64 %[[VAL_18]], %[[VAL_25]]
+// CHECK:         %[[VAL_27:.*]] = icmp slt i64 %[[VAL_25]], 3
+// CHECK:         %[[VAL_28:.*]] = and i1 %[[VAL_26]], %[[VAL_27]]
+// CHECK:         br i1 %[[VAL_28]], label %[[VAL_29:.*]], label %[[VAL_23]]
+// CHECK:       smaller_comparison_index-after:                   ; preds = %[[VAL_30:.*]], %[[VAL_21]]
+// CHECK:         br label %[[VAL_22]]
+// CHECK:       smaller_comparison_index-true:                    ; preds = %[[VAL_21]]
+// CHECK:         %[[VAL_31:.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* %[[VAL_5]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_25]]
+// CHECK:         %[[VAL_32:.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* %[[VAL_5]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_18]]
+// CHECK:         %[[VAL_33:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_7]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_25]]
+// CHECK:         %[[VAL_34:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_7]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_18]]
+// CHECK:         call void @region_0_6(i32* %[[VAL_31]], i32* %[[VAL_32]], float* %[[VAL_33]], float* %[[VAL_34]], i8* %[[VAL_3]])
+// CHECK:         %[[VAL_35:.*]] = load i8, i8* %[[VAL_3]], align 1
+// CHECK:         %[[VAL_36:.*]] = icmp ne i8 %[[VAL_35]], 0
+// CHECK:         br i1 %[[VAL_36]], label %[[VAL_37:.*]], label %[[VAL_30]]
+// CHECK:       is_smaller_than-after:                            ; preds = %[[VAL_37]], %[[VAL_29]]
+// CHECK:         br label %[[VAL_23]]
+// CHECK:       is_smaller_than-true:                             ; preds = %[[VAL_29]]
+// CHECK:         %[[VAL_38:.*]] = load i32, i32* %[[VAL_31]], align 4
+// CHECK:         %[[VAL_39:.*]] = load i32, i32* %[[VAL_32]], align 4
+// CHECK:         %[[VAL_40:.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* %[[VAL_5]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_18]]
+// CHECK:         store i32 %[[VAL_38]], i32* %[[VAL_40]], align 4
+// CHECK:         %[[VAL_41:.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* %[[VAL_5]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_25]]
+// CHECK:         store i32 %[[VAL_39]], i32* %[[VAL_41]], align 4
+// CHECK:         %[[VAL_42:.*]] = load float, float* %[[VAL_33]], align 4
+// CHECK:         %[[VAL_43:.*]] = load float, float* %[[VAL_34]], align 4
+// CHECK:         %[[VAL_44:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_7]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_18]]
+// CHECK:         store float %[[VAL_42]], float* %[[VAL_44]], align 4
+// CHECK:         %[[VAL_45:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_7]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_25]]
+// CHECK:         store float %[[VAL_43]], float* %[[VAL_45]], align 4
+// CHECK:         br label %[[VAL_30]]
+// CHECK:       }
+
+// CHECK:     define void @sort__2(i8* noalias align 64 dereferenceable(24) %[[VAL_0:.*]], i8* noalias align 64 dereferenceable(24) %[[VAL_1:.*]], i8* noalias align 64 dereferenceable(16) %[[VAL_2:.*]]) {
+// CHECK:       entry:
+// CHECK:         %[[VAL_3:.*]] = alloca i8, align 1
+// CHECK:         %[[VAL_4:.*]] = getelementptr inbounds i8, i8* %[[VAL_0]], i64 0
+// CHECK:         %[[VAL_5:.*]] = bitcast i8* %[[VAL_4]] to [2 x [3 x i32]]*
+// CHECK:         %[[VAL_6:.*]] = getelementptr inbounds i8, i8* %[[VAL_1]], i64 0
+// CHECK:         %[[VAL_7:.*]] = bitcast i8* %[[VAL_6]] to [2 x [3 x float]]*
+// CHECK:         %[[VAL_8:.*]] = getelementptr inbounds i8, i8* %[[VAL_2]], i64 0
+// CHECK:         %[[VAL_9:.*]] = bitcast i8* %[[VAL_8]] to [2 x i8*]*
+// CHECK:         %[[VAL_10:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
+// CHECK:         %[[VAL_11:.*]] = zext i32 %[[VAL_10]] to i64
+// CHECK:         %[[VAL_12:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
+// CHECK:         %[[VAL_13:.*]] = zext i32 %[[VAL_12]] to i64
+// CHECK:         %[[VAL_14:.*]] = mul nuw nsw i64 %[[VAL_11]], 4
+// CHECK:         %[[VAL_15:.*]] = add nuw nsw i64 %[[VAL_14]], %[[VAL_13]]
+// CHECK:         %[[VAL_16:.*]] = icmp ult i64 %[[VAL_15]], 4
+// CHECK:         call void @llvm.assume(i1 %[[VAL_16]])
+// CHECK:         %[[VAL_17:.*]] = udiv i64 %[[VAL_15]], 1
+// CHECK:         %[[VAL_18:.*]] = urem i64 %[[VAL_17]], 2
+// CHECK:         %[[VAL_19:.*]] = udiv i64 %[[VAL_15]], 2
+// CHECK:         %[[VAL_20:.*]] = icmp ult i64 %[[VAL_15]], 4
+// CHECK:         br i1 %[[VAL_20]], label %[[VAL_21:.*]], label %[[VAL_22:.*]]
+// CHECK:       sort.in_bounds-after:                             ; preds = %[[VAL_23:.*]], %[[VAL_24:.*]]
+// CHECK:         ret void
+// CHECK:       sort.in_bounds-true:                              ; preds = %[[VAL_24]]
+// CHECK:         %[[VAL_25:.*]] = mul i64 %[[VAL_18]], 2
+// CHECK:         %[[VAL_26:.*]] = xor i64 %[[VAL_25]], 1
+// CHECK:         %[[VAL_27:.*]] = icmp slt i64 %[[VAL_25]], %[[VAL_26]]
+// CHECK:         %[[VAL_28:.*]] = icmp slt i64 %[[VAL_26]], 3
+// CHECK:         %[[VAL_29:.*]] = and i1 %[[VAL_27]], %[[VAL_28]]
+// CHECK:         br i1 %[[VAL_29]], label %[[VAL_30:.*]], label %[[VAL_23]]
+// CHECK:       smaller_comparison_index-after:                   ; preds = %[[VAL_31:.*]], %[[VAL_21]]
+// CHECK:         br label %[[VAL_22]]
+// CHECK:       smaller_comparison_index-true:                    ; preds = %[[VAL_21]]
+// CHECK:         %[[VAL_32:.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* %[[VAL_5]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_26]]
+// CHECK:         %[[VAL_33:.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* %[[VAL_5]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_25]]
+// CHECK:         %[[VAL_34:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_7]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_26]]
+// CHECK:         %[[VAL_35:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_7]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_25]]
+// CHECK:         call void @region_0_6(i32* %[[VAL_32]], i32* %[[VAL_33]], float* %[[VAL_34]], float* %[[VAL_35]], i8* %[[VAL_3]])
+// CHECK:         %[[VAL_36:.*]] = load i8, i8* %[[VAL_3]], align 1
+// CHECK:         %[[VAL_37:.*]] = icmp ne i8 %[[VAL_36]], 0
+// CHECK:         br i1 %[[VAL_37]], label %[[VAL_38:.*]], label %[[VAL_31]]
+// CHECK:       is_smaller_than-after:                            ; preds = %[[VAL_38]], %[[VAL_30]]
+// CHECK:         br label %[[VAL_23]]
+// CHECK:       is_smaller_than-true:                             ; preds = %[[VAL_30]]
+// CHECK:         %[[VAL_39:.*]] = load i32, i32* %[[VAL_32]], align 4
+// CHECK:         %[[VAL_40:.*]] = load i32, i32* %[[VAL_33]], align 4
+// CHECK:         %[[VAL_41:.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* %[[VAL_5]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_25]]
+// CHECK:         store i32 %[[VAL_39]], i32* %[[VAL_41]], align 4
+// CHECK:         %[[VAL_42:.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* %[[VAL_5]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_26]]
+// CHECK:         store i32 %[[VAL_40]], i32* %[[VAL_42]], align 4
+// CHECK:         %[[VAL_43:.*]] = load float, float* %[[VAL_34]], align 4
+// CHECK:         %[[VAL_44:.*]] = load float, float* %[[VAL_35]], align 4
+// CHECK:         %[[VAL_45:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_7]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_25]]
+// CHECK:         store float %[[VAL_43]], float* %[[VAL_45]], align 4
+// CHECK:         %[[VAL_46:.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* %[[VAL_7]], i64 0, i64 %[[VAL_19]], i64 %[[VAL_26]]
+// CHECK:         store float %[[VAL_44]], float* %[[VAL_46]], align 4
+// CHECK:         br label %[[VAL_31]]
+// CHECK:       }
 
-// CHECK: define void @sort__2(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]], i8* noalias align 64 dereferenceable(24) [[ALLOC1:%.*]], i8* noalias align 64 dereferenceable(16) [[ALLOC4:%.*]])
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[COMPARE_RETURN_BUFFER:%.*]] = alloca i8, align 1
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0:%.*]], i64 0
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to [2 x [3 x i32]]*
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[ALLOC1:%.*]], i64 0
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to [2 x [3 x float]]*
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[ALLOC4:%.*]], i64 0
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
-// CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
-// CHECK-NEXT:    [[BLOCK_ID:%.*]] = zext i32 [[TMP6]] to i64
-// CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
-// CHECK-NEXT:    [[THREAD_ID:%.*]] = zext i32 [[TMP7]] to i64
-// CHECK-NEXT:    [[TMP8:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4
-// CHECK-NEXT:    [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP8]], [[THREAD_ID]]
-// CHECK-NEXT:    [[LINEAR_INDEX_IN_RANGE:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
-// CHECK-NEXT:    call void @llvm.assume(i1 [[LINEAR_INDEX_IN_RANGE]])
-// CHECK-NEXT:    [[TMP9:%.*]] = udiv i64 [[LINEAR_INDEX]], 1
-// CHECK-NEXT:    [[TMP10:%.*]] = urem i64 [[TMP9]], 2
-// CHECK-NEXT:    [[TMP11:%.*]] = udiv i64 [[LINEAR_INDEX]], 2
-// CHECK-NEXT:    [[TMP12:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
-// CHECK-NEXT:    br i1 [[TMP12]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]]
-// CHECK:       sort.in_bounds-after:
-// CHECK-NEXT:    ret void
-// CHECK:       sort.in_bounds-true:
-// CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP10]], 2
-// CHECK-NEXT:    [[TMP18:%.*]] = xor i64 [[TMP17]], 1
-// CHECK-NEXT:    [[TMP19:%.*]] = icmp slt i64 [[TMP17]], [[TMP18]]
-// CHECK-NEXT:    [[TMP20:%.*]] = icmp slt i64 [[TMP18]], 3
-// CHECK-NEXT:    [[TMP21:%.*]] = and i1 [[TMP19]], [[TMP20]]
-// CHECK-NEXT:    br i1 [[TMP21]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]]
-// CHECK:       smaller_comparison_index-after:
-// CHECK-NEXT:    br label [[SORT_IN_BOUNDS_AFTER]]
-// CHECK:       smaller_comparison_index-true:
-// CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP18]]
-// CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP17]]
-// CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP18]]
-// CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP17]]
-// CHECK-NEXT:    call void @region_0_6(i32* [[TMP22]], i32* [[TMP23]], float* [[TMP24]], float* [[TMP25]], i8* [[COMPARE_RETURN_BUFFER]])
-// CHECK-NEXT:    [[TMP26:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1
-// CHECK-NEXT:    [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP26]], 0
-// CHECK-NEXT:    br i1 [[BOOLEAN_PREDICATE]], label [[IS_SMALLER_THAN_TRUE:%.*]], label [[IS_SMALLER_THAN_AFTER:%.*]]
-// CHECK:       is_smaller_than-after:
-// CHECK-NEXT:    br label [[SMALLER_COMPARISON_INDEX_AFTER]]
-// CHECK:       is_smaller_than-true:
-// CHECK-NEXT:    [[TMP27:%.*]] = load i32, i32* [[TMP22]], align 4
-// CHECK-NEXT:    [[TMP28:%.*]] = load i32, i32* [[TMP23]], align 4
-// CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP17]]
-// CHECK-NEXT:    store i32 [[TMP27]], i32* [[TMP29]], align 4
-// CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP18]]
-// CHECK-NEXT:    store i32 [[TMP28]], i32* [[TMP30]], align 4
-// CHECK-NEXT:    [[TMP31:%.*]] = load float, float* [[TMP24]], align 4
-// CHECK-NEXT:    [[TMP32:%.*]] = load float, float* [[TMP25]], align 4
-// CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP17]]
-// CHECK-NEXT:    store float [[TMP31]], float* [[TMP33]], align 4
-// CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP18]]
-// CHECK-NEXT:    store float [[TMP32]], float* [[TMP34]], align 4
-// CHECK-NEXT:    br label [[IS_SMALLER_THAN_AFTER]]
 ENTRY main {
   x = s32[2, 3] parameter(0)
   y = f32[2, 3] parameter(1)