; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -o - -S -load-store-vectorizer -dce %s | FileCheck %s ; Make sure LoadStoreVectorizer vectorizes the loads below. ; In order to prove that the vectorization is safe, it tries to ; match nested adds and find an expression that adds a constant ; value to an existing index and the result doesn't overflow. target triple = "x86_64--" define void @ld_v4i8_add_nsw(i32 %v0, i32 %v1, i8* %src, <4 x i8>* %dst) { ; CHECK-LABEL: @ld_v4i8_add_nsw( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP:%.*]] = add nsw i32 [[V0:%.*]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = add nsw i32 [[V1:%.*]], [[TMP]] ; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP2]] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1 ; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP82:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP133:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2 ; CHECK-NEXT: [[TMP184:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3 ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP41]], i32 0 ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP82]], i32 1 ; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP133]], i32 2 ; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP184]], i32 3 ; CHECK-NEXT: store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]] ; CHECK-NEXT: ret void ; bb: %tmp = add nsw i32 %v0, -1 %tmp1 = add nsw i32 %v1, %tmp %tmp2 = sext i32 %tmp1 to i64 %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2 %tmp4 = load i8, i8* %tmp3, align 1 %tmp5 = add nsw i32 %v1, %v0 %tmp6 = sext i32 %tmp5 to i64 %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6 %tmp8 = load i8, i8* %tmp7, align 1 %tmp9 = add nsw i32 %v0, 1 %tmp10 = add nsw i32 %v1, %tmp9 %tmp11 = sext i32 %tmp10 to i64 %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11 %tmp13 = load i8, i8* %tmp12, align 1 %tmp14 = add nsw i32 %v0, 2 %tmp15 = add nsw i32 %v1, %tmp14 %tmp16 = sext i32 %tmp15 to i64 %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16 %tmp18 = load i8, i8* %tmp17, align 1 %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0 %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1 %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2 %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3 store <4 x i8> %tmp22, <4 x i8>* %dst ret void } define void @ld_v4i8_add_nuw(i32 %v0, i32 %v1, i8* %src, <4 x i8>* %dst) { ; CHECK-LABEL: @ld_v4i8_add_nuw( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP:%.*]] = add nuw i32 [[V0:%.*]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = add nuw i32 [[V1:%.*]], [[TMP]] ; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP2]] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1 ; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP82:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP133:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2 ; CHECK-NEXT: [[TMP184:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3 ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP41]], i32 0 ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP82]], i32 1 ; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP133]], i32 2 ; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP184]], i32 3 ; CHECK-NEXT: store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]] ; CHECK-NEXT: ret void ; bb: %tmp = add nuw i32 %v0, -1 %tmp1 = add nuw i32 %v1, %tmp %tmp2 = zext i32 %tmp1 to i64 %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2 %tmp4 = load i8, i8* %tmp3, align 1 %tmp5 = add nuw i32 %v1, %v0 %tmp6 = zext i32 %tmp5 to i64 %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6 %tmp8 = load i8, i8* %tmp7, align 1 %tmp9 = add nuw i32 %v0, 1 %tmp10 = add nuw i32 %v1, %tmp9 %tmp11 = zext i32 %tmp10 to i64 %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11 %tmp13 = load i8, i8* %tmp12, align 1 %tmp14 = add nuw i32 %v0, 2 %tmp15 = add nuw i32 %v1, %tmp14 %tmp16 = zext i32 %tmp15 to i64 %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16 %tmp18 = load i8, i8* %tmp17, align 1 %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0 %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1 %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2 %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3 store <4 x i8> %tmp22, <4 x i8>* %dst ret void } ; Apply different operand orders for the nested add sequences define void @ld_v4i8_add_nsw_operand_orders(i32 %v0, i32 %v1, i8* %src, <4 x i8>* %dst) { ; CHECK-LABEL: @ld_v4i8_add_nsw_operand_orders( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP:%.*]] = add nsw i32 [[V0:%.*]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = add nsw i32 [[V1:%.*]], [[TMP]] ; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP2]] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1 ; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP82:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP133:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2 ; CHECK-NEXT: [[TMP184:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3 ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP41]], i32 0 ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP82]], i32 1 ; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP133]], i32 2 ; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP184]], i32 3 ; CHECK-NEXT: store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]] ; CHECK-NEXT: ret void ; bb: %tmp = add nsw i32 %v0, -1 %tmp1 = add nsw i32 %v1, %tmp %tmp2 = sext i32 %tmp1 to i64 %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2 %tmp4 = load i8, i8* %tmp3, align 1 %tmp5 = add nsw i32 %v0, %v1 %tmp6 = sext i32 %tmp5 to i64 %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6 %tmp8 = load i8, i8* %tmp7, align 1 %tmp9 = add nsw i32 %v0, 1 %tmp10 = add nsw i32 %tmp9, %v1 %tmp11 = sext i32 %tmp10 to i64 %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11 %tmp13 = load i8, i8* %tmp12, align 1 %tmp14 = add nsw i32 %v0, 2 %tmp15 = add nsw i32 %v1, %tmp14 %tmp16 = sext i32 %tmp15 to i64 %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16 %tmp18 = load i8, i8* %tmp17, align 1 %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0 %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1 %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2 %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3 store <4 x i8> %tmp22, <4 x i8>* %dst ret void } ; Apply different operand orders for the nested add sequences define void @ld_v4i8_add_nuw_operand_orders(i32 %v0, i32 %v1, i8* %src, <4 x i8>* %dst) { ; CHECK-LABEL: @ld_v4i8_add_nuw_operand_orders( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP:%.*]] = add nuw i32 [[V0:%.*]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = add nuw i32 [[V1:%.*]], [[TMP]] ; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP2]] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1 ; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP82:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP133:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2 ; CHECK-NEXT: [[TMP184:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3 ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP41]], i32 0 ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP82]], i32 1 ; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP133]], i32 2 ; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP184]], i32 3 ; CHECK-NEXT: store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]] ; CHECK-NEXT: ret void ; bb: %tmp = add nuw i32 %v0, -1 %tmp1 = add nuw i32 %v1, %tmp %tmp2 = zext i32 %tmp1 to i64 %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2 %tmp4 = load i8, i8* %tmp3, align 1 %tmp5 = add nuw i32 %v0, %v1 %tmp6 = zext i32 %tmp5 to i64 %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6 %tmp8 = load i8, i8* %tmp7, align 1 %tmp9 = add nuw i32 %v0, 1 %tmp10 = add nuw i32 %tmp9, %v1 %tmp11 = zext i32 %tmp10 to i64 %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11 %tmp13 = load i8, i8* %tmp12, align 1 %tmp14 = add nuw i32 %v0, 2 %tmp15 = add nuw i32 %v1, %tmp14 %tmp16 = zext i32 %tmp15 to i64 %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16 %tmp18 = load i8, i8* %tmp17, align 1 %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0 %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1 %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2 %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3 store <4 x i8> %tmp22, <4 x i8>* %dst ret void } define void @ld_v4i8_add_known_bits(i32 %ind0, i32 %ind1, i8* %src, <4 x i8>* %dst) { ; CHECK-LABEL: @ld_v4i8_add_known_bits( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[V0:%.*]] = mul i32 [[IND0:%.*]], 4 ; CHECK-NEXT: [[V1:%.*]] = mul i32 [[IND1:%.*]], 4 ; CHECK-NEXT: [[TMP:%.*]] = add i32 [[V0]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[V1]], [[TMP]] ; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[TMP3]], align 1 ; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]] ; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[TMP7]] to <3 x i8>* ; CHECK-NEXT: [[TMP1:%.*]] = load <3 x i8>, <3 x i8>* [[TMP0]], align 1 ; CHECK-NEXT: [[TMP81:%.*]] = extractelement <3 x i8> [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP132:%.*]] = extractelement <3 x i8> [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP183:%.*]] = extractelement <3 x i8> [[TMP1]], i32 2 ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP81]], i32 1 ; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP132]], i32 2 ; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP183]], i32 3 ; CHECK-NEXT: store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]] ; CHECK-NEXT: ret void ; bb: %v0 = mul i32 %ind0, 4 %v1 = mul i32 %ind1, 4 %tmp = add i32 %v0, -1 %tmp1 = add i32 %v1, %tmp %tmp2 = sext i32 %tmp1 to i64 %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2 %tmp4 = load i8, i8* %tmp3, align 1 %tmp5 = add i32 %v1, %v0 %tmp6 = sext i32 %tmp5 to i64 %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6 %tmp8 = load i8, i8* %tmp7, align 1 %tmp9 = add i32 %v0, 1 %tmp10 = add i32 %v1, %tmp9 %tmp11 = sext i32 %tmp10 to i64 %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11 %tmp13 = load i8, i8* %tmp12, align 1 %tmp14 = add i32 %v0, 2 %tmp15 = add i32 %v1, %tmp14 %tmp16 = sext i32 %tmp15 to i64 %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16 %tmp18 = load i8, i8* %tmp17, align 1 %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0 %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1 %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2 %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3 store <4 x i8> %tmp22, <4 x i8>* %dst ret void } define void @ld_v4i8_add_known_bits1(i32 %ind0, i32 %ind1, i8* %src, <4 x i8>* %dst) { ; CHECK-LABEL: @ld_v4i8_add_known_bits1( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[V0:%.*]] = mul i32 [[IND0:%.*]], 4 ; CHECK-NEXT: [[V1:%.*]] = mul i32 [[IND1:%.*]], 4 ; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]] ; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[TMP7]] to <4 x i8>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1 ; CHECK-NEXT: [[TMP81:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP132:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP183:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2 ; CHECK-NEXT: [[TMP44:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3 ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP44]], i32 0 ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP81]], i32 1 ; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP132]], i32 2 ; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP183]], i32 3 ; CHECK-NEXT: store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]] ; CHECK-NEXT: ret void ; bb: %v0 = mul i32 %ind0, 4 %v1 = mul i32 %ind1, 4 %tmp = add i32 %v0, 3 %tmp1 = add i32 %v1, %tmp %tmp2 = sext i32 %tmp1 to i64 %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2 %tmp4 = load i8, i8* %tmp3, align 1 %tmp5 = add i32 %v1, %v0 %tmp6 = sext i32 %tmp5 to i64 %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6 %tmp8 = load i8, i8* %tmp7, align 1 %tmp9 = add i32 %v0, 1 %tmp10 = add i32 %v1, %tmp9 %tmp11 = sext i32 %tmp10 to i64 %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11 %tmp13 = load i8, i8* %tmp12, align 1 %tmp14 = add i32 %v0, 2 %tmp15 = add i32 %v1, %tmp14 %tmp16 = sext i32 %tmp15 to i64 %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16 %tmp18 = load i8, i8* %tmp17, align 1 %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0 %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1 %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2 %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3 store <4 x i8> %tmp22, <4 x i8>* %dst ret void } define void @ld_v4i8_add_known_bits_by_assume(i32 %ind0, i32 %ind1, i8* %src, <4 x i8>* %dst) { ; CHECK-LABEL: @ld_v4i8_add_known_bits_by_assume( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[V0:%.*]] = mul i32 [[IND0:%.*]], 3 ; CHECK-NEXT: [[V1:%.*]] = mul i32 [[IND1:%.*]], 3 ; CHECK-NEXT: [[AND_I:%.*]] = and i32 [[V0]], 3 ; CHECK-NEXT: [[CMP_I:%.*]] = icmp eq i32 [[AND_I]], 0 ; CHECK-NEXT: [[AND_I_1:%.*]] = and i32 [[V1]], 3 ; CHECK-NEXT: [[CMP_I_1:%.*]] = icmp eq i32 [[AND_I_1]], 0 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_I]]) ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_I_1]]) ; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]] ; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[TMP7]] to <4 x i8>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1 ; CHECK-NEXT: [[TMP81:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP132:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP183:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2 ; CHECK-NEXT: [[TMP44:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3 ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP44]], i32 0 ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP81]], i32 1 ; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP132]], i32 2 ; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP183]], i32 3 ; CHECK-NEXT: store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]] ; CHECK-NEXT: ret void ; bb: %v0 = mul i32 %ind0, 3 %v1 = mul i32 %ind1, 3 %and.i = and i32 %v0, 3 %cmp.i = icmp eq i32 %and.i, 0 %and.i.1 = and i32 %v1, 3 %cmp.i.1 = icmp eq i32 %and.i.1, 0 call void @llvm.assume(i1 %cmp.i) call void @llvm.assume(i1 %cmp.i.1) %tmp = add i32 %v0, 3 %tmp1 = add i32 %v1, %tmp %tmp2 = sext i32 %tmp1 to i64 %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2 %tmp4 = load i8, i8* %tmp3, align 1 %tmp5 = add i32 %v1, %v0 %tmp6 = sext i32 %tmp5 to i64 %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6 %tmp8 = load i8, i8* %tmp7, align 1 %tmp9 = add i32 %v0, 1 %tmp10 = add i32 %v1, %tmp9 %tmp11 = sext i32 %tmp10 to i64 %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11 %tmp13 = load i8, i8* %tmp12, align 1 %tmp14 = add i32 %v0, 2 %tmp15 = add i32 %v1, %tmp14 %tmp16 = sext i32 %tmp15 to i64 %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16 %tmp18 = load i8, i8* %tmp17, align 1 %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0 %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1 %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2 %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3 store <4 x i8> %tmp22, <4 x i8>* %dst ret void } declare void @llvm.assume(i1) define void @ld_v4i8_add_assume_on_arg(i32 %v0, i32 %v1, i8* %src, <4 x i8>* %dst) { ; CHECK-LABEL: @ld_v4i8_add_assume_on_arg( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[AND_I:%.*]] = and i32 [[V0:%.*]], 3 ; CHECK-NEXT: [[CMP_I:%.*]] = icmp eq i32 [[AND_I]], 0 ; CHECK-NEXT: [[AND_I_1:%.*]] = and i32 [[V1:%.*]], 3 ; CHECK-NEXT: [[CMP_I_1:%.*]] = icmp eq i32 [[AND_I_1]], 0 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_I]]) ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_I_1]]) ; CHECK-NEXT: [[TMP:%.*]] = add nsw i32 [[V0]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[V1]], [[TMP]] ; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[TMP3]], align 1 ; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]] ; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[TMP7]] to <3 x i8>* ; CHECK-NEXT: [[TMP1:%.*]] = load <3 x i8>, <3 x i8>* [[TMP0]], align 1 ; CHECK-NEXT: [[TMP81:%.*]] = extractelement <3 x i8> [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP132:%.*]] = extractelement <3 x i8> [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP183:%.*]] = extractelement <3 x i8> [[TMP1]], i32 2 ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP81]], i32 1 ; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP132]], i32 2 ; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP183]], i32 3 ; CHECK-NEXT: store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]] ; CHECK-NEXT: ret void ; bb: %and.i = and i32 %v0, 3 %cmp.i = icmp eq i32 %and.i, 0 %and.i.1 = and i32 %v1, 3 %cmp.i.1 = icmp eq i32 %and.i.1, 0 call void @llvm.assume(i1 %cmp.i) call void @llvm.assume(i1 %cmp.i.1) %tmp = add nsw i32 %v0, -1 %tmp1 = add i32 %v1, %tmp %tmp2 = sext i32 %tmp1 to i64 %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2 %tmp4 = load i8, i8* %tmp3, align 1 %tmp5 = add i32 %v1, %v0 %tmp6 = sext i32 %tmp5 to i64 %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6 %tmp8 = load i8, i8* %tmp7, align 1 %tmp9 = add nsw i32 %v0, 1 %tmp10 = add i32 %v1, %tmp9 %tmp11 = sext i32 %tmp10 to i64 %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11 %tmp13 = load i8, i8* %tmp12, align 1 %tmp14 = add nsw i32 %v0, 2 %tmp15 = add i32 %v1, %tmp14 %tmp16 = sext i32 %tmp15 to i64 %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16 %tmp18 = load i8, i8* %tmp17, align 1 %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0 %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1 %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2 %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3 store <4 x i8> %tmp22, <4 x i8>* %dst ret void } define void @ld_v4i8_add_assume_on_arg1(i32 %v0, i32 %v1, i8* %src, <4 x i8>* %dst) { ; CHECK-LABEL: @ld_v4i8_add_assume_on_arg1( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[AND_I:%.*]] = and i32 [[V0:%.*]], 3 ; CHECK-NEXT: [[CMP_I:%.*]] = icmp eq i32 [[AND_I]], 0 ; CHECK-NEXT: [[AND_I_1:%.*]] = and i32 [[V1:%.*]], 3 ; CHECK-NEXT: [[CMP_I_1:%.*]] = icmp eq i32 [[AND_I_1]], 0 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_I]]) ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_I_1]]) ; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]] ; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[TMP7]] to <4 x i8>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1 ; CHECK-NEXT: [[TMP81:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP132:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP183:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2 ; CHECK-NEXT: [[TMP44:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3 ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP44]], i32 0 ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP81]], i32 1 ; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP132]], i32 2 ; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP183]], i32 3 ; CHECK-NEXT: store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]] ; CHECK-NEXT: ret void ; bb: %and.i = and i32 %v0, 3 %cmp.i = icmp eq i32 %and.i, 0 %and.i.1 = and i32 %v1, 3 %cmp.i.1 = icmp eq i32 %and.i.1, 0 call void @llvm.assume(i1 %cmp.i) call void @llvm.assume(i1 %cmp.i.1) %tmp = add nsw i32 %v0, 3 %tmp1 = add i32 %v1, %tmp %tmp2 = sext i32 %tmp1 to i64 %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2 %tmp4 = load i8, i8* %tmp3, align 1 %tmp5 = add i32 %v1, %v0 %tmp6 = sext i32 %tmp5 to i64 %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6 %tmp8 = load i8, i8* %tmp7, align 1 %tmp9 = add nsw i32 %v0, 1 %tmp10 = add i32 %v1, %tmp9 %tmp11 = sext i32 %tmp10 to i64 %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11 %tmp13 = load i8, i8* %tmp12, align 1 %tmp14 = add nsw i32 %v0, 2 %tmp15 = add i32 %v1, %tmp14 %tmp16 = sext i32 %tmp15 to i64 %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16 %tmp18 = load i8, i8* %tmp17, align 1 %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0 %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1 %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2 %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3 store <4 x i8> %tmp22, <4 x i8>* %dst ret void } ; Address computations are partly separated by control flow and with llvm.assume placed ; in the second basic block define void @ld_v2i8_add_different_contexts(i32 %ind0, i32 %ind1, i8* %src, <2 x i8>* %dst) { ; CHECK-LABEL: @ld_v2i8_add_different_contexts( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[V0:%.*]] = mul i32 [[IND0:%.*]], 4 ; CHECK-NEXT: [[V1:%.*]] = mul i32 [[IND1:%.*]], 3 ; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]] ; CHECK-NEXT: [[BIT_COND:%.*]] = icmp eq i32 [[V1]], 0 ; CHECK-NEXT: br i1 [[BIT_COND]], label [[BB_LOADS:%.*]], label [[BB_SKIP:%.*]] ; CHECK: bb.loads: ; CHECK-NEXT: call void @llvm.assume(i1 [[BIT_COND]]) ; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[TMP7]] to <2 x i8>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* [[TMP0]], align 1 ; CHECK-NEXT: [[TMP81:%.*]] = extractelement <2 x i8> [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i8> [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i8> undef, i8 [[TMP42]], i32 0 ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x i8> [[TMP19]], i8 [[TMP81]], i32 1 ; CHECK-NEXT: store <2 x i8> [[TMP20]], <2 x i8>* [[DST:%.*]] ; CHECK-NEXT: br label [[BB_SKIP]] ; CHECK: bb.skip: ; CHECK-NEXT: ret void ; bb: %v0 = mul i32 %ind0, 4 %v1 = mul i32 %ind1, 3 %tmp5 = add i32 %v1, %v0 %bit_cond = icmp eq i32 %v1, 0 br i1 %bit_cond, label %bb.loads, label %bb.skip bb.loads: call void @llvm.assume(i1 %bit_cond) %tmp = add nsw i32 %v0, 1 %tmp1 = add i32 %v1, %tmp %tmp2 = sext i32 %tmp1 to i64 %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2 %tmp4 = load i8, i8* %tmp3, align 1 %tmp6 = sext i32 %tmp5 to i64 %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6 %tmp8 = load i8, i8* %tmp7, align 1 %tmp19 = insertelement <2 x i8> undef, i8 %tmp4, i32 0 %tmp20 = insertelement <2 x i8> %tmp19, i8 %tmp8, i32 1 store <2 x i8> %tmp20, <2 x i8>* %dst br label %bb.skip bb.skip: ret void } ; Same as ld_v2i8_add_different_contexts but with llvm.assume placed between loads define void @ld_v2i8_add_different_contexts1(i32 %ind0, i32 %ind1, i8* %src, <2 x i8>* %dst) { ; CHECK-LABEL: @ld_v2i8_add_different_contexts1( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[V0:%.*]] = mul i32 [[IND0:%.*]], 4 ; CHECK-NEXT: [[V1:%.*]] = mul i32 [[IND1:%.*]], 3 ; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]] ; CHECK-NEXT: [[BIT_COND:%.*]] = icmp eq i32 [[V1]], 0 ; CHECK-NEXT: br i1 [[BIT_COND]], label [[BB_LOADS:%.*]], label [[BB_SKIP:%.*]] ; CHECK: bb.loads: ; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[TMP7]] to <2 x i8>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* [[TMP0]], align 1 ; CHECK-NEXT: [[TMP81:%.*]] = extractelement <2 x i8> [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i8> [[TMP1]], i32 1 ; CHECK-NEXT: call void @llvm.assume(i1 [[BIT_COND]]) ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i8> undef, i8 [[TMP42]], i32 0 ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x i8> [[TMP19]], i8 [[TMP81]], i32 1 ; CHECK-NEXT: store <2 x i8> [[TMP20]], <2 x i8>* [[DST:%.*]] ; CHECK-NEXT: br label [[BB_SKIP]] ; CHECK: bb.skip: ; CHECK-NEXT: ret void ; bb: %v0 = mul i32 %ind0, 4 %v1 = mul i32 %ind1, 3 %tmp5 = add i32 %v1, %v0 %bit_cond = icmp eq i32 %v1, 0 br i1 %bit_cond, label %bb.loads, label %bb.skip bb.loads: %tmp6 = sext i32 %tmp5 to i64 %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6 %tmp8 = load i8, i8* %tmp7, align 1 call void @llvm.assume(i1 %bit_cond) %tmp = add nsw i32 %v0, 1 %tmp1 = add i32 %v1, %tmp %tmp2 = sext i32 %tmp1 to i64 %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2 %tmp4 = load i8, i8* %tmp3, align 1 %tmp19 = insertelement <2 x i8> undef, i8 %tmp4, i32 0 %tmp20 = insertelement <2 x i8> %tmp19, i8 %tmp8, i32 1 store <2 x i8> %tmp20, <2 x i8>* %dst br label %bb.skip bb.skip: ret void } ; llvm.assume is placed between loads in a single basic block define void @ld_v2i8_add_context(i32 %ind0, i32 %ind1, i8* %src, <2 x i8>* %dst) { ; CHECK-LABEL: @ld_v2i8_add_context( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[V0:%.*]] = mul i32 [[IND0:%.*]], 4 ; CHECK-NEXT: [[V1:%.*]] = mul i32 [[IND1:%.*]], 3 ; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]] ; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[TMP7]] to <2 x i8>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* [[TMP0]], align 1 ; CHECK-NEXT: [[TMP81:%.*]] = extractelement <2 x i8> [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i8> [[TMP1]], i32 1 ; CHECK-NEXT: [[BIT_COND:%.*]] = icmp eq i32 [[TMP5]], 0 ; CHECK-NEXT: call void @llvm.assume(i1 [[BIT_COND]]) ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i8> undef, i8 [[TMP42]], i32 0 ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x i8> [[TMP19]], i8 [[TMP81]], i32 1 ; CHECK-NEXT: store <2 x i8> [[TMP20]], <2 x i8>* [[DST:%.*]] ; CHECK-NEXT: ret void ; bb: %v0 = mul i32 %ind0, 4 %v1 = mul i32 %ind1, 3 %tmp5 = add i32 %v1, %v0 %tmp6 = sext i32 %tmp5 to i64 %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6 %tmp8 = load i8, i8* %tmp7, align 1 %bit_cond = icmp eq i32 %tmp5, 0 call void @llvm.assume(i1 %bit_cond) %tmp = add nsw i32 %v0, 1 %tmp1 = add i32 %v1, %tmp %tmp2 = sext i32 %tmp1 to i64 %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2 %tmp4 = load i8, i8* %tmp3, align 1 %tmp19 = insertelement <2 x i8> undef, i8 %tmp4, i32 0 %tmp20 = insertelement <2 x i8> %tmp19, i8 %tmp8, i32 1 store <2 x i8> %tmp20, <2 x i8>* %dst ret void } ; Placing llvm.assume after all the loads and stores in the basic block still works define void @ld_v2i8_add_context1(i32 %ind0, i32 %ind1, i8* %src, <2 x i8>* %dst) { ; CHECK-LABEL: @ld_v2i8_add_context1( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[V0:%.*]] = mul i32 [[IND0:%.*]], 4 ; CHECK-NEXT: [[V1:%.*]] = mul i32 [[IND1:%.*]], 3 ; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]] ; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[TMP7]] to <2 x i8>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* [[TMP0]], align 1 ; CHECK-NEXT: [[TMP81:%.*]] = extractelement <2 x i8> [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i8> [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i8> undef, i8 [[TMP42]], i32 0 ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x i8> [[TMP19]], i8 [[TMP81]], i32 1 ; CHECK-NEXT: store <2 x i8> [[TMP20]], <2 x i8>* [[DST:%.*]] ; CHECK-NEXT: [[BIT_COND:%.*]] = icmp eq i32 [[TMP5]], 0 ; CHECK-NEXT: call void @llvm.assume(i1 [[BIT_COND]]) ; CHECK-NEXT: ret void ; bb: %v0 = mul i32 %ind0, 4 %v1 = mul i32 %ind1, 3 %tmp5 = add i32 %v1, %v0 %tmp6 = sext i32 %tmp5 to i64 %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6 %tmp8 = load i8, i8* %tmp7, align 1 %tmp = add nsw i32 %v0, 1 %tmp1 = add i32 %v1, %tmp %tmp2 = sext i32 %tmp1 to i64 %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2 %tmp4 = load i8, i8* %tmp3, align 1 %tmp19 = insertelement <2 x i8> undef, i8 %tmp4, i32 0 %tmp20 = insertelement <2 x i8> %tmp19, i8 %tmp8, i32 1 store <2 x i8> %tmp20, <2 x i8>* %dst %bit_cond = icmp eq i32 %tmp5, 0 call void @llvm.assume(i1 %bit_cond) ret void } ; Make sure we don't vectorize the loads below because the source of ; sext instructions doesn't have the nsw flag or known bits allowing ; to apply the vectorization. define void @ld_v4i8_add_not_safe(i32 %v0, i32 %v1, i8* %src, <4 x i8>* %dst) { ; CHECK-LABEL: @ld_v4i8_add_not_safe( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP:%.*]] = add nsw i32 [[V0:%.*]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[V1:%.*]], [[TMP]] ; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[TMP3]], align 1 ; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]] ; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP7]], align 1 ; CHECK-NEXT: [[TMP9:%.*]] = add nsw i32 [[V0]], 1 ; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[V1]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = sext i32 [[TMP10]] to i64 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[TMP11]] ; CHECK-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP12]], align 1 ; CHECK-NEXT: [[TMP14:%.*]] = add nsw i32 [[V0]], 2 ; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[V1]], [[TMP14]] ; CHECK-NEXT: [[TMP16:%.*]] = sext i32 [[TMP15]] to i64 ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[TMP16]] ; CHECK-NEXT: [[TMP18:%.*]] = load i8, i8* [[TMP17]], align 1 ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP8]], i32 1 ; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP13]], i32 2 ; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP18]], i32 3 ; CHECK-NEXT: store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]] ; CHECK-NEXT: ret void ; bb: %tmp = add nsw i32 %v0, -1 %tmp1 = add i32 %v1, %tmp %tmp2 = sext i32 %tmp1 to i64 %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2 %tmp4 = load i8, i8* %tmp3, align 1 %tmp5 = add i32 %v1, %v0 %tmp6 = sext i32 %tmp5 to i64 %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6 %tmp8 = load i8, i8* %tmp7, align 1 %tmp9 = add nsw i32 %v0, 1 %tmp10 = add i32 %v1, %tmp9 %tmp11 = sext i32 %tmp10 to i64 %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11 %tmp13 = load i8, i8* %tmp12, align 1 %tmp14 = add nsw i32 %v0, 2 %tmp15 = add i32 %v1, %tmp14 %tmp16 = sext i32 %tmp15 to i64 %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16 %tmp18 = load i8, i8* %tmp17, align 1 %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0 %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1 %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2 %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3 store <4 x i8> %tmp22, <4 x i8>* %dst ret void }