Compiler projects using llvm
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
// REQUIRES: powerpc-registered-target
// RUN: %clang_cc1 -flax-vector-conversions=none -no-opaque-pointers -triple powerpc64-unknown-unknown -emit-llvm %s \
// RUN:   -target-cpu pwr8 -o - | FileCheck %s -check-prefix=BE-PWR8
// RUN: %clang_cc1 -flax-vector-conversions=none -no-opaque-pointers -triple powerpc64le-unknown-unknown -emit-llvm %s \
// RUN:   -target-cpu pwr8 -o - | FileCheck %s -check-prefix=LE-PWR8

// RUN: %clang_cc1 -flax-vector-conversions=none -no-opaque-pointers -triple powerpc64-unknown-unknown -emit-llvm %s \
// RUN:   -target-cpu pwr9 -o - | FileCheck %s -check-prefix=BE-PWR9
// RUN: %clang_cc1 -flax-vector-conversions=none -no-opaque-pointers -triple powerpc64le-unknown-unknown -emit-llvm %s \
// RUN:   -target-cpu pwr9 -o - | FileCheck %s -check-prefix=LE-PWR9
// RUN: %clang_cc1 -flax-vector-conversions=none -no-opaque-pointers -triple powerpc-unknown-unknown -emit-llvm %s \
// RUN:   -target-cpu pwr9 -o - | FileCheck %s -check-prefix=BE32-PWR9

#include <altivec.h>
// BE-PWR8-LABEL: @test_ldrmb1(
// BE-PWR8-NEXT:  entry:
// BE-PWR8-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR8-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR8-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// BE-PWR8-NEXT:    [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i32 0
// BE-PWR8-NEXT:    [[LD_LO:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(i8* [[TMP0]])
// BE-PWR8-NEXT:    [[LD_HI:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(i8* [[TMP1]])
// BE-PWR8-NEXT:    [[MASK1:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP0]])
// BE-PWR8-NEXT:    [[SHUFFLE1:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[LD_LO]], <4 x i32> [[LD_HI]], <16 x i8> [[MASK1]])
// BE-PWR8-NEXT:    [[SHUFFLE2:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> zeroinitializer, <4 x i32> [[SHUFFLE1]], <16 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16>)
// BE-PWR8-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[SHUFFLE2]] to <16 x i8>
// BE-PWR8-NEXT:    ret <16 x i8> [[TMP2]]
//
// LE-PWR8-LABEL: @test_ldrmb1(
// LE-PWR8-NEXT:  entry:
// LE-PWR8-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// LE-PWR8-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// LE-PWR8-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// LE-PWR8-NEXT:    [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i32 0
// LE-PWR8-NEXT:    [[LD_LO:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(i8* [[TMP0]])
// LE-PWR8-NEXT:    [[LD_HI:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(i8* [[TMP1]])
// LE-PWR8-NEXT:    [[MASK1:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(i8* [[TMP0]])
// LE-PWR8-NEXT:    [[SHUFFLE1:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[LD_HI]], <4 x i32> [[LD_LO]], <16 x i8> [[MASK1]])
// LE-PWR8-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[SHUFFLE1]] to <16 x i8>
// LE-PWR8-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
// LE-PWR8-NEXT:    ret <16 x i8> [[TMP3]]
//
// BE-PWR9-LABEL: @test_ldrmb1(
// BE-PWR9-NEXT:  entry:
// BE-PWR9-NEXT:    [[__A_ADDR_I:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT:    [[__B_ADDR_I:%.*]] = alloca i64, align 8
// BE-PWR9-NEXT:    [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT:    store i8* [[TMP0]], i8** [[__A_ADDR_I]], align 8
// BE-PWR9-NEXT:    store i64 1, i64* [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[__A_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP2]], 56
// BE-PWR9-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]])
// BE-PWR9-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
// BE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT:    [[TMP5:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP5]]
// BE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// BE-PWR9-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
// BE-PWR9-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(i8* [[TMP6]])
// BE-PWR9-NEXT:    store <16 x i8> [[TMP7]], <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT:    [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]])
// BE-PWR9-NEXT:    [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <16 x i8>
// BE-PWR9-NEXT:    ret <16 x i8> [[TMP14]]
//
// LE-PWR9-LABEL: @test_ldrmb1(
// LE-PWR9-NEXT:  entry:
// LE-PWR9-NEXT:    [[__A_ADDR_I:%.*]] = alloca i8*, align 8
// LE-PWR9-NEXT:    [[__B_ADDR_I:%.*]] = alloca i64, align 8
// LE-PWR9-NEXT:    [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// LE-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// LE-PWR9-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// LE-PWR9-NEXT:    store i8* [[TMP0]], i8** [[__A_ADDR_I]], align 8
// LE-PWR9-NEXT:    store i64 1, i64* [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[__A_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP2]], 56
// LE-PWR9-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]])
// LE-PWR9-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
// LE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT:    [[TMP5:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP5]]
// LE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// LE-PWR9-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
// LE-PWR9-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(i8* [[TMP6]])
// LE-PWR9-NEXT:    store <16 x i8> [[TMP7]], <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT:    [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]])
// LE-PWR9-NEXT:    [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <16 x i8>
// LE-PWR9-NEXT:    ret <16 x i8> [[TMP14]]
//
// BE32-PWR9-LABEL: @test_ldrmb1(
// BE32-PWR9-NEXT:  entry:
// BE32-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 4
// BE32-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 4
// BE32-PWR9-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 4
// BE32-PWR9-NEXT:    [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i32 0
// BE32-PWR9-NEXT:    [[LD_LO:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(i8* [[TMP0]])
// BE32-PWR9-NEXT:    [[LD_HI:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(i8* [[TMP1]])
// BE32-PWR9-NEXT:    [[MASK1:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP0]])
// BE32-PWR9-NEXT:    [[SHUFFLE1:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[LD_LO]], <4 x i32> [[LD_HI]], <16 x i8> [[MASK1]])
// BE32-PWR9-NEXT:    [[SHUFFLE2:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> zeroinitializer, <4 x i32> [[SHUFFLE1]], <16 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16>)
// BE32-PWR9-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[SHUFFLE2]] to <16 x i8>
// BE32-PWR9-NEXT:    ret <16 x i8> [[TMP2]]
//
vector unsigned char test_ldrmb1(char *ptr) { return __vec_ldrmb(ptr, 1); }

// BE-PWR8-LABEL: @test_strmb1(
// BE-PWR8-NEXT:  entry:
// BE-PWR8-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR8-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR8-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR8-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR8-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// BE-PWR8-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR8-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// BE-PWR8-NEXT:    [[TMP3:%.*]] = extractelement <16 x i8> [[TMP1]], i64 15
// BE-PWR8-NEXT:    store i8 [[TMP3]], i8* [[TMP2]], align 1
// BE-PWR8-NEXT:    ret void
//
// LE-PWR8-LABEL: @test_strmb1(
// LE-PWR8-NEXT:  entry:
// LE-PWR8-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// LE-PWR8-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// LE-PWR8-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// LE-PWR8-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR8-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// LE-PWR8-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR8-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// LE-PWR8-NEXT:    [[TMP3:%.*]] = extractelement <16 x i8> [[TMP1]], i64 0
// LE-PWR8-NEXT:    store i8 [[TMP3]], i8* [[TMP2]], align 1
// LE-PWR8-NEXT:    ret void
//
// BE-PWR9-LABEL: @test_strmb1(
// BE-PWR9-NEXT:  entry:
// BE-PWR9-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT:    [[__C_ADDR_I:%.*]] = alloca i64, align 8
// BE-PWR9-NEXT:    [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR9-NEXT:    [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT:    store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT:    store i64 1, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// BE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// BE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// BE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// BE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// BE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// BE-PWR9-NEXT:    ret void
//
// LE-PWR9-LABEL: @test_strmb1(
// LE-PWR9-NEXT:  entry:
// LE-PWR9-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// LE-PWR9-NEXT:    [[__C_ADDR_I:%.*]] = alloca i64, align 8
// LE-PWR9-NEXT:    [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// LE-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// LE-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR9-NEXT:    [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// LE-PWR9-NEXT:    store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT:    store i64 1, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// LE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// LE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// LE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// LE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// LE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// LE-PWR9-NEXT:    ret void
//
// BE32-PWR9-LABEL: @test_strmb1(
// BE32-PWR9-NEXT:  entry:
// BE32-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 4
// BE32-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE32-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 4
// BE32-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE32-PWR9-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 4
// BE32-PWR9-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE32-PWR9-NEXT:    [[TMP2:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// BE32-PWR9-NEXT:    [[TMP3:%.*]] = extractelement <16 x i8> [[TMP1]], i64 15
// BE32-PWR9-NEXT:    store i8 [[TMP3]], i8* [[TMP2]], align 1
// BE32-PWR9-NEXT:    ret void
//
void test_strmb1(char *ptr, vector unsigned char data) {
  __vec_strmb(ptr, 1, data);
}

// BE-PWR8-LABEL: @test_strmb2(
// BE-PWR8-NEXT:  entry:
// BE-PWR8-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR8-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR8-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR8-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR8-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// BE-PWR8-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR8-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
// BE-PWR8-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// BE-PWR8-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i16*
// BE-PWR8-NEXT:    [[TMP5:%.*]] = extractelement <8 x i16> [[TMP2]], i64 7
// BE-PWR8-NEXT:    store i16 [[TMP5]], i16* [[TMP4]], align 1
// BE-PWR8-NEXT:    ret void
//
// LE-PWR8-LABEL: @test_strmb2(
// LE-PWR8-NEXT:  entry:
// LE-PWR8-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// LE-PWR8-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// LE-PWR8-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// LE-PWR8-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR8-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// LE-PWR8-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR8-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
// LE-PWR8-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// LE-PWR8-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i16*
// LE-PWR8-NEXT:    [[TMP5:%.*]] = extractelement <8 x i16> [[TMP2]], i64 0
// LE-PWR8-NEXT:    [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP5]])
// LE-PWR8-NEXT:    store i16 [[TMP6]], i16* [[TMP4]], align 1
// LE-PWR8-NEXT:    ret void
//
// BE-PWR9-LABEL: @test_strmb2(
// BE-PWR9-NEXT:  entry:
// BE-PWR9-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT:    [[__C_ADDR_I:%.*]] = alloca i64, align 8
// BE-PWR9-NEXT:    [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR9-NEXT:    [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT:    store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT:    store i64 2, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// BE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// BE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// BE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// BE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// BE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// BE-PWR9-NEXT:    ret void
//
// LE-PWR9-LABEL: @test_strmb2(
// LE-PWR9-NEXT:  entry:
// LE-PWR9-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// LE-PWR9-NEXT:    [[__C_ADDR_I:%.*]] = alloca i64, align 8
// LE-PWR9-NEXT:    [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// LE-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// LE-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR9-NEXT:    [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// LE-PWR9-NEXT:    store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT:    store i64 2, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// LE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// LE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// LE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// LE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// LE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// LE-PWR9-NEXT:    ret void
//
// BE32-PWR9-LABEL: @test_strmb2(
// BE32-PWR9-NEXT:  entry:
// BE32-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 4
// BE32-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE32-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 4
// BE32-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE32-PWR9-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 4
// BE32-PWR9-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE32-PWR9-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
// BE32-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// BE32-PWR9-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i16*
// BE32-PWR9-NEXT:    [[TMP5:%.*]] = extractelement <8 x i16> [[TMP2]], i64 7
// BE32-PWR9-NEXT:    store i16 [[TMP5]], i16* [[TMP4]], align 1
// BE32-PWR9-NEXT:    ret void
//
void test_strmb2(char *ptr, vector unsigned char data) {
  __vec_strmb(ptr, 2, data);
}

// BE-PWR8-LABEL: @test_strmb3(
// BE-PWR8-NEXT:  entry:
// BE-PWR8-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR8-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR8-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR8-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR8-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// BE-PWR8-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR8-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
// BE-PWR8-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1
// BE-PWR8-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i16*
// BE-PWR8-NEXT:    [[TMP5:%.*]] = extractelement <8 x i16> [[TMP2]], i64 7
// BE-PWR8-NEXT:    store i16 [[TMP5]], i16* [[TMP4]], align 1
// BE-PWR8-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// BE-PWR8-NEXT:    [[TMP7:%.*]] = extractelement <16 x i8> [[TMP1]], i64 13
// BE-PWR8-NEXT:    store i8 [[TMP7]], i8* [[TMP6]], align 1
// BE-PWR8-NEXT:    ret void
//
// LE-PWR8-LABEL: @test_strmb3(
// LE-PWR8-NEXT:  entry:
// LE-PWR8-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// LE-PWR8-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// LE-PWR8-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// LE-PWR8-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR8-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// LE-PWR8-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR8-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
// LE-PWR8-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1
// LE-PWR8-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i16*
// LE-PWR8-NEXT:    [[TMP5:%.*]] = extractelement <8 x i16> [[TMP2]], i64 0
// LE-PWR8-NEXT:    [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP5]])
// LE-PWR8-NEXT:    store i16 [[TMP6]], i16* [[TMP4]], align 1
// LE-PWR8-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// LE-PWR8-NEXT:    [[TMP8:%.*]] = extractelement <16 x i8> [[TMP1]], i64 2
// LE-PWR8-NEXT:    store i8 [[TMP8]], i8* [[TMP7]], align 1
// LE-PWR8-NEXT:    ret void
//
// BE-PWR9-LABEL: @test_strmb3(
// BE-PWR9-NEXT:  entry:
// BE-PWR9-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT:    [[__C_ADDR_I:%.*]] = alloca i64, align 8
// BE-PWR9-NEXT:    [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR9-NEXT:    [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT:    store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT:    store i64 3, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// BE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// BE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// BE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// BE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// BE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// BE-PWR9-NEXT:    ret void
//
// LE-PWR9-LABEL: @test_strmb3(
// LE-PWR9-NEXT:  entry:
// LE-PWR9-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// LE-PWR9-NEXT:    [[__C_ADDR_I:%.*]] = alloca i64, align 8
// LE-PWR9-NEXT:    [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// LE-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// LE-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR9-NEXT:    [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// LE-PWR9-NEXT:    store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT:    store i64 3, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// LE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// LE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// LE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// LE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// LE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// LE-PWR9-NEXT:    ret void
//
// BE32-PWR9-LABEL: @test_strmb3(
// BE32-PWR9-NEXT:  entry:
// BE32-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 4
// BE32-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE32-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 4
// BE32-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE32-PWR9-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 4
// BE32-PWR9-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE32-PWR9-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
// BE32-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1
// BE32-PWR9-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i16*
// BE32-PWR9-NEXT:    [[TMP5:%.*]] = extractelement <8 x i16> [[TMP2]], i64 7
// BE32-PWR9-NEXT:    store i16 [[TMP5]], i16* [[TMP4]], align 1
// BE32-PWR9-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// BE32-PWR9-NEXT:    [[TMP7:%.*]] = extractelement <16 x i8> [[TMP1]], i64 13
// BE32-PWR9-NEXT:    store i8 [[TMP7]], i8* [[TMP6]], align 1
// BE32-PWR9-NEXT:    ret void
//
void test_strmb3(char *ptr, vector unsigned char data) {
  __vec_strmb(ptr, 3, data);
}

// BE-PWR8-LABEL: @test_strmb4(
// BE-PWR8-NEXT:  entry:
// BE-PWR8-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR8-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR8-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR8-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR8-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// BE-PWR8-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR8-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// BE-PWR8-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// BE-PWR8-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i32*
// BE-PWR8-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 3
// BE-PWR8-NEXT:    store i32 [[TMP5]], i32* [[TMP4]], align 1
// BE-PWR8-NEXT:    ret void
//
// LE-PWR8-LABEL: @test_strmb4(
// LE-PWR8-NEXT:  entry:
// LE-PWR8-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// LE-PWR8-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// LE-PWR8-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// LE-PWR8-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR8-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// LE-PWR8-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR8-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// LE-PWR8-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// LE-PWR8-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i32*
// LE-PWR8-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
// LE-PWR8-NEXT:    [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
// LE-PWR8-NEXT:    store i32 [[TMP6]], i32* [[TMP4]], align 1
// LE-PWR8-NEXT:    ret void
//
// BE-PWR9-LABEL: @test_strmb4(
// BE-PWR9-NEXT:  entry:
// BE-PWR9-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT:    [[__C_ADDR_I:%.*]] = alloca i64, align 8
// BE-PWR9-NEXT:    [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR9-NEXT:    [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT:    store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT:    store i64 4, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// BE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// BE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// BE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// BE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// BE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// BE-PWR9-NEXT:    ret void
//
// LE-PWR9-LABEL: @test_strmb4(
// LE-PWR9-NEXT:  entry:
// LE-PWR9-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// LE-PWR9-NEXT:    [[__C_ADDR_I:%.*]] = alloca i64, align 8
// LE-PWR9-NEXT:    [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// LE-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// LE-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR9-NEXT:    [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// LE-PWR9-NEXT:    store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT:    store i64 4, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// LE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// LE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// LE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// LE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// LE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// LE-PWR9-NEXT:    ret void
//
// BE32-PWR9-LABEL: @test_strmb4(
// BE32-PWR9-NEXT:  entry:
// BE32-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 4
// BE32-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE32-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 4
// BE32-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE32-PWR9-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 4
// BE32-PWR9-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE32-PWR9-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// BE32-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// BE32-PWR9-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i32*
// BE32-PWR9-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 3
// BE32-PWR9-NEXT:    store i32 [[TMP5]], i32* [[TMP4]], align 1
// BE32-PWR9-NEXT:    ret void
//
void test_strmb4(char *ptr, vector unsigned char data) {
  __vec_strmb(ptr, 4, data);
}

// BE-PWR8-LABEL: @test_strmb5(
// BE-PWR8-NEXT:  entry:
// BE-PWR8-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR8-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR8-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR8-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR8-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// BE-PWR8-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR8-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// BE-PWR8-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1
// BE-PWR8-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i32*
// BE-PWR8-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 3
// BE-PWR8-NEXT:    store i32 [[TMP5]], i32* [[TMP4]], align 1
// BE-PWR8-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// BE-PWR8-NEXT:    [[TMP7:%.*]] = extractelement <16 x i8> [[TMP1]], i64 11
// BE-PWR8-NEXT:    store i8 [[TMP7]], i8* [[TMP6]], align 1
// BE-PWR8-NEXT:    ret void
//
// LE-PWR8-LABEL: @test_strmb5(
// LE-PWR8-NEXT:  entry:
// LE-PWR8-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// LE-PWR8-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// LE-PWR8-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// LE-PWR8-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR8-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// LE-PWR8-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR8-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// LE-PWR8-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1
// LE-PWR8-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i32*
// LE-PWR8-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
// LE-PWR8-NEXT:    [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
// LE-PWR8-NEXT:    store i32 [[TMP6]], i32* [[TMP4]], align 1
// LE-PWR8-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// LE-PWR8-NEXT:    [[TMP8:%.*]] = extractelement <16 x i8> [[TMP1]], i64 4
// LE-PWR8-NEXT:    store i8 [[TMP8]], i8* [[TMP7]], align 1
// LE-PWR8-NEXT:    ret void
//
// BE-PWR9-LABEL: @test_strmb5(
// BE-PWR9-NEXT:  entry:
// BE-PWR9-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT:    [[__C_ADDR_I:%.*]] = alloca i64, align 8
// BE-PWR9-NEXT:    [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR9-NEXT:    [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT:    store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT:    store i64 5, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// BE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// BE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// BE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// BE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// BE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// BE-PWR9-NEXT:    ret void
//
// LE-PWR9-LABEL: @test_strmb5(
// LE-PWR9-NEXT:  entry:
// LE-PWR9-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// LE-PWR9-NEXT:    [[__C_ADDR_I:%.*]] = alloca i64, align 8
// LE-PWR9-NEXT:    [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// LE-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// LE-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR9-NEXT:    [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// LE-PWR9-NEXT:    store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT:    store i64 5, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// LE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// LE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// LE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// LE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// LE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// LE-PWR9-NEXT:    ret void
//
// BE32-PWR9-LABEL: @test_strmb5(
// BE32-PWR9-NEXT:  entry:
// BE32-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 4
// BE32-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE32-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 4
// BE32-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE32-PWR9-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 4
// BE32-PWR9-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE32-PWR9-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// BE32-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1
// BE32-PWR9-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i32*
// BE32-PWR9-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 3
// BE32-PWR9-NEXT:    store i32 [[TMP5]], i32* [[TMP4]], align 1
// BE32-PWR9-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// BE32-PWR9-NEXT:    [[TMP7:%.*]] = extractelement <16 x i8> [[TMP1]], i64 11
// BE32-PWR9-NEXT:    store i8 [[TMP7]], i8* [[TMP6]], align 1
// BE32-PWR9-NEXT:    ret void
//
void test_strmb5(char *ptr, vector unsigned char data) {
  __vec_strmb(ptr, 5, data);
}

// BE-PWR8-LABEL: @test_strmb6(
// BE-PWR8-NEXT:  entry:
// BE-PWR8-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR8-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR8-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR8-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR8-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// BE-PWR8-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR8-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// BE-PWR8-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 2
// BE-PWR8-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i32*
// BE-PWR8-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 3
// BE-PWR8-NEXT:    store i32 [[TMP5]], i32* [[TMP4]], align 1
// BE-PWR8-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
// BE-PWR8-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// BE-PWR8-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to i16*
// BE-PWR8-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP6]], i64 5
// BE-PWR8-NEXT:    store i16 [[TMP9]], i16* [[TMP8]], align 1
// BE-PWR8-NEXT:    ret void
//
// LE-PWR8-LABEL: @test_strmb6(
// LE-PWR8-NEXT:  entry:
// LE-PWR8-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// LE-PWR8-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// LE-PWR8-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// LE-PWR8-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR8-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// LE-PWR8-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR8-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// LE-PWR8-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 2
// LE-PWR8-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i32*
// LE-PWR8-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
// LE-PWR8-NEXT:    [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
// LE-PWR8-NEXT:    store i32 [[TMP6]], i32* [[TMP4]], align 1
// LE-PWR8-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
// LE-PWR8-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// LE-PWR8-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i16*
// LE-PWR8-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP7]], i64 2
// LE-PWR8-NEXT:    [[TMP11:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP10]])
// LE-PWR8-NEXT:    store i16 [[TMP11]], i16* [[TMP9]], align 1
// LE-PWR8-NEXT:    ret void
//
// BE-PWR9-LABEL: @test_strmb6(
// BE-PWR9-NEXT:  entry:
// BE-PWR9-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT:    [[__C_ADDR_I:%.*]] = alloca i64, align 8
// BE-PWR9-NEXT:    [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR9-NEXT:    [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT:    store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT:    store i64 6, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// BE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// BE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// BE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// BE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// BE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// BE-PWR9-NEXT:    ret void
//
// LE-PWR9-LABEL: @test_strmb6(
// LE-PWR9-NEXT:  entry:
// LE-PWR9-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// LE-PWR9-NEXT:    [[__C_ADDR_I:%.*]] = alloca i64, align 8
// LE-PWR9-NEXT:    [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// LE-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// LE-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR9-NEXT:    [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// LE-PWR9-NEXT:    store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT:    store i64 6, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// LE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// LE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// LE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// LE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// LE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// LE-PWR9-NEXT:    ret void
//
// BE32-PWR9-LABEL: @test_strmb6(
// BE32-PWR9-NEXT:  entry:
// BE32-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 4
// BE32-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE32-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 4
// BE32-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE32-PWR9-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 4
// BE32-PWR9-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE32-PWR9-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// BE32-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 2
// BE32-PWR9-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i32*
// BE32-PWR9-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 3
// BE32-PWR9-NEXT:    store i32 [[TMP5]], i32* [[TMP4]], align 1
// BE32-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
// BE32-PWR9-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// BE32-PWR9-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to i16*
// BE32-PWR9-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP6]], i64 5
// BE32-PWR9-NEXT:    store i16 [[TMP9]], i16* [[TMP8]], align 1
// BE32-PWR9-NEXT:    ret void
//
void test_strmb6(char *ptr, vector unsigned char data) {
  __vec_strmb(ptr, 6, data);
}

// BE-PWR8-LABEL: @test_strmb7(
// BE-PWR8-NEXT:  entry:
// BE-PWR8-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR8-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR8-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR8-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR8-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// BE-PWR8-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR8-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// BE-PWR8-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 3
// BE-PWR8-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i32*
// BE-PWR8-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 3
// BE-PWR8-NEXT:    store i32 [[TMP5]], i32* [[TMP4]], align 1
// BE-PWR8-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
// BE-PWR8-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1
// BE-PWR8-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to i16*
// BE-PWR8-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP6]], i64 5
// BE-PWR8-NEXT:    store i16 [[TMP9]], i16* [[TMP8]], align 1
// BE-PWR8-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// BE-PWR8-NEXT:    [[TMP11:%.*]] = extractelement <16 x i8> [[TMP1]], i64 9
// BE-PWR8-NEXT:    store i8 [[TMP11]], i8* [[TMP10]], align 1
// BE-PWR8-NEXT:    ret void
//
// LE-PWR8-LABEL: @test_strmb7(
// LE-PWR8-NEXT:  entry:
// LE-PWR8-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// LE-PWR8-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// LE-PWR8-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// LE-PWR8-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR8-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// LE-PWR8-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR8-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// LE-PWR8-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 3
// LE-PWR8-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i32*
// LE-PWR8-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
// LE-PWR8-NEXT:    [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
// LE-PWR8-NEXT:    store i32 [[TMP6]], i32* [[TMP4]], align 1
// LE-PWR8-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
// LE-PWR8-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1
// LE-PWR8-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i16*
// LE-PWR8-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP7]], i64 2
// LE-PWR8-NEXT:    [[TMP11:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP10]])
// LE-PWR8-NEXT:    store i16 [[TMP11]], i16* [[TMP9]], align 1
// LE-PWR8-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// LE-PWR8-NEXT:    [[TMP13:%.*]] = extractelement <16 x i8> [[TMP1]], i64 6
// LE-PWR8-NEXT:    store i8 [[TMP13]], i8* [[TMP12]], align 1
// LE-PWR8-NEXT:    ret void
//
// BE-PWR9-LABEL: @test_strmb7(
// BE-PWR9-NEXT:  entry:
// BE-PWR9-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT:    [[__C_ADDR_I:%.*]] = alloca i64, align 8
// BE-PWR9-NEXT:    [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR9-NEXT:    [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT:    store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT:    store i64 7, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// BE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// BE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// BE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// BE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// BE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// BE-PWR9-NEXT:    ret void
//
// LE-PWR9-LABEL: @test_strmb7(
// LE-PWR9-NEXT:  entry:
// LE-PWR9-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// LE-PWR9-NEXT:    [[__C_ADDR_I:%.*]] = alloca i64, align 8
// LE-PWR9-NEXT:    [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// LE-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// LE-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR9-NEXT:    [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// LE-PWR9-NEXT:    store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT:    store i64 7, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// LE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// LE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// LE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// LE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// LE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// LE-PWR9-NEXT:    ret void
//
// BE32-PWR9-LABEL: @test_strmb7(
// BE32-PWR9-NEXT:  entry:
// BE32-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 4
// BE32-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE32-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 4
// BE32-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE32-PWR9-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 4
// BE32-PWR9-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE32-PWR9-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// BE32-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 3
// BE32-PWR9-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i32*
// BE32-PWR9-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 3
// BE32-PWR9-NEXT:    store i32 [[TMP5]], i32* [[TMP4]], align 1
// BE32-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
// BE32-PWR9-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1
// BE32-PWR9-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to i16*
// BE32-PWR9-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP6]], i64 5
// BE32-PWR9-NEXT:    store i16 [[TMP9]], i16* [[TMP8]], align 1
// BE32-PWR9-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// BE32-PWR9-NEXT:    [[TMP11:%.*]] = extractelement <16 x i8> [[TMP1]], i64 9
// BE32-PWR9-NEXT:    store i8 [[TMP11]], i8* [[TMP10]], align 1
// BE32-PWR9-NEXT:    ret void
//
void test_strmb7(char *ptr, vector unsigned char data) {
  __vec_strmb(ptr, 7, data);
}

// BE-PWR8-LABEL: @test_strmb8(
// BE-PWR8-NEXT:  entry:
// BE-PWR8-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR8-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR8-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR8-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR8-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// BE-PWR8-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR8-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
// BE-PWR8-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// BE-PWR8-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64*
// BE-PWR8-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1
// BE-PWR8-NEXT:    store i64 [[TMP5]], i64* [[TMP4]], align 1
// BE-PWR8-NEXT:    ret void
//
// LE-PWR8-LABEL: @test_strmb8(
// LE-PWR8-NEXT:  entry:
// LE-PWR8-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// LE-PWR8-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// LE-PWR8-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// LE-PWR8-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR8-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// LE-PWR8-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR8-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
// LE-PWR8-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// LE-PWR8-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64*
// LE-PWR8-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
// LE-PWR8-NEXT:    [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
// LE-PWR8-NEXT:    store i64 [[TMP6]], i64* [[TMP4]], align 1
// LE-PWR8-NEXT:    ret void
//
// BE-PWR9-LABEL: @test_strmb8(
// BE-PWR9-NEXT:  entry:
// BE-PWR9-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT:    [[__C_ADDR_I:%.*]] = alloca i64, align 8
// BE-PWR9-NEXT:    [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR9-NEXT:    [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT:    store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT:    store i64 8, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// BE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// BE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// BE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// BE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// BE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// BE-PWR9-NEXT:    ret void
//
// LE-PWR9-LABEL: @test_strmb8(
// LE-PWR9-NEXT:  entry:
// LE-PWR9-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// LE-PWR9-NEXT:    [[__C_ADDR_I:%.*]] = alloca i64, align 8
// LE-PWR9-NEXT:    [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// LE-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// LE-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR9-NEXT:    [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// LE-PWR9-NEXT:    store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT:    store i64 8, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// LE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// LE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// LE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// LE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// LE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// LE-PWR9-NEXT:    ret void
//
// BE32-PWR9-LABEL: @test_strmb8(
// BE32-PWR9-NEXT:  entry:
// BE32-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 4
// BE32-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE32-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 4
// BE32-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE32-PWR9-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 4
// BE32-PWR9-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE32-PWR9-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
// BE32-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// BE32-PWR9-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64*
// BE32-PWR9-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1
// BE32-PWR9-NEXT:    store i64 [[TMP5]], i64* [[TMP4]], align 1
// BE32-PWR9-NEXT:    ret void
//
void test_strmb8(char *ptr, vector unsigned char data) {
  __vec_strmb(ptr, 8, data);
}
// BE-PWR8-LABEL: @test_ldrmb9(
// BE-PWR8-NEXT:  entry:
// BE-PWR8-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR8-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR8-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// BE-PWR8-NEXT:    [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i32 8
// BE-PWR8-NEXT:    [[LD_LO:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(i8* [[TMP0]])
// BE-PWR8-NEXT:    [[LD_HI:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(i8* [[TMP1]])
// BE-PWR8-NEXT:    [[MASK1:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP0]])
// BE-PWR8-NEXT:    [[SHUFFLE1:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[LD_LO]], <4 x i32> [[LD_HI]], <16 x i8> [[MASK1]])
// BE-PWR8-NEXT:    [[SHUFFLE2:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> zeroinitializer, <4 x i32> [[SHUFFLE1]], <16 x i8> <i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24>)
// BE-PWR8-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[SHUFFLE2]] to <16 x i8>
// BE-PWR8-NEXT:    ret <16 x i8> [[TMP2]]
//
// LE-PWR8-LABEL: @test_ldrmb9(
// LE-PWR8-NEXT:  entry:
// LE-PWR8-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// LE-PWR8-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// LE-PWR8-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// LE-PWR8-NEXT:    [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i32 8
// LE-PWR8-NEXT:    [[LD_LO:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(i8* [[TMP0]])
// LE-PWR8-NEXT:    [[LD_HI:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(i8* [[TMP1]])
// LE-PWR8-NEXT:    [[MASK1:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(i8* [[TMP0]])
// LE-PWR8-NEXT:    [[SHUFFLE1:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[LD_HI]], <4 x i32> [[LD_LO]], <16 x i8> [[MASK1]])
// LE-PWR8-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[SHUFFLE1]] to <16 x i8>
// LE-PWR8-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> zeroinitializer, <16 x i32> <i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22>
// LE-PWR8-NEXT:    ret <16 x i8> [[TMP3]]
//
// BE-PWR9-LABEL: @test_ldrmb9(
// BE-PWR9-NEXT:  entry:
// BE-PWR9-NEXT:    [[__A_ADDR_I:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT:    [[__B_ADDR_I:%.*]] = alloca i64, align 8
// BE-PWR9-NEXT:    [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT:    store i8* [[TMP0]], i8** [[__A_ADDR_I]], align 8
// BE-PWR9-NEXT:    store i64 9, i64* [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[__A_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP2]], 56
// BE-PWR9-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]])
// BE-PWR9-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
// BE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT:    [[TMP5:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP5]]
// BE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// BE-PWR9-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
// BE-PWR9-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(i8* [[TMP6]])
// BE-PWR9-NEXT:    store <16 x i8> [[TMP7]], <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT:    [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]])
// BE-PWR9-NEXT:    [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <16 x i8>
// BE-PWR9-NEXT:    ret <16 x i8> [[TMP14]]
//
// LE-PWR9-LABEL: @test_ldrmb9(
// LE-PWR9-NEXT:  entry:
// LE-PWR9-NEXT:    [[__A_ADDR_I:%.*]] = alloca i8*, align 8
// LE-PWR9-NEXT:    [[__B_ADDR_I:%.*]] = alloca i64, align 8
// LE-PWR9-NEXT:    [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// LE-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// LE-PWR9-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// LE-PWR9-NEXT:    store i8* [[TMP0]], i8** [[__A_ADDR_I]], align 8
// LE-PWR9-NEXT:    store i64 9, i64* [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[__A_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP2]], 56
// LE-PWR9-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]])
// LE-PWR9-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
// LE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT:    [[TMP5:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP5]]
// LE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// LE-PWR9-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
// LE-PWR9-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(i8* [[TMP6]])
// LE-PWR9-NEXT:    store <16 x i8> [[TMP7]], <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT:    [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]])
// LE-PWR9-NEXT:    [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <16 x i8>
// LE-PWR9-NEXT:    ret <16 x i8> [[TMP14]]
//
// BE32-PWR9-LABEL: @test_ldrmb9(
// BE32-PWR9-NEXT:  entry:
// BE32-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 4
// BE32-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 4
// BE32-PWR9-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 4
// BE32-PWR9-NEXT:    [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i32 8
// BE32-PWR9-NEXT:    [[LD_LO:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(i8* [[TMP0]])
// BE32-PWR9-NEXT:    [[LD_HI:%.*]] = call <4 x i32> @llvm.ppc.altivec.lvx(i8* [[TMP1]])
// BE32-PWR9-NEXT:    [[MASK1:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP0]])
// BE32-PWR9-NEXT:    [[SHUFFLE1:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[LD_LO]], <4 x i32> [[LD_HI]], <16 x i8> [[MASK1]])
// BE32-PWR9-NEXT:    [[SHUFFLE2:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> zeroinitializer, <4 x i32> [[SHUFFLE1]], <16 x i8> <i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24>)
// BE32-PWR9-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[SHUFFLE2]] to <16 x i8>
// BE32-PWR9-NEXT:    ret <16 x i8> [[TMP2]]
//
vector unsigned char test_ldrmb9(char *ptr) { return __vec_ldrmb(ptr, 9); }

// BE-PWR8-LABEL: @test_strmb9(
// BE-PWR8-NEXT:  entry:
// BE-PWR8-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR8-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR8-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR8-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR8-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// BE-PWR8-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR8-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
// BE-PWR8-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1
// BE-PWR8-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64*
// BE-PWR8-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1
// BE-PWR8-NEXT:    store i64 [[TMP5]], i64* [[TMP4]], align 1
// BE-PWR8-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// BE-PWR8-NEXT:    [[TMP7:%.*]] = extractelement <16 x i8> [[TMP1]], i64 7
// BE-PWR8-NEXT:    store i8 [[TMP7]], i8* [[TMP6]], align 1
// BE-PWR8-NEXT:    ret void
//
// LE-PWR8-LABEL: @test_strmb9(
// LE-PWR8-NEXT:  entry:
// LE-PWR8-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// LE-PWR8-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// LE-PWR8-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// LE-PWR8-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR8-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// LE-PWR8-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR8-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
// LE-PWR8-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1
// LE-PWR8-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64*
// LE-PWR8-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
// LE-PWR8-NEXT:    [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
// LE-PWR8-NEXT:    store i64 [[TMP6]], i64* [[TMP4]], align 1
// LE-PWR8-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// LE-PWR8-NEXT:    [[TMP8:%.*]] = extractelement <16 x i8> [[TMP1]], i64 8
// LE-PWR8-NEXT:    store i8 [[TMP8]], i8* [[TMP7]], align 1
// LE-PWR8-NEXT:    ret void
//
// BE-PWR9-LABEL: @test_strmb9(
// BE-PWR9-NEXT:  entry:
// BE-PWR9-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT:    [[__C_ADDR_I:%.*]] = alloca i64, align 8
// BE-PWR9-NEXT:    [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR9-NEXT:    [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT:    store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT:    store i64 9, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// BE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// BE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// BE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// BE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// BE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// BE-PWR9-NEXT:    ret void
//
// LE-PWR9-LABEL: @test_strmb9(
// LE-PWR9-NEXT:  entry:
// LE-PWR9-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// LE-PWR9-NEXT:    [[__C_ADDR_I:%.*]] = alloca i64, align 8
// LE-PWR9-NEXT:    [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// LE-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// LE-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR9-NEXT:    [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// LE-PWR9-NEXT:    store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT:    store i64 9, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// LE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// LE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// LE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// LE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// LE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// LE-PWR9-NEXT:    ret void
//
// BE32-PWR9-LABEL: @test_strmb9(
// BE32-PWR9-NEXT:  entry:
// BE32-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 4
// BE32-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE32-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 4
// BE32-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE32-PWR9-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 4
// BE32-PWR9-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE32-PWR9-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
// BE32-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1
// BE32-PWR9-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64*
// BE32-PWR9-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1
// BE32-PWR9-NEXT:    store i64 [[TMP5]], i64* [[TMP4]], align 1
// BE32-PWR9-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// BE32-PWR9-NEXT:    [[TMP7:%.*]] = extractelement <16 x i8> [[TMP1]], i64 7
// BE32-PWR9-NEXT:    store i8 [[TMP7]], i8* [[TMP6]], align 1
// BE32-PWR9-NEXT:    ret void
//
void test_strmb9(char *ptr, vector unsigned char data) {
  __vec_strmb(ptr, 9, data);
}

// BE-PWR8-LABEL: @test_strmb10(
// BE-PWR8-NEXT:  entry:
// BE-PWR8-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR8-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR8-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR8-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR8-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// BE-PWR8-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR8-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
// BE-PWR8-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 2
// BE-PWR8-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64*
// BE-PWR8-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1
// BE-PWR8-NEXT:    store i64 [[TMP5]], i64* [[TMP4]], align 1
// BE-PWR8-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
// BE-PWR8-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// BE-PWR8-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to i16*
// BE-PWR8-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP6]], i64 3
// BE-PWR8-NEXT:    store i16 [[TMP9]], i16* [[TMP8]], align 1
// BE-PWR8-NEXT:    ret void
//
// LE-PWR8-LABEL: @test_strmb10(
// LE-PWR8-NEXT:  entry:
// LE-PWR8-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// LE-PWR8-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// LE-PWR8-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// LE-PWR8-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR8-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// LE-PWR8-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR8-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
// LE-PWR8-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 2
// LE-PWR8-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64*
// LE-PWR8-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
// LE-PWR8-NEXT:    [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
// LE-PWR8-NEXT:    store i64 [[TMP6]], i64* [[TMP4]], align 1
// LE-PWR8-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
// LE-PWR8-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// LE-PWR8-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i16*
// LE-PWR8-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP7]], i64 4
// LE-PWR8-NEXT:    [[TMP11:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP10]])
// LE-PWR8-NEXT:    store i16 [[TMP11]], i16* [[TMP9]], align 1
// LE-PWR8-NEXT:    ret void
//
// BE-PWR9-LABEL: @test_strmb10(
// BE-PWR9-NEXT:  entry:
// BE-PWR9-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT:    [[__C_ADDR_I:%.*]] = alloca i64, align 8
// BE-PWR9-NEXT:    [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR9-NEXT:    [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT:    store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT:    store i64 10, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// BE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// BE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// BE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// BE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// BE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// BE-PWR9-NEXT:    ret void
//
// LE-PWR9-LABEL: @test_strmb10(
// LE-PWR9-NEXT:  entry:
// LE-PWR9-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// LE-PWR9-NEXT:    [[__C_ADDR_I:%.*]] = alloca i64, align 8
// LE-PWR9-NEXT:    [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// LE-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// LE-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR9-NEXT:    [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// LE-PWR9-NEXT:    store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT:    store i64 10, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// LE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// LE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// LE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// LE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// LE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// LE-PWR9-NEXT:    ret void
//
// BE32-PWR9-LABEL: @test_strmb10(
// BE32-PWR9-NEXT:  entry:
// BE32-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 4
// BE32-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE32-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 4
// BE32-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE32-PWR9-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 4
// BE32-PWR9-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE32-PWR9-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
// BE32-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 2
// BE32-PWR9-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64*
// BE32-PWR9-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1
// BE32-PWR9-NEXT:    store i64 [[TMP5]], i64* [[TMP4]], align 1
// BE32-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
// BE32-PWR9-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// BE32-PWR9-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to i16*
// BE32-PWR9-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP6]], i64 3
// BE32-PWR9-NEXT:    store i16 [[TMP9]], i16* [[TMP8]], align 1
// BE32-PWR9-NEXT:    ret void
//
void test_strmb10(char *ptr, vector unsigned char data) {
  __vec_strmb(ptr, 10, data);
}

// BE-PWR8-LABEL: @test_strmb11(
// BE-PWR8-NEXT:  entry:
// BE-PWR8-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR8-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR8-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR8-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR8-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// BE-PWR8-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR8-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
// BE-PWR8-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 3
// BE-PWR8-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64*
// BE-PWR8-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1
// BE-PWR8-NEXT:    store i64 [[TMP5]], i64* [[TMP4]], align 1
// BE-PWR8-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
// BE-PWR8-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1
// BE-PWR8-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to i16*
// BE-PWR8-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP6]], i64 3
// BE-PWR8-NEXT:    store i16 [[TMP9]], i16* [[TMP8]], align 1
// BE-PWR8-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// BE-PWR8-NEXT:    [[TMP11:%.*]] = extractelement <16 x i8> [[TMP1]], i64 5
// BE-PWR8-NEXT:    store i8 [[TMP11]], i8* [[TMP10]], align 1
// BE-PWR8-NEXT:    ret void
//
// LE-PWR8-LABEL: @test_strmb11(
// LE-PWR8-NEXT:  entry:
// LE-PWR8-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// LE-PWR8-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// LE-PWR8-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// LE-PWR8-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR8-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// LE-PWR8-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR8-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
// LE-PWR8-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 3
// LE-PWR8-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64*
// LE-PWR8-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
// LE-PWR8-NEXT:    [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
// LE-PWR8-NEXT:    store i64 [[TMP6]], i64* [[TMP4]], align 1
// LE-PWR8-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
// LE-PWR8-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1
// LE-PWR8-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i16*
// LE-PWR8-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP7]], i64 4
// LE-PWR8-NEXT:    [[TMP11:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP10]])
// LE-PWR8-NEXT:    store i16 [[TMP11]], i16* [[TMP9]], align 1
// LE-PWR8-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// LE-PWR8-NEXT:    [[TMP13:%.*]] = extractelement <16 x i8> [[TMP1]], i64 10
// LE-PWR8-NEXT:    store i8 [[TMP13]], i8* [[TMP12]], align 1
// LE-PWR8-NEXT:    ret void
//
// BE-PWR9-LABEL: @test_strmb11(
// BE-PWR9-NEXT:  entry:
// BE-PWR9-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT:    [[__C_ADDR_I:%.*]] = alloca i64, align 8
// BE-PWR9-NEXT:    [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR9-NEXT:    [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT:    store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT:    store i64 11, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// BE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// BE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// BE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// BE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// BE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// BE-PWR9-NEXT:    ret void
//
// LE-PWR9-LABEL: @test_strmb11(
// LE-PWR9-NEXT:  entry:
// LE-PWR9-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// LE-PWR9-NEXT:    [[__C_ADDR_I:%.*]] = alloca i64, align 8
// LE-PWR9-NEXT:    [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// LE-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// LE-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR9-NEXT:    [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// LE-PWR9-NEXT:    store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT:    store i64 11, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// LE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// LE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// LE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// LE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// LE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// LE-PWR9-NEXT:    ret void
//
// BE32-PWR9-LABEL: @test_strmb11(
// BE32-PWR9-NEXT:  entry:
// BE32-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 4
// BE32-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE32-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 4
// BE32-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE32-PWR9-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 4
// BE32-PWR9-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE32-PWR9-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
// BE32-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 3
// BE32-PWR9-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64*
// BE32-PWR9-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1
// BE32-PWR9-NEXT:    store i64 [[TMP5]], i64* [[TMP4]], align 1
// BE32-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
// BE32-PWR9-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1
// BE32-PWR9-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to i16*
// BE32-PWR9-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP6]], i64 3
// BE32-PWR9-NEXT:    store i16 [[TMP9]], i16* [[TMP8]], align 1
// BE32-PWR9-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// BE32-PWR9-NEXT:    [[TMP11:%.*]] = extractelement <16 x i8> [[TMP1]], i64 5
// BE32-PWR9-NEXT:    store i8 [[TMP11]], i8* [[TMP10]], align 1
// BE32-PWR9-NEXT:    ret void
//
void test_strmb11(char *ptr, vector unsigned char data) {
  __vec_strmb(ptr, 11, data);
}

// BE-PWR8-LABEL: @test_strmb12(
// BE-PWR8-NEXT:  entry:
// BE-PWR8-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR8-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR8-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR8-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR8-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// BE-PWR8-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR8-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
// BE-PWR8-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 4
// BE-PWR8-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64*
// BE-PWR8-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1
// BE-PWR8-NEXT:    store i64 [[TMP5]], i64* [[TMP4]], align 1
// BE-PWR8-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// BE-PWR8-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// BE-PWR8-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to i32*
// BE-PWR8-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP6]], i64 1
// BE-PWR8-NEXT:    store i32 [[TMP9]], i32* [[TMP8]], align 1
// BE-PWR8-NEXT:    ret void
//
// LE-PWR8-LABEL: @test_strmb12(
// LE-PWR8-NEXT:  entry:
// LE-PWR8-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// LE-PWR8-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// LE-PWR8-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// LE-PWR8-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR8-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// LE-PWR8-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR8-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
// LE-PWR8-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 4
// LE-PWR8-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64*
// LE-PWR8-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
// LE-PWR8-NEXT:    [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
// LE-PWR8-NEXT:    store i64 [[TMP6]], i64* [[TMP4]], align 1
// LE-PWR8-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// LE-PWR8-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// LE-PWR8-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32*
// LE-PWR8-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP7]], i64 2
// LE-PWR8-NEXT:    [[TMP11:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
// LE-PWR8-NEXT:    store i32 [[TMP11]], i32* [[TMP9]], align 1
// LE-PWR8-NEXT:    ret void
//
// BE-PWR9-LABEL: @test_strmb12(
// BE-PWR9-NEXT:  entry:
// BE-PWR9-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT:    [[__C_ADDR_I:%.*]] = alloca i64, align 8
// BE-PWR9-NEXT:    [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR9-NEXT:    [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT:    store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT:    store i64 12, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// BE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// BE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// BE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// BE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// BE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// BE-PWR9-NEXT:    ret void
//
// LE-PWR9-LABEL: @test_strmb12(
// LE-PWR9-NEXT:  entry:
// LE-PWR9-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// LE-PWR9-NEXT:    [[__C_ADDR_I:%.*]] = alloca i64, align 8
// LE-PWR9-NEXT:    [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// LE-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// LE-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR9-NEXT:    [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// LE-PWR9-NEXT:    store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT:    store i64 12, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// LE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// LE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// LE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// LE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// LE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// LE-PWR9-NEXT:    ret void
//
// BE32-PWR9-LABEL: @test_strmb12(
// BE32-PWR9-NEXT:  entry:
// BE32-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 4
// BE32-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE32-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 4
// BE32-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE32-PWR9-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 4
// BE32-PWR9-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE32-PWR9-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
// BE32-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 4
// BE32-PWR9-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64*
// BE32-PWR9-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1
// BE32-PWR9-NEXT:    store i64 [[TMP5]], i64* [[TMP4]], align 1
// BE32-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// BE32-PWR9-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// BE32-PWR9-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to i32*
// BE32-PWR9-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP6]], i64 1
// BE32-PWR9-NEXT:    store i32 [[TMP9]], i32* [[TMP8]], align 1
// BE32-PWR9-NEXT:    ret void
//
void test_strmb12(char *ptr, vector unsigned char data) {
  __vec_strmb(ptr, 12, data);
}

// BE-PWR8-LABEL: @test_strmb13(
// BE-PWR8-NEXT:  entry:
// BE-PWR8-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR8-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR8-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR8-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR8-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// BE-PWR8-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR8-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
// BE-PWR8-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 5
// BE-PWR8-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64*
// BE-PWR8-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1
// BE-PWR8-NEXT:    store i64 [[TMP5]], i64* [[TMP4]], align 1
// BE-PWR8-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// BE-PWR8-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1
// BE-PWR8-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to i32*
// BE-PWR8-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP6]], i64 1
// BE-PWR8-NEXT:    store i32 [[TMP9]], i32* [[TMP8]], align 1
// BE-PWR8-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// BE-PWR8-NEXT:    [[TMP11:%.*]] = extractelement <16 x i8> [[TMP1]], i64 3
// BE-PWR8-NEXT:    store i8 [[TMP11]], i8* [[TMP10]], align 1
// BE-PWR8-NEXT:    ret void
//
// LE-PWR8-LABEL: @test_strmb13(
// LE-PWR8-NEXT:  entry:
// LE-PWR8-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// LE-PWR8-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// LE-PWR8-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// LE-PWR8-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR8-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// LE-PWR8-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR8-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
// LE-PWR8-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 5
// LE-PWR8-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64*
// LE-PWR8-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
// LE-PWR8-NEXT:    [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
// LE-PWR8-NEXT:    store i64 [[TMP6]], i64* [[TMP4]], align 1
// LE-PWR8-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// LE-PWR8-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1
// LE-PWR8-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32*
// LE-PWR8-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP7]], i64 2
// LE-PWR8-NEXT:    [[TMP11:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
// LE-PWR8-NEXT:    store i32 [[TMP11]], i32* [[TMP9]], align 1
// LE-PWR8-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// LE-PWR8-NEXT:    [[TMP13:%.*]] = extractelement <16 x i8> [[TMP1]], i64 12
// LE-PWR8-NEXT:    store i8 [[TMP13]], i8* [[TMP12]], align 1
// LE-PWR8-NEXT:    ret void
//
// BE-PWR9-LABEL: @test_strmb13(
// BE-PWR9-NEXT:  entry:
// BE-PWR9-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT:    [[__C_ADDR_I:%.*]] = alloca i64, align 8
// BE-PWR9-NEXT:    [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR9-NEXT:    [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT:    store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT:    store i64 13, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// BE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// BE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// BE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// BE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// BE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// BE-PWR9-NEXT:    ret void
//
// LE-PWR9-LABEL: @test_strmb13(
// LE-PWR9-NEXT:  entry:
// LE-PWR9-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// LE-PWR9-NEXT:    [[__C_ADDR_I:%.*]] = alloca i64, align 8
// LE-PWR9-NEXT:    [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// LE-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// LE-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR9-NEXT:    [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// LE-PWR9-NEXT:    store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT:    store i64 13, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// LE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// LE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// LE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// LE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// LE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// LE-PWR9-NEXT:    ret void
//
// BE32-PWR9-LABEL: @test_strmb13(
// BE32-PWR9-NEXT:  entry:
// BE32-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 4
// BE32-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE32-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 4
// BE32-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE32-PWR9-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 4
// BE32-PWR9-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE32-PWR9-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
// BE32-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 5
// BE32-PWR9-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64*
// BE32-PWR9-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1
// BE32-PWR9-NEXT:    store i64 [[TMP5]], i64* [[TMP4]], align 1
// BE32-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// BE32-PWR9-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1
// BE32-PWR9-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to i32*
// BE32-PWR9-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP6]], i64 1
// BE32-PWR9-NEXT:    store i32 [[TMP9]], i32* [[TMP8]], align 1
// BE32-PWR9-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// BE32-PWR9-NEXT:    [[TMP11:%.*]] = extractelement <16 x i8> [[TMP1]], i64 3
// BE32-PWR9-NEXT:    store i8 [[TMP11]], i8* [[TMP10]], align 1
// BE32-PWR9-NEXT:    ret void
//
void test_strmb13(char *ptr, vector unsigned char data) {
  __vec_strmb(ptr, 13, data);
}

// BE-PWR8-LABEL: @test_strmb14(
// BE-PWR8-NEXT:  entry:
// BE-PWR8-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR8-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR8-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR8-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR8-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// BE-PWR8-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR8-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
// BE-PWR8-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 6
// BE-PWR8-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64*
// BE-PWR8-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1
// BE-PWR8-NEXT:    store i64 [[TMP5]], i64* [[TMP4]], align 1
// BE-PWR8-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// BE-PWR8-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[TMP0]], i64 2
// BE-PWR8-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to i32*
// BE-PWR8-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP6]], i64 1
// BE-PWR8-NEXT:    store i32 [[TMP9]], i32* [[TMP8]], align 1
// BE-PWR8-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
// BE-PWR8-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// BE-PWR8-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP11]] to i16*
// BE-PWR8-NEXT:    [[TMP13:%.*]] = extractelement <8 x i16> [[TMP10]], i64 1
// BE-PWR8-NEXT:    store i16 [[TMP13]], i16* [[TMP12]], align 1
// BE-PWR8-NEXT:    ret void
//
// LE-PWR8-LABEL: @test_strmb14(
// LE-PWR8-NEXT:  entry:
// LE-PWR8-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// LE-PWR8-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// LE-PWR8-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// LE-PWR8-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR8-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// LE-PWR8-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR8-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
// LE-PWR8-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 6
// LE-PWR8-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64*
// LE-PWR8-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
// LE-PWR8-NEXT:    [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
// LE-PWR8-NEXT:    store i64 [[TMP6]], i64* [[TMP4]], align 1
// LE-PWR8-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// LE-PWR8-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[TMP0]], i64 2
// LE-PWR8-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32*
// LE-PWR8-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP7]], i64 2
// LE-PWR8-NEXT:    [[TMP11:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
// LE-PWR8-NEXT:    store i32 [[TMP11]], i32* [[TMP9]], align 1
// LE-PWR8-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
// LE-PWR8-NEXT:    [[TMP13:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// LE-PWR8-NEXT:    [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i16*
// LE-PWR8-NEXT:    [[TMP15:%.*]] = extractelement <8 x i16> [[TMP12]], i64 6
// LE-PWR8-NEXT:    [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]])
// LE-PWR8-NEXT:    store i16 [[TMP16]], i16* [[TMP14]], align 1
// LE-PWR8-NEXT:    ret void
//
// BE-PWR9-LABEL: @test_strmb14(
// BE-PWR9-NEXT:  entry:
// BE-PWR9-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT:    [[__C_ADDR_I:%.*]] = alloca i64, align 8
// BE-PWR9-NEXT:    [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR9-NEXT:    [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT:    store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT:    store i64 14, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// BE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// BE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// BE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// BE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// BE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// BE-PWR9-NEXT:    ret void
//
// LE-PWR9-LABEL: @test_strmb14(
// LE-PWR9-NEXT:  entry:
// LE-PWR9-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// LE-PWR9-NEXT:    [[__C_ADDR_I:%.*]] = alloca i64, align 8
// LE-PWR9-NEXT:    [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// LE-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// LE-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR9-NEXT:    [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// LE-PWR9-NEXT:    store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT:    store i64 14, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// LE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// LE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// LE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// LE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// LE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// LE-PWR9-NEXT:    ret void
//
// BE32-PWR9-LABEL: @test_strmb14(
// BE32-PWR9-NEXT:  entry:
// BE32-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 4
// BE32-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE32-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 4
// BE32-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE32-PWR9-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 4
// BE32-PWR9-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE32-PWR9-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
// BE32-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 6
// BE32-PWR9-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64*
// BE32-PWR9-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1
// BE32-PWR9-NEXT:    store i64 [[TMP5]], i64* [[TMP4]], align 1
// BE32-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// BE32-PWR9-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[TMP0]], i64 2
// BE32-PWR9-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to i32*
// BE32-PWR9-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP6]], i64 1
// BE32-PWR9-NEXT:    store i32 [[TMP9]], i32* [[TMP8]], align 1
// BE32-PWR9-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
// BE32-PWR9-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// BE32-PWR9-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP11]] to i16*
// BE32-PWR9-NEXT:    [[TMP13:%.*]] = extractelement <8 x i16> [[TMP10]], i64 1
// BE32-PWR9-NEXT:    store i16 [[TMP13]], i16* [[TMP12]], align 1
// BE32-PWR9-NEXT:    ret void
//
void test_strmb14(char *ptr, vector unsigned char data) {
  __vec_strmb(ptr, 14, data);
}

// BE-PWR8-LABEL: @test_strmb15(
// BE-PWR8-NEXT:  entry:
// BE-PWR8-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR8-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR8-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR8-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR8-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// BE-PWR8-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR8-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
// BE-PWR8-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 7
// BE-PWR8-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64*
// BE-PWR8-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1
// BE-PWR8-NEXT:    store i64 [[TMP5]], i64* [[TMP4]], align 1
// BE-PWR8-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// BE-PWR8-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[TMP0]], i64 3
// BE-PWR8-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to i32*
// BE-PWR8-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP6]], i64 1
// BE-PWR8-NEXT:    store i32 [[TMP9]], i32* [[TMP8]], align 1
// BE-PWR8-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
// BE-PWR8-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1
// BE-PWR8-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP11]] to i16*
// BE-PWR8-NEXT:    [[TMP13:%.*]] = extractelement <8 x i16> [[TMP10]], i64 1
// BE-PWR8-NEXT:    store i16 [[TMP13]], i16* [[TMP12]], align 1
// BE-PWR8-NEXT:    [[TMP14:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// BE-PWR8-NEXT:    [[TMP15:%.*]] = extractelement <16 x i8> [[TMP1]], i64 1
// BE-PWR8-NEXT:    store i8 [[TMP15]], i8* [[TMP14]], align 1
// BE-PWR8-NEXT:    ret void
//
// LE-PWR8-LABEL: @test_strmb15(
// LE-PWR8-NEXT:  entry:
// LE-PWR8-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// LE-PWR8-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// LE-PWR8-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// LE-PWR8-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR8-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// LE-PWR8-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR8-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
// LE-PWR8-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 7
// LE-PWR8-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64*
// LE-PWR8-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
// LE-PWR8-NEXT:    [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
// LE-PWR8-NEXT:    store i64 [[TMP6]], i64* [[TMP4]], align 1
// LE-PWR8-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// LE-PWR8-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[TMP0]], i64 3
// LE-PWR8-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32*
// LE-PWR8-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP7]], i64 2
// LE-PWR8-NEXT:    [[TMP11:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
// LE-PWR8-NEXT:    store i32 [[TMP11]], i32* [[TMP9]], align 1
// LE-PWR8-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
// LE-PWR8-NEXT:    [[TMP13:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1
// LE-PWR8-NEXT:    [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i16*
// LE-PWR8-NEXT:    [[TMP15:%.*]] = extractelement <8 x i16> [[TMP12]], i64 6
// LE-PWR8-NEXT:    [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]])
// LE-PWR8-NEXT:    store i16 [[TMP16]], i16* [[TMP14]], align 1
// LE-PWR8-NEXT:    [[TMP17:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// LE-PWR8-NEXT:    [[TMP18:%.*]] = extractelement <16 x i8> [[TMP1]], i64 14
// LE-PWR8-NEXT:    store i8 [[TMP18]], i8* [[TMP17]], align 1
// LE-PWR8-NEXT:    ret void
//
// BE-PWR9-LABEL: @test_strmb15(
// BE-PWR9-NEXT:  entry:
// BE-PWR9-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT:    [[__C_ADDR_I:%.*]] = alloca i64, align 8
// BE-PWR9-NEXT:    [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR9-NEXT:    [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT:    store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT:    store i64 15, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// BE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// BE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// BE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// BE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// BE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// BE-PWR9-NEXT:    ret void
//
// LE-PWR9-LABEL: @test_strmb15(
// LE-PWR9-NEXT:  entry:
// LE-PWR9-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// LE-PWR9-NEXT:    [[__C_ADDR_I:%.*]] = alloca i64, align 8
// LE-PWR9-NEXT:    [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// LE-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// LE-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR9-NEXT:    [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// LE-PWR9-NEXT:    store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT:    store i64 15, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// LE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// LE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// LE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// LE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// LE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// LE-PWR9-NEXT:    ret void
//
// BE32-PWR9-LABEL: @test_strmb15(
// BE32-PWR9-NEXT:  entry:
// BE32-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 4
// BE32-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE32-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 4
// BE32-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE32-PWR9-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 4
// BE32-PWR9-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE32-PWR9-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
// BE32-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP0]], i64 7
// BE32-PWR9-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i64*
// BE32-PWR9-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1
// BE32-PWR9-NEXT:    store i64 [[TMP5]], i64* [[TMP4]], align 1
// BE32-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// BE32-PWR9-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[TMP0]], i64 3
// BE32-PWR9-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to i32*
// BE32-PWR9-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP6]], i64 1
// BE32-PWR9-NEXT:    store i32 [[TMP9]], i32* [[TMP8]], align 1
// BE32-PWR9-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
// BE32-PWR9-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1
// BE32-PWR9-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP11]] to i16*
// BE32-PWR9-NEXT:    [[TMP13:%.*]] = extractelement <8 x i16> [[TMP10]], i64 1
// BE32-PWR9-NEXT:    store i16 [[TMP13]], i16* [[TMP12]], align 1
// BE32-PWR9-NEXT:    [[TMP14:%.*]] = getelementptr i8, i8* [[TMP0]], i64 0
// BE32-PWR9-NEXT:    [[TMP15:%.*]] = extractelement <16 x i8> [[TMP1]], i64 1
// BE32-PWR9-NEXT:    store i8 [[TMP15]], i8* [[TMP14]], align 1
// BE32-PWR9-NEXT:    ret void
//
void test_strmb15(char *ptr, vector unsigned char data) {
  __vec_strmb(ptr, 15, data);
}
// BE-PWR8-LABEL: @test_ldrmb16(
// BE-PWR8-NEXT:  entry:
// BE-PWR8-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR8-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR8-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// BE-PWR8-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
// BE-PWR8-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
// BE-PWR8-NEXT:    ret <16 x i8> [[TMP2]]
//
// LE-PWR8-LABEL: @test_ldrmb16(
// LE-PWR8-NEXT:  entry:
// LE-PWR8-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// LE-PWR8-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// LE-PWR8-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// LE-PWR8-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
// LE-PWR8-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
// LE-PWR8-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> [[TMP2]], <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
// LE-PWR8-NEXT:    ret <16 x i8> [[TMP3]]
//
// BE-PWR9-LABEL: @test_ldrmb16(
// BE-PWR9-NEXT:  entry:
// BE-PWR9-NEXT:    [[__A_ADDR_I:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT:    [[__B_ADDR_I:%.*]] = alloca i64, align 8
// BE-PWR9-NEXT:    [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT:    store i8* [[TMP0]], i8** [[__A_ADDR_I]], align 8
// BE-PWR9-NEXT:    store i64 16, i64* [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[__A_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP2]], 56
// BE-PWR9-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]])
// BE-PWR9-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
// BE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT:    [[TMP5:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP5]]
// BE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// BE-PWR9-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
// BE-PWR9-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(i8* [[TMP6]])
// BE-PWR9-NEXT:    store <16 x i8> [[TMP7]], <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT:    [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]])
// BE-PWR9-NEXT:    [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <16 x i8>
// BE-PWR9-NEXT:    ret <16 x i8> [[TMP14]]
//
// LE-PWR9-LABEL: @test_ldrmb16(
// LE-PWR9-NEXT:  entry:
// LE-PWR9-NEXT:    [[__A_ADDR_I:%.*]] = alloca i8*, align 8
// LE-PWR9-NEXT:    [[__B_ADDR_I:%.*]] = alloca i64, align 8
// LE-PWR9-NEXT:    [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// LE-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// LE-PWR9-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// LE-PWR9-NEXT:    store i8* [[TMP0]], i8** [[__A_ADDR_I]], align 8
// LE-PWR9-NEXT:    store i64 16, i64* [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[__A_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP2]], 56
// LE-PWR9-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]])
// LE-PWR9-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
// LE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT:    [[TMP5:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP5]]
// LE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// LE-PWR9-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
// LE-PWR9-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(i8* [[TMP6]])
// LE-PWR9-NEXT:    store <16 x i8> [[TMP7]], <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT:    [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]])
// LE-PWR9-NEXT:    [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <16 x i8>
// LE-PWR9-NEXT:    ret <16 x i8> [[TMP14]]
//
// BE32-PWR9-LABEL: @test_ldrmb16(
// BE32-PWR9-NEXT:  entry:
// BE32-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 4
// BE32-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 4
// BE32-PWR9-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 4
// BE32-PWR9-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
// BE32-PWR9-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
// BE32-PWR9-NEXT:    ret <16 x i8> [[TMP2]]
//
vector unsigned char test_ldrmb16(char *ptr) { return __vec_ldrmb(ptr, 16); }

// BE-PWR8-LABEL: @test_strmb16(
// BE-PWR8-NEXT:  entry:
// BE-PWR8-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR8-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR8-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR8-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR8-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// BE-PWR8-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR8-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
// BE-PWR8-NEXT:    store <16 x i8> [[TMP1]], <16 x i8>* [[TMP2]], align 1
// BE-PWR8-NEXT:    ret void
//
// LE-PWR8-LABEL: @test_strmb16(
// LE-PWR8-NEXT:  entry:
// LE-PWR8-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// LE-PWR8-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// LE-PWR8-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// LE-PWR8-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR8-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// LE-PWR8-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR8-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
// LE-PWR8-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
// LE-PWR8-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* [[TMP2]], align 1
// LE-PWR8-NEXT:    ret void
//
// BE-PWR9-LABEL: @test_strmb16(
// BE-PWR9-NEXT:  entry:
// BE-PWR9-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT:    [[__C_ADDR_I:%.*]] = alloca i64, align 8
// BE-PWR9-NEXT:    [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR9-NEXT:    [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT:    store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT:    store i64 16, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// BE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// BE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// BE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// BE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// BE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
// BE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// BE-PWR9-NEXT:    ret void
//
// LE-PWR9-LABEL: @test_strmb16(
// LE-PWR9-NEXT:  entry:
// LE-PWR9-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// LE-PWR9-NEXT:    [[__C_ADDR_I:%.*]] = alloca i64, align 8
// LE-PWR9-NEXT:    [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 8
// LE-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// LE-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// LE-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR9-NEXT:    [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// LE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// LE-PWR9-NEXT:    store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT:    store i64 16, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// LE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// LE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// LE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// LE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// LE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
// LE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// LE-PWR9-NEXT:    ret void
//
// BE32-PWR9-LABEL: @test_strmb16(
// BE32-PWR9-NEXT:  entry:
// BE32-PWR9-NEXT:    [[PTR_ADDR:%.*]] = alloca i8*, align 4
// BE32-PWR9-NEXT:    [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE32-PWR9-NEXT:    store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 4
// BE32-PWR9-NEXT:    store <16 x i8> [[DATA:%.*]], <16 x i8>* [[DATA_ADDR]], align 16
// BE32-PWR9-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 4
// BE32-PWR9-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[DATA_ADDR]], align 16
// BE32-PWR9-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
// BE32-PWR9-NEXT:    store <16 x i8> [[TMP1]], <16 x i8>* [[TMP2]], align 1
// BE32-PWR9-NEXT:    ret void
//
void test_strmb16(char *ptr, vector unsigned char data) {
  __vec_strmb(ptr, 16, data);
}