Compiler projects using llvm
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle -O2 | FileCheck %s --check-prefixes=AVX512,AVX512-FAST
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq,+fast-variable-perlane-shuffle -O2 | FileCheck %s --check-prefixes=AVX512,AVX512-FAST-PERLANE
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle -O2 | FileCheck %s --check-prefixes=AVX512NOTDQ,AVX512NOTDQ-FAST
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+fast-variable-perlane-shuffle -O2 | FileCheck %s --check-prefixes=AVX512NOTDQ,AVX512NOTDQ-FAST-PERLANE

define void @load_v8i1_broadcast_4_v2i1(ptr %a0,<2 x double> %a1,<2 x double> %a2,ptr %a3) {
; AVX512-LABEL: load_v8i1_broadcast_4_v2i1:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovb (%rdi), %k0
; AVX512-NEXT:    kshiftrb $4, %k0, %k0
; AVX512-NEXT:    vpmovm2q %k0, %xmm2
; AVX512-NEXT:    vpbroadcastq %xmm2, %xmm2
; AVX512-NEXT:    vpmovq2m %xmm2, %k1
; AVX512-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
; AVX512-NEXT:    vmovapd %xmm1, (%rsi)
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v2i1:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    movzbl (%rdi), %eax
; AVX512NOTDQ-NEXT:    kmovd %eax, %k0
; AVX512NOTDQ-NEXT:    kshiftrw $4, %k0, %k1
; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm2, %xmm2 {%k1} {z}
; AVX512NOTDQ-NEXT:    vpbroadcastq %xmm2, %xmm2
; AVX512NOTDQ-NEXT:    vptestmq %xmm2, %xmm2, %k1
; AVX512NOTDQ-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
; AVX512NOTDQ-NEXT:    vmovapd %xmm1, (%rsi)
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <8 x i1>, ptr %a0
    %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 4,i32 4>
    %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
    store <2 x double> %d2, ptr %a3
    ret void
}
define void @load_v8i1_broadcast_7_v2i1(ptr %a0,<2 x double> %a1,<2 x double> %a2,ptr %a3) {
; AVX512-LABEL: load_v8i1_broadcast_7_v2i1:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovb (%rdi), %k0
; AVX512-NEXT:    kshiftrb $6, %k0, %k0
; AVX512-NEXT:    vpmovm2q %k0, %xmm2
; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; AVX512-NEXT:    vpmovq2m %xmm2, %k1
; AVX512-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
; AVX512-NEXT:    vmovapd %xmm1, (%rsi)
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v2i1:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    movzbl (%rdi), %eax
; AVX512NOTDQ-NEXT:    kmovd %eax, %k0
; AVX512NOTDQ-NEXT:    kshiftrw $6, %k0, %k1
; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm2, %xmm2 {%k1} {z}
; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; AVX512NOTDQ-NEXT:    vptestmq %xmm2, %xmm2, %k1
; AVX512NOTDQ-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
; AVX512NOTDQ-NEXT:    vmovapd %xmm1, (%rsi)
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <8 x i1>, ptr %a0
    %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 7,i32 7>
    %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
    store <2 x double> %d2, ptr %a3
    ret void
}
define void @load_v16i1_broadcast_8_v2i1(ptr %a0,<2 x double> %a1,<2 x double> %a2,ptr %a3) {
; AVX512-LABEL: load_v16i1_broadcast_8_v2i1:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovw (%rdi), %k0
; AVX512-NEXT:    kshiftrw $8, %k0, %k0
; AVX512-NEXT:    vpmovm2q %k0, %xmm2
; AVX512-NEXT:    vpbroadcastq %xmm2, %xmm2
; AVX512-NEXT:    vpmovq2m %xmm2, %k1
; AVX512-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
; AVX512-NEXT:    vmovapd %xmm1, (%rsi)
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v2i1:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
; AVX512NOTDQ-NEXT:    kshiftrw $8, %k0, %k1
; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm2, %xmm2 {%k1} {z}
; AVX512NOTDQ-NEXT:    vpbroadcastq %xmm2, %xmm2
; AVX512NOTDQ-NEXT:    vptestmq %xmm2, %xmm2, %k1
; AVX512NOTDQ-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
; AVX512NOTDQ-NEXT:    vmovapd %xmm1, (%rsi)
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <16 x i1>, ptr %a0
    %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 8,i32 8>
    %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
    store <2 x double> %d2, ptr %a3
    ret void
}
define void @load_v16i1_broadcast_8_v4i1(ptr %a0,<4 x float> %a1,<4 x float> %a2,ptr %a3) {
; AVX512-LABEL: load_v16i1_broadcast_8_v4i1:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovw (%rdi), %k0
; AVX512-NEXT:    kshiftrw $8, %k0, %k0
; AVX512-NEXT:    vpmovm2d %k0, %xmm2
; AVX512-NEXT:    vpbroadcastd %xmm2, %xmm2
; AVX512-NEXT:    vpmovd2m %xmm2, %k1
; AVX512-NEXT:    vmovaps %xmm0, %xmm1 {%k1}
; AVX512-NEXT:    vmovaps %xmm1, (%rsi)
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v4i1:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
; AVX512NOTDQ-NEXT:    kshiftrw $8, %k0, %k1
; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm2, %xmm2 {%k1} {z}
; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm2, %xmm2
; AVX512NOTDQ-NEXT:    vptestmd %xmm2, %xmm2, %k1
; AVX512NOTDQ-NEXT:    vmovaps %xmm0, %xmm1 {%k1}
; AVX512NOTDQ-NEXT:    vmovaps %xmm1, (%rsi)
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <16 x i1>, ptr %a0
    %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 8,i32 8,i32 8,i32 8>
    %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
    store <4 x float> %d2, ptr %a3
    ret void
}
define void @load_v16i1_broadcast_15_v2i1(ptr %a0,<2 x double> %a1,<2 x double> %a2,ptr %a3) {
; AVX512-LABEL: load_v16i1_broadcast_15_v2i1:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovw (%rdi), %k0
; AVX512-NEXT:    kshiftrw $14, %k0, %k0
; AVX512-NEXT:    vpmovm2q %k0, %xmm2
; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; AVX512-NEXT:    vpmovq2m %xmm2, %k1
; AVX512-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
; AVX512-NEXT:    vmovapd %xmm1, (%rsi)
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v2i1:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
; AVX512NOTDQ-NEXT:    kshiftrw $14, %k0, %k1
; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm2, %xmm2 {%k1} {z}
; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; AVX512NOTDQ-NEXT:    vptestmq %xmm2, %xmm2, %k1
; AVX512NOTDQ-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
; AVX512NOTDQ-NEXT:    vmovapd %xmm1, (%rsi)
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <16 x i1>, ptr %a0
    %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 15,i32 15>
    %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
    store <2 x double> %d2, ptr %a3
    ret void
}
define void @load_v16i1_broadcast_15_v4i1(ptr %a0,<4 x float> %a1,<4 x float> %a2,ptr %a3) {
; AVX512-LABEL: load_v16i1_broadcast_15_v4i1:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovw (%rdi), %k0
; AVX512-NEXT:    kshiftrw $12, %k0, %k0
; AVX512-NEXT:    vpmovm2d %k0, %xmm2
; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
; AVX512-NEXT:    vpmovd2m %xmm2, %k1
; AVX512-NEXT:    vmovaps %xmm0, %xmm1 {%k1}
; AVX512-NEXT:    vmovaps %xmm1, (%rsi)
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v4i1:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
; AVX512NOTDQ-NEXT:    kshiftrw $12, %k0, %k1
; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm2, %xmm2 {%k1} {z}
; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
; AVX512NOTDQ-NEXT:    vptestmd %xmm2, %xmm2, %k1
; AVX512NOTDQ-NEXT:    vmovaps %xmm0, %xmm1 {%k1}
; AVX512NOTDQ-NEXT:    vmovaps %xmm1, (%rsi)
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <16 x i1>, ptr %a0
    %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 15,i32 15,i32 15,i32 15>
    %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
    store <4 x float> %d2, ptr %a3
    ret void
}
define void @load_v32i1_broadcast_16_v2i1(ptr %a0,<2 x double> %a1,<2 x double> %a2,ptr %a3) {
; AVX512-LABEL: load_v32i1_broadcast_16_v2i1:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovd (%rdi), %k0
; AVX512-NEXT:    kshiftrd $16, %k0, %k0
; AVX512-NEXT:    vpmovm2q %k0, %xmm2
; AVX512-NEXT:    vpbroadcastq %xmm2, %xmm2
; AVX512-NEXT:    vpmovq2m %xmm2, %k1
; AVX512-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
; AVX512-NEXT:    vmovapd %xmm1, (%rsi)
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v2i1:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
; AVX512NOTDQ-NEXT:    kshiftrd $16, %k0, %k1
; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm2, %xmm2 {%k1} {z}
; AVX512NOTDQ-NEXT:    vpbroadcastq %xmm2, %xmm2
; AVX512NOTDQ-NEXT:    vptestmq %xmm2, %xmm2, %k1
; AVX512NOTDQ-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
; AVX512NOTDQ-NEXT:    vmovapd %xmm1, (%rsi)
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <32 x i1>, ptr %a0
    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 16,i32 16>
    %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
    store <2 x double> %d2, ptr %a3
    ret void
}
define void @load_v32i1_broadcast_16_v4i1(ptr %a0,<4 x float> %a1,<4 x float> %a2,ptr %a3) {
; AVX512-LABEL: load_v32i1_broadcast_16_v4i1:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovd (%rdi), %k0
; AVX512-NEXT:    kshiftrd $16, %k0, %k0
; AVX512-NEXT:    vpmovm2d %k0, %xmm2
; AVX512-NEXT:    vpbroadcastd %xmm2, %xmm2
; AVX512-NEXT:    vpmovd2m %xmm2, %k1
; AVX512-NEXT:    vmovaps %xmm0, %xmm1 {%k1}
; AVX512-NEXT:    vmovaps %xmm1, (%rsi)
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v4i1:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
; AVX512NOTDQ-NEXT:    kshiftrd $16, %k0, %k1
; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm2, %xmm2 {%k1} {z}
; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm2, %xmm2
; AVX512NOTDQ-NEXT:    vptestmd %xmm2, %xmm2, %k1
; AVX512NOTDQ-NEXT:    vmovaps %xmm0, %xmm1 {%k1}
; AVX512NOTDQ-NEXT:    vmovaps %xmm1, (%rsi)
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <32 x i1>, ptr %a0
    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 16,i32 16,i32 16,i32 16>
    %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
    store <4 x float> %d2, ptr %a3
    ret void
}
define void @load_v32i1_broadcast_16_v8i1(ptr %a0,<8 x float> %a1,<8 x float> %a2,ptr %a3) {
; AVX512-LABEL: load_v32i1_broadcast_16_v8i1:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovb 2(%rdi), %k0
; AVX512-NEXT:    vpmovm2d %k0, %ymm2
; AVX512-NEXT:    vpbroadcastd %xmm2, %ymm2
; AVX512-NEXT:    vpmovd2m %ymm2, %k1
; AVX512-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
; AVX512-NEXT:    vmovaps %ymm1, (%rsi)
; AVX512-NEXT:    vzeroupper
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v8i1:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    kmovw 2(%rdi), %k1
; AVX512NOTDQ-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
; AVX512NOTDQ-NEXT:    vmovdqa32 %ymm2, %ymm2 {%k1} {z}
; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm2, %ymm2
; AVX512NOTDQ-NEXT:    vptestmd %ymm2, %ymm2, %k1
; AVX512NOTDQ-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
; AVX512NOTDQ-NEXT:    vmovaps %ymm1, (%rsi)
; AVX512NOTDQ-NEXT:    vzeroupper
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <32 x i1>, ptr %a0
    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16>
    %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2
    store <8 x float> %d2, ptr %a3
    ret void
}
define void @load_v32i1_broadcast_31_v2i1(ptr %a0,<2 x double> %a1,<2 x double> %a2,ptr %a3) {
; AVX512-LABEL: load_v32i1_broadcast_31_v2i1:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovd (%rdi), %k0
; AVX512-NEXT:    kshiftrd $30, %k0, %k0
; AVX512-NEXT:    vpmovm2q %k0, %xmm2
; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; AVX512-NEXT:    vpmovq2m %xmm2, %k1
; AVX512-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
; AVX512-NEXT:    vmovapd %xmm1, (%rsi)
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v2i1:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
; AVX512NOTDQ-NEXT:    kshiftrd $30, %k0, %k1
; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm2, %xmm2 {%k1} {z}
; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; AVX512NOTDQ-NEXT:    vptestmq %xmm2, %xmm2, %k1
; AVX512NOTDQ-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
; AVX512NOTDQ-NEXT:    vmovapd %xmm1, (%rsi)
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <32 x i1>, ptr %a0
    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 31,i32 31>
    %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
    store <2 x double> %d2, ptr %a3
    ret void
}
define void @load_v32i1_broadcast_31_v4i1(ptr %a0,<4 x float> %a1,<4 x float> %a2,ptr %a3) {
; AVX512-LABEL: load_v32i1_broadcast_31_v4i1:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovd (%rdi), %k0
; AVX512-NEXT:    kshiftrd $28, %k0, %k0
; AVX512-NEXT:    vpmovm2d %k0, %xmm2
; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
; AVX512-NEXT:    vpmovd2m %xmm2, %k1
; AVX512-NEXT:    vmovaps %xmm0, %xmm1 {%k1}
; AVX512-NEXT:    vmovaps %xmm1, (%rsi)
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v4i1:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
; AVX512NOTDQ-NEXT:    kshiftrd $28, %k0, %k1
; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm2, %xmm2 {%k1} {z}
; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
; AVX512NOTDQ-NEXT:    vptestmd %xmm2, %xmm2, %k1
; AVX512NOTDQ-NEXT:    vmovaps %xmm0, %xmm1 {%k1}
; AVX512NOTDQ-NEXT:    vmovaps %xmm1, (%rsi)
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <32 x i1>, ptr %a0
    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 31,i32 31,i32 31,i32 31>
    %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
    store <4 x float> %d2, ptr %a3
    ret void
}
define void @load_v32i1_broadcast_31_v8i1(ptr %a0,<8 x float> %a1,<8 x float> %a2,ptr %a3) {
; AVX512-FAST-LABEL: load_v32i1_broadcast_31_v8i1:
; AVX512-FAST:       # %bb.0:
; AVX512-FAST-NEXT:    kmovb 3(%rdi), %k0
; AVX512-FAST-NEXT:    vpmovm2d %k0, %ymm2
; AVX512-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7]
; AVX512-FAST-NEXT:    vpermd %ymm2, %ymm3, %ymm2
; AVX512-FAST-NEXT:    vpmovd2m %ymm2, %k1
; AVX512-FAST-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
; AVX512-FAST-NEXT:    vmovaps %ymm1, (%rsi)
; AVX512-FAST-NEXT:    vzeroupper
; AVX512-FAST-NEXT:    retq
;
; AVX512-FAST-PERLANE-LABEL: load_v32i1_broadcast_31_v8i1:
; AVX512-FAST-PERLANE:       # %bb.0:
; AVX512-FAST-PERLANE-NEXT:    kmovb 3(%rdi), %k0
; AVX512-FAST-PERLANE-NEXT:    vpmovm2d %k0, %ymm2
; AVX512-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7]
; AVX512-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
; AVX512-FAST-PERLANE-NEXT:    vpmovd2m %ymm2, %k1
; AVX512-FAST-PERLANE-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
; AVX512-FAST-PERLANE-NEXT:    vmovaps %ymm1, (%rsi)
; AVX512-FAST-PERLANE-NEXT:    vzeroupper
; AVX512-FAST-PERLANE-NEXT:    retq
;
; AVX512NOTDQ-FAST-LABEL: load_v32i1_broadcast_31_v8i1:
; AVX512NOTDQ-FAST:       # %bb.0:
; AVX512NOTDQ-FAST-NEXT:    movzbl 3(%rdi), %eax
; AVX512NOTDQ-FAST-NEXT:    kmovd %eax, %k1
; AVX512NOTDQ-FAST-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
; AVX512NOTDQ-FAST-NEXT:    vmovdqa32 %ymm2, %ymm2 {%k1} {z}
; AVX512NOTDQ-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7]
; AVX512NOTDQ-FAST-NEXT:    vpermd %ymm2, %ymm3, %ymm2
; AVX512NOTDQ-FAST-NEXT:    vptestmd %ymm2, %ymm2, %k1
; AVX512NOTDQ-FAST-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
; AVX512NOTDQ-FAST-NEXT:    vmovaps %ymm1, (%rsi)
; AVX512NOTDQ-FAST-NEXT:    vzeroupper
; AVX512NOTDQ-FAST-NEXT:    retq
;
; AVX512NOTDQ-FAST-PERLANE-LABEL: load_v32i1_broadcast_31_v8i1:
; AVX512NOTDQ-FAST-PERLANE:       # %bb.0:
; AVX512NOTDQ-FAST-PERLANE-NEXT:    movzbl 3(%rdi), %eax
; AVX512NOTDQ-FAST-PERLANE-NEXT:    kmovd %eax, %k1
; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
; AVX512NOTDQ-FAST-PERLANE-NEXT:    vmovdqa32 %ymm2, %ymm2 {%k1} {z}
; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7]
; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
; AVX512NOTDQ-FAST-PERLANE-NEXT:    vptestmd %ymm2, %ymm2, %k1
; AVX512NOTDQ-FAST-PERLANE-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
; AVX512NOTDQ-FAST-PERLANE-NEXT:    vmovaps %ymm1, (%rsi)
; AVX512NOTDQ-FAST-PERLANE-NEXT:    vzeroupper
; AVX512NOTDQ-FAST-PERLANE-NEXT:    retq
    %d0 = load <32 x i1>, ptr %a0
    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31>
    %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2
    store <8 x float> %d2, ptr %a3
    ret void
}
define void @load_v64i1_broadcast_32_v2i1(ptr %a0,<2 x double> %a1,<2 x double> %a2,ptr %a3) {
; AVX512-LABEL: load_v64i1_broadcast_32_v2i1:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovq (%rdi), %k0
; AVX512-NEXT:    kshiftrq $32, %k0, %k0
; AVX512-NEXT:    vpmovm2q %k0, %xmm2
; AVX512-NEXT:    vpbroadcastq %xmm2, %xmm2
; AVX512-NEXT:    vpmovq2m %xmm2, %k1
; AVX512-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
; AVX512-NEXT:    vmovapd %xmm1, (%rsi)
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v2i1:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
; AVX512NOTDQ-NEXT:    kshiftrq $32, %k0, %k1
; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm2, %xmm2 {%k1} {z}
; AVX512NOTDQ-NEXT:    vpbroadcastq %xmm2, %xmm2
; AVX512NOTDQ-NEXT:    vptestmq %xmm2, %xmm2, %k1
; AVX512NOTDQ-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
; AVX512NOTDQ-NEXT:    vmovapd %xmm1, (%rsi)
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <64 x i1>, ptr %a0
    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 32,i32 32>
    %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
    store <2 x double> %d2, ptr %a3
    ret void
}
define void @load_v64i1_broadcast_32_v4i1(ptr %a0,<4 x float> %a1,<4 x float> %a2,ptr %a3) {
; AVX512-LABEL: load_v64i1_broadcast_32_v4i1:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovq (%rdi), %k0
; AVX512-NEXT:    kshiftrq $32, %k0, %k0
; AVX512-NEXT:    vpmovm2d %k0, %xmm2
; AVX512-NEXT:    vpbroadcastd %xmm2, %xmm2
; AVX512-NEXT:    vpmovd2m %xmm2, %k1
; AVX512-NEXT:    vmovaps %xmm0, %xmm1 {%k1}
; AVX512-NEXT:    vmovaps %xmm1, (%rsi)
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v4i1:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
; AVX512NOTDQ-NEXT:    kshiftrq $32, %k0, %k1
; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm2, %xmm2 {%k1} {z}
; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm2, %xmm2
; AVX512NOTDQ-NEXT:    vptestmd %xmm2, %xmm2, %k1
; AVX512NOTDQ-NEXT:    vmovaps %xmm0, %xmm1 {%k1}
; AVX512NOTDQ-NEXT:    vmovaps %xmm1, (%rsi)
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <64 x i1>, ptr %a0
    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 32,i32 32,i32 32,i32 32>
    %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
    store <4 x float> %d2, ptr %a3
    ret void
}
define void @load_v64i1_broadcast_32_v8i1(ptr %a0,<8 x float> %a1,<8 x float> %a2,ptr %a3) {
; AVX512-LABEL: load_v64i1_broadcast_32_v8i1:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovb 4(%rdi), %k0
; AVX512-NEXT:    vpmovm2d %k0, %ymm2
; AVX512-NEXT:    vpbroadcastd %xmm2, %ymm2
; AVX512-NEXT:    vpmovd2m %ymm2, %k1
; AVX512-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
; AVX512-NEXT:    vmovaps %ymm1, (%rsi)
; AVX512-NEXT:    vzeroupper
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v8i1:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    kmovw 4(%rdi), %k1
; AVX512NOTDQ-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
; AVX512NOTDQ-NEXT:    vmovdqa32 %ymm2, %ymm2 {%k1} {z}
; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm2, %ymm2
; AVX512NOTDQ-NEXT:    vptestmd %ymm2, %ymm2, %k1
; AVX512NOTDQ-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
; AVX512NOTDQ-NEXT:    vmovaps %ymm1, (%rsi)
; AVX512NOTDQ-NEXT:    vzeroupper
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <64 x i1>, ptr %a0
    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32>
    %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2
    store <8 x float> %d2, ptr %a3
    ret void
}
define void @load_v64i1_broadcast_32_v16i1(ptr %a0,<16 x float> %a1,<16 x float> %a2,ptr %a3) {
; AVX512-LABEL: load_v64i1_broadcast_32_v16i1:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovw 4(%rdi), %k0
; AVX512-NEXT:    vpmovm2d %k0, %zmm2
; AVX512-NEXT:    vpbroadcastd %xmm2, %zmm2
; AVX512-NEXT:    vpmovd2m %zmm2, %k1
; AVX512-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
; AVX512-NEXT:    vmovaps %zmm1, (%rsi)
; AVX512-NEXT:    vzeroupper
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v16i1:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    kmovw 4(%rdi), %k1
; AVX512NOTDQ-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm2, %zmm2
; AVX512NOTDQ-NEXT:    vptestmd %zmm2, %zmm2, %k1
; AVX512NOTDQ-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
; AVX512NOTDQ-NEXT:    vmovaps %zmm1, (%rsi)
; AVX512NOTDQ-NEXT:    vzeroupper
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <64 x i1>, ptr %a0
    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32>
    %d2 = select <16 x i1> %d1, <16 x float> %a1, <16 x float> %a2
    store <16 x float> %d2, ptr %a3
    ret void
}
define void @load_v64i1_broadcast_63_v2i1(ptr %a0,<2 x double> %a1,<2 x double> %a2,ptr %a3) {
; AVX512-LABEL: load_v64i1_broadcast_63_v2i1:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovq (%rdi), %k0
; AVX512-NEXT:    kshiftrq $62, %k0, %k0
; AVX512-NEXT:    vpmovm2q %k0, %xmm2
; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; AVX512-NEXT:    vpmovq2m %xmm2, %k1
; AVX512-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
; AVX512-NEXT:    vmovapd %xmm1, (%rsi)
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v2i1:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
; AVX512NOTDQ-NEXT:    kshiftrq $62, %k0, %k1
; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm2, %xmm2 {%k1} {z}
; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; AVX512NOTDQ-NEXT:    vptestmq %xmm2, %xmm2, %k1
; AVX512NOTDQ-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
; AVX512NOTDQ-NEXT:    vmovapd %xmm1, (%rsi)
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <64 x i1>, ptr %a0
    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 63,i32 63>
    %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
    store <2 x double> %d2, ptr %a3
    ret void
}
define void @load_v64i1_broadcast_63_v4i1(ptr %a0,<4 x float> %a1,<4 x float> %a2,ptr %a3) {
; AVX512-LABEL: load_v64i1_broadcast_63_v4i1:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovq (%rdi), %k0
; AVX512-NEXT:    kshiftrq $60, %k0, %k0
; AVX512-NEXT:    vpmovm2d %k0, %xmm2
; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
; AVX512-NEXT:    vpmovd2m %xmm2, %k1
; AVX512-NEXT:    vmovaps %xmm0, %xmm1 {%k1}
; AVX512-NEXT:    vmovaps %xmm1, (%rsi)
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v4i1:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
; AVX512NOTDQ-NEXT:    kshiftrq $60, %k0, %k1
; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm2, %xmm2 {%k1} {z}
; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
; AVX512NOTDQ-NEXT:    vptestmd %xmm2, %xmm2, %k1
; AVX512NOTDQ-NEXT:    vmovaps %xmm0, %xmm1 {%k1}
; AVX512NOTDQ-NEXT:    vmovaps %xmm1, (%rsi)
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <64 x i1>, ptr %a0
    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 63,i32 63,i32 63,i32 63>
    %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
    store <4 x float> %d2, ptr %a3
    ret void
}
define void @load_v64i1_broadcast_63_v8i1(ptr %a0,<8 x float> %a1,<8 x float> %a2,ptr %a3) {
; AVX512-FAST-LABEL: load_v64i1_broadcast_63_v8i1:
; AVX512-FAST:       # %bb.0:
; AVX512-FAST-NEXT:    kmovb 7(%rdi), %k0
; AVX512-FAST-NEXT:    vpmovm2d %k0, %ymm2
; AVX512-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7]
; AVX512-FAST-NEXT:    vpermd %ymm2, %ymm3, %ymm2
; AVX512-FAST-NEXT:    vpmovd2m %ymm2, %k1
; AVX512-FAST-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
; AVX512-FAST-NEXT:    vmovaps %ymm1, (%rsi)
; AVX512-FAST-NEXT:    vzeroupper
; AVX512-FAST-NEXT:    retq
;
; AVX512-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v8i1:
; AVX512-FAST-PERLANE:       # %bb.0:
; AVX512-FAST-PERLANE-NEXT:    kmovb 7(%rdi), %k0
; AVX512-FAST-PERLANE-NEXT:    vpmovm2d %k0, %ymm2
; AVX512-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7]
; AVX512-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
; AVX512-FAST-PERLANE-NEXT:    vpmovd2m %ymm2, %k1
; AVX512-FAST-PERLANE-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
; AVX512-FAST-PERLANE-NEXT:    vmovaps %ymm1, (%rsi)
; AVX512-FAST-PERLANE-NEXT:    vzeroupper
; AVX512-FAST-PERLANE-NEXT:    retq
;
; AVX512NOTDQ-FAST-LABEL: load_v64i1_broadcast_63_v8i1:
; AVX512NOTDQ-FAST:       # %bb.0:
; AVX512NOTDQ-FAST-NEXT:    movzbl 7(%rdi), %eax
; AVX512NOTDQ-FAST-NEXT:    kmovd %eax, %k1
; AVX512NOTDQ-FAST-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
; AVX512NOTDQ-FAST-NEXT:    vmovdqa32 %ymm2, %ymm2 {%k1} {z}
; AVX512NOTDQ-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7]
; AVX512NOTDQ-FAST-NEXT:    vpermd %ymm2, %ymm3, %ymm2
; AVX512NOTDQ-FAST-NEXT:    vptestmd %ymm2, %ymm2, %k1
; AVX512NOTDQ-FAST-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
; AVX512NOTDQ-FAST-NEXT:    vmovaps %ymm1, (%rsi)
; AVX512NOTDQ-FAST-NEXT:    vzeroupper
; AVX512NOTDQ-FAST-NEXT:    retq
;
; AVX512NOTDQ-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v8i1:
; AVX512NOTDQ-FAST-PERLANE:       # %bb.0:
; AVX512NOTDQ-FAST-PERLANE-NEXT:    movzbl 7(%rdi), %eax
; AVX512NOTDQ-FAST-PERLANE-NEXT:    kmovd %eax, %k1
; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
; AVX512NOTDQ-FAST-PERLANE-NEXT:    vmovdqa32 %ymm2, %ymm2 {%k1} {z}
; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7]
; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
; AVX512NOTDQ-FAST-PERLANE-NEXT:    vptestmd %ymm2, %ymm2, %k1
; AVX512NOTDQ-FAST-PERLANE-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
; AVX512NOTDQ-FAST-PERLANE-NEXT:    vmovaps %ymm1, (%rsi)
; AVX512NOTDQ-FAST-PERLANE-NEXT:    vzeroupper
; AVX512NOTDQ-FAST-PERLANE-NEXT:    retq
    %d0 = load <64 x i1>, ptr %a0
    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63>
    %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2
    store <8 x float> %d2, ptr %a3
    ret void
}
define void @load_v64i1_broadcast_63_v16i1(ptr %a0,<16 x float> %a1,<16 x float> %a2,ptr %a3) {
; AVX512-FAST-LABEL: load_v64i1_broadcast_63_v16i1:
; AVX512-FAST:       # %bb.0:
; AVX512-FAST-NEXT:    kmovw 6(%rdi), %k0
; AVX512-FAST-NEXT:    vpmovm2d %k0, %zmm2
; AVX512-FAST-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512-FAST-NEXT:    vpermd %zmm2, %zmm3, %zmm2
; AVX512-FAST-NEXT:    vpmovd2m %zmm2, %k1
; AVX512-FAST-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
; AVX512-FAST-NEXT:    vmovaps %zmm1, (%rsi)
; AVX512-FAST-NEXT:    vzeroupper
; AVX512-FAST-NEXT:    retq
;
; AVX512-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v16i1:
; AVX512-FAST-PERLANE:       # %bb.0:
; AVX512-FAST-PERLANE-NEXT:    kmovw 6(%rdi), %k0
; AVX512-FAST-PERLANE-NEXT:    vpmovm2d %k0, %zmm2
; AVX512-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} zmm2 = zmm2[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
; AVX512-FAST-PERLANE-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[6,7,6,7,6,7,6,7]
; AVX512-FAST-PERLANE-NEXT:    vpmovd2m %zmm2, %k1
; AVX512-FAST-PERLANE-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
; AVX512-FAST-PERLANE-NEXT:    vmovaps %zmm1, (%rsi)
; AVX512-FAST-PERLANE-NEXT:    vzeroupper
; AVX512-FAST-PERLANE-NEXT:    retq
;
; AVX512NOTDQ-FAST-LABEL: load_v64i1_broadcast_63_v16i1:
; AVX512NOTDQ-FAST:       # %bb.0:
; AVX512NOTDQ-FAST-NEXT:    kmovw 6(%rdi), %k1
; AVX512NOTDQ-FAST-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; AVX512NOTDQ-FAST-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512NOTDQ-FAST-NEXT:    vpermd %zmm2, %zmm3, %zmm2
; AVX512NOTDQ-FAST-NEXT:    vptestmd %zmm2, %zmm2, %k1
; AVX512NOTDQ-FAST-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
; AVX512NOTDQ-FAST-NEXT:    vmovaps %zmm1, (%rsi)
; AVX512NOTDQ-FAST-NEXT:    vzeroupper
; AVX512NOTDQ-FAST-NEXT:    retq
;
; AVX512NOTDQ-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v16i1:
; AVX512NOTDQ-FAST-PERLANE:       # %bb.0:
; AVX512NOTDQ-FAST-PERLANE-NEXT:    kmovw 6(%rdi), %k1
; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} zmm2 = zmm2[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
; AVX512NOTDQ-FAST-PERLANE-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[6,7,6,7,6,7,6,7]
; AVX512NOTDQ-FAST-PERLANE-NEXT:    vptestmd %zmm2, %zmm2, %k1
; AVX512NOTDQ-FAST-PERLANE-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
; AVX512NOTDQ-FAST-PERLANE-NEXT:    vmovaps %zmm1, (%rsi)
; AVX512NOTDQ-FAST-PERLANE-NEXT:    vzeroupper
; AVX512NOTDQ-FAST-PERLANE-NEXT:    retq
    %d0 = load <64 x i1>, ptr %a0
    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63>
    %d2 = select <16 x i1> %d1, <16 x float> %a1, <16 x float> %a2
    store <16 x float> %d2, ptr %a3
    ret void
}
define void @load_v2i1_broadcast_1_v1i1_store(ptr %a0,ptr %a1) {
; AVX512-LABEL: load_v2i1_broadcast_1_v1i1_store:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovb (%rdi), %k0
; AVX512-NEXT:    kshiftrb $1, %k0, %k0
; AVX512-NEXT:    kshiftlb $7, %k0, %k0
; AVX512-NEXT:    kshiftrb $7, %k0, %k0
; AVX512-NEXT:    kmovb %k0, (%rsi)
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v2i1_broadcast_1_v1i1_store:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    movzbl (%rdi), %eax
; AVX512NOTDQ-NEXT:    kmovd %eax, %k0
; AVX512NOTDQ-NEXT:    kshiftrw $1, %k0, %k0
; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <2 x i1>, ptr %a0
    %d1 = shufflevector <2 x i1> %d0,<2 x i1> undef,<1 x i32><i32 1>
    store <1 x i1> %d1, ptr %a1
    ret void
}
define void @load_v3i1_broadcast_1_v1i1_store(ptr %a0,ptr %a1) {
; AVX512-LABEL: load_v3i1_broadcast_1_v1i1_store:
; AVX512:       # %bb.0:
; AVX512-NEXT:    movzbl (%rdi), %eax
; AVX512-NEXT:    shrb %al
; AVX512-NEXT:    xorl %ecx, %ecx
; AVX512-NEXT:    testb $1, %al
; AVX512-NEXT:    movl $255, %eax
; AVX512-NEXT:    cmovel %ecx, %eax
; AVX512-NEXT:    kmovd %eax, %k0
; AVX512-NEXT:    kshiftrb $1, %k0, %k0
; AVX512-NEXT:    kshiftlb $7, %k0, %k0
; AVX512-NEXT:    kshiftrb $7, %k0, %k0
; AVX512-NEXT:    kmovb %k0, (%rsi)
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v3i1_broadcast_1_v1i1_store:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    movzbl (%rdi), %eax
; AVX512NOTDQ-NEXT:    shrb %al
; AVX512NOTDQ-NEXT:    xorl %ecx, %ecx
; AVX512NOTDQ-NEXT:    testb $1, %al
; AVX512NOTDQ-NEXT:    movl $255, %eax
; AVX512NOTDQ-NEXT:    cmovel %ecx, %eax
; AVX512NOTDQ-NEXT:    kmovd %eax, %k0
; AVX512NOTDQ-NEXT:    kshiftrw $1, %k0, %k0
; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <3 x i1>, ptr %a0
    %d1 = shufflevector <3 x i1> %d0,<3 x i1> undef,<1 x i32><i32 1>
    store <1 x i1> %d1, ptr %a1
    ret void
}
define void @load_v3i1_broadcast_2_v1i1_store(ptr %a0,ptr %a1) {
; AVX512-LABEL: load_v3i1_broadcast_2_v1i1_store:
; AVX512:       # %bb.0:
; AVX512-NEXT:    xorl %eax, %eax
; AVX512-NEXT:    testb $4, (%rdi)
; AVX512-NEXT:    movl $255, %ecx
; AVX512-NEXT:    cmovel %eax, %ecx
; AVX512-NEXT:    kmovd %ecx, %k0
; AVX512-NEXT:    kshiftrb $2, %k0, %k0
; AVX512-NEXT:    kshiftlb $7, %k0, %k0
; AVX512-NEXT:    kshiftrb $7, %k0, %k0
; AVX512-NEXT:    kmovb %k0, (%rsi)
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v3i1_broadcast_2_v1i1_store:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    xorl %eax, %eax
; AVX512NOTDQ-NEXT:    testb $4, (%rdi)
; AVX512NOTDQ-NEXT:    movl $255, %ecx
; AVX512NOTDQ-NEXT:    cmovel %eax, %ecx
; AVX512NOTDQ-NEXT:    kmovd %ecx, %k0
; AVX512NOTDQ-NEXT:    kshiftrw $2, %k0, %k0
; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <3 x i1>, ptr %a0
    %d1 = shufflevector <3 x i1> %d0,<3 x i1> undef,<1 x i32><i32 2>
    store <1 x i1> %d1, ptr %a1
    ret void
}
define void @load_v4i1_broadcast_2_v1i1_store(ptr %a0,ptr %a1) {
; AVX512-LABEL: load_v4i1_broadcast_2_v1i1_store:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovb (%rdi), %k0
; AVX512-NEXT:    kshiftrb $2, %k0, %k0
; AVX512-NEXT:    kshiftlb $7, %k0, %k0
; AVX512-NEXT:    kshiftrb $7, %k0, %k0
; AVX512-NEXT:    kmovb %k0, (%rsi)
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v4i1_broadcast_2_v1i1_store:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    movzbl (%rdi), %eax
; AVX512NOTDQ-NEXT:    kmovd %eax, %k0
; AVX512NOTDQ-NEXT:    kshiftrw $2, %k0, %k0
; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <4 x i1>, ptr %a0
    %d1 = shufflevector <4 x i1> %d0,<4 x i1> undef,<1 x i32><i32 2>
    store <1 x i1> %d1, ptr %a1
    ret void
}
define void @load_v4i1_broadcast_3_v1i1_store(ptr %a0,ptr %a1) {
; AVX512-LABEL: load_v4i1_broadcast_3_v1i1_store:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovb (%rdi), %k0
; AVX512-NEXT:    kshiftrb $3, %k0, %k0
; AVX512-NEXT:    kshiftlb $7, %k0, %k0
; AVX512-NEXT:    kshiftrb $7, %k0, %k0
; AVX512-NEXT:    kmovb %k0, (%rsi)
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v4i1_broadcast_3_v1i1_store:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    movzbl (%rdi), %eax
; AVX512NOTDQ-NEXT:    kmovd %eax, %k0
; AVX512NOTDQ-NEXT:    kshiftrw $3, %k0, %k0
; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <4 x i1>, ptr %a0
    %d1 = shufflevector <4 x i1> %d0,<4 x i1> undef,<1 x i32><i32 3>
    store <1 x i1> %d1, ptr %a1
    ret void
}
define void @load_v8i1_broadcast_4_v1i1_store(ptr %a0,ptr %a1) {
; AVX512-LABEL: load_v8i1_broadcast_4_v1i1_store:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovb (%rdi), %k0
; AVX512-NEXT:    kshiftrb $4, %k0, %k0
; AVX512-NEXT:    kshiftlb $7, %k0, %k0
; AVX512-NEXT:    kshiftrb $7, %k0, %k0
; AVX512-NEXT:    kmovb %k0, (%rsi)
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v1i1_store:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    movzbl (%rdi), %eax
; AVX512NOTDQ-NEXT:    kmovd %eax, %k0
; AVX512NOTDQ-NEXT:    kshiftrw $4, %k0, %k0
; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <8 x i1>, ptr %a0
    %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<1 x i32><i32 4>
    store <1 x i1> %d1, ptr %a1
    ret void
}
define void @load_v8i1_broadcast_4_v2i1_store(ptr %a0,ptr %a1) {
; AVX512-LABEL: load_v8i1_broadcast_4_v2i1_store:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovb (%rdi), %k0
; AVX512-NEXT:    kshiftrb $4, %k0, %k0
; AVX512-NEXT:    vpmovm2q %k0, %xmm0
; AVX512-NEXT:    vpbroadcastq %xmm0, %xmm0
; AVX512-NEXT:    vpmovq2m %xmm0, %k0
; AVX512-NEXT:    kmovb %k0, (%rsi)
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v2i1_store:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    movzbl (%rdi), %eax
; AVX512NOTDQ-NEXT:    kmovd %eax, %k0
; AVX512NOTDQ-NEXT:    kshiftrw $4, %k0, %k1
; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512NOTDQ-NEXT:    vpbroadcastq %xmm0, %xmm0
; AVX512NOTDQ-NEXT:    vptestmq %xmm0, %xmm0, %k0
; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <8 x i1>, ptr %a0
    %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 4,i32 4>
    store <2 x i1> %d1, ptr %a1
    ret void
}
define void @load_v8i1_broadcast_7_v1i1_store(ptr %a0,ptr %a1) {
; AVX512-LABEL: load_v8i1_broadcast_7_v1i1_store:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovb (%rdi), %k0
; AVX512-NEXT:    kshiftrb $7, %k0, %k0
; AVX512-NEXT:    kshiftlb $7, %k0, %k0
; AVX512-NEXT:    kshiftrb $7, %k0, %k0
; AVX512-NEXT:    kmovb %k0, (%rsi)
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v1i1_store:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    movzbl (%rdi), %eax
; AVX512NOTDQ-NEXT:    kmovd %eax, %k0
; AVX512NOTDQ-NEXT:    kshiftrw $7, %k0, %k0
; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <8 x i1>, ptr %a0
    %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<1 x i32><i32 7>
    store <1 x i1> %d1, ptr %a1
    ret void
}
define void @load_v8i1_broadcast_7_v2i1_store(ptr %a0,ptr %a1) {
; AVX512-LABEL: load_v8i1_broadcast_7_v2i1_store:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovb (%rdi), %k0
; AVX512-NEXT:    kshiftrb $6, %k0, %k0
; AVX512-NEXT:    vpmovm2q %k0, %xmm0
; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX512-NEXT:    vpmovq2m %xmm0, %k0
; AVX512-NEXT:    kmovb %k0, (%rsi)
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v2i1_store:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    movzbl (%rdi), %eax
; AVX512NOTDQ-NEXT:    kmovd %eax, %k0
; AVX512NOTDQ-NEXT:    kshiftrw $6, %k0, %k1
; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX512NOTDQ-NEXT:    vptestmq %xmm0, %xmm0, %k0
; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <8 x i1>, ptr %a0
    %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 7,i32 7>
    store <2 x i1> %d1, ptr %a1
    ret void
}
define void @load_v16i1_broadcast_8_v1i1_store(ptr %a0,ptr %a1) {
; AVX512-LABEL: load_v16i1_broadcast_8_v1i1_store:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovw (%rdi), %k0
; AVX512-NEXT:    kshiftrw $8, %k0, %k0
; AVX512-NEXT:    kshiftlb $7, %k0, %k0
; AVX512-NEXT:    kshiftrb $7, %k0, %k0
; AVX512-NEXT:    kmovb %k0, (%rsi)
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v1i1_store:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
; AVX512NOTDQ-NEXT:    kshiftrw $8, %k0, %k0
; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <16 x i1>, ptr %a0
    %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<1 x i32><i32 8>
    store <1 x i1> %d1, ptr %a1
    ret void
}
define void @load_v16i1_broadcast_8_v2i1_store(ptr %a0,ptr %a1) {
; AVX512-LABEL: load_v16i1_broadcast_8_v2i1_store:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovw (%rdi), %k0
; AVX512-NEXT:    kshiftrw $8, %k0, %k0
; AVX512-NEXT:    vpmovm2q %k0, %xmm0
; AVX512-NEXT:    vpbroadcastq %xmm0, %xmm0
; AVX512-NEXT:    vpmovq2m %xmm0, %k0
; AVX512-NEXT:    kmovb %k0, (%rsi)
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v2i1_store:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
; AVX512NOTDQ-NEXT:    kshiftrw $8, %k0, %k1
; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512NOTDQ-NEXT:    vpbroadcastq %xmm0, %xmm0
; AVX512NOTDQ-NEXT:    vptestmq %xmm0, %xmm0, %k0
; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <16 x i1>, ptr %a0
    %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 8,i32 8>
    store <2 x i1> %d1, ptr %a1
    ret void
}
define void @load_v16i1_broadcast_8_v4i1_store(ptr %a0,ptr %a1) {
; AVX512-LABEL: load_v16i1_broadcast_8_v4i1_store:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovw (%rdi), %k0
; AVX512-NEXT:    kshiftrw $8, %k0, %k0
; AVX512-NEXT:    vpmovm2d %k0, %xmm0
; AVX512-NEXT:    vpbroadcastd %xmm0, %xmm0
; AVX512-NEXT:    vpmovd2m %xmm0, %k0
; AVX512-NEXT:    kmovb %k0, (%rsi)
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v4i1_store:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
; AVX512NOTDQ-NEXT:    kshiftrw $8, %k0, %k1
; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm0, %xmm0
; AVX512NOTDQ-NEXT:    vptestmd %xmm0, %xmm0, %k0
; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <16 x i1>, ptr %a0
    %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 8,i32 8,i32 8,i32 8>
    store <4 x i1> %d1, ptr %a1
    ret void
}
define void @load_v16i1_broadcast_15_v1i1_store(ptr %a0,ptr %a1) {
; AVX512-LABEL: load_v16i1_broadcast_15_v1i1_store:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovw (%rdi), %k0
; AVX512-NEXT:    kshiftrw $15, %k0, %k0
; AVX512-NEXT:    kshiftlb $7, %k0, %k0
; AVX512-NEXT:    kshiftrb $7, %k0, %k0
; AVX512-NEXT:    kmovb %k0, (%rsi)
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v1i1_store:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <16 x i1>, ptr %a0
    %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<1 x i32><i32 15>
    store <1 x i1> %d1, ptr %a1
    ret void
}
define void @load_v16i1_broadcast_15_v2i1_store(ptr %a0,ptr %a1) {
; AVX512-LABEL: load_v16i1_broadcast_15_v2i1_store:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovw (%rdi), %k0
; AVX512-NEXT:    kshiftrw $14, %k0, %k0
; AVX512-NEXT:    vpmovm2q %k0, %xmm0
; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX512-NEXT:    vpmovq2m %xmm0, %k0
; AVX512-NEXT:    kmovb %k0, (%rsi)
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v2i1_store:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
; AVX512NOTDQ-NEXT:    kshiftrw $14, %k0, %k1
; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX512NOTDQ-NEXT:    vptestmq %xmm0, %xmm0, %k0
; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <16 x i1>, ptr %a0
    %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 15,i32 15>
    store <2 x i1> %d1, ptr %a1
    ret void
}
define void @load_v16i1_broadcast_15_v4i1_store(ptr %a0,ptr %a1) {
; AVX512-LABEL: load_v16i1_broadcast_15_v4i1_store:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovw (%rdi), %k0
; AVX512-NEXT:    kshiftrw $12, %k0, %k0
; AVX512-NEXT:    vpmovm2d %k0, %xmm0
; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512-NEXT:    vpmovd2m %xmm0, %k0
; AVX512-NEXT:    kmovb %k0, (%rsi)
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v4i1_store:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
; AVX512NOTDQ-NEXT:    kshiftrw $12, %k0, %k1
; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512NOTDQ-NEXT:    vptestmd %xmm0, %xmm0, %k0
; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <16 x i1>, ptr %a0
    %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 15,i32 15,i32 15,i32 15>
    store <4 x i1> %d1, ptr %a1
    ret void
}
define void @load_v32i1_broadcast_16_v1i1_store(ptr %a0,ptr %a1) {
; AVX512-LABEL: load_v32i1_broadcast_16_v1i1_store:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovd (%rdi), %k0
; AVX512-NEXT:    kshiftrd $16, %k0, %k0
; AVX512-NEXT:    kshiftlb $7, %k0, %k0
; AVX512-NEXT:    kshiftrb $7, %k0, %k0
; AVX512-NEXT:    kmovb %k0, (%rsi)
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v1i1_store:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
; AVX512NOTDQ-NEXT:    kshiftrd $16, %k0, %k0
; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <32 x i1>, ptr %a0
    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<1 x i32><i32 16>
    store <1 x i1> %d1, ptr %a1
    ret void
}
define void @load_v32i1_broadcast_16_v2i1_store(ptr %a0,ptr %a1) {
; AVX512-LABEL: load_v32i1_broadcast_16_v2i1_store:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovd (%rdi), %k0
; AVX512-NEXT:    kshiftrd $16, %k0, %k0
; AVX512-NEXT:    vpmovm2q %k0, %xmm0
; AVX512-NEXT:    vpbroadcastq %xmm0, %xmm0
; AVX512-NEXT:    vpmovq2m %xmm0, %k0
; AVX512-NEXT:    kmovb %k0, (%rsi)
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v2i1_store:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
; AVX512NOTDQ-NEXT:    kshiftrd $16, %k0, %k1
; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512NOTDQ-NEXT:    vpbroadcastq %xmm0, %xmm0
; AVX512NOTDQ-NEXT:    vptestmq %xmm0, %xmm0, %k0
; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <32 x i1>, ptr %a0
    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 16,i32 16>
    store <2 x i1> %d1, ptr %a1
    ret void
}
define void @load_v32i1_broadcast_16_v4i1_store(ptr %a0,ptr %a1) {
; AVX512-LABEL: load_v32i1_broadcast_16_v4i1_store:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovd (%rdi), %k0
; AVX512-NEXT:    kshiftrd $16, %k0, %k0
; AVX512-NEXT:    vpmovm2d %k0, %xmm0
; AVX512-NEXT:    vpbroadcastd %xmm0, %xmm0
; AVX512-NEXT:    vpmovd2m %xmm0, %k0
; AVX512-NEXT:    kmovb %k0, (%rsi)
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v4i1_store:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
; AVX512NOTDQ-NEXT:    kshiftrd $16, %k0, %k1
; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm0, %xmm0
; AVX512NOTDQ-NEXT:    vptestmd %xmm0, %xmm0, %k0
; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <32 x i1>, ptr %a0
    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 16,i32 16,i32 16,i32 16>
    store <4 x i1> %d1, ptr %a1
    ret void
}
define void @load_v32i1_broadcast_16_v8i1_store(ptr %a0,ptr %a1) {
; AVX512-LABEL: load_v32i1_broadcast_16_v8i1_store:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovb 2(%rdi), %k0
; AVX512-NEXT:    vpmovm2d %k0, %ymm0
; AVX512-NEXT:    vpbroadcastd %xmm0, %ymm0
; AVX512-NEXT:    vpmovd2m %ymm0, %k0
; AVX512-NEXT:    kmovb %k0, (%rsi)
; AVX512-NEXT:    vzeroupper
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v8i1_store:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    kmovw 2(%rdi), %k1
; AVX512NOTDQ-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512NOTDQ-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm0, %ymm0
; AVX512NOTDQ-NEXT:    vptestmd %ymm0, %ymm0, %k0
; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
; AVX512NOTDQ-NEXT:    vzeroupper
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <32 x i1>, ptr %a0
    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16>
    store <8 x i1> %d1, ptr %a1
    ret void
}
define void @load_v32i1_broadcast_31_v1i1_store(ptr %a0,ptr %a1) {
; AVX512-LABEL: load_v32i1_broadcast_31_v1i1_store:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovd (%rdi), %k0
; AVX512-NEXT:    kshiftrd $31, %k0, %k0
; AVX512-NEXT:    kshiftlb $7, %k0, %k0
; AVX512-NEXT:    kshiftrb $7, %k0, %k0
; AVX512-NEXT:    kmovb %k0, (%rsi)
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v1i1_store:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
; AVX512NOTDQ-NEXT:    kshiftrd $31, %k0, %k0
; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <32 x i1>, ptr %a0
    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<1 x i32><i32 31>
    store <1 x i1> %d1, ptr %a1
    ret void
}
define void @load_v32i1_broadcast_31_v2i1_store(ptr %a0,ptr %a1) {
; AVX512-LABEL: load_v32i1_broadcast_31_v2i1_store:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovd (%rdi), %k0
; AVX512-NEXT:    kshiftrd $30, %k0, %k0
; AVX512-NEXT:    vpmovm2q %k0, %xmm0
; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX512-NEXT:    vpmovq2m %xmm0, %k0
; AVX512-NEXT:    kmovb %k0, (%rsi)
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v2i1_store:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
; AVX512NOTDQ-NEXT:    kshiftrd $30, %k0, %k1
; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX512NOTDQ-NEXT:    vptestmq %xmm0, %xmm0, %k0
; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <32 x i1>, ptr %a0
    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 31,i32 31>
    store <2 x i1> %d1, ptr %a1
    ret void
}
define void @load_v32i1_broadcast_31_v4i1_store(ptr %a0,ptr %a1) {
; AVX512-LABEL: load_v32i1_broadcast_31_v4i1_store:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovd (%rdi), %k0
; AVX512-NEXT:    kshiftrd $28, %k0, %k0
; AVX512-NEXT:    vpmovm2d %k0, %xmm0
; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512-NEXT:    vpmovd2m %xmm0, %k0
; AVX512-NEXT:    kmovb %k0, (%rsi)
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v4i1_store:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
; AVX512NOTDQ-NEXT:    kshiftrd $28, %k0, %k1
; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512NOTDQ-NEXT:    vptestmd %xmm0, %xmm0, %k0
; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <32 x i1>, ptr %a0
    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 31,i32 31,i32 31,i32 31>
    store <4 x i1> %d1, ptr %a1
    ret void
}
define void @load_v32i1_broadcast_31_v8i1_store(ptr %a0,ptr %a1) {
; AVX512-FAST-LABEL: load_v32i1_broadcast_31_v8i1_store:
; AVX512-FAST:       # %bb.0:
; AVX512-FAST-NEXT:    kmovb 3(%rdi), %k0
; AVX512-FAST-NEXT:    vpmovm2d %k0, %ymm0
; AVX512-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7]
; AVX512-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
; AVX512-FAST-NEXT:    vpmovd2m %ymm0, %k0
; AVX512-FAST-NEXT:    kmovb %k0, (%rsi)
; AVX512-FAST-NEXT:    vzeroupper
; AVX512-FAST-NEXT:    retq
;
; AVX512-FAST-PERLANE-LABEL: load_v32i1_broadcast_31_v8i1_store:
; AVX512-FAST-PERLANE:       # %bb.0:
; AVX512-FAST-PERLANE-NEXT:    kmovb 3(%rdi), %k0
; AVX512-FAST-PERLANE-NEXT:    vpmovm2d %k0, %ymm0
; AVX512-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7]
; AVX512-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX512-FAST-PERLANE-NEXT:    vpmovd2m %ymm0, %k0
; AVX512-FAST-PERLANE-NEXT:    kmovb %k0, (%rsi)
; AVX512-FAST-PERLANE-NEXT:    vzeroupper
; AVX512-FAST-PERLANE-NEXT:    retq
;
; AVX512NOTDQ-FAST-LABEL: load_v32i1_broadcast_31_v8i1_store:
; AVX512NOTDQ-FAST:       # %bb.0:
; AVX512NOTDQ-FAST-NEXT:    movzbl 3(%rdi), %eax
; AVX512NOTDQ-FAST-NEXT:    kmovd %eax, %k1
; AVX512NOTDQ-FAST-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512NOTDQ-FAST-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512NOTDQ-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7]
; AVX512NOTDQ-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
; AVX512NOTDQ-FAST-NEXT:    vptestmd %ymm0, %ymm0, %k0
; AVX512NOTDQ-FAST-NEXT:    kmovd %k0, %eax
; AVX512NOTDQ-FAST-NEXT:    movb %al, (%rsi)
; AVX512NOTDQ-FAST-NEXT:    vzeroupper
; AVX512NOTDQ-FAST-NEXT:    retq
;
; AVX512NOTDQ-FAST-PERLANE-LABEL: load_v32i1_broadcast_31_v8i1_store:
; AVX512NOTDQ-FAST-PERLANE:       # %bb.0:
; AVX512NOTDQ-FAST-PERLANE-NEXT:    movzbl 3(%rdi), %eax
; AVX512NOTDQ-FAST-PERLANE-NEXT:    kmovd %eax, %k1
; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512NOTDQ-FAST-PERLANE-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7]
; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX512NOTDQ-FAST-PERLANE-NEXT:    vptestmd %ymm0, %ymm0, %k0
; AVX512NOTDQ-FAST-PERLANE-NEXT:    kmovd %k0, %eax
; AVX512NOTDQ-FAST-PERLANE-NEXT:    movb %al, (%rsi)
; AVX512NOTDQ-FAST-PERLANE-NEXT:    vzeroupper
; AVX512NOTDQ-FAST-PERLANE-NEXT:    retq
    %d0 = load <32 x i1>, ptr %a0
    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31>
    store <8 x i1> %d1, ptr %a1
    ret void
}
define void @load_v64i1_broadcast_32_v1i1_store(ptr %a0,ptr %a1) {
; AVX512-LABEL: load_v64i1_broadcast_32_v1i1_store:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovq (%rdi), %k0
; AVX512-NEXT:    kshiftrq $32, %k0, %k0
; AVX512-NEXT:    kshiftlb $7, %k0, %k0
; AVX512-NEXT:    kshiftrb $7, %k0, %k0
; AVX512-NEXT:    kmovb %k0, (%rsi)
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v1i1_store:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
; AVX512NOTDQ-NEXT:    kshiftrq $32, %k0, %k0
; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <64 x i1>, ptr %a0
    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<1 x i32><i32 32>
    store <1 x i1> %d1, ptr %a1
    ret void
}
define void @load_v64i1_broadcast_32_v2i1_store(ptr %a0,ptr %a1) {
; AVX512-LABEL: load_v64i1_broadcast_32_v2i1_store:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovq (%rdi), %k0
; AVX512-NEXT:    kshiftrq $32, %k0, %k0
; AVX512-NEXT:    vpmovm2q %k0, %xmm0
; AVX512-NEXT:    vpbroadcastq %xmm0, %xmm0
; AVX512-NEXT:    vpmovq2m %xmm0, %k0
; AVX512-NEXT:    kmovb %k0, (%rsi)
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v2i1_store:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
; AVX512NOTDQ-NEXT:    kshiftrq $32, %k0, %k1
; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512NOTDQ-NEXT:    vpbroadcastq %xmm0, %xmm0
; AVX512NOTDQ-NEXT:    vptestmq %xmm0, %xmm0, %k0
; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <64 x i1>, ptr %a0
    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 32,i32 32>
    store <2 x i1> %d1, ptr %a1
    ret void
}
define void @load_v64i1_broadcast_32_v4i1_store(ptr %a0,ptr %a1) {
; AVX512-LABEL: load_v64i1_broadcast_32_v4i1_store:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovq (%rdi), %k0
; AVX512-NEXT:    kshiftrq $32, %k0, %k0
; AVX512-NEXT:    vpmovm2d %k0, %xmm0
; AVX512-NEXT:    vpbroadcastd %xmm0, %xmm0
; AVX512-NEXT:    vpmovd2m %xmm0, %k0
; AVX512-NEXT:    kmovb %k0, (%rsi)
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v4i1_store:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
; AVX512NOTDQ-NEXT:    kshiftrq $32, %k0, %k1
; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm0, %xmm0
; AVX512NOTDQ-NEXT:    vptestmd %xmm0, %xmm0, %k0
; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <64 x i1>, ptr %a0
    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 32,i32 32,i32 32,i32 32>
    store <4 x i1> %d1, ptr %a1
    ret void
}
define void @load_v64i1_broadcast_32_v8i1_store(ptr %a0,ptr %a1) {
; AVX512-LABEL: load_v64i1_broadcast_32_v8i1_store:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovb 4(%rdi), %k0
; AVX512-NEXT:    vpmovm2d %k0, %ymm0
; AVX512-NEXT:    vpbroadcastd %xmm0, %ymm0
; AVX512-NEXT:    vpmovd2m %ymm0, %k0
; AVX512-NEXT:    kmovb %k0, (%rsi)
; AVX512-NEXT:    vzeroupper
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v8i1_store:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    kmovw 4(%rdi), %k1
; AVX512NOTDQ-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512NOTDQ-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm0, %ymm0
; AVX512NOTDQ-NEXT:    vptestmd %ymm0, %ymm0, %k0
; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
; AVX512NOTDQ-NEXT:    vzeroupper
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <64 x i1>, ptr %a0
    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32>
    store <8 x i1> %d1, ptr %a1
    ret void
}
define void @load_v64i1_broadcast_32_v16i1_store(ptr %a0,ptr %a1) {
; AVX512-LABEL: load_v64i1_broadcast_32_v16i1_store:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovw 4(%rdi), %k0
; AVX512-NEXT:    vpmovm2d %k0, %zmm0
; AVX512-NEXT:    vpbroadcastd %xmm0, %zmm0
; AVX512-NEXT:    vpmovd2m %zmm0, %k0
; AVX512-NEXT:    kmovw %k0, (%rsi)
; AVX512-NEXT:    vzeroupper
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v16i1_store:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    kmovw 4(%rdi), %k1
; AVX512NOTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm0, %zmm0
; AVX512NOTDQ-NEXT:    vptestmd %zmm0, %zmm0, %k0
; AVX512NOTDQ-NEXT:    kmovw %k0, (%rsi)
; AVX512NOTDQ-NEXT:    vzeroupper
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <64 x i1>, ptr %a0
    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32>
    store <16 x i1> %d1, ptr %a1
    ret void
}
define void @load_v64i1_broadcast_63_v1i1_store(ptr %a0,ptr %a1) {
; AVX512-LABEL: load_v64i1_broadcast_63_v1i1_store:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovq (%rdi), %k0
; AVX512-NEXT:    kshiftrq $63, %k0, %k0
; AVX512-NEXT:    kshiftlb $7, %k0, %k0
; AVX512-NEXT:    kshiftrb $7, %k0, %k0
; AVX512-NEXT:    kmovb %k0, (%rsi)
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v1i1_store:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
; AVX512NOTDQ-NEXT:    kshiftrq $63, %k0, %k0
; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <64 x i1>, ptr %a0
    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<1 x i32><i32 63>
    store <1 x i1> %d1, ptr %a1
    ret void
}
define void @load_v64i1_broadcast_63_v2i1_store(ptr %a0,ptr %a1) {
; AVX512-LABEL: load_v64i1_broadcast_63_v2i1_store:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovq (%rdi), %k0
; AVX512-NEXT:    kshiftrq $62, %k0, %k0
; AVX512-NEXT:    vpmovm2q %k0, %xmm0
; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX512-NEXT:    vpmovq2m %xmm0, %k0
; AVX512-NEXT:    kmovb %k0, (%rsi)
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v2i1_store:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
; AVX512NOTDQ-NEXT:    kshiftrq $62, %k0, %k1
; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX512NOTDQ-NEXT:    vptestmq %xmm0, %xmm0, %k0
; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <64 x i1>, ptr %a0
    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 63,i32 63>
    store <2 x i1> %d1, ptr %a1
    ret void
}
define void @load_v64i1_broadcast_63_v4i1_store(ptr %a0,ptr %a1) {
; AVX512-LABEL: load_v64i1_broadcast_63_v4i1_store:
; AVX512:       # %bb.0:
; AVX512-NEXT:    kmovq (%rdi), %k0
; AVX512-NEXT:    kshiftrq $60, %k0, %k0
; AVX512-NEXT:    vpmovm2d %k0, %xmm0
; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512-NEXT:    vpmovd2m %xmm0, %k0
; AVX512-NEXT:    kmovb %k0, (%rsi)
; AVX512-NEXT:    retq
;
; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v4i1_store:
; AVX512NOTDQ:       # %bb.0:
; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
; AVX512NOTDQ-NEXT:    kshiftrq $60, %k0, %k1
; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512NOTDQ-NEXT:    vptestmd %xmm0, %xmm0, %k0
; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
; AVX512NOTDQ-NEXT:    retq
    %d0 = load <64 x i1>, ptr %a0
    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 63,i32 63,i32 63,i32 63>
    store <4 x i1> %d1, ptr %a1
    ret void
}
define void @load_v64i1_broadcast_63_v8i1_store(ptr %a0,ptr %a1) {
; AVX512-FAST-LABEL: load_v64i1_broadcast_63_v8i1_store:
; AVX512-FAST:       # %bb.0:
; AVX512-FAST-NEXT:    kmovb 7(%rdi), %k0
; AVX512-FAST-NEXT:    vpmovm2d %k0, %ymm0
; AVX512-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7]
; AVX512-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
; AVX512-FAST-NEXT:    vpmovd2m %ymm0, %k0
; AVX512-FAST-NEXT:    kmovb %k0, (%rsi)
; AVX512-FAST-NEXT:    vzeroupper
; AVX512-FAST-NEXT:    retq
;
; AVX512-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v8i1_store:
; AVX512-FAST-PERLANE:       # %bb.0:
; AVX512-FAST-PERLANE-NEXT:    kmovb 7(%rdi), %k0
; AVX512-FAST-PERLANE-NEXT:    vpmovm2d %k0, %ymm0
; AVX512-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7]
; AVX512-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX512-FAST-PERLANE-NEXT:    vpmovd2m %ymm0, %k0
; AVX512-FAST-PERLANE-NEXT:    kmovb %k0, (%rsi)
; AVX512-FAST-PERLANE-NEXT:    vzeroupper
; AVX512-FAST-PERLANE-NEXT:    retq
;
; AVX512NOTDQ-FAST-LABEL: load_v64i1_broadcast_63_v8i1_store:
; AVX512NOTDQ-FAST:       # %bb.0:
; AVX512NOTDQ-FAST-NEXT:    movzbl 7(%rdi), %eax
; AVX512NOTDQ-FAST-NEXT:    kmovd %eax, %k1
; AVX512NOTDQ-FAST-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512NOTDQ-FAST-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512NOTDQ-FAST-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7]
; AVX512NOTDQ-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
; AVX512NOTDQ-FAST-NEXT:    vptestmd %ymm0, %ymm0, %k0
; AVX512NOTDQ-FAST-NEXT:    kmovd %k0, %eax
; AVX512NOTDQ-FAST-NEXT:    movb %al, (%rsi)
; AVX512NOTDQ-FAST-NEXT:    vzeroupper
; AVX512NOTDQ-FAST-NEXT:    retq
;
; AVX512NOTDQ-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v8i1_store:
; AVX512NOTDQ-FAST-PERLANE:       # %bb.0:
; AVX512NOTDQ-FAST-PERLANE-NEXT:    movzbl 7(%rdi), %eax
; AVX512NOTDQ-FAST-PERLANE-NEXT:    kmovd %eax, %k1
; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512NOTDQ-FAST-PERLANE-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7]
; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX512NOTDQ-FAST-PERLANE-NEXT:    vptestmd %ymm0, %ymm0, %k0
; AVX512NOTDQ-FAST-PERLANE-NEXT:    kmovd %k0, %eax
; AVX512NOTDQ-FAST-PERLANE-NEXT:    movb %al, (%rsi)
; AVX512NOTDQ-FAST-PERLANE-NEXT:    vzeroupper
; AVX512NOTDQ-FAST-PERLANE-NEXT:    retq
    %d0 = load <64 x i1>, ptr %a0
    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63>
    store <8 x i1> %d1, ptr %a1
    ret void
}
define void @load_v64i1_broadcast_63_v16i1_store(ptr %a0,ptr %a1) {
; AVX512-FAST-LABEL: load_v64i1_broadcast_63_v16i1_store:
; AVX512-FAST:       # %bb.0:
; AVX512-FAST-NEXT:    kmovw 6(%rdi), %k0
; AVX512-FAST-NEXT:    vpmovm2d %k0, %zmm0
; AVX512-FAST-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512-FAST-NEXT:    vpmovd2m %zmm0, %k0
; AVX512-FAST-NEXT:    kmovw %k0, (%rsi)
; AVX512-FAST-NEXT:    vzeroupper
; AVX512-FAST-NEXT:    retq
;
; AVX512-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v16i1_store:
; AVX512-FAST-PERLANE:       # %bb.0:
; AVX512-FAST-PERLANE-NEXT:    kmovw 6(%rdi), %k0
; AVX512-FAST-PERLANE-NEXT:    vpmovm2d %k0, %zmm0
; AVX512-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} zmm0 = zmm0[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
; AVX512-FAST-PERLANE-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,6,7,6,7,6,7]
; AVX512-FAST-PERLANE-NEXT:    vpmovd2m %zmm0, %k0
; AVX512-FAST-PERLANE-NEXT:    kmovw %k0, (%rsi)
; AVX512-FAST-PERLANE-NEXT:    vzeroupper
; AVX512-FAST-PERLANE-NEXT:    retq
;
; AVX512NOTDQ-FAST-LABEL: load_v64i1_broadcast_63_v16i1_store:
; AVX512NOTDQ-FAST:       # %bb.0:
; AVX512NOTDQ-FAST-NEXT:    kmovw 6(%rdi), %k1
; AVX512NOTDQ-FAST-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512NOTDQ-FAST-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512NOTDQ-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm0
; AVX512NOTDQ-FAST-NEXT:    vptestmd %zmm0, %zmm0, %k0
; AVX512NOTDQ-FAST-NEXT:    kmovw %k0, (%rsi)
; AVX512NOTDQ-FAST-NEXT:    vzeroupper
; AVX512NOTDQ-FAST-NEXT:    retq
;
; AVX512NOTDQ-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v16i1_store:
; AVX512NOTDQ-FAST-PERLANE:       # %bb.0:
; AVX512NOTDQ-FAST-PERLANE-NEXT:    kmovw 6(%rdi), %k1
; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} zmm0 = zmm0[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
; AVX512NOTDQ-FAST-PERLANE-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,6,7,6,7,6,7]
; AVX512NOTDQ-FAST-PERLANE-NEXT:    vptestmd %zmm0, %zmm0, %k0
; AVX512NOTDQ-FAST-PERLANE-NEXT:    kmovw %k0, (%rsi)
; AVX512NOTDQ-FAST-PERLANE-NEXT:    vzeroupper
; AVX512NOTDQ-FAST-PERLANE-NEXT:    retq
    %d0 = load <64 x i1>, ptr %a0
    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63>
    store <16 x i1> %d1, ptr %a1
    ret void
}