; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle %s -o - | FileCheck --check-prefixes=CHECK,CHECK-FAST %s ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw,+fast-variable-perlane-shuffle %s -o - | FileCheck --check-prefixes=CHECK,CHECK-FAST-PERLANE %s ; FIXME: All cases here should be fixed by PR34380 define <8 x i16> @test_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec) { ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [8,6,12,4,7,9,14,8] ; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8> ret <8 x i16> %res } define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [8,6,12,4,7,9,14,8] ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 ret <8 x i16> %res } define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [8,6,12,4,7,9,14,8] ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer ret <8 x i16> %res } define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [4,12,9,4,14,15,12,14] ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 ret <8 x i16> %res } define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [4,12,9,4,14,15,12,14] ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer ret <8 x i16> %res } define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [4,11,14,10,7,1,6,9] ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 ret <8 x i16> %res } define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [4,11,14,10,7,1,6,9] ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer ret <8 x i16> %res } define <8 x i16> @test_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec) { ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [14,15,7,13,4,12,8,0] ; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0> ret <8 x i16> %res } define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [14,15,7,13,4,12,8,0] ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 ret <8 x i16> %res } define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,7,13,4,12,8,0] ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer ret <8 x i16> %res } define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %xmm1 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [0,7,13,3,5,13,3,9] ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <16 x i16>, ptr %vp %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9> ret <8 x i16> %res } define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %xmm2 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [0,7,13,3,5,13,3,9] ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <16 x i16>, ptr %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 ret <8 x i16> %res } define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %xmm2 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [0,7,13,3,5,13,3,9] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <16 x i16>, ptr %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer ret <8 x i16> %res } define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %xmm2 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [3,15,12,7,1,5,8,14] ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <16 x i16>, ptr %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 3, i32 15, i32 12, i32 7, i32 1, i32 5, i32 8, i32 14> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 ret <8 x i16> %res } define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %xmm2 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [3,15,12,7,1,5,8,14] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <16 x i16>, ptr %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 3, i32 15, i32 12, i32 7, i32 1, i32 5, i32 8, i32 14> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer ret <8 x i16> %res } define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [9,0,3,0,5,0,7,1] ; CHECK-NEXT: vpermi2w (%rdi), %xmm2, %xmm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <16 x i16>, ptr %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 ret <8 x i16> %res } define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [9,0,3,0,5,0,7,1] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermi2w (%rdi), %xmm2, %xmm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <16 x i16>, ptr %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer ret <8 x i16> %res } define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %xmm1 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [9,7,9,6,9,4,3,2] ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <16 x i16>, ptr %vp %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2> ret <8 x i16> %res } define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %xmm2 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [9,7,9,6,9,4,3,2] ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <16 x i16>, ptr %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 ret <8 x i16> %res } define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %xmm2 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [9,7,9,6,9,4,3,2] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <16 x i16>, ptr %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer ret <8 x i16> %res } define <16 x i16> @test_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec) { ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2] ; CHECK-NEXT: vpermi2w %ymm0, %ymm2, %ymm1 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18> ret <16 x i16> %res } define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2] ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4 ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18> %cmp = icmp eq <16 x i16> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2 ret <16 x i16> %res } define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18> %cmp = icmp eq <16 x i16> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer ret <16 x i16> %res } define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26] ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4 ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 14, i32 21, i32 31, i32 29, i32 25, i32 2, i32 19, i32 15, i32 20, i32 27, i32 7, i32 23, i32 3, i32 7, i32 25, i32 10> %cmp = icmp eq <16 x i16> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2 ret <16 x i16> %res } define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 14, i32 21, i32 31, i32 29, i32 25, i32 2, i32 19, i32 15, i32 20, i32 27, i32 7, i32 23, i32 3, i32 7, i32 25, i32 10> %cmp = icmp eq <16 x i16> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer ret <16 x i16> %res } define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15] ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4 ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 26, i32 3, i32 4, i32 22, i32 1, i32 18, i32 29, i32 17, i32 21, i32 0, i32 20, i32 19, i32 18, i32 12, i32 11, i32 31> %cmp = icmp eq <16 x i16> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2 ret <16 x i16> %res } define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 26, i32 3, i32 4, i32 22, i32 1, i32 18, i32 29, i32 17, i32 21, i32 0, i32 20, i32 19, i32 18, i32 12, i32 11, i32 31> %cmp = icmp eq <16 x i16> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer ret <16 x i16> %res } define <16 x i16> @test_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec) { ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5] ; CHECK-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5> ret <16 x i16> %res } define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5] ; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm4 ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5> %cmp = icmp eq <16 x i16> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2 ret <16 x i16> %res } define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermt2w %ymm2, %ymm3, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5> %cmp = icmp eq <16 x i16> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer ret <16 x i16> %res } define <8 x i16> @test_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec) { ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [22,27,7,10,13,21,5,14] ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; CHECK-NEXT: vpermt2w %ymm0, %ymm2, %ymm1 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30> ret <8 x i16> %res } define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [22,27,7,10,13,21,5,14] ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4 ; CHECK-NEXT: vpermt2w %ymm0, %ymm3, %ymm4 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmw %xmm4, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 ret <8 x i16> %res } define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [22,27,7,10,13,21,5,14] ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermt2w %ymm0, %ymm3, %ymm2 {%k1} {z} ; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer ret <8 x i16> %res } define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,21,27,10,8,19,14,5] ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4 ; CHECK-NEXT: vpermt2w %ymm4, %ymm3, %ymm0 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 ret <8 x i16> %res } define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [1,21,27,10,8,19,14,5] ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermt2w %ymm3, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer ret <8 x i16> %res } define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [15,13,18,16,9,11,26,8] ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4 ; CHECK-NEXT: vpermt2w %ymm4, %ymm3, %ymm0 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 ret <8 x i16> %res } define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [15,13,18,16,9,11,26,8] ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermt2w %ymm3, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer ret <8 x i16> %res } define <8 x i16> @test_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec) { ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [17,0,23,10,1,8,7,30] ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; CHECK-NEXT: vpermt2w %ymm2, %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30> ret <8 x i16> %res } define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [17,0,23,10,1,8,7,30] ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4 ; CHECK-NEXT: vpermt2w %ymm4, %ymm3, %ymm0 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 ret <8 x i16> %res } define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [17,0,23,10,1,8,7,30] ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermt2w %ymm3, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer ret <8 x i16> %res } define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm1 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12] ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12> ret <16 x i16> %res } define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask0(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12] ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12> %cmp = icmp eq <16 x i16> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2 ret <16 x i16> %res } define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask0(ptr %vp, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12] ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12> %cmp = icmp eq <16 x i16> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer ret <16 x i16> %res } define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask1(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25] ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16, i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25> %cmp = icmp eq <16 x i16> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2 ret <16 x i16> %res } define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask1(ptr %vp, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25] ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16, i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25> %cmp = icmp eq <16 x i16> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer ret <16 x i16> %res } define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask2(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0] ; CHECK-NEXT: vpermi2w (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16> %cmp = icmp eq <16 x i16> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2 ret <16 x i16> %res } define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask2(ptr %vp, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0] ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermi2w (%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16> %cmp = icmp eq <16 x i16> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer ret <16 x i16> %res } define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm1 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16] ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16> ret <16 x i16> %res } define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask3(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16] ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16> %cmp = icmp eq <16 x i16> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2 ret <16 x i16> %res } define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask3(ptr %vp, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16] ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16> %cmp = icmp eq <16 x i16> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer ret <16 x i16> %res } define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [16,17,5,1,14,14,13,17] ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm0 ; CHECK-NEXT: vpermt2w (%rdi), %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1> ret <8 x i16> %res } define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [16,17,5,1,14,14,13,17] ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm3 ; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 ret <8 x i16> %res } define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [16,17,5,1,14,14,13,17] ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer ret <8 x i16> %res } define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [7,6,4,6,12,4,27,1] ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm3 ; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 ret <8 x i16> %res } define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [7,6,4,6,12,4,27,1] ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer ret <8 x i16> %res } define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [6,18,0,4,10,25,22,10] ; CHECK-NEXT: vmovdqa (%rdi), %ymm3 ; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 18, i32 0, i32 4, i32 10, i32 25, i32 22, i32 10> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 ret <8 x i16> %res } define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [6,18,0,4,10,25,22,10] ; CHECK-NEXT: vmovdqa (%rdi), %ymm1 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 18, i32 0, i32 4, i32 10, i32 25, i32 22, i32 10> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer ret <8 x i16> %res } define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [19,1,5,31,9,12,17,9] ; CHECK-NEXT: vmovdqa (%rdi), %ymm0 ; CHECK-NEXT: vpermt2w 32(%rdi), %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9> ret <8 x i16> %res } define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [19,1,5,31,9,12,17,9] ; CHECK-NEXT: vmovdqa (%rdi), %ymm3 ; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 ret <8 x i16> %res } define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [19,1,5,31,9,12,17,9] ; CHECK-NEXT: vmovdqa (%rdi), %ymm1 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer ret <8 x i16> %res } define <8 x i16> @test_16xi16_to_8xi16_E84C94EF(<16 x i16> %vec) { ; CHECK-LABEL: test_16xi16_to_8xi16_E84C94EF: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [14,8,4,12,9,4,14,15] ; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15> ret <8 x i16> %res } define <4 x i32> @test_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec) { ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4,0,3,2] ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2> ret <4 x i32> %res } define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [4,0,3,2] ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 ret <4 x i32> %res } define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [4,0,3,2] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer ret <4 x i32> %res } define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [3,0,7,3] ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 3, i32 0, i32 7, i32 3> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 ret <4 x i32> %res } define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [3,0,7,3] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 3, i32 0, i32 7, i32 3> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer ret <4 x i32> %res } define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3 ; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm3[1],xmm0[1] ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 6, i32 7, i32 2, i32 3> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 ret <4 x i32> %res } define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2 ; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm2[1],xmm0[1] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 6, i32 7, i32 2, i32 3> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer ret <4 x i32> %res } define <4 x i32> @test_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec) { ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [5,3,2,5] ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5> ret <4 x i32> %res } define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [5,3,2,5] ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 ret <4 x i32> %res } define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [5,3,2,5] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer ret <4 x i32> %res } define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps 16(%rdi), %xmm0 ; CHECK-NEXT: vshufps $7, (%rdi), %xmm0, %xmm0 # xmm0 = xmm0[3,1],mem[0,0] ; CHECK-NEXT: retq %vec = load <8 x i32>, ptr %vp %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0> ret <4 x i32> %res } define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 ; CHECK-NEXT: vshufps $7, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[3,1],mem[0,0] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i32>, ptr %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 ret <4 x i32> %res } define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps 16(%rdi), %xmm1 ; CHECK-NEXT: vshufps $7, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[3,1],mem[0,0] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <8 x i32>, ptr %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer ret <4 x i32> %res } define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %xmm2 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [5,0,0,3] ; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm3 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i32>, ptr %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 0, i32 0, i32 3> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 ret <4 x i32> %res } define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %xmm2 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [5,0,0,3] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <8 x i32>, ptr %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 0, i32 0, i32 3> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer ret <4 x i32> %res } define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [0,7,7,0] ; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm3 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i32>, ptr %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 3, i32 3, i32 4> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 ret <4 x i32> %res } define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [0,7,7,0] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <8 x i32>, ptr %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 3, i32 3, i32 4> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer ret <4 x i32> %res } define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm1 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [5,1,2,7] ; CHECK-NEXT: vpermi2d 16(%rdi), %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <8 x i32>, ptr %vp %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7> ret <4 x i32> %res } define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm2 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [5,1,2,7] ; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm3 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i32>, ptr %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 ret <4 x i32> %res } define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm2 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [5,1,2,7] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <8 x i32>, ptr %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer ret <4 x i32> %res } define <8 x i32> @test_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [1,13,11,14,7,10,1,6] ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6> ret <8 x i32> %res } define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,13,11,14,7,10,1,6] ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2 ret <8 x i32> %res } define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [1,13,11,14,7,10,1,6] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer ret <8 x i32> %res } define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [3,0,15,3,2,3,6,8] ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 3, i32 0, i32 15, i32 3, i32 2, i32 3, i32 6, i32 8> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2 ret <8 x i32> %res } define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,0,15,3,2,3,6,8] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 3, i32 0, i32 15, i32 3, i32 2, i32 3, i32 6, i32 8> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer ret <8 x i32> %res } define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [2,15,15,2,6,10,14,7] ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2 ret <8 x i32> %res } define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,15,15,2,6,10,14,7] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer ret <8 x i32> %res } define <8 x i32> @test_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [14,5,7,7,10,3,9,3] ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3> ret <8 x i32> %res } define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [14,5,7,7,10,3,9,3] ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2 ret <8 x i32> %res } define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [14,5,7,7,10,3,9,3] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer ret <8 x i32> %res } define <4 x i32> @test_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,12,4,6,4,12] ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12> ret <4 x i32> %res } define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,12,4,6,4,12] ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 ret <4 x i32> %res } define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,12,4,6,4,12] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer ret <4 x i32> %res } define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [5,1,3,4] ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 9, i32 11, i32 12> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 ret <4 x i32> %res } define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [5,1,3,4] ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 9, i32 11, i32 12> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer ret <4 x i32> %res } define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,13,0] ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 1, i32 1, i32 13, i32 0> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 ret <4 x i32> %res } define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,13,0] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 1, i32 1, i32 13, i32 0> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer ret <4 x i32> %res } define <4 x i32> @test_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,0,0,13] ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13> ret <4 x i32> %res } define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [3,0,0,13] ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 ret <4 x i32> %res } define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [3,0,0,13] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer ret <4 x i32> %res } define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,6,0,1,2,4,4] ; CHECK-NEXT: vpermps 32(%rdi), %ymm0, %ymm0 ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12> ret <8 x i32> %res } define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask0(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [7,0,6,0,1,2,4,4] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd 32(%rdi), %ymm2, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2 ret <8 x i32> %res } define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask0(ptr %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,0,6,0,1,2,4,4] ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermd 32(%rdi), %ymm1, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer ret <8 x i32> %res } define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask1(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [7,3,6,11,0,1,5,15] ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vmovdqa32 %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 11, i32 14, i32 3, i32 8, i32 9, i32 13, i32 7> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2 ret <8 x i32> %res } define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask1(ptr %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,3,6,11,0,1,5,15] ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 11, i32 14, i32 3, i32 8, i32 9, i32 13, i32 7> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer ret <8 x i32> %res } define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask2(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [4,14,1,5,4,2,8,10] ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vmovdqa32 %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 12, i32 6, i32 9, i32 13, i32 12, i32 10, i32 0, i32 2> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2 ret <8 x i32> %res } define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask2(ptr %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,1,5,4,2,8,10] ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 12, i32 6, i32 9, i32 13, i32 12, i32 10, i32 0, i32 2> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer ret <8 x i32> %res } define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm1 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [8,4,1,13,15,4,6,12] ; CHECK-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12> ret <8 x i32> %res } define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask3(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [8,4,1,13,15,4,6,12] ; CHECK-NEXT: vpermi2d 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vmovdqa32 %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2 ret <8 x i32> %res } define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask3(ptr %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [8,4,1,13,15,4,6,12] ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermi2d 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer ret <8 x i32> %res } define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [13,0,0,6] ; CHECK-NEXT: vmovdqa (%rdi), %ymm0 ; CHECK-NEXT: vpermt2d 32(%rdi), %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6> ret <4 x i32> %res } define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [13,0,0,6] ; CHECK-NEXT: vmovdqa (%rdi), %ymm3 ; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 ret <4 x i32> %res } define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [13,0,0,6] ; CHECK-NEXT: vmovdqa (%rdi), %ymm1 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer ret <4 x i32> %res } define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <15,5,3,2,u,u,u,u> ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 7, i32 13, i32 11, i32 10> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 ret <4 x i32> %res } define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = <15,5,3,2,u,u,u,u> ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 7, i32 13, i32 11, i32 10> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer ret <4 x i32> %res } define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [2,15,6,9] ; CHECK-NEXT: vmovdqa (%rdi), %ymm3 ; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 2, i32 15, i32 6, i32 9> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 ret <4 x i32> %res } define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [2,15,6,9] ; CHECK-NEXT: vmovdqa (%rdi), %ymm1 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 2, i32 15, i32 6, i32 9> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer ret <4 x i32> %res } define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [2,4,3,6] ; CHECK-NEXT: vpermi2d (%rdi), %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2> ret <4 x i32> %res } define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [2,4,3,6] ; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm3 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 ret <4 x i32> %res } define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [2,4,3,6] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer ret <4 x i32> %res } define <4 x i32> @test_16xi32_to_4xi32_perm_mask9(<16 x i32> %vec) { ; CHECK-FAST-LABEL: test_16xi32_to_4xi32_perm_mask9: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovaps {{.*#+}} xmm1 = [12,9,4,10] ; CHECK-FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-FAST-NEXT: vzeroupper ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_16xi32_to_4xi32_perm_mask9: ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <4,1,u,2> ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; CHECK-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm1, %ymm1 ; CHECK-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 ; CHECK-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,3] ; CHECK-FAST-PERLANE-NEXT: vpermi2d %xmm2, %xmm1, %xmm0 ; CHECK-FAST-PERLANE-NEXT: vzeroupper ; CHECK-FAST-PERLANE-NEXT: retq %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 12, i32 9, i32 4, i32 10> ret <4 x i32> %res } define <2 x i64> @test_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec) { ; CHECK-LABEL: test_4xi64_to_2xi64_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,2,3] ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0> ret <2 x i64> %res } define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,0,2,3] ; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0> %cmp = icmp eq <2 x i64> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2 ret <2 x i64> %res } define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,2,3] ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0> %cmp = icmp eq <2 x i64> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer ret <2 x i64> %res } define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1> %cmp = icmp eq <2 x i64> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2 ret <2 x i64> %res } define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,2,3] ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1> %cmp = icmp eq <2 x i64> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer ret <2 x i64> %res } define <2 x i64> @test_4xi64_to_2xi64_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_4xi64_to_2xi64_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps (%rdi), %xmm0 ; CHECK-NEXT: vunpckhpd 16(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[1],mem[1] ; CHECK-NEXT: retq %vec = load <4 x i64>, ptr %vp %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3> ret <2 x i64> %res } define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %xmm2 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpunpckhqdq 16(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[1],mem[1] ; CHECK-NEXT: retq %vec = load <4 x i64>, ptr %vp %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3> %cmp = icmp eq <2 x i64> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2 ret <2 x i64> %res } define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %xmm1 ; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpunpckhqdq 16(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[1],mem[1] ; CHECK-NEXT: retq %vec = load <4 x i64>, ptr %vp %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3> %cmp = icmp eq <2 x i64> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer ret <2 x i64> %res } define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 ; CHECK-NEXT: vpblendd $12, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[2,3] ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <4 x i64>, ptr %vp %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1> %cmp = icmp eq <2 x i64> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2 ret <2 x i64> %res } define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1 ; CHECK-NEXT: vpblendd $12, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[0,1],mem[2,3] ; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1 ; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <4 x i64>, ptr %vp %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1> %cmp = icmp eq <2 x i64> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer ret <2 x i64> %res } define <4 x i64> @test_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec) { ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,1] ; CHECK-NEXT: retq %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5> ret <4 x i64> %res } define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,1] ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 ret <4 x i64> %res } define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,1] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer ret <4 x i64> %res } define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask1: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [6,4,6,1] ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm0 ; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-FAST-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask1: ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,2,1] ; CHECK-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 4, i32 6, i32 1> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 ret <4 x i64> %res } define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask1: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [6,4,6,1] ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask1: ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,2,1] ; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 4, i32 6, i32 1> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer ret <4 x i64> %res } define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask2: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [6,3,6,3] ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm0 ; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-FAST-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask2: ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,3] ; CHECK-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 3, i32 6, i32 3> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 ret <4 x i64> %res } define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask2: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [6,3,6,3] ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask2: ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,3] ; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 3, i32 6, i32 3> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer ret <4 x i64> %res } define <4 x i64> @test_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec) { ; CHECK-FAST-LABEL: test_8xi64_to_4xi64_perm_mask3: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [6,0,0,7] ; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_4xi64_perm_mask3: ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm1 ; CHECK-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,0,3] ; CHECK-FAST-PERLANE-NEXT: retq %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7> ret <4 x i64> %res } define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask3: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [6,0,0,7] ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm0 ; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-FAST-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask3: ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7] ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,0,3] ; CHECK-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 ret <4 x i64> %res } define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask3: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [6,0,0,7] ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask3: ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,0,3] ; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer ret <4 x i64> %res } define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask4: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [3,7,7,5] ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm0 ; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-FAST-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask4: ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,3,1] ; CHECK-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 ret <4 x i64> %res } define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask4: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [3,7,7,5] ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask4: ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,3,1] ; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer ret <4 x i64> %res } define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask5: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [4,1,0,6] ; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 4, i32 1, i32 0, i32 6> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 ret <4 x i64> %res } define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask5: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [4,1,0,6] ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 4, i32 1, i32 0, i32 6> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer ret <4 x i64> %res } define <4 x i64> @test_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec) { ; CHECK-FAST-LABEL: test_8xi64_to_4xi64_perm_mask6: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,3] ; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_4xi64_perm_mask6: ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm1 ; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,2,1,3] ; CHECK-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; CHECK-FAST-PERLANE-NEXT: retq %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3> ret <4 x i64> %res } define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask6: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [7,6,5,3] ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm0 ; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-FAST-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask6: ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,2,1,3] ; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-FAST-PERLANE-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 ret <4 x i64> %res } define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask6: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,3] ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask6: ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,2,1,3] ; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-PERLANE-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} ; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer ret <4 x i64> %res } define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask7: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm3 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,0,3,4] ; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm4 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 ret <4 x i64> %res } define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask7: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [2,0,3,4] ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermt2q %ymm2, %ymm3, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer ret <4 x i64> %res } define <2 x i64> @test_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec) { ; CHECK-LABEL: test_8xi64_to_2xi64_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 ; CHECK-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0> ret <2 x i64> %res } define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3 ; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1 ; CHECK-NEXT: valignq {{.*#+}} xmm1 {%k1} = xmm3[1],xmm0[0] ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0> %cmp = icmp eq <2 x i64> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2 ret <2 x i64> %res } define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 ; CHECK-NEXT: valignq {{.*#+}} xmm0 {%k1} {z} = xmm2[1],xmm0[0] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0> %cmp = icmp eq <2 x i64> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer ret <2 x i64> %res } define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 5> %cmp = icmp eq <2 x i64> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2 ret <2 x i64> %res } define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,2,3] ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 5> %cmp = icmp eq <2 x i64> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer ret <2 x i64> %res } define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vpermpd $136, (%rdi), %ymm0 # ymm0 = mem[0,2,0,2] ; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2> ret <4 x i64> %res } define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask0(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermq $136, (%rdi), %ymm0 {%k1} # ymm0 {%k1} = mem[0,2,0,2] ; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 ret <4 x i64> %res } define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask0(ptr %vp, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermq $136, (%rdi), %ymm0 {%k1} {z} # ymm0 {%k1} {z} = mem[0,2,0,2] ; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer ret <4 x i64> %res } define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask1(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask1: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,4] ; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm3 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask1: ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm2 ; CHECK-FAST-PERLANE-NEXT: vpblendd $3, (%rdi), %ymm2, %ymm2 # ymm2 = mem[0,1],ymm2[2,3,4,5,6,7] ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[0,3,2,0] ; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 7, i32 6, i32 0> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 ret <4 x i64> %res } define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask1(ptr %vp, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask1: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,4] ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask1: ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 ; CHECK-FAST-PERLANE-NEXT: vpblendd $3, (%rdi), %ymm1, %ymm1 # ymm1 = mem[0,1],ymm1[2,3,4,5,6,7] ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[0,3,2,0] ; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 7, i32 6, i32 0> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer ret <4 x i64> %res } define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask2: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [3,5,5,1] ; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm3 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask2: ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm2 ; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq (%rdi), %ymm2, %ymm2 # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[2,1,1,0] ; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 1, i32 1, i32 5> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 ret <4 x i64> %res } define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(ptr %vp, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [3,5,5,1] ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2: ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 ; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq (%rdi), %ymm1, %ymm1 # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[2,1,1,0] ; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 1, i32 1, i32 5> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer ret <4 x i64> %res } define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask3(ptr %vp) { ; CHECK-FAST-LABEL: test_8xi64_to_4xi64_perm_mem_mask3: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovdqa (%rdi), %ymm1 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [7,0,0,2] ; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm1, %ymm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_4xi64_perm_mem_mask3: ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 ; CHECK-FAST-PERLANE-NEXT: vpalignr $8, 32(%rdi), %ymm0, %ymm0 # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,1,3] ; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x i64>, ptr %vp %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2> ret <4 x i64> %res } define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask3(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask3: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [7,0,0,2] ; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask3: ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-FAST-PERLANE-NEXT: vpalignr $8, 32(%rdi), %ymm2, %ymm2 # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[2,1,1,3] ; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 ret <4 x i64> %res } define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask3(ptr %vp, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask3: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [7,0,0,2] ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask3: ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1 ; CHECK-FAST-PERLANE-NEXT: vpalignr $8, 32(%rdi), %ymm1, %ymm1 # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[2,1,1,3] ; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer ret <4 x i64> %res } define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask4(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask4: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,4,6,1] ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 4, i32 6, i32 1> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 ret <4 x i64> %res } define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask4(ptr %vp, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask4: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,6,1] ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 4, i32 6, i32 1> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer ret <4 x i64> %res } define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask5(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask5: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,7,1] ; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask5: ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-FAST-PERLANE-NEXT: vpblendd $192, 32(%rdi), %ymm2, %ymm2 # ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[0,2,3,1] ; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 7, i32 1> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 ret <4 x i64> %res } define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(ptr %vp, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask5: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,7,1] ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask5: ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1 ; CHECK-FAST-PERLANE-NEXT: vpblendd $192, 32(%rdi), %ymm1, %ymm1 # ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[0,2,3,1] ; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 7, i32 1> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer ret <4 x i64> %res } define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask6(ptr %vp) { ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask6: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm1 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [7,2,3,2] ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2> ret <4 x i64> %res } define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask6: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [7,2,3,2] ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 ret <4 x i64> %res } define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask6(ptr %vp, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask6: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,2,3,2] ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer ret <4 x i64> %res } define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask7(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask7: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [3,3,1,5] ; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm3 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask7: ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm2 ; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq (%rdi), %ymm2, %ymm2 # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[2,2,0,1] ; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 7, i32 5, i32 1> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 ret <4 x i64> %res } define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask7(ptr %vp, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask7: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,1,5] ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask7: ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 ; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq (%rdi), %ymm1, %ymm1 # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[2,2,0,1] ; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 7, i32 5, i32 1> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer ret <4 x i64> %res } define <2 x i64> @test_8xi64_to_2xi64_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_8xi64_to_2xi64_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps 32(%rdi), %xmm0 ; CHECK-NEXT: vblendps $12, (%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0,1],mem[2,3] ; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1> ret <2 x i64> %res } define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa 32(%rdi), %xmm2 ; CHECK-NEXT: vpblendd $12, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[2,3] ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1> %cmp = icmp eq <2 x i64> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2 ret <2 x i64> %res } define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa 32(%rdi), %xmm1 ; CHECK-NEXT: vpblendd $12, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[0,1],mem[2,3] ; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1 ; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1> %cmp = icmp eq <2 x i64> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer ret <2 x i64> %res } define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 ; CHECK-NEXT: vpunpcklqdq (%rdi), %ymm2, %ymm2 # ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] ; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 2> %cmp = icmp eq <2 x i64> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2 ret <2 x i64> %res } define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 ; CHECK-NEXT: vpunpcklqdq (%rdi), %ymm1, %ymm1 # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] ; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1 ; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1 ; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 2> %cmp = icmp eq <2 x i64> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer ret <2 x i64> %res } define <4 x float> @test_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec) { ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,1] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5> ret <4 x float> %res } define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3 ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqps %xmm4, %xmm2, %k1 ; CHECK-NEXT: vshufps {{.*#+}} xmm1 {%k1} = xmm0[0,3],xmm3[0,1] ; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5> %cmp = fcmp oeq <4 x float> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 ret <4 x float> %res } define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3],xmm2[0,1] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5> %cmp = fcmp oeq <4 x float> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer ret <4 x float> %res } define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [1,3,5,0] ; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 ; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 0> %cmp = fcmp oeq <4 x float> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 ret <4 x float> %res } define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [1,3,5,0] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 ; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 0> %cmp = fcmp oeq <4 x float> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer ret <4 x float> %res } define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [3,2,7,0] ; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 ; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 2, i32 7, i32 0> %cmp = fcmp oeq <4 x float> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 ret <4 x float> %res } define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,2,7,0] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 ; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 2, i32 7, i32 0> %cmp = fcmp oeq <4 x float> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer ret <4 x float> %res } define <4 x float> @test_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec) { ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,3,5,2] ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 3, i32 5, i32 2> ret <4 x float> %res } define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [3,3,5,2] ; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 ; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 3, i32 5, i32 2> %cmp = fcmp oeq <4 x float> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 ret <4 x float> %res } define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,3,5,2] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 ; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 3, i32 5, i32 2> %cmp = fcmp oeq <4 x float> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer ret <4 x float> %res } define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps 16(%rdi), %xmm1 ; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [2,6,0,1] ; CHECK-NEXT: vpermi2ps (%rdi), %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <8 x float>, ptr %vp %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5> ret <4 x float> %res } define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [2,6,0,1] ; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm3 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x float>, ptr %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5> %cmp = fcmp oeq <4 x float> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 ret <4 x float> %res } define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [2,6,0,1] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 ; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z} ; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <8 x float>, ptr %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5> %cmp = fcmp oeq <4 x float> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer ret <4 x float> %res } define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [2,7,7,2] ; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm3 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x float>, ptr %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 3, i32 3, i32 6> %cmp = fcmp oeq <4 x float> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 ret <4 x float> %res } define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [2,7,7,2] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 ; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z} ; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <8 x float>, ptr %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 3, i32 3, i32 6> %cmp = fcmp oeq <4 x float> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer ret <4 x float> %res } define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps (%rdi), %xmm2 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [3,1,3,7] ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm3 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x float>, ptr %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 7> %cmp = fcmp oeq <4 x float> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 ret <4 x float> %res } define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps (%rdi), %xmm2 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,1,3,7] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z} ; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <8 x float>, ptr %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 7> %cmp = fcmp oeq <4 x float> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer ret <4 x float> %res } define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps (%rdi), %xmm1 ; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [1,3,5,3] ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <8 x float>, ptr %vp %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3> ret <4 x float> %res } define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps (%rdi), %xmm2 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [1,3,5,3] ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm3 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x float>, ptr %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3> %cmp = fcmp oeq <4 x float> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 ret <4 x float> %res } define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps (%rdi), %xmm2 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [1,3,5,3] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z} ; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <8 x float>, ptr %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3> %cmp = fcmp oeq <4 x float> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer ret <4 x float> %res } define <8 x float> @test_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec) { ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,4,12,10,8,2,11,7] ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 12, i32 10, i32 8, i32 2, i32 11, i32 7> ret <8 x float> %res } define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [0,4,12,10,8,2,11,7] ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 ; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 12, i32 10, i32 8, i32 2, i32 11, i32 7> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 ret <8 x float> %res } define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,4,12,10,8,2,11,7] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 12, i32 10, i32 8, i32 2, i32 11, i32 7> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer ret <8 x float> %res } define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask1(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [10,12,3,12,4,15,1,14] ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 ; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 ret <8 x float> %res } define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask1(<16 x float> %vec, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [10,12,3,12,4,15,1,14] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer ret <8 x float> %res } define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [0,4,8,9,6,1,4,4] ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 ; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 ret <8 x float> %res } define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,4,8,9,6,1,4,4] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer ret <8 x float> %res } define <8 x float> @test_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec) { ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [12,14,9,0,12,4,5,8] ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 12, i32 14, i32 9, i32 0, i32 12, i32 4, i32 5, i32 8> ret <8 x float> %res } define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [12,14,9,0,12,4,5,8] ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 ; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 12, i32 14, i32 9, i32 0, i32 12, i32 4, i32 5, i32 8> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 ret <8 x float> %res } define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [12,14,9,0,12,4,5,8] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 12, i32 14, i32 9, i32 0, i32 12, i32 4, i32 5, i32 8> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer ret <8 x float> %res } define <4 x float> @test_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec) { ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4,8,9,10] ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 4, i32 8, i32 9, i32 10> ret <4 x float> %res } define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [4,8,9,10] ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 ; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 4, i32 8, i32 9, i32 10> %cmp = fcmp oeq <4 x float> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 ret <4 x float> %res } define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [4,8,9,10] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 4, i32 8, i32 9, i32 10> %cmp = fcmp oeq <4 x float> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer ret <4 x float> %res } define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-FAST-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask1: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovaps {{.*#+}} xmm3 = [8,6,10,6] ; CHECK-FAST-NEXT: vpermps %zmm0, %zmm3, %zmm0 ; CHECK-FAST-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-FAST-NEXT: vcmpeqps %xmm3, %xmm2, %k1 ; CHECK-FAST-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-FAST-NEXT: vzeroupper ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask1: ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $2, %zmm0, %xmm3 ; CHECK-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm0 ; CHECK-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm4 = [0,6,2,6] ; CHECK-FAST-PERLANE-NEXT: vpermi2ps %xmm0, %xmm3, %xmm4 ; CHECK-FAST-PERLANE-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; CHECK-FAST-PERLANE-NEXT: vcmpeqps %xmm0, %xmm2, %k1 ; CHECK-FAST-PERLANE-NEXT: vblendmps %xmm4, %xmm1, %xmm0 {%k1} ; CHECK-FAST-PERLANE-NEXT: vzeroupper ; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 6, i32 10, i32 6> %cmp = fcmp oeq <4 x float> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 ret <4 x float> %res } define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %mask) { ; CHECK-FAST-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovaps {{.*#+}} xmm2 = [8,6,10,6] ; CHECK-FAST-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-FAST-NEXT: vcmpeqps %xmm3, %xmm1, %k1 ; CHECK-FAST-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-FAST-NEXT: vzeroupper ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1: ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $2, %zmm0, %xmm2 ; CHECK-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm3 ; CHECK-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm0 = [0,6,2,6] ; CHECK-FAST-PERLANE-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; CHECK-FAST-PERLANE-NEXT: vcmpeqps %xmm4, %xmm1, %k1 ; CHECK-FAST-PERLANE-NEXT: vpermi2ps %xmm3, %xmm2, %xmm0 {%k1} {z} ; CHECK-FAST-PERLANE-NEXT: vzeroupper ; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 6, i32 10, i32 6> %cmp = fcmp oeq <4 x float> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer ret <4 x float> %res } define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask2(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,0],ymm0[0,1],ymm3[4,4],ymm0[4,5] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 ; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm1 {%k1} ; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 12, i32 4, i32 5> %cmp = fcmp oeq <4 x float> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 ret <4 x float> %res } define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask2(<16 x float> %vec, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 ; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,0],ymm0[0,1],ymm2[4,4],ymm0[4,5] ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 ; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 12, i32 4, i32 5> %cmp = fcmp oeq <4 x float> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer ret <4 x float> %res } define <4 x float> @test_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec) { ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [10,2,11,6] ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6> ret <4 x float> %res } define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [10,2,11,6] ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 ; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6> %cmp = fcmp oeq <4 x float> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 ret <4 x float> %res } define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [10,2,11,6] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6> %cmp = fcmp oeq <4 x float> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer ret <4 x float> %res } define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps (%rdi), %ymm1 ; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [7,6,7,11,5,10,0,4] ; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4> ret <8 x float> %res } define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask0(ptr %vp, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps (%rdi), %ymm2 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [7,6,7,11,5,10,0,4] ; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 ; CHECK-NEXT: vmovaps %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 ret <8 x float> %res } define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0(ptr %vp, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps (%rdi), %ymm2 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,7,11,5,10,0,4] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm0, %k1 ; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovaps %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer ret <8 x float> %res } define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask1(ptr %vp, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps (%rdi), %ymm2 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [11,0,9,0,7,14,0,8] ; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 ; CHECK-NEXT: vmovaps %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 11, i32 0, i32 9, i32 0, i32 7, i32 14, i32 0, i32 8> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 ret <8 x float> %res } define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1(ptr %vp, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps (%rdi), %ymm2 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [11,0,9,0,7,14,0,8] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm0, %k1 ; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovaps %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 11, i32 0, i32 9, i32 0, i32 7, i32 14, i32 0, i32 8> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer ret <8 x float> %res } define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-FAST-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovaps 32(%rdi), %ymm2 ; CHECK-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [9,5,2,3,2,8,8,1] ; CHECK-FAST-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3 ; CHECK-FAST-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-FAST-NEXT: vcmpeqps %ymm2, %ymm1, %k1 ; CHECK-FAST-NEXT: vmovaps %ymm3, %ymm0 {%k1} ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2: ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm2 ; CHECK-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm3 ; CHECK-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm4 = [9,5,2,3,2,8,8,1] ; CHECK-FAST-PERLANE-NEXT: vpermi2ps %ymm2, %ymm3, %ymm4 ; CHECK-FAST-PERLANE-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-FAST-PERLANE-NEXT: vcmpeqps %ymm2, %ymm1, %k1 ; CHECK-FAST-PERLANE-NEXT: vmovaps %ymm4, %ymm0 {%k1} ; CHECK-FAST-PERLANE-NEXT: retq %vec = load <16 x float>, ptr %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 1, i32 13, i32 10, i32 11, i32 10, i32 0, i32 0, i32 9> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 ret <8 x float> %res } define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8 x float> %mask) { ; CHECK-FAST-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovaps 32(%rdi), %ymm2 ; CHECK-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [9,5,2,3,2,8,8,1] ; CHECK-FAST-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-FAST-NEXT: vcmpeqps %ymm3, %ymm0, %k1 ; CHECK-FAST-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-FAST-NEXT: vmovaps %ymm1, %ymm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2: ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm2 ; CHECK-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm3 ; CHECK-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm1 = [9,5,2,3,2,8,8,1] ; CHECK-FAST-PERLANE-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; CHECK-FAST-PERLANE-NEXT: vcmpeqps %ymm4, %ymm0, %k1 ; CHECK-FAST-PERLANE-NEXT: vpermi2ps %ymm2, %ymm3, %ymm1 {%k1} {z} ; CHECK-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm0 ; CHECK-FAST-PERLANE-NEXT: retq %vec = load <16 x float>, ptr %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 1, i32 13, i32 10, i32 11, i32 10, i32 0, i32 0, i32 9> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer ret <8 x float> %res } define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps 32(%rdi), %ymm1 ; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [7,5,3,3,11,4,12,9] ; CHECK-NEXT: vpermi2ps (%rdi), %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1> ret <8 x float> %res } define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask3(ptr %vp, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [7,5,3,3,11,4,12,9] ; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 ; CHECK-NEXT: vmovaps %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 ret <8 x float> %res } define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3(ptr %vp, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [7,5,3,3,11,4,12,9] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm0, %k1 ; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovaps %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer ret <8 x float> %res } define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vpermpd $231, 32(%rdi), %ymm1 # ymm1 = mem[3,1,2,3] ; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [0,6,7,3] ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 14, i32 6, i32 7, i32 11> ret <4 x float> %res } define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vpermpd $231, 32(%rdi), %ymm2 # ymm2 = mem[3,1,2,3] ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [0,6,7,3] ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm3 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 14, i32 6, i32 7, i32 11> %cmp = fcmp oeq <4 x float> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 ret <4 x float> %res } define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vpermpd $231, 32(%rdi), %ymm2 # ymm2 = mem[3,1,2,3] ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [0,6,7,3] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z} ; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 14, i32 6, i32 7, i32 11> %cmp = fcmp oeq <4 x float> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer ret <4 x float> %res } define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = <0,10,6,15,u,u,u,u> ; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 2, i32 14, i32 7> %cmp = fcmp oeq <4 x float> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 ret <4 x float> %res } define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = <0,10,6,15,u,u,u,u> ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 ; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 2, i32 14, i32 7> %cmp = fcmp oeq <4 x float> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer ret <4 x float> %res } define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovddup {{.*#+}} xmm2 = [60129542148,60129542148] ; CHECK-NEXT: # xmm2 = mem[0,0] ; CHECK-NEXT: vmovaps 32(%rdi), %ymm3 ; CHECK-NEXT: vpermt2ps (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 6, i32 12, i32 6> %cmp = fcmp oeq <4 x float> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 ret <4 x float> %res } define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovddup {{.*#+}} xmm2 = [60129542148,60129542148] ; CHECK-NEXT: # xmm2 = mem[0,0] ; CHECK-NEXT: vmovaps 32(%rdi), %ymm1 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 ; CHECK-NEXT: vpermt2ps (%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 6, i32 12, i32 6> %cmp = fcmp oeq <4 x float> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer ret <4 x float> %res } define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,3,15,9] ; CHECK-NEXT: vmovaps (%rdi), %ymm0 ; CHECK-NEXT: vpermt2ps 32(%rdi), %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 15, i32 9> ret <4 x float> %res } define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,3,15,9] ; CHECK-NEXT: vmovaps (%rdi), %ymm3 ; CHECK-NEXT: vpermt2ps 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 15, i32 9> %cmp = fcmp oeq <4 x float> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 ret <4 x float> %res } define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,3,15,9] ; CHECK-NEXT: vmovaps (%rdi), %ymm1 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 ; CHECK-NEXT: vpermt2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 15, i32 9> %cmp = fcmp oeq <4 x float> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer ret <4 x float> %res } define <2 x double> @test_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec) { ; CHECK-LABEL: test_4xdouble_to_2xdouble_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,2,3] ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0> ret <2 x double> %res } define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec, <2 x double> %vec2, <2 x double> %mask) { ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,2,3] ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1 ; CHECK-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0> %cmp = fcmp oeq <2 x double> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2 ret <2 x double> %res } define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec, <2 x double> %mask) { ; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,2,3] ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0> %cmp = fcmp oeq <2 x double> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer ret <2 x double> %res } define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mask1(<4 x double> %vec, <2 x double> %vec2, <2 x double> %mask) { ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,3,2,3] ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1 ; CHECK-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 1, i32 3> %cmp = fcmp oeq <2 x double> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2 ret <2 x double> %res } define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mask1(<4 x double> %vec, <2 x double> %mask) { ; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,2,3] ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 1, i32 3> %cmp = fcmp oeq <2 x double> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer ret <2 x double> %res } define <2 x double> @test_4xdouble_to_2xdouble_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_4xdouble_to_2xdouble_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps (%rdi), %xmm0 ; CHECK-NEXT: vblendps $3, 16(%rdi), %xmm0, %xmm0 # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: retq %vec = load <4 x double>, ptr %vp %res = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1> ret <2 x double> %res } define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2 x double> %vec2, <2 x double> %mask) { ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovapd (%rdi), %xmm2 ; CHECK-NEXT: vblendpd $1, 16(%rdi), %xmm2, %xmm2 # xmm2 = mem[0],xmm2[1] ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 ; CHECK-NEXT: vmovapd %xmm2, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <4 x double>, ptr %vp %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1> %cmp = fcmp oeq <2 x double> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2 ret <2 x double> %res } define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2 x double> %mask) { ; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovapd (%rdi), %xmm1 ; CHECK-NEXT: vblendpd $1, 16(%rdi), %xmm1, %xmm1 # xmm1 = mem[0],xmm1[1] ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1 ; CHECK-NEXT: vmovapd %xmm1, %xmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <4 x double>, ptr %vp %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1> %cmp = fcmp oeq <2 x double> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer ret <2 x double> %res } define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 x double> %vec2, <2 x double> %mask) { ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovapd 16(%rdi), %xmm2 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 ; CHECK-NEXT: vunpcklpd (%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0] ; CHECK-NEXT: retq %vec = load <4 x double>, ptr %vp %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0> %cmp = fcmp oeq <2 x double> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2 ret <2 x double> %res } define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 x double> %mask) { ; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovapd 16(%rdi), %xmm1 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1 ; CHECK-NEXT: vunpcklpd (%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0] ; CHECK-NEXT: retq %vec = load <4 x double>, ptr %vp %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0> %cmp = fcmp oeq <2 x double> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer ret <2 x double> %res } define <4 x double> @test_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec) { ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [7,3,7,3] ; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3> ret <4 x double> %res } define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [7,3,7,3] ; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 ; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 ret <4 x double> %res } define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [7,3,7,3] ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer ret <4 x double> %res } define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask1(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [2,0,7,6] ; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 ; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 0, i32 7, i32 6> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 ret <4 x double> %res } define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask1(<8 x double> %vec, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [2,0,7,6] ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 0, i32 7, i32 6> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer ret <4 x double> %res } define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask2(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 ; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,0] ; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 0> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 ret <4 x double> %res } define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask2(<8 x double> %vec, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,0] ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 0> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer ret <4 x double> %res } define <4 x double> @test_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec) { ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,1,4] ; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 4> ret <4 x double> %res } define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [0,2,1,4] ; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 ; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 4> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 ret <4 x double> %res } define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [0,2,1,4] ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 4> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer ret <4 x double> %res } define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask4: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vextractf32x4 $2, %zmm0, %xmm3 ; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm4 = [1,1,5,5] ; CHECK-FAST-NEXT: vpermi2pd %ymm3, %ymm0, %ymm4 ; CHECK-FAST-NEXT: vxorpd %xmm0, %xmm0, %xmm0 ; CHECK-FAST-NEXT: vcmpeqpd %ymm0, %ymm2, %k1 ; CHECK-FAST-NEXT: vblendmpd %ymm4, %ymm1, %ymm0 {%k1} ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask4: ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $2, %zmm0, %xmm3 ; CHECK-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 ; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,0,1,1] ; CHECK-FAST-PERLANE-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 1, i32 5, i32 5> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 ret <4 x double> %res } define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vextractf32x4 $2, %zmm0, %xmm2 ; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [1,1,5,5] ; CHECK-FAST-NEXT: vxorpd %xmm4, %xmm4, %xmm4 ; CHECK-FAST-NEXT: vcmpeqpd %ymm4, %ymm1, %k1 ; CHECK-FAST-NEXT: vpermt2pd %ymm2, %ymm3, %ymm0 {%k1} {z} ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4: ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $2, %zmm0, %xmm2 ; CHECK-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 ; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,1,1] ; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 1, i32 5, i32 5> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer ret <4 x double> %res } define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask5(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask5: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [2,6,2,2] ; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 ; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 6, i32 2, i32 2> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 ret <4 x double> %res } define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask5(<8 x double> %vec, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask5: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [2,6,2,2] ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 6, i32 2, i32 2> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer ret <4 x double> %res } define <4 x double> @test_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec) { ; CHECK-FAST-LABEL: test_8xdouble_to_4xdouble_perm_mask6: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [5,8,7,8] ; CHECK-FAST-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_8xdouble_to_4xdouble_perm_mask6: ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm1 ; CHECK-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0 ; CHECK-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; CHECK-FAST-PERLANE-NEXT: retq %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0> ret <4 x double> %res } define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [5,8,7,8] ; CHECK-FAST-NEXT: vpermi2pd %zmm0, %zmm0, %zmm3 ; CHECK-FAST-NEXT: vxorpd %xmm0, %xmm0, %xmm0 ; CHECK-FAST-NEXT: vcmpeqpd %ymm0, %ymm2, %k1 ; CHECK-FAST-NEXT: vblendmpd %ymm3, %ymm1, %ymm0 {%k1} ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6: ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm3 ; CHECK-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0 ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm4, %xmm4, %xmm4 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm4, %ymm2, %k1 ; CHECK-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm3[1],ymm0[1],ymm3[3],ymm0[3] ; CHECK-FAST-PERLANE-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 ret <4 x double> %res } define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask6: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [5,8,7,8] ; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 ; CHECK-FAST-NEXT: vpermt2pd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask6: ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm2 ; CHECK-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0 ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 ; CHECK-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm2[1],ymm0[1],ymm2[3],ymm0[3] ; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer ret <4 x double> %res } define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask7: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [3,5,0,6] ; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm3, %zmm0 ; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 ; CHECK-FAST-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask7: ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm3 ; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,3] ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm4, %xmm4, %xmm4 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm4, %ymm2, %k1 ; CHECK-FAST-PERLANE-NEXT: vshufpd {{.*#+}} ymm1 {%k1} = ymm0[1],ymm3[1],ymm0[2],ymm3[2] ; CHECK-FAST-PERLANE-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 5, i32 0, i32 6> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 ret <4 x double> %res } define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask7: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [3,5,0,6] ; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 ; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask7: ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm2 ; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,3] ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 ; CHECK-FAST-PERLANE-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm2[1],ymm0[2],ymm2[2] ; CHECK-FAST-PERLANE-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 5, i32 0, i32 6> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer ret <4 x double> %res } define <2 x double> @test_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec) { ; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1 ; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm1 ; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6> ret <2 x double> %res } define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %vec2, <2 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3 ; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm3 ; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqpd %xmm4, %xmm2, %k1 ; CHECK-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm3[0] ; CHECK-NEXT: vmovapd %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6> %cmp = fcmp oeq <2 x double> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2 ret <2 x double> %res } define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 ; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm2 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 ; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm2[0] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6> %cmp = fcmp oeq <2 x double> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer ret <2 x double> %res } define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask1(<8 x double> %vec, <2 x double> %vec2, <2 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovapd {{.*#+}} xmm3 = [3,7] ; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1 ; CHECK-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 3, i32 7> %cmp = fcmp oeq <2 x double> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2 ret <2 x double> %res } define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask1(<8 x double> %vec, <2 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovapd {{.*#+}} xmm2 = [3,7] ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 ; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 3, i32 7> %cmp = fcmp oeq <2 x double> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer ret <2 x double> %res } define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovapd (%rdi), %ymm1 ; CHECK-NEXT: vmovapd {{.*#+}} ymm0 = [1,6,7,2] ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2> ret <4 x double> %res } define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovapd (%rdi), %ymm2 ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [1,6,7,2] ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 ; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 ret <4 x double> %res } define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovapd (%rdi), %ymm2 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [1,6,7,2] ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer ret <4 x double> %res } define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(ptr %vp, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovapd (%rdi), %ymm2 ; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [3,4,2,6] ; CHECK-FAST-NEXT: vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm3 ; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-FAST-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 ; CHECK-FAST-NEXT: vmovapd %ymm3, %ymm0 {%k1} ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1: ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vpermpd $236, (%rdi), %ymm2 # ymm2 = mem[0,3,2,3] ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 ; CHECK-FAST-PERLANE-NEXT: vshufpd $1, 32(%rdi){1to4}, %ymm2, %ymm0 {%k1} ; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 4, i32 2, i32 4> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 ret <4 x double> %res } define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(ptr %vp, <4 x double> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovapd (%rdi), %ymm2 ; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [3,4,2,6] ; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 ; CHECK-FAST-NEXT: vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm1 {%k1} {z} ; CHECK-FAST-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1: ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vpermpd $236, (%rdi), %ymm1 # ymm1 = mem[0,3,2,3] ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm2, %ymm0, %k1 ; CHECK-FAST-PERLANE-NEXT: vshufpd $1, 32(%rdi){1to4}, %ymm1, %ymm0 {%k1} {z} ; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 4, i32 2, i32 4> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer ret <4 x double> %res } define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask2: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovapd (%rdi), %ymm2 ; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [1,2,3,4] ; CHECK-FAST-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3 ; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-FAST-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 ; CHECK-FAST-NEXT: vmovapd %ymm3, %ymm0 {%k1} ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask2: ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vmovapd (%rdi), %ymm2 ; CHECK-FAST-PERLANE-NEXT: vperm2f128 $33, 32(%rdi), %ymm2, %ymm3 # ymm3 = ymm2[2,3],mem[0,1] ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm4, %xmm4, %xmm4 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm4, %ymm1, %k1 ; CHECK-FAST-PERLANE-NEXT: vshufpd {{.*#+}} ymm0 {%k1} = ymm2[1],ymm3[0],ymm2[3],ymm3[2] ; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 ret <4 x double> %res } define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp, <4 x double> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovapd (%rdi), %ymm2 ; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [1,2,3,4] ; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 ; CHECK-FAST-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-FAST-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2: ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vmovapd (%rdi), %ymm1 ; CHECK-FAST-PERLANE-NEXT: vperm2f128 $33, 32(%rdi), %ymm1, %ymm2 # ymm2 = ymm1[2,3],mem[0,1] ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 ; CHECK-FAST-PERLANE-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm1[1],ymm2[0],ymm1[3],ymm2[2] ; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer ret <4 x double> %res } define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovapd (%rdi), %ymm1 ; CHECK-NEXT: vmovapd {{.*#+}} ymm0 = [4,2,1,0] ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0> ret <4 x double> %res } define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovapd (%rdi), %ymm2 ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [4,2,1,0] ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 ; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 ret <4 x double> %res } define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovapd (%rdi), %ymm2 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [4,2,1,0] ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer ret <4 x double> %res } define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask4(ptr %vp, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask4: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovapd 32(%rdi), %ymm2 ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [2,4,1,5] ; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 ; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 6, i32 0, i32 5, i32 1> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 ret <4 x double> %res } define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4(ptr %vp, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovapd 32(%rdi), %ymm2 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [2,4,1,5] ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 ; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 6, i32 0, i32 5, i32 1> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer ret <4 x double> %res } define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask5(ptr %vp, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask5: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovapd (%rdi), %ymm2 ; CHECK-NEXT: vperm2f128 $33, 32(%rdi), %ymm2, %ymm2 # ymm2 = ymm2[2,3],mem[0,1] ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 ; CHECK-NEXT: vshufpd $14, 40(%rdi){1to4}, %ymm2, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 5, i32 5, i32 5> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 ret <4 x double> %res } define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5(ptr %vp, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovapd (%rdi), %ymm1 ; CHECK-NEXT: vperm2f128 $33, 32(%rdi), %ymm1, %ymm1 # ymm1 = ymm1[2,3],mem[0,1] ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1 ; CHECK-NEXT: vshufpd $14, 40(%rdi){1to4}, %ymm1, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 5, i32 5, i32 5> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer ret <4 x double> %res } define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp) { ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask6: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovapd 32(%rdi), %ymm1 ; CHECK-NEXT: vmovapd {{.*#+}} ymm0 = [0,2,4,1] ; CHECK-NEXT: vpermi2pd (%rdi), %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5> ret <4 x double> %res } define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask6: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovapd 32(%rdi), %ymm2 ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [0,2,4,1] ; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 ; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 ret <4 x double> %res } define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovapd 32(%rdi), %ymm2 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [0,2,4,1] ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 ; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer ret <4 x double> %res } define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask7(ptr %vp, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask7: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovapd (%rdi), %ymm2 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 ; CHECK-NEXT: vunpcklpd 40(%rdi){1to4}, %ymm2, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 5> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 ret <4 x double> %res } define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7(ptr %vp, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovapd (%rdi), %ymm1 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1 ; CHECK-NEXT: vunpcklpd 40(%rdi){1to4}, %ymm1, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 5> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer ret <4 x double> %res } define <2 x double> @test_8xdouble_to_2xdouble_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovapd (%rdi), %xmm0 ; CHECK-NEXT: vshufpd $1, 48(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[1],mem[0] ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6> ret <2 x double> %res } define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2 x double> %vec2, <2 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovapd (%rdi), %xmm2 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 ; CHECK-NEXT: vshufpd $1, 48(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[1],mem[0] ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6> %cmp = fcmp oeq <2 x double> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2 ret <2 x double> %res } define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovapd (%rdi), %xmm1 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1 ; CHECK-NEXT: vshufpd $1, 48(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[1],mem[0] ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6> %cmp = fcmp oeq <2 x double> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer ret <2 x double> %res } define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 x double> %vec2, <2 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovddup 8(%rdi), %xmm2 # xmm2 = mem[0,0] ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 ; CHECK-NEXT: vunpcklpd 32(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0] ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4> %cmp = fcmp oeq <2 x double> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2 ret <2 x double> %res } define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovddup 8(%rdi), %xmm1 # xmm1 = mem[0,0] ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1 ; CHECK-NEXT: vunpcklpd 32(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0] ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4> %cmp = fcmp oeq <2 x double> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer ret <2 x double> %res } ; PR35977 define void @test_zext_v8i8_to_v8i16(ptr %arg, ptr %arg1) { ; CHECK-LABEL: test_zext_v8i8_to_v8i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vpmovzxbw (%rdi), %xmm0 # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; CHECK-NEXT: vpsllw $8, %xmm0, %xmm0 ; CHECK-NEXT: vmovdqa %xmm0, (%rsi) ; CHECK-NEXT: retq %tmp2 = load <8 x i8>, ptr %arg %tmp3 = extractelement <8 x i8> %tmp2, i32 0 %tmp4 = zext i8 %tmp3 to i16 %tmp5 = insertelement <8 x i16> undef, i16 %tmp4, i32 0 %tmp6 = extractelement <8 x i8> %tmp2, i32 1 %tmp7 = zext i8 %tmp6 to i16 %tmp8 = insertelement <8 x i16> %tmp5, i16 %tmp7, i32 1 %tmp9 = extractelement <8 x i8> %tmp2, i32 2 %tmp10 = zext i8 %tmp9 to i16 %tmp11 = insertelement <8 x i16> %tmp8, i16 %tmp10, i32 2 %tmp12 = extractelement <8 x i8> %tmp2, i32 3 %tmp13 = zext i8 %tmp12 to i16 %tmp14 = insertelement <8 x i16> %tmp11, i16 %tmp13, i32 3 %tmp15 = extractelement <8 x i8> %tmp2, i32 4 %tmp16 = zext i8 %tmp15 to i16 %tmp17 = insertelement <8 x i16> %tmp14, i16 %tmp16, i32 4 %tmp18 = extractelement <8 x i8> %tmp2, i32 5 %tmp19 = zext i8 %tmp18 to i16 %tmp20 = insertelement <8 x i16> %tmp17, i16 %tmp19, i32 5 %tmp21 = extractelement <8 x i8> %tmp2, i32 6 %tmp22 = zext i8 %tmp21 to i16 %tmp23 = insertelement <8 x i16> %tmp20, i16 %tmp22, i32 6 %tmp24 = extractelement <8 x i8> %tmp2, i32 7 %tmp25 = zext i8 %tmp24 to i16 %tmp26 = insertelement <8 x i16> %tmp23, i16 %tmp25, i32 7 %tmp27 = shl <8 x i16> %tmp26, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> store <8 x i16> %tmp27, ptr %arg1 ret void }