; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 | FileCheck %s --check-prefixes=CHECK,X64 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 | FileCheck %s --check-prefixes=CHECK,X86 define <8 x half> @broadcastph128(ptr %x) { ; X64-LABEL: broadcastph128: ; X64: # %bb.0: ; X64-NEXT: vpbroadcastw (%rdi), %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: broadcastph128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vpbroadcastw (%eax), %xmm0 ; X86-NEXT: retl %l1 = load half, ptr %x, align 2 %vec = insertelement <8 x half> undef, half %l1, i32 0 %res = shufflevector <8 x half> %vec, <8 x half> undef, <8 x i32> zeroinitializer ret <8 x half> %res } define <16 x half> @broadcastph256(ptr %x) { ; X64-LABEL: broadcastph256: ; X64: # %bb.0: ; X64-NEXT: vpbroadcastw (%rdi), %ymm0 ; X64-NEXT: retq ; ; X86-LABEL: broadcastph256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vpbroadcastw (%eax), %ymm0 ; X86-NEXT: retl %l1 = load half, ptr %x, align 2 %vec = insertelement <16 x half> undef, half %l1, i32 0 %res = shufflevector <16 x half> %vec, <16 x half> undef, <16 x i32> zeroinitializer ret <16 x half> %res } define <32 x half> @broadcastph512(ptr %x) { ; X64-LABEL: broadcastph512: ; X64: # %bb.0: ; X64-NEXT: vpbroadcastw (%rdi), %zmm0 ; X64-NEXT: retq ; ; X86-LABEL: broadcastph512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vpbroadcastw (%eax), %zmm0 ; X86-NEXT: retl %l1 = load half, ptr %x, align 2 %vec = insertelement <32 x half> undef, half %l1, i32 0 %res = shufflevector <32 x half> %vec, <32 x half> undef, <32 x i32> zeroinitializer ret <32 x half> %res } define <8 x half> @broadcastph128_scalar(half %x) { ; X64-LABEL: broadcastph128_scalar: ; X64: # %bb.0: ; X64-NEXT: vpbroadcastw %xmm0, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: broadcastph128_scalar: ; X86: # %bb.0: ; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: retl %vec = insertelement <8 x half> undef, half %x, i32 0 %res = shufflevector <8 x half> %vec, <8 x half> undef, <8 x i32> zeroinitializer ret <8 x half> %res } define <16 x half> @broadcastph256_scalar(half %x) { ; X64-LABEL: broadcastph256_scalar: ; X64: # %bb.0: ; X64-NEXT: vpbroadcastw %xmm0, %ymm0 ; X64-NEXT: retq ; ; X86-LABEL: broadcastph256_scalar: ; X86: # %bb.0: ; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %ymm0 ; X86-NEXT: retl %vec = insertelement <16 x half> undef, half %x, i32 0 %res = shufflevector <16 x half> %vec, <16 x half> undef, <16 x i32> zeroinitializer ret <16 x half> %res } define <32 x half> @broadcastph512_scalar(half %x) { ; X64-LABEL: broadcastph512_scalar: ; X64: # %bb.0: ; X64-NEXT: vpbroadcastw %xmm0, %zmm0 ; X64-NEXT: retq ; ; X86-LABEL: broadcastph512_scalar: ; X86: # %bb.0: ; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %zmm0 ; X86-NEXT: retl %vec = insertelement <32 x half> undef, half %x, i32 0 %res = shufflevector <32 x half> %vec, <32 x half> undef, <32 x i32> zeroinitializer ret <32 x half> %res } define <8 x half> @broadcastph128_reg(<8 x half> %x) { ; CHECK-LABEL: broadcastph128_reg: ; CHECK: # %bb.0: ; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <8 x half> %x, <8 x half> undef, <8 x i32> zeroinitializer ret <8 x half> %res } define <16 x half> @broadcastph256_reg(<16 x half> %x) { ; CHECK-LABEL: broadcastph256_reg: ; CHECK: # %bb.0: ; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <16 x half> %x, <16 x half> undef, <16 x i32> zeroinitializer ret <16 x half> %res } define <32 x half> @broadcastph512_reg(<32 x half> %x) { ; CHECK-LABEL: broadcastph512_reg: ; CHECK: # %bb.0: ; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <32 x half> %x, <32 x half> undef, <32 x i32> zeroinitializer ret <32 x half> %res } define i16 @test1(half %x) { ; X64-LABEL: test1: ; X64: # %bb.0: ; X64-NEXT: vmovw %xmm0, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq ; ; X86-LABEL: test1: ; X86: # %bb.0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl %res = bitcast half %x to i16 ret i16 %res } define <8 x i16> @test2(i16 %x) { ; X64-LABEL: test2: ; X64: # %bb.0: ; X64-NEXT: vmovw %edi, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: test2: ; X86: # %bb.0: ; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: retl %res = insertelement <8 x i16>undef, i16 %x, i32 0 ret <8 x i16>%res } define <8 x i16> @test4(ptr %x) { ; X64-LABEL: test4: ; X64: # %bb.0: ; X64-NEXT: vpbroadcastw (%rdi), %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: test4: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vpbroadcastw (%eax), %xmm0 ; X86-NEXT: retl %y = load i16, ptr %x %res = insertelement <8 x i16>undef, i16 %y, i32 0 ret <8 x i16>%res } define void @test5(half %x, ptr %y) { ; X64-LABEL: test5: ; X64: # %bb.0: ; X64-NEXT: vmovsh %xmm0, (%rdi) ; X64-NEXT: retq ; ; X86-LABEL: test5: ; X86: # %bb.0: ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovsh %xmm0, (%eax) ; X86-NEXT: retl store half %x, ptr %y, align 2 ret void } define half @test7(ptr %x) { ; X64-LABEL: test7: ; X64: # %bb.0: ; X64-NEXT: vmovsh (%rdi), %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: test7: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovsh (%eax), %xmm0 ; X86-NEXT: retl %y = load i16, ptr %x %res = bitcast i16 %y to half ret half %res } define <8 x i16> @test10(ptr %x) { ; X64-LABEL: test10: ; X64: # %bb.0: ; X64-NEXT: vmovw (%rdi), %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: test10: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovw (%eax), %xmm0 ; X86-NEXT: retl %y = load i16, ptr %x, align 2 %res = insertelement <8 x i16>zeroinitializer, i16 %y, i32 0 ret <8 x i16>%res } define <16 x i16> @test10b(ptr %x) { ; X64-LABEL: test10b: ; X64: # %bb.0: ; X64-NEXT: vmovw (%rdi), %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: test10b: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovw (%eax), %xmm0 ; X86-NEXT: retl %y = load i16, ptr %x, align 2 %res = insertelement <16 x i16>zeroinitializer, i16 %y, i32 0 ret <16 x i16>%res } define <32 x i16> @test10c(ptr %x) { ; X64-LABEL: test10c: ; X64: # %bb.0: ; X64-NEXT: vmovw (%rdi), %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: test10c: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovw (%eax), %xmm0 ; X86-NEXT: retl %y = load i16, ptr %x, align 2 %res = insertelement <32 x i16>zeroinitializer, i16 %y, i32 0 ret <32 x i16>%res } define <8 x half> @test11(ptr %x) { ; X64-LABEL: test11: ; X64: # %bb.0: ; X64-NEXT: vmovsh (%rdi), %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: test11: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovsh (%eax), %xmm0 ; X86-NEXT: retl %y = load half, ptr %x, align 2 %res = insertelement <8 x half>zeroinitializer, half %y, i32 0 ret <8 x half>%res } define <16 x half> @test11b(ptr %x) { ; X64-LABEL: test11b: ; X64: # %bb.0: ; X64-NEXT: vmovsh (%rdi), %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: test11b: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovsh (%eax), %xmm0 ; X86-NEXT: retl %y = load half, ptr %x, align 2 %res = insertelement <16 x half>zeroinitializer, half %y, i32 0 ret <16 x half>%res } define <32 x half> @test11c(ptr %x) { ; X64-LABEL: test11c: ; X64: # %bb.0: ; X64-NEXT: vmovsh (%rdi), %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: test11c: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovsh (%eax), %xmm0 ; X86-NEXT: retl %y = load half, ptr %x, align 2 %res = insertelement <32 x half>zeroinitializer, half %y, i32 0 ret <32 x half>%res } define <8 x half> @test14(half %x) { ; X64-LABEL: test14: ; X64: # %bb.0: ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-NEXT: vmovsh %xmm0, %xmm1, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: test14: ; X86: # %bb.0: ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: retl %res = insertelement <8 x half>zeroinitializer, half %x, i32 0 ret <8 x half>%res } define <16 x half> @test14b(half %x) { ; X64-LABEL: test14b: ; X64: # %bb.0: ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-NEXT: vmovsh %xmm0, %xmm1, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: test14b: ; X86: # %bb.0: ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: retl %res = insertelement <16 x half>zeroinitializer, half %x, i32 0 ret <16 x half>%res } define <32 x half> @test14c(half %x) { ; X64-LABEL: test14c: ; X64: # %bb.0: ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-NEXT: vmovsh %xmm0, %xmm1, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: test14c: ; X86: # %bb.0: ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: retl %res = insertelement <32 x half>zeroinitializer, half %x, i32 0 ret <32 x half>%res } define <8 x i16> @test15(i16 %x) { ; X64-LABEL: test15: ; X64: # %bb.0: ; X64-NEXT: vmovw %edi, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: test15: ; X86: # %bb.0: ; X86-NEXT: vmovw {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: retl %res = insertelement <8 x i16>zeroinitializer, i16 %x, i32 0 ret <8 x i16>%res } define <16 x i16> @test16(i16 %x) { ; X64-LABEL: test16: ; X64: # %bb.0: ; X64-NEXT: vmovw %edi, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: test16: ; X86: # %bb.0: ; X86-NEXT: vmovw {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: retl %res = insertelement <16 x i16>zeroinitializer, i16 %x, i32 0 ret <16 x i16>%res } define <32 x i16> @test17(i16 %x) { ; X64-LABEL: test17: ; X64: # %bb.0: ; X64-NEXT: vmovw %edi, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: test17: ; X86: # %bb.0: ; X86-NEXT: vmovw {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: retl %res = insertelement <32 x i16>zeroinitializer, i16 %x, i32 0 ret <32 x i16>%res } define <8 x i16> @test18(i16 %x) { ; X64-LABEL: test18: ; X64: # %bb.0: ; X64-NEXT: vmovw %edi, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: test18: ; X86: # %bb.0: ; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: retl %res = insertelement <8 x i16> undef, i16 %x, i32 0 ret <8 x i16>%res } define <16 x i16> @test19(i16 %x) { ; X64-LABEL: test19: ; X64: # %bb.0: ; X64-NEXT: vmovw %edi, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: test19: ; X86: # %bb.0: ; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %ymm0 ; X86-NEXT: retl %res = insertelement <16 x i16> undef, i16 %x, i32 0 ret <16 x i16>%res } define <32 x i16> @test20(i16 %x) { ; X64-LABEL: test20: ; X64: # %bb.0: ; X64-NEXT: vmovw %edi, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: test20: ; X86: # %bb.0: ; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %zmm0 ; X86-NEXT: retl %res = insertelement <32 x i16> undef, i16 %x, i32 0 ret <32 x i16>%res } @g8f16 = external global <8 x half> @g8f16u = external global <8 x half>, align 8 @g16f16 = external global <16 x half> @g16f16u = external global <16 x half>, align 8 @g32f16 = external global <32 x half> @g32f16u = external global <32 x half>, align 8 define <32 x half> @load32f16(ptr %a) { ; X64-LABEL: load32f16: ; X64: # %bb.0: ; X64-NEXT: vmovaps (%rdi), %zmm0 ; X64-NEXT: retq ; ; X86-LABEL: load32f16: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovaps (%eax), %zmm0 ; X86-NEXT: retl %res = load <32 x half>, ptr %a ret <32 x half> %res } define <32 x half> @load32f16mask(ptr %a, <32 x half> %b, i32 %c) { ; X64-LABEL: load32f16mask: ; X64: # %bb.0: ; X64-NEXT: kmovd %esi, %k1 ; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} ; X64-NEXT: retq ; ; X86-LABEL: load32f16mask: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} ; X86-NEXT: retl %msk = bitcast i32 %c to <32 x i1> %res0 = load <32 x half>, ptr %a %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %b ret <32 x half> %res } define <32 x half> @load32f16maskz(ptr %a, i32 %c) { ; X64-LABEL: load32f16maskz: ; X64: # %bb.0: ; X64-NEXT: kmovd %esi, %k1 ; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z} ; X64-NEXT: retq ; ; X86-LABEL: load32f16maskz: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} {z} ; X86-NEXT: retl %msk = bitcast i32 %c to <32 x i1> %res0 = load <32 x half>, ptr %a %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> zeroinitializer ret <32 x half> %res } define <32 x half> @loadu32f16(ptr %a) { ; X64-LABEL: loadu32f16: ; X64: # %bb.0: ; X64-NEXT: vmovups (%rdi), %zmm0 ; X64-NEXT: retq ; ; X86-LABEL: loadu32f16: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovups (%eax), %zmm0 ; X86-NEXT: retl %res = load <32 x half>, ptr %a, align 8 ret <32 x half> %res } define <32 x half> @loadu32f16mask(ptr %a, <32 x half> %b, i32 %c) { ; X64-LABEL: loadu32f16mask: ; X64: # %bb.0: ; X64-NEXT: kmovd %esi, %k1 ; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} ; X64-NEXT: retq ; ; X86-LABEL: loadu32f16mask: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} ; X86-NEXT: retl %msk = bitcast i32 %c to <32 x i1> %res0 = load <32 x half>, ptr %a, align 8 %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %b ret <32 x half> %res } define <32 x half> @loadu32f16maskz(ptr %a, i32 %c) { ; X64-LABEL: loadu32f16maskz: ; X64: # %bb.0: ; X64-NEXT: kmovd %esi, %k1 ; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z} ; X64-NEXT: retq ; ; X86-LABEL: loadu32f16maskz: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} {z} ; X86-NEXT: retl %msk = bitcast i32 %c to <32 x i1> %res0 = load <32 x half>, ptr %a, align 8 %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> zeroinitializer ret <32 x half> %res } define void @store32f16(<32 x half> %a) { ; X64-LABEL: store32f16: ; X64: # %bb.0: ; X64-NEXT: movq g32f16@GOTPCREL(%rip), %rax ; X64-NEXT: vmovaps %zmm0, (%rax) ; X64-NEXT: vzeroupper ; X64-NEXT: retq ; ; X86-LABEL: store32f16: ; X86: # %bb.0: ; X86-NEXT: vmovaps %zmm0, g32f16 ; X86-NEXT: vzeroupper ; X86-NEXT: retl store <32 x half> %a, ptr @g32f16 ret void } define void @storeu32f16(<32 x half> %a) { ; X64-LABEL: storeu32f16: ; X64: # %bb.0: ; X64-NEXT: movq g32f16u@GOTPCREL(%rip), %rax ; X64-NEXT: vmovups %zmm0, (%rax) ; X64-NEXT: vzeroupper ; X64-NEXT: retq ; ; X86-LABEL: storeu32f16: ; X86: # %bb.0: ; X86-NEXT: vmovups %zmm0, g32f16u ; X86-NEXT: vzeroupper ; X86-NEXT: retl store <32 x half> %a, ptr @g32f16u, align 8 ret void } declare void @llvm.masked.store.v32f16.p0(<32 x half>, ptr, i32, <32 x i1>) declare <32 x half> @llvm.masked.load.v32f16.p0(ptr, i32, <32 x i1>, <32 x half>) define void @storeu32f16mask(<32 x i1> %mask, ptr %addr, <32 x half> %val) { ; X64-LABEL: storeu32f16mask: ; X64: # %bb.0: ; X64-NEXT: vpsllw $7, %ymm0, %ymm0 ; X64-NEXT: vpmovb2m %ymm0, %k1 ; X64-NEXT: vmovdqu16 %zmm1, (%rdi) {%k1} ; X64-NEXT: vzeroupper ; X64-NEXT: retq ; ; X86-LABEL: storeu32f16mask: ; X86: # %bb.0: ; X86-NEXT: vpsllw $7, %ymm0, %ymm0 ; X86-NEXT: vpmovb2m %ymm0, %k1 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovdqu16 %zmm1, (%eax) {%k1} ; X86-NEXT: vzeroupper ; X86-NEXT: retl call void @llvm.masked.store.v32f16.p0(<32 x half> %val, ptr %addr, i32 4, <32 x i1>%mask) ret void } define <32 x half> @maskloadu32f16(ptr %addr, <32 x half> %val, <32 x i1> %mask) { ; X64-LABEL: maskloadu32f16: ; X64: # %bb.0: ; X64-NEXT: vpsllw $7, %ymm1, %ymm1 ; X64-NEXT: vpmovb2m %ymm1, %k1 ; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} ; X64-NEXT: retq ; ; X86-LABEL: maskloadu32f16: ; X86: # %bb.0: ; X86-NEXT: vpsllw $7, %ymm1, %ymm1 ; X86-NEXT: vpmovb2m %ymm1, %k1 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} ; X86-NEXT: retl %res = call <32 x half> @llvm.masked.load.v32f16.p0(ptr %addr, i32 4, <32 x i1> %mask, <32 x half> %val) ret <32 x half> %res } define <32 x half> @maskuloadu32f16(ptr %addr, <32 x i1> %mask) { ; X64-LABEL: maskuloadu32f16: ; X64: # %bb.0: ; X64-NEXT: vpsllw $7, %ymm0, %ymm0 ; X64-NEXT: vpmovb2m %ymm0, %k1 ; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z} ; X64-NEXT: retq ; ; X86-LABEL: maskuloadu32f16: ; X86: # %bb.0: ; X86-NEXT: vpsllw $7, %ymm0, %ymm0 ; X86-NEXT: vpmovb2m %ymm0, %k1 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} {z} ; X86-NEXT: retl %res = call <32 x half> @llvm.masked.load.v32f16.p0(ptr %addr, i32 4, <32 x i1> %mask, <32 x half> undef) ret <32 x half> %res } define <32 x half> @maskzloadu32f16(ptr %addr, <32 x i1> %mask) { ; X64-LABEL: maskzloadu32f16: ; X64: # %bb.0: ; X64-NEXT: vpsllw $7, %ymm0, %ymm0 ; X64-NEXT: vpmovb2m %ymm0, %k1 ; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z} ; X64-NEXT: retq ; ; X86-LABEL: maskzloadu32f16: ; X86: # %bb.0: ; X86-NEXT: vpsllw $7, %ymm0, %ymm0 ; X86-NEXT: vpmovb2m %ymm0, %k1 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} {z} ; X86-NEXT: retl %res = call <32 x half> @llvm.masked.load.v32f16.p0(ptr %addr, i32 4, <32 x i1> %mask, <32 x half> zeroinitializer) ret <32 x half> %res } define <32 x half> @movrr32f16(<32 x half> %a, <32 x half> %b) { ; CHECK-LABEL: movrr32f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps %zmm1, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} ret <32 x half> %b } define <32 x half> @movrrk32f16(<32 x half> %a, <32 x half> %b, i32 %msk) { ; X64-LABEL: movrrk32f16: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} ; X64-NEXT: retq ; ; X86-LABEL: movrrk32f16: ; X86: # %bb.0: ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} ; X86-NEXT: retl %mask = bitcast i32 %msk to <32 x i1> %res = select <32 x i1> %mask, <32 x half> %a, <32 x half> %b ret <32 x half> %res } define <32 x half> @movrrkz32f16(<32 x half> %a, i32 %msk) { ; X64-LABEL: movrrkz32f16: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} ; X64-NEXT: retq ; ; X86-LABEL: movrrkz32f16: ; X86: # %bb.0: ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} ; X86-NEXT: retl %mask = bitcast i32 %msk to <32 x i1> %res = select <32 x i1> %mask, <32 x half> %a, <32 x half> zeroinitializer ret <32 x half> %res } define <16 x half> @load16f16(ptr %a) { ; X64-LABEL: load16f16: ; X64: # %bb.0: ; X64-NEXT: vmovaps (%rdi), %ymm0 ; X64-NEXT: retq ; ; X86-LABEL: load16f16: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovaps (%eax), %ymm0 ; X86-NEXT: retl %res = load <16 x half>, ptr %a ret <16 x half> %res } define <16 x half> @load16f16mask(ptr %a, <16 x half> %b, i16 %c) { ; X64-LABEL: load16f16mask: ; X64: # %bb.0: ; X64-NEXT: kmovd %esi, %k1 ; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} ; X64-NEXT: retq ; ; X86-LABEL: load16f16mask: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} ; X86-NEXT: retl %msk = bitcast i16 %c to <16 x i1> %res0 = load <16 x half>, ptr %a %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %b ret <16 x half> %res } define <16 x half> @load16f16maskz(ptr %a, i16 %c) { ; X64-LABEL: load16f16maskz: ; X64: # %bb.0: ; X64-NEXT: kmovd %esi, %k1 ; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z} ; X64-NEXT: retq ; ; X86-LABEL: load16f16maskz: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} {z} ; X86-NEXT: retl %msk = bitcast i16 %c to <16 x i1> %res0 = load <16 x half>, ptr %a %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> zeroinitializer ret <16 x half> %res } define <16 x half> @loadu16f16(ptr %a) { ; X64-LABEL: loadu16f16: ; X64: # %bb.0: ; X64-NEXT: vmovups (%rdi), %ymm0 ; X64-NEXT: retq ; ; X86-LABEL: loadu16f16: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovups (%eax), %ymm0 ; X86-NEXT: retl %res = load <16 x half>, ptr %a, align 8 ret <16 x half> %res } define <16 x half> @loadu16f16mask(ptr %a, <16 x half> %b, i16 %c) { ; X64-LABEL: loadu16f16mask: ; X64: # %bb.0: ; X64-NEXT: kmovd %esi, %k1 ; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} ; X64-NEXT: retq ; ; X86-LABEL: loadu16f16mask: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} ; X86-NEXT: retl %msk = bitcast i16 %c to <16 x i1> %res0 = load <16 x half>, ptr %a, align 8 %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %b ret <16 x half> %res } define <16 x half> @loadu16f16maskz(ptr %a, i16 %c) { ; X64-LABEL: loadu16f16maskz: ; X64: # %bb.0: ; X64-NEXT: kmovd %esi, %k1 ; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z} ; X64-NEXT: retq ; ; X86-LABEL: loadu16f16maskz: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} {z} ; X86-NEXT: retl %msk = bitcast i16 %c to <16 x i1> %res0 = load <16 x half>, ptr %a, align 8 %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> zeroinitializer ret <16 x half> %res } define void @store16f16(<16 x half> %a) { ; X64-LABEL: store16f16: ; X64: # %bb.0: ; X64-NEXT: movq g16f16@GOTPCREL(%rip), %rax ; X64-NEXT: vmovaps %ymm0, (%rax) ; X64-NEXT: vzeroupper ; X64-NEXT: retq ; ; X86-LABEL: store16f16: ; X86: # %bb.0: ; X86-NEXT: vmovaps %ymm0, g16f16 ; X86-NEXT: vzeroupper ; X86-NEXT: retl store <16 x half> %a, ptr @g16f16 ret void } define void @storeu16f16(<16 x half> %a) { ; X64-LABEL: storeu16f16: ; X64: # %bb.0: ; X64-NEXT: movq g16f16u@GOTPCREL(%rip), %rax ; X64-NEXT: vmovups %ymm0, (%rax) ; X64-NEXT: vzeroupper ; X64-NEXT: retq ; ; X86-LABEL: storeu16f16: ; X86: # %bb.0: ; X86-NEXT: vmovups %ymm0, g16f16u ; X86-NEXT: vzeroupper ; X86-NEXT: retl store <16 x half> %a, ptr @g16f16u, align 8 ret void } declare void @llvm.masked.store.v16f16.p0(<16 x half>, ptr, i32, <16 x i1>) declare <16 x half> @llvm.masked.load.v16f16.p0(ptr, i32, <16 x i1>, <16 x half>) define void @storeu16f16mask(<16 x i1> %mask, ptr %addr, <16 x half> %val) { ; X64-LABEL: storeu16f16mask: ; X64: # %bb.0: ; X64-NEXT: vpsllw $7, %xmm0, %xmm0 ; X64-NEXT: vpmovb2m %xmm0, %k1 ; X64-NEXT: vmovdqu16 %ymm1, (%rdi) {%k1} ; X64-NEXT: vzeroupper ; X64-NEXT: retq ; ; X86-LABEL: storeu16f16mask: ; X86: # %bb.0: ; X86-NEXT: vpsllw $7, %xmm0, %xmm0 ; X86-NEXT: vpmovb2m %xmm0, %k1 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovdqu16 %ymm1, (%eax) {%k1} ; X86-NEXT: vzeroupper ; X86-NEXT: retl call void @llvm.masked.store.v16f16.p0(<16 x half> %val, ptr %addr, i32 4, <16 x i1>%mask) ret void } define <16 x half> @maskloadu16f16(ptr %addr, <16 x half> %val, <16 x i1> %mask) { ; X64-LABEL: maskloadu16f16: ; X64: # %bb.0: ; X64-NEXT: vpsllw $7, %xmm1, %xmm1 ; X64-NEXT: vpmovb2m %xmm1, %k1 ; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} ; X64-NEXT: retq ; ; X86-LABEL: maskloadu16f16: ; X86: # %bb.0: ; X86-NEXT: vpsllw $7, %xmm1, %xmm1 ; X86-NEXT: vpmovb2m %xmm1, %k1 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} ; X86-NEXT: retl %res = call <16 x half> @llvm.masked.load.v16f16.p0(ptr %addr, i32 4, <16 x i1> %mask, <16 x half> %val) ret <16 x half> %res } define <16 x half> @maskuloadu16f16(ptr %addr, <16 x i1> %mask) { ; X64-LABEL: maskuloadu16f16: ; X64: # %bb.0: ; X64-NEXT: vpsllw $7, %xmm0, %xmm0 ; X64-NEXT: vpmovb2m %xmm0, %k1 ; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z} ; X64-NEXT: retq ; ; X86-LABEL: maskuloadu16f16: ; X86: # %bb.0: ; X86-NEXT: vpsllw $7, %xmm0, %xmm0 ; X86-NEXT: vpmovb2m %xmm0, %k1 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} {z} ; X86-NEXT: retl %res = call <16 x half> @llvm.masked.load.v16f16.p0(ptr %addr, i32 4, <16 x i1> %mask, <16 x half> undef) ret <16 x half> %res } define <16 x half> @maskzloadu16f16(ptr %addr, <16 x i1> %mask) { ; X64-LABEL: maskzloadu16f16: ; X64: # %bb.0: ; X64-NEXT: vpsllw $7, %xmm0, %xmm0 ; X64-NEXT: vpmovb2m %xmm0, %k1 ; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z} ; X64-NEXT: retq ; ; X86-LABEL: maskzloadu16f16: ; X86: # %bb.0: ; X86-NEXT: vpsllw $7, %xmm0, %xmm0 ; X86-NEXT: vpmovb2m %xmm0, %k1 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} {z} ; X86-NEXT: retl %res = call <16 x half> @llvm.masked.load.v16f16.p0(ptr %addr, i32 4, <16 x i1> %mask, <16 x half> zeroinitializer) ret <16 x half> %res } define <16 x half> @movrr16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-LABEL: movrr16f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps %ymm1, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} ret <16 x half> %b } define <16 x half> @movrrk16f16(<16 x half> %a, <16 x half> %b, i16 %msk) { ; X64-LABEL: movrrk16f16: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} ; X64-NEXT: retq ; ; X86-LABEL: movrrk16f16: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} ; X86-NEXT: retl %mask = bitcast i16 %msk to <16 x i1> %res = select <16 x i1> %mask, <16 x half> %a, <16 x half> %b ret <16 x half> %res } define <16 x half> @movrrkz16f16(<16 x half> %a, i16 %msk) { ; X64-LABEL: movrrkz16f16: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} ; X64-NEXT: retq ; ; X86-LABEL: movrrkz16f16: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} ; X86-NEXT: retl %mask = bitcast i16 %msk to <16 x i1> %res = select <16 x i1> %mask, <16 x half> %a, <16 x half> zeroinitializer ret <16 x half> %res } define <8 x half> @load8f16(ptr %a) { ; X64-LABEL: load8f16: ; X64: # %bb.0: ; X64-NEXT: movaps (%rdi), %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: load8f16: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movaps (%eax), %xmm0 ; X86-NEXT: retl %res = load <8 x half>, ptr %a ret <8 x half> %res } define <8 x half> @load8f16mask(ptr %a, <8 x half> %b, i8 %c) { ; X64-LABEL: load8f16mask: ; X64: # %bb.0: ; X64-NEXT: kmovd %esi, %k1 ; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} ; X64-NEXT: retq ; ; X86-LABEL: load8f16mask: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} ; X86-NEXT: retl %msk = bitcast i8 %c to <8 x i1> %res0 = load <8 x half>, ptr %a %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %b ret <8 x half> %res } define <8 x half> @load8f16maskz(ptr %a, i8 %c) { ; X64-LABEL: load8f16maskz: ; X64: # %bb.0: ; X64-NEXT: kmovd %esi, %k1 ; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z} ; X64-NEXT: retq ; ; X86-LABEL: load8f16maskz: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} {z} ; X86-NEXT: retl %msk = bitcast i8 %c to <8 x i1> %res0 = load <8 x half>, ptr %a %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> zeroinitializer ret <8 x half> %res } define <8 x half> @loadu8f16(ptr %a) { ; X64-LABEL: loadu8f16: ; X64: # %bb.0: ; X64-NEXT: movups (%rdi), %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: loadu8f16: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movups (%eax), %xmm0 ; X86-NEXT: retl %res = load <8 x half>, ptr %a, align 8 ret <8 x half> %res } define <8 x half> @loadu8f16mask(ptr %a, <8 x half> %b, i8 %c) { ; X64-LABEL: loadu8f16mask: ; X64: # %bb.0: ; X64-NEXT: kmovd %esi, %k1 ; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} ; X64-NEXT: retq ; ; X86-LABEL: loadu8f16mask: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} ; X86-NEXT: retl %msk = bitcast i8 %c to <8 x i1> %res0 = load <8 x half>, ptr %a, align 8 %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %b ret <8 x half> %res } define <8 x half> @loadu8f16maskz(ptr %a, i8 %c) { ; X64-LABEL: loadu8f16maskz: ; X64: # %bb.0: ; X64-NEXT: kmovd %esi, %k1 ; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z} ; X64-NEXT: retq ; ; X86-LABEL: loadu8f16maskz: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} {z} ; X86-NEXT: retl %msk = bitcast i8 %c to <8 x i1> %res0 = load <8 x half>, ptr %a, align 8 %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> zeroinitializer ret <8 x half> %res } define void @store8f16(<8 x half> %a) { ; X64-LABEL: store8f16: ; X64: # %bb.0: ; X64-NEXT: movq g8f16@GOTPCREL(%rip), %rax ; X64-NEXT: movaps %xmm0, (%rax) ; X64-NEXT: retq ; ; X86-LABEL: store8f16: ; X86: # %bb.0: ; X86-NEXT: movaps %xmm0, g8f16 ; X86-NEXT: retl store <8 x half> %a, ptr @g8f16 ret void } define void @storeu8f16(<8 x half> %a) { ; X64-LABEL: storeu8f16: ; X64: # %bb.0: ; X64-NEXT: movq g8f16u@GOTPCREL(%rip), %rax ; X64-NEXT: movups %xmm0, (%rax) ; X64-NEXT: retq ; ; X86-LABEL: storeu8f16: ; X86: # %bb.0: ; X86-NEXT: movups %xmm0, g8f16u ; X86-NEXT: retl store <8 x half> %a, ptr @g8f16u, align 8 ret void } declare void @llvm.masked.store.v8f16.p0(<8 x half>, ptr, i32, <8 x i1>) declare <8 x half> @llvm.masked.load.v8f16.p0(ptr, i32, <8 x i1>, <8 x half>) define void @storeu8f16mask(<8 x i1> %mask, ptr %addr, <8 x half> %val) { ; X64-LABEL: storeu8f16mask: ; X64: # %bb.0: ; X64-NEXT: vpsllw $15, %xmm0, %xmm0 ; X64-NEXT: vpmovw2m %xmm0, %k1 ; X64-NEXT: vmovdqu16 %xmm1, (%rdi) {%k1} ; X64-NEXT: retq ; ; X86-LABEL: storeu8f16mask: ; X86: # %bb.0: ; X86-NEXT: vpsllw $15, %xmm0, %xmm0 ; X86-NEXT: vpmovw2m %xmm0, %k1 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovdqu16 %xmm1, (%eax) {%k1} ; X86-NEXT: retl call void @llvm.masked.store.v8f16.p0(<8 x half> %val, ptr %addr, i32 4, <8 x i1>%mask) ret void } define <8 x half> @maskloadu8f16(ptr %addr, <8 x half> %val, <8 x i1> %mask) { ; X64-LABEL: maskloadu8f16: ; X64: # %bb.0: ; X64-NEXT: vpsllw $15, %xmm1, %xmm1 ; X64-NEXT: vpmovw2m %xmm1, %k1 ; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} ; X64-NEXT: retq ; ; X86-LABEL: maskloadu8f16: ; X86: # %bb.0: ; X86-NEXT: vpsllw $15, %xmm1, %xmm1 ; X86-NEXT: vpmovw2m %xmm1, %k1 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} ; X86-NEXT: retl %res = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x half> %val) ret <8 x half> %res } define <8 x half> @maskuloadu8f16(ptr %addr, <8 x i1> %mask) { ; X64-LABEL: maskuloadu8f16: ; X64: # %bb.0: ; X64-NEXT: vpsllw $15, %xmm0, %xmm0 ; X64-NEXT: vpmovw2m %xmm0, %k1 ; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z} ; X64-NEXT: retq ; ; X86-LABEL: maskuloadu8f16: ; X86: # %bb.0: ; X86-NEXT: vpsllw $15, %xmm0, %xmm0 ; X86-NEXT: vpmovw2m %xmm0, %k1 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} {z} ; X86-NEXT: retl %res = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x half> undef) ret <8 x half> %res } define <8 x half> @maskzloadu8f16(ptr %addr, <8 x i1> %mask) { ; X64-LABEL: maskzloadu8f16: ; X64: # %bb.0: ; X64-NEXT: vpsllw $15, %xmm0, %xmm0 ; X64-NEXT: vpmovw2m %xmm0, %k1 ; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z} ; X64-NEXT: retq ; ; X86-LABEL: maskzloadu8f16: ; X86: # %bb.0: ; X86-NEXT: vpsllw $15, %xmm0, %xmm0 ; X86-NEXT: vpmovw2m %xmm0, %k1 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} {z} ; X86-NEXT: retl %res = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x half> zeroinitializer) ret <8 x half> %res } define <8 x half> @movrr8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-LABEL: movrr8f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} ret <8 x half> %b } define <8 x half> @movrrk8f16(<8 x half> %a, <8 x half> %b, i8 %msk) { ; X64-LABEL: movrrk8f16: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} ; X64-NEXT: retq ; ; X86-LABEL: movrrk8f16: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} ; X86-NEXT: retl %mask = bitcast i8 %msk to <8 x i1> %res = select <8 x i1> %mask, <8 x half> %a, <8 x half> %b ret <8 x half> %res } define <8 x half> @movrrkz8f16(<8 x half> %a, i8 %msk) { ; X64-LABEL: movrrkz8f16: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z} ; X64-NEXT: retq ; ; X86-LABEL: movrrkz8f16: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z} ; X86-NEXT: retl %mask = bitcast i8 %msk to <8 x i1> %res = select <8 x i1> %mask, <8 x half> %a, <8 x half> zeroinitializer ret <8 x half> %res } define <8 x half> @movsh(<8 x half> %a, <8 x half> %b) { ; CHECK-LABEL: movsh: ; CHECK: # %bb.0: ; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,0,1,2,3,4,5,6,7,14,15,10,11] ; CHECK-NEXT: vmovsh %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: vaddph %xmm0, %xmm2, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %res1 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 7, i32 0, i32 1, i32 2, i32 3, i32 7, i32 5> %res2 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %res = fadd <8 x half> %res1, %res2 ret <8 x half> %res } define i16 @test_movw(half %x) { ; X64-LABEL: test_movw: ; X64: # %bb.0: ; X64-NEXT: vmovw %xmm0, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq ; ; X86-LABEL: test_movw: ; X86: # %bb.0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl %res = bitcast half %x to i16 ret i16 %res } define half @test_movw2(i16 %x) { ; X64-LABEL: test_movw2: ; X64: # %bb.0: ; X64-NEXT: vmovw %edi, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: test_movw2: ; X86: # %bb.0: ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: retl %res = bitcast i16 %x to half ret half %res } ; sext avoids having a truncate in front of the bitcast input due to calling ; convention or i16 op promotion. define half @test_movw3(i8 %x) { ; X64-LABEL: test_movw3: ; X64: # %bb.0: ; X64-NEXT: movsbl %dil, %eax ; X64-NEXT: vmovw %eax, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: test_movw3: ; X86: # %bb.0: ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovw %eax, %xmm0 ; X86-NEXT: retl %z = sext i8 %x to i16 %a = bitcast i16 %z to half ret half %a } define half @extract_f16_0(<8 x half> %x) { ; CHECK-LABEL: extract_f16_0: ; CHECK: # %bb.0: ; CHECK-NEXT: ret{{[l|q]}} %res = extractelement <8 x half> %x, i32 0 ret half %res } define half @extract_f16_1(<8 x half> %x) { ; CHECK-LABEL: extract_f16_1: ; CHECK: # %bb.0: ; CHECK-NEXT: vpsrld $16, %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %res = extractelement <8 x half> %x, i32 1 ret half %res } define half @extract_f16_2(<8 x half> %x) { ; CHECK-LABEL: extract_f16_2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; CHECK-NEXT: ret{{[l|q]}} %res = extractelement <8 x half> %x, i32 2 ret half %res } define half @extract_f16_3(<8 x half> %x) { ; CHECK-LABEL: extract_f16_3: ; CHECK: # %bb.0: ; CHECK-NEXT: vpsrlq $48, %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %res = extractelement <8 x half> %x, i32 3 ret half %res } define half @extract_f16_4(<8 x half> %x) { ; CHECK-LABEL: extract_f16_4: ; CHECK: # %bb.0: ; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; CHECK-NEXT: ret{{[l|q]}} %res = extractelement <8 x half> %x, i32 4 ret half %res } define half @extract_f16_5(<8 x half> %x) { ; CHECK-LABEL: extract_f16_5: ; CHECK: # %bb.0: ; CHECK-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: ret{{[l|q]}} %res = extractelement <8 x half> %x, i32 5 ret half %res } define half @extract_f16_6(<8 x half> %x) { ; CHECK-LABEL: extract_f16_6: ; CHECK: # %bb.0: ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: ret{{[l|q]}} %res = extractelement <8 x half> %x, i32 6 ret half %res } define half @extract_f16_7(<8 x half> %x) { ; CHECK-LABEL: extract_f16_7: ; CHECK: # %bb.0: ; CHECK-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: ret{{[l|q]}} %res = extractelement <8 x half> %x, i32 7 ret half %res } define i16 @extract_i16_0(<8 x i16> %x) { ; CHECK-LABEL: extract_i16_0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovw %xmm0, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: ret{{[l|q]}} %res = extractelement <8 x i16> %x, i32 0 ret i16 %res } define i16 @extract_i16_1(<8 x i16> %x) { ; CHECK-LABEL: extract_i16_1: ; CHECK: # %bb.0: ; CHECK-NEXT: vpextrw $1, %xmm0, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: ret{{[l|q]}} %res = extractelement <8 x i16> %x, i32 1 ret i16 %res } define i16 @extract_i16_2(<8 x i16> %x) { ; CHECK-LABEL: extract_i16_2: ; CHECK: # %bb.0: ; CHECK-NEXT: vpextrw $2, %xmm0, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: ret{{[l|q]}} %res = extractelement <8 x i16> %x, i32 2 ret i16 %res } define i16 @extract_i16_3(<8 x i16> %x) { ; CHECK-LABEL: extract_i16_3: ; CHECK: # %bb.0: ; CHECK-NEXT: vpextrw $3, %xmm0, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: ret{{[l|q]}} %res = extractelement <8 x i16> %x, i32 3 ret i16 %res } define i16 @extract_i16_4(<8 x i16> %x) { ; CHECK-LABEL: extract_i16_4: ; CHECK: # %bb.0: ; CHECK-NEXT: vpextrw $4, %xmm0, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: ret{{[l|q]}} %res = extractelement <8 x i16> %x, i32 4 ret i16 %res } define i16 @extract_i16_5(<8 x i16> %x) { ; CHECK-LABEL: extract_i16_5: ; CHECK: # %bb.0: ; CHECK-NEXT: vpextrw $5, %xmm0, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: ret{{[l|q]}} %res = extractelement <8 x i16> %x, i32 5 ret i16 %res } define i16 @extract_i16_6(<8 x i16> %x) { ; CHECK-LABEL: extract_i16_6: ; CHECK: # %bb.0: ; CHECK-NEXT: vpextrw $6, %xmm0, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: ret{{[l|q]}} %res = extractelement <8 x i16> %x, i32 6 ret i16 %res } define i16 @extract_i16_7(<8 x i16> %x) { ; CHECK-LABEL: extract_i16_7: ; CHECK: # %bb.0: ; CHECK-NEXT: vpextrw $7, %xmm0, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: ret{{[l|q]}} %res = extractelement <8 x i16> %x, i32 7 ret i16 %res } define void @extract_store_f16_0(<8 x half> %x, ptr %y) { ; X64-LABEL: extract_store_f16_0: ; X64: # %bb.0: ; X64-NEXT: vmovsh %xmm0, (%rdi) ; X64-NEXT: retq ; ; X86-LABEL: extract_store_f16_0: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovsh %xmm0, (%eax) ; X86-NEXT: retl %res = extractelement <8 x half> %x, i32 0 store half %res, ptr %y ret void } define void @extract_store_f16_1(<8 x half> %x, ptr %y) { ; X64-LABEL: extract_store_f16_1: ; X64: # %bb.0: ; X64-NEXT: vpsrld $16, %xmm0, %xmm0 ; X64-NEXT: vmovsh %xmm0, (%rdi) ; X64-NEXT: retq ; ; X86-LABEL: extract_store_f16_1: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vpsrld $16, %xmm0, %xmm0 ; X86-NEXT: vmovsh %xmm0, (%eax) ; X86-NEXT: retl %res = extractelement <8 x half> %x, i32 1 store half %res, ptr %y ret void } define void @extract_store_f16_2(<8 x half> %x, ptr %y) { ; X64-LABEL: extract_store_f16_2: ; X64: # %bb.0: ; X64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; X64-NEXT: vmovsh %xmm0, (%rdi) ; X64-NEXT: retq ; ; X86-LABEL: extract_store_f16_2: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; X86-NEXT: vmovsh %xmm0, (%eax) ; X86-NEXT: retl %res = extractelement <8 x half> %x, i32 2 store half %res, ptr %y ret void } define void @extract_store_f16_3(<8 x half> %x, ptr %y) { ; X64-LABEL: extract_store_f16_3: ; X64: # %bb.0: ; X64-NEXT: vpsrlq $48, %xmm0, %xmm0 ; X64-NEXT: vmovsh %xmm0, (%rdi) ; X64-NEXT: retq ; ; X86-LABEL: extract_store_f16_3: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vpsrlq $48, %xmm0, %xmm0 ; X86-NEXT: vmovsh %xmm0, (%eax) ; X86-NEXT: retl %res = extractelement <8 x half> %x, i32 3 store half %res, ptr %y ret void } define void @extract_store_f16_4(<8 x half> %x, ptr %y) { ; X64-LABEL: extract_store_f16_4: ; X64: # %bb.0: ; X64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; X64-NEXT: vmovsh %xmm0, (%rdi) ; X64-NEXT: retq ; ; X86-LABEL: extract_store_f16_4: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; X86-NEXT: vmovsh %xmm0, (%eax) ; X86-NEXT: retl %res = extractelement <8 x half> %x, i32 4 store half %res, ptr %y ret void } define void @extract_store_f16_5(<8 x half> %x, ptr %y) { ; X64-LABEL: extract_store_f16_5: ; X64: # %bb.0: ; X64-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X64-NEXT: vmovsh %xmm0, (%rdi) ; X64-NEXT: retq ; ; X86-LABEL: extract_store_f16_5: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X86-NEXT: vmovsh %xmm0, (%eax) ; X86-NEXT: retl %res = extractelement <8 x half> %x, i32 5 store half %res, ptr %y ret void } define void @extract_store_f16_6(<8 x half> %x, ptr %y) { ; X64-LABEL: extract_store_f16_6: ; X64: # %bb.0: ; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; X64-NEXT: vmovsh %xmm0, (%rdi) ; X64-NEXT: retq ; ; X86-LABEL: extract_store_f16_6: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; X86-NEXT: vmovsh %xmm0, (%eax) ; X86-NEXT: retl %res = extractelement <8 x half> %x, i32 6 store half %res, ptr %y ret void } define void @extract_store_f16_7(<8 x half> %x, ptr %y) { ; X64-LABEL: extract_store_f16_7: ; X64: # %bb.0: ; X64-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X64-NEXT: vmovsh %xmm0, (%rdi) ; X64-NEXT: retq ; ; X86-LABEL: extract_store_f16_7: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X86-NEXT: vmovsh %xmm0, (%eax) ; X86-NEXT: retl %res = extractelement <8 x half> %x, i32 7 store half %res, ptr %y ret void } define void @extract_store_i16_0(<8 x i16> %x, ptr %y) { ; X64-LABEL: extract_store_i16_0: ; X64: # %bb.0: ; X64-NEXT: vpextrw $0, %xmm0, (%rdi) ; X64-NEXT: retq ; ; X86-LABEL: extract_store_i16_0: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vpextrw $0, %xmm0, (%eax) ; X86-NEXT: retl %res = extractelement <8 x i16> %x, i32 0 store i16 %res, ptr %y ret void } define void @extract_store_i16_1(<8 x i16> %x, ptr %y) { ; X64-LABEL: extract_store_i16_1: ; X64: # %bb.0: ; X64-NEXT: vpextrw $1, %xmm0, (%rdi) ; X64-NEXT: retq ; ; X86-LABEL: extract_store_i16_1: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vpextrw $1, %xmm0, (%eax) ; X86-NEXT: retl %res = extractelement <8 x i16> %x, i32 1 store i16 %res, ptr %y ret void } define void @extract_store_i16_2(<8 x i16> %x, ptr %y) { ; X64-LABEL: extract_store_i16_2: ; X64: # %bb.0: ; X64-NEXT: vpextrw $2, %xmm0, (%rdi) ; X64-NEXT: retq ; ; X86-LABEL: extract_store_i16_2: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vpextrw $2, %xmm0, (%eax) ; X86-NEXT: retl %res = extractelement <8 x i16> %x, i32 2 store i16 %res, ptr %y ret void } define void @extract_store_i16_3(<8 x i16> %x, ptr %y) { ; X64-LABEL: extract_store_i16_3: ; X64: # %bb.0: ; X64-NEXT: vpextrw $3, %xmm0, (%rdi) ; X64-NEXT: retq ; ; X86-LABEL: extract_store_i16_3: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vpextrw $3, %xmm0, (%eax) ; X86-NEXT: retl %res = extractelement <8 x i16> %x, i32 3 store i16 %res, ptr %y ret void } define void @extract_store_i16_4(<8 x i16> %x, ptr %y) { ; X64-LABEL: extract_store_i16_4: ; X64: # %bb.0: ; X64-NEXT: vpextrw $4, %xmm0, (%rdi) ; X64-NEXT: retq ; ; X86-LABEL: extract_store_i16_4: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vpextrw $4, %xmm0, (%eax) ; X86-NEXT: retl %res = extractelement <8 x i16> %x, i32 4 store i16 %res, ptr %y ret void } define void @extract_store_i16_5(<8 x i16> %x, ptr %y) { ; X64-LABEL: extract_store_i16_5: ; X64: # %bb.0: ; X64-NEXT: vpextrw $5, %xmm0, (%rdi) ; X64-NEXT: retq ; ; X86-LABEL: extract_store_i16_5: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vpextrw $5, %xmm0, (%eax) ; X86-NEXT: retl %res = extractelement <8 x i16> %x, i32 5 store i16 %res, ptr %y ret void } define void @extract_store_i16_6(<8 x i16> %x, ptr %y) { ; X64-LABEL: extract_store_i16_6: ; X64: # %bb.0: ; X64-NEXT: vpextrw $6, %xmm0, (%rdi) ; X64-NEXT: retq ; ; X86-LABEL: extract_store_i16_6: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vpextrw $6, %xmm0, (%eax) ; X86-NEXT: retl %res = extractelement <8 x i16> %x, i32 6 store i16 %res, ptr %y ret void } define void @extract_store_i16_7(<8 x i16> %x, ptr %y) { ; X64-LABEL: extract_store_i16_7: ; X64: # %bb.0: ; X64-NEXT: vpextrw $7, %xmm0, (%rdi) ; X64-NEXT: retq ; ; X86-LABEL: extract_store_i16_7: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vpextrw $7, %xmm0, (%eax) ; X86-NEXT: retl %res = extractelement <8 x i16> %x, i32 7 store i16 %res, ptr %y ret void } define i32 @extract_zext_i16_0(<8 x i16> %x) { ; CHECK-LABEL: extract_zext_i16_0: ; CHECK: # %bb.0: ; CHECK-NEXT: vpextrw $0, %xmm0, %eax ; CHECK-NEXT: ret{{[l|q]}} %res = extractelement <8 x i16> %x, i32 0 %res2 = zext i16 %res to i32 ret i32 %res2 } define i32 @extract_zext_i16_1(<8 x i16> %x) { ; CHECK-LABEL: extract_zext_i16_1: ; CHECK: # %bb.0: ; CHECK-NEXT: vpextrw $1, %xmm0, %eax ; CHECK-NEXT: ret{{[l|q]}} %res = extractelement <8 x i16> %x, i32 1 %res2 = zext i16 %res to i32 ret i32 %res2 } define <8 x half> @build_vector_xxxxuuuu(half %a0, half %a1, half %a2, half %a3) { ; X64-LABEL: build_vector_xxxxuuuu: ; X64: # %bb.0: ; X64-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero ; X64-NEXT: retq ; ; X86-LABEL: build_vector_xxxxuuuu: ; X86: # %bb.0: ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero ; X86-NEXT: retl %a = insertelement <8 x half> undef, half %a0, i32 0 %b = insertelement <8 x half> %a, half %a1, i32 1 %c = insertelement <8 x half> %b, half %a2, i32 2 %d = insertelement <8 x half> %c, half %a3, i32 3 ret <8 x half> %d } define <8 x half> @build_vector_uuuuxxxx(half %a0, half %a1, half %a2, half %a3) { ; X64-LABEL: build_vector_uuuuxxxx: ; X64: # %bb.0: ; X64-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X64-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; X64-NEXT: vpbroadcastq %xmm0, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: build_vector_uuuuxxxx: ; X86: # %bb.0: ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-NEXT: vpbroadcastq %xmm0, %xmm0 ; X86-NEXT: retl %a = insertelement <8 x half> undef, half %a0, i32 4 %b = insertelement <8 x half> %a, half %a1, i32 5 %c = insertelement <8 x half> %b, half %a2, i32 6 %d = insertelement <8 x half> %c, half %a3, i32 7 ret <8 x half> %d } define <8 x half> @build_vector_xxxxxxxx(half %a0, half %a1, half %a2, half %a3, half %a4, half %a5, half %a6, half %a7) { ; X64-LABEL: build_vector_xxxxxxxx: ; X64: # %bb.0: ; X64-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; X64-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; X64-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] ; X64-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X64-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] ; X64-NEXT: retq ; ; X86-LABEL: build_vector_xxxxxxxx: ; X86: # %bb.0: ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm3 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; X86-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; X86-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X86-NEXT: retl %a = insertelement <8 x half> undef, half %a0, i32 0 %b = insertelement <8 x half> %a, half %a1, i32 1 %c = insertelement <8 x half> %b, half %a2, i32 2 %d = insertelement <8 x half> %c, half %a3, i32 3 %e = insertelement <8 x half> %d, half %a4, i32 4 %f = insertelement <8 x half> %e, half %a5, i32 5 %g = insertelement <8 x half> %f, half %a6, i32 6 %h = insertelement <8 x half> %g, half %a7, i32 7 ret <8 x half> %h } define <16 x half> @build_vector_xxxxuuuuuuuuxxxx(half %a0, half %a1, half %a2, half %a3, half %a4, half %a5, half %a6, half %a7) { ; X64-LABEL: build_vector_xxxxuuuuuuuuxxxx: ; X64: # %bb.0: ; X64-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero ; X64-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; X64-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; X64-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; X64-NEXT: vpbroadcastq %xmm1, %xmm1 ; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X64-NEXT: retq ; ; X86-LABEL: build_vector_xxxxuuuuuuuuxxxx: ; X86: # %bb.0: ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm3 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero ; X86-NEXT: vpbroadcastq %xmm0, %xmm0 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X86-NEXT: retl %a = insertelement <16 x half> undef, half %a0, i32 0 %b = insertelement <16 x half> %a, half %a1, i32 1 %c = insertelement <16 x half> %b, half %a2, i32 2 %d = insertelement <16 x half> %c, half %a3, i32 3 %e = insertelement <16 x half> %d, half %a4, i32 12 %f = insertelement <16 x half> %e, half %a5, i32 13 %g = insertelement <16 x half> %f, half %a6, i32 14 %h = insertelement <16 x half> %g, half %a7, i32 15 ret <16 x half> %h } define <8 x half> @regression1(<8 x half> %a, <8 x half> %b) { ; CHECK-LABEL: regression1: ; CHECK: # %bb.0: ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,0,1,2,3,4,5,6,7,14,15,10,11] ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 7, i32 0, i32 1, i32 2, i32 3, i32 7, i32 5> ret <8 x half> %res } define <4 x float> @regression2(ptr addrspace(1) %0, <4 x i32> %1, <4 x i32> %2, <4 x float> %3, ptr %4) { ; X64-LABEL: regression2: ; X64: # %bb.0: ; X64-NEXT: vmovw (%rsi), %xmm0 ; X64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; X64-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: regression2: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovw (%eax), %xmm0 ; X86-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; X86-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X86-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; X86-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0 ; X86-NEXT: retl %6 = load i8, ptr %4, align 1 %7 = getelementptr i8, ptr %4, i64 1 %8 = addrspacecast ptr %7 to ptr addrspace(4) %9 = load i8, ptr addrspace(4) %8, align 1 %10 = insertelement <2 x i8> poison, i8 %6, i32 0 %11 = insertelement <2 x i8> %10, i8 %9, i32 1 %12 = uitofp <2 x i8> %11 to <2 x float> %13 = shufflevector <2 x float> %12, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> %14 = shufflevector <4 x float> %13, <4 x float> <float poison, float poison, float 0.000000e+00, float 2.550000e+02>, <4 x i32> <i32 0, i32 1, i32 6, i32 7> %15 = fmul contract <4 x float> %14, <float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000> ret <4 x float> %15 } ; Make sure load/stores of v4f16 are handled well on 32-bit targets where ; default widening legalization can't use i64. define void @load_store_v4f16(ptr %x, ptr %y, ptr %z) { ; X64-LABEL: load_store_v4f16: ; X64: # %bb.0: ; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X64-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; X64-NEXT: vaddph %xmm1, %xmm0, %xmm0 ; X64-NEXT: vmovlps %xmm0, (%rdx) ; X64-NEXT: retq ; ; X86-LABEL: load_store_v4f16: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; X86-NEXT: vaddph %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovlps %xmm0, (%eax) ; X86-NEXT: retl %a = load <4 x half>, ptr %x %b = load <4 x half>, ptr %y %c = fadd <4 x half> %a, %b store <4 x half> %c, ptr %z ret void } define <8 x half> @test21(half %a, half %b, half %c) nounwind { ; X64-LABEL: test21: ; X64: # %bb.0: ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X64-NEXT: vmovsh %xmm2, %xmm3, %xmm2 ; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X64-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-NEXT: vpbroadcastw %xmm1, %xmm1 ; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] ; X64-NEXT: retq ; ; X86-LABEL: test21: ; X86: # %bb.0: ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 ; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X86-NEXT: vpbroadcastw %xmm1, %xmm1 ; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] ; X86-NEXT: retl %1 = insertelement <8 x half> <half poison, half poison, half poison, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000>, half %a, i32 0 %2 = insertelement <8 x half> %1, half %b, i32 1 %3 = insertelement <8 x half> %2, half %c, i32 2 ret <8 x half> %3 } define <16 x i16> @test22(ptr %mem) nounwind { ; X64-LABEL: test22: ; X64: # %bb.0: ; X64-NEXT: movzwl 0, %eax ; X64-NEXT: andw (%rdi), %ax ; X64-NEXT: vmovw %eax, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: test22: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzwl 0, %ecx ; X86-NEXT: andw (%eax), %cx ; X86-NEXT: vmovw %ecx, %xmm0 ; X86-NEXT: retl %1 = load i16, ptr null, align 2 %2 = load i16, ptr %mem, align 2 %3 = and i16 %1, %2 %4 = insertelement <16 x i16> <i16 undef, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, i16 %3, i32 0 ret <16 x i16> %4 } define void @pr52560(i8 %0, <2 x i16> %1, ptr %c) nounwind { ; X64-LABEL: pr52560: ; X64: # %bb.0: # %entry ; X64-NEXT: movsbl %dil, %eax ; X64-NEXT: vmovw %eax, %xmm1 ; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; X64-NEXT: vpcmpgtw %xmm2, %xmm1, %k1 ; X64-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z} ; X64-NEXT: vmovw %xmm0, %eax ; X64-NEXT: testw %ax, %ax ; X64-NEXT: je .LBB121_2 ; X64-NEXT: # %bb.1: # %for.body.preheader ; X64-NEXT: movb $0, (%rsi) ; X64-NEXT: .LBB121_2: # %for.end ; X64-NEXT: retq ; ; X86-LABEL: pr52560: ; X86: # %bb.0: # %entry ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovw %eax, %xmm1 ; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; X86-NEXT: vpcmpgtw %xmm2, %xmm1, %k1 ; X86-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z} ; X86-NEXT: vmovw %xmm0, %eax ; X86-NEXT: testw %ax, %ax ; X86-NEXT: je .LBB121_2 ; X86-NEXT: # %bb.1: # %for.body.preheader ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movb $0, (%eax) ; X86-NEXT: .LBB121_2: # %for.end ; X86-NEXT: retl entry: %conv = sext i8 %0 to i16 %2 = insertelement <2 x i16> <i16 poison, i16 0>, i16 %conv, i32 0 %3 = icmp sgt <2 x i16> %2, zeroinitializer %4 = select <2 x i1> %3, <2 x i16> %1, <2 x i16> <i16 0, i16 poison> %5 = extractelement <2 x i16> %4, i32 0 %tobool.not14 = icmp eq i16 %5, 0 br i1 %tobool.not14, label %for.end, label %for.body.preheader for.body.preheader: ; preds = %entry store i8 0, ptr %c, align 1 br label %for.end for.end: ; preds = %for.body.preheader, %entry ret void } define <16 x i32> @pr52561(<16 x i32> %a, <16 x i32> %b) "min-legal-vector-width"="256" "prefer-vector-width"="256" nounwind { ; X64-LABEL: pr52561: ; X64: # %bb.0: ; X64-NEXT: vpbroadcastd {{.*#+}} ymm4 = [112,112,112,112,112,112,112,112] ; X64-NEXT: vpaddd %ymm4, %ymm2, %ymm2 ; X64-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; X64-NEXT: vpaddd %ymm4, %ymm3, %ymm2 ; X64-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; X64-NEXT: vmovsh %xmm0, %xmm2, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: pr52561: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-32, %esp ; X86-NEXT: subl $32, %esp ; X86-NEXT: vpaddd 8(%ebp), %ymm1, %ymm1 ; X86-NEXT: vpbroadcastd {{.*#+}} ymm3 = [112,112,112,112,112,112,112,112] ; X86-NEXT: vpaddd %ymm3, %ymm2, %ymm2 ; X86-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; X86-NEXT: vpaddd %ymm3, %ymm1, %ymm1 ; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1 ; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; X86-NEXT: vmovsh %xmm0, %xmm2, %xmm0 ; X86-NEXT: movl %ebp, %esp ; X86-NEXT: popl %ebp ; X86-NEXT: retl %1 = add <16 x i32> %a, <i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112> %2 = add <16 x i32> %1, %b %3 = and <16 x i32> %2, <i32 65535, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 65535> ret <16 x i32> %3 }