; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx,+mmx | FileCheck %s --check-prefix=X86 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx,+mmx | FileCheck %s --check-prefix=X64 define <4 x i64> @A(ptr %ptr) nounwind uwtable readnone ssp { ; X86-LABEL: A: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vbroadcastsd (%eax), %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: A: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vbroadcastsd (%rdi), %ymm0 ; X64-NEXT: retq entry: %q = load i64, ptr %ptr, align 8 %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0 %vecinit2.i = insertelement <4 x i64> %vecinit.i, i64 %q, i32 1 %vecinit4.i = insertelement <4 x i64> %vecinit2.i, i64 %q, i32 2 %vecinit6.i = insertelement <4 x i64> %vecinit4.i, i64 %q, i32 3 ret <4 x i64> %vecinit6.i } define <4 x i64> @A2(ptr %ptr, ptr %ptr2) nounwind uwtable readnone ssp { ; X86-LABEL: A2: ; X86: ## %bb.0: ## %entry ; X86-NEXT: pushl %esi ; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: .cfi_offset %esi, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl (%ecx), %edx ; X86-NEXT: movl 4(%ecx), %esi ; X86-NEXT: vbroadcastsd (%ecx), %ymm0 ; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: movl %esi, 4(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: A2: ; X64: ## %bb.0: ## %entry ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: movq %rax, (%rsi) ; X64-NEXT: vmovq %rax, %xmm0 ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-NEXT: retq entry: %q = load i64, ptr %ptr, align 8 store i64 %q, ptr %ptr2, align 8 ; to create a chain to prevent broadcast %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0 %vecinit2.i = insertelement <4 x i64> %vecinit.i, i64 %q, i32 1 %vecinit4.i = insertelement <4 x i64> %vecinit2.i, i64 %q, i32 2 %vecinit6.i = insertelement <4 x i64> %vecinit4.i, i64 %q, i32 3 ret <4 x i64> %vecinit6.i } define <8 x i32> @B(ptr %ptr) nounwind uwtable readnone ssp { ; X86-LABEL: B: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vbroadcastss (%eax), %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: B: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vbroadcastss (%rdi), %ymm0 ; X64-NEXT: retq entry: %q = load i32, ptr %ptr, align 4 %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 %vecinit2.i = insertelement <8 x i32> %vecinit.i, i32 %q, i32 1 %vecinit4.i = insertelement <8 x i32> %vecinit2.i, i32 %q, i32 2 %vecinit6.i = insertelement <8 x i32> %vecinit4.i, i32 %q, i32 3 ret <8 x i32> %vecinit6.i } define <8 x i32> @B2(ptr %ptr) nounwind uwtable readnone ssp { ; X86-LABEL: B2: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vbroadcastss (%eax), %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: B2: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vbroadcastss (%rdi), %ymm0 ; X64-NEXT: retq entry: %q = load i32, ptr %ptr, align 4 %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 %vecinit2.i = insertelement <8 x i32> %vecinit.i, i32 %q, i32 1 %vecinit4.i = insertelement <8 x i32> %vecinit2.i, i32 %q, i32 2 %vecinit6.i = insertelement <8 x i32> %vecinit4.i, i32 %q, i32 3 %vecinit8.i = insertelement <8 x i32> %vecinit6.i, i32 %q, i32 4 %vecinit10.i = insertelement <8 x i32> %vecinit8.i, i32 %q, i32 5 %vecinit12.i = insertelement <8 x i32> %vecinit10.i, i32 %q, i32 6 %vecinit14.i = insertelement <8 x i32> %vecinit12.i, i32 %q, i32 7 ret <8 x i32> %vecinit14.i } define <8 x i32> @B3(ptr %ptr, ptr %ptr2) nounwind uwtable readnone ssp { ; X86-LABEL: B3: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl (%ecx), %ecx ; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: vmovd %ecx, %xmm0 ; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: B3: ; X64: ## %bb.0: ## %entry ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: movl %eax, (%rsi) ; X64-NEXT: vmovd %eax, %xmm0 ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-NEXT: retq entry: %q = load i32, ptr %ptr, align 4 store i32 %q, ptr %ptr2, align 4 ; to create a chain to prevent broadcast %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 %vecinit2.i = insertelement <8 x i32> %vecinit.i, i32 %q, i32 1 %vecinit4.i = insertelement <8 x i32> %vecinit2.i, i32 %q, i32 2 %vecinit6.i = insertelement <8 x i32> %vecinit4.i, i32 %q, i32 3 %vecinit8.i = insertelement <8 x i32> %vecinit6.i, i32 %q, i32 4 %vecinit10.i = insertelement <8 x i32> %vecinit8.i, i32 %q, i32 5 %vecinit12.i = insertelement <8 x i32> %vecinit10.i, i32 %q, i32 6 %vecinit14.i = insertelement <8 x i32> %vecinit12.i, i32 %q, i32 7 ret <8 x i32> %vecinit14.i } define <4 x double> @C(ptr %ptr) nounwind uwtable readnone ssp { ; X86-LABEL: C: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vbroadcastsd (%eax), %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: C: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vbroadcastsd (%rdi), %ymm0 ; X64-NEXT: retq entry: %q = load double, ptr %ptr, align 8 %vecinit.i = insertelement <4 x double> undef, double %q, i32 0 %vecinit2.i = insertelement <4 x double> %vecinit.i, double %q, i32 1 %vecinit4.i = insertelement <4 x double> %vecinit2.i, double %q, i32 2 %vecinit6.i = insertelement <4 x double> %vecinit4.i, double %q, i32 3 ret <4 x double> %vecinit6.i } define <4 x double> @C2(ptr %ptr, ptr %ptr2) nounwind uwtable readnone ssp { ; X86-LABEL: C2: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: vbroadcastsd (%ecx), %ymm0 ; X86-NEXT: vmovlps %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: C2: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vbroadcastsd (%rdi), %ymm0 ; X64-NEXT: vmovlps %xmm0, (%rsi) ; X64-NEXT: retq entry: %q = load double, ptr %ptr, align 8 store double %q, ptr %ptr2, align 8 ; to create a chain to prevent broadcast %vecinit.i = insertelement <4 x double> undef, double %q, i32 0 %vecinit2.i = insertelement <4 x double> %vecinit.i, double %q, i32 1 %vecinit4.i = insertelement <4 x double> %vecinit2.i, double %q, i32 2 %vecinit6.i = insertelement <4 x double> %vecinit4.i, double %q, i32 3 ret <4 x double> %vecinit6.i } define <8 x float> @D(ptr %ptr) nounwind uwtable readnone ssp { ; X86-LABEL: D: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vbroadcastss (%eax), %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: D: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vbroadcastss (%rdi), %ymm0 ; X64-NEXT: retq entry: %q = load float, ptr %ptr, align 4 %vecinit.i = insertelement <8 x float> undef, float %q, i32 0 %vecinit2.i = insertelement <8 x float> %vecinit.i, float %q, i32 1 %vecinit4.i = insertelement <8 x float> %vecinit2.i, float %q, i32 2 %vecinit6.i = insertelement <8 x float> %vecinit4.i, float %q, i32 3 ret <8 x float> %vecinit6.i } define <8 x float> @D2(ptr %ptr) nounwind uwtable readnone ssp { ; X86-LABEL: D2: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vbroadcastss (%eax), %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: D2: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vbroadcastss (%rdi), %ymm0 ; X64-NEXT: retq entry: %q = load float, ptr %ptr, align 4 %vecinit.i = insertelement <8 x float> undef, float %q, i32 0 %vecinit2.i = insertelement <8 x float> %vecinit.i, float %q, i32 1 %vecinit4.i = insertelement <8 x float> %vecinit2.i, float %q, i32 2 %vecinit6.i = insertelement <8 x float> %vecinit4.i, float %q, i32 3 %vecinit8.i = insertelement <8 x float> %vecinit6.i, float %q, i32 4 %vecinit10.i = insertelement <8 x float> %vecinit8.i, float %q, i32 5 %vecinit12.i = insertelement <8 x float> %vecinit10.i, float %q, i32 6 %vecinit14.i = insertelement <8 x float> %vecinit12.i, float %q, i32 7 ret <8 x float> %vecinit14.i } define <8 x float> @D3(ptr %ptr, ptr %ptr2) nounwind uwtable readnone ssp { ; X86-LABEL: D3: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: vbroadcastss (%ecx), %ymm0 ; X86-NEXT: vmovss %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: D3: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vbroadcastss (%rdi), %ymm0 ; X64-NEXT: vmovss %xmm0, (%rsi) ; X64-NEXT: retq entry: %q = load float, ptr %ptr, align 4 store float %q, ptr %ptr2, align 4 ; to create a chain to prevent broadcast %vecinit.i = insertelement <8 x float> undef, float %q, i32 0 %vecinit2.i = insertelement <8 x float> %vecinit.i, float %q, i32 1 %vecinit4.i = insertelement <8 x float> %vecinit2.i, float %q, i32 2 %vecinit6.i = insertelement <8 x float> %vecinit4.i, float %q, i32 3 %vecinit8.i = insertelement <8 x float> %vecinit6.i, float %q, i32 4 %vecinit10.i = insertelement <8 x float> %vecinit8.i, float %q, i32 5 %vecinit12.i = insertelement <8 x float> %vecinit10.i, float %q, i32 6 %vecinit14.i = insertelement <8 x float> %vecinit12.i, float %q, i32 7 ret <8 x float> %vecinit14.i } ;;;; 128-bit versions define <4 x float> @e(ptr %ptr) nounwind uwtable readnone ssp { ; X86-LABEL: e: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vbroadcastss (%eax), %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: e: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vbroadcastss (%rdi), %xmm0 ; X64-NEXT: retq entry: %q = load float, ptr %ptr, align 4 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2 %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3 ret <4 x float> %vecinit6.i } define <4 x float> @e2(ptr %ptr, ptr %ptr2) nounwind uwtable readnone ssp { ; X86-LABEL: e2: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: vbroadcastss (%ecx), %xmm0 ; X86-NEXT: vmovss %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: e2: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vbroadcastss (%rdi), %xmm0 ; X64-NEXT: vmovss %xmm0, (%rsi) ; X64-NEXT: retq entry: %q = load float, ptr %ptr, align 4 store float %q, ptr %ptr2, align 4 ; to create a chain to prevent broadcast %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2 %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3 ret <4 x float> %vecinit6.i } ; Don't broadcast constants on pre-AVX2 hardware. define <4 x float> @_e2(ptr %ptr) nounwind uwtable readnone ssp { ; X86-LABEL: _e2: ; X86: ## %bb.0: ## %entry ; X86-NEXT: vmovaps {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3] ; X86-NEXT: retl ; ; X64-LABEL: _e2: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vmovaps {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3] ; X64-NEXT: retq entry: %vecinit.i = insertelement <4 x float> undef, float 0xbf80000000000000, i32 0 %vecinit2.i = insertelement <4 x float> %vecinit.i, float 0xbf80000000000000, i32 1 %vecinit4.i = insertelement <4 x float> %vecinit2.i, float 0xbf80000000000000, i32 2 %vecinit6.i = insertelement <4 x float> %vecinit4.i, float 0xbf80000000000000, i32 3 ret <4 x float> %vecinit6.i } define <4 x i32> @F(ptr %ptr) nounwind uwtable readnone ssp { ; X86-LABEL: F: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vbroadcastss (%eax), %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: F: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vbroadcastss (%rdi), %xmm0 ; X64-NEXT: retq entry: %q = load i32, ptr %ptr, align 4 %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 %vecinit2.i = insertelement <4 x i32> %vecinit.i, i32 %q, i32 1 %vecinit4.i = insertelement <4 x i32> %vecinit2.i, i32 %q, i32 2 %vecinit6.i = insertelement <4 x i32> %vecinit4.i, i32 %q, i32 3 ret <4 x i32> %vecinit6.i } define <4 x i32> @F2(ptr %ptr, ptr %ptr2) nounwind uwtable readnone ssp { ; X86-LABEL: F2: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl (%ecx), %ecx ; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: vmovd %ecx, %xmm0 ; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X86-NEXT: retl ; ; X64-LABEL: F2: ; X64: ## %bb.0: ## %entry ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: movl %eax, (%rsi) ; X64-NEXT: vmovd %eax, %xmm0 ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-NEXT: retq entry: %q = load i32, ptr %ptr, align 4 store i32 %q, ptr %ptr2, align 4 ; to create a chain to prevent broadcast %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 %vecinit2.i = insertelement <4 x i32> %vecinit.i, i32 %q, i32 1 %vecinit4.i = insertelement <4 x i32> %vecinit2.i, i32 %q, i32 2 %vecinit6.i = insertelement <4 x i32> %vecinit4.i, i32 %q, i32 3 ret <4 x i32> %vecinit6.i } ; FIXME: Pointer adjusted broadcasts define <4 x i32> @load_splat_4i32_4i32_1111(ptr %ptr) nounwind uwtable readnone ssp { ; X86-LABEL: load_splat_4i32_4i32_1111: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,1,1,1] ; X86-NEXT: retl ; ; X64-LABEL: load_splat_4i32_4i32_1111: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,1,1,1] ; X64-NEXT: retq entry: %ld = load <4 x i32>, ptr %ptr %ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ret <4 x i32> %ret } define <8 x i32> @load_splat_8i32_4i32_33333333(ptr %ptr) nounwind uwtable readnone ssp { ; X86-LABEL: load_splat_8i32_4i32_33333333: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vbroadcastss 12(%eax), %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: load_splat_8i32_4i32_33333333: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vbroadcastss 12(%rdi), %ymm0 ; X64-NEXT: retq entry: %ld = load <4 x i32>, ptr %ptr %ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> ret <8 x i32> %ret } define <8 x i32> @load_splat_8i32_8i32_55555555(ptr %ptr) nounwind uwtable readnone ssp { ; X86-LABEL: load_splat_8i32_8i32_55555555: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vbroadcastss 20(%eax), %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: load_splat_8i32_8i32_55555555: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vbroadcastss 20(%rdi), %ymm0 ; X64-NEXT: retq entry: %ld = load <8 x i32>, ptr %ptr %ret = shufflevector <8 x i32> %ld, <8 x i32> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5> ret <8 x i32> %ret } define <4 x float> @load_splat_4f32_4f32_1111(ptr %ptr) nounwind uwtable readnone ssp { ; X86-LABEL: load_splat_4f32_4f32_1111: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vbroadcastss 4(%eax), %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: load_splat_4f32_4f32_1111: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vbroadcastss 4(%rdi), %xmm0 ; X64-NEXT: retq entry: %ld = load <4 x float>, ptr %ptr %ret = shufflevector <4 x float> %ld, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ret <4 x float> %ret } define <8 x float> @load_splat_8f32_4f32_33333333(ptr %ptr) nounwind uwtable readnone ssp { ; X86-LABEL: load_splat_8f32_4f32_33333333: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vbroadcastss 12(%eax), %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: load_splat_8f32_4f32_33333333: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vbroadcastss 12(%rdi), %ymm0 ; X64-NEXT: retq entry: %ld = load <4 x float>, ptr %ptr %ret = shufflevector <4 x float> %ld, <4 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> ret <8 x float> %ret } define <8 x float> @load_splat_8f32_8f32_55555555(ptr %ptr) nounwind uwtable readnone ssp { ; X86-LABEL: load_splat_8f32_8f32_55555555: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vbroadcastss 20(%eax), %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: load_splat_8f32_8f32_55555555: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vbroadcastss 20(%rdi), %ymm0 ; X64-NEXT: retq entry: %ld = load <8 x float>, ptr %ptr %ret = shufflevector <8 x float> %ld, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5> ret <8 x float> %ret } define <2 x i64> @load_splat_2i64_2i64_1111(ptr %ptr) nounwind uwtable readnone ssp { ; X86-LABEL: load_splat_2i64_2i64_1111: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] ; X86-NEXT: retl ; ; X64-LABEL: load_splat_2i64_2i64_1111: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] ; X64-NEXT: retq entry: %ld = load <2 x i64>, ptr %ptr %ret = shufflevector <2 x i64> %ld, <2 x i64> undef, <2 x i32> <i32 1, i32 1> ret <2 x i64> %ret } define <4 x i64> @load_splat_4i64_2i64_1111(ptr %ptr) nounwind uwtable readnone ssp { ; X86-LABEL: load_splat_4i64_2i64_1111: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vbroadcastsd 8(%eax), %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: load_splat_4i64_2i64_1111: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vbroadcastsd 8(%rdi), %ymm0 ; X64-NEXT: retq entry: %ld = load <2 x i64>, ptr %ptr %ret = shufflevector <2 x i64> %ld, <2 x i64> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ret <4 x i64> %ret } define <4 x i64> @load_splat_4i64_4i64_2222(ptr %ptr) nounwind uwtable readnone ssp { ; X86-LABEL: load_splat_4i64_4i64_2222: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vbroadcastsd 16(%eax), %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: load_splat_4i64_4i64_2222: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vbroadcastsd 16(%rdi), %ymm0 ; X64-NEXT: retq entry: %ld = load <4 x i64>, ptr %ptr %ret = shufflevector <4 x i64> %ld, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2> ret <4 x i64> %ret } define <2 x double> @load_splat_2f64_2f64_1111(ptr %ptr) nounwind uwtable readnone ssp { ; X86-LABEL: load_splat_2f64_2f64_1111: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; X86-NEXT: retl ; ; X64-LABEL: load_splat_2f64_2f64_1111: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; X64-NEXT: retq entry: %ld = load <2 x double>, ptr %ptr %ret = shufflevector <2 x double> %ld, <2 x double> undef, <2 x i32> <i32 1, i32 1> ret <2 x double> %ret } define <4 x double> @load_splat_4f64_2f64_1111(ptr %ptr) nounwind uwtable readnone ssp { ; X86-LABEL: load_splat_4f64_2f64_1111: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vbroadcastsd 8(%eax), %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: load_splat_4f64_2f64_1111: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vbroadcastsd 8(%rdi), %ymm0 ; X64-NEXT: retq entry: %ld = load <2 x double>, ptr %ptr %ret = shufflevector <2 x double> %ld, <2 x double> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ret <4 x double> %ret } define <4 x double> @load_splat_4f64_4f64_2222(ptr %ptr) nounwind uwtable readnone ssp { ; X86-LABEL: load_splat_4f64_4f64_2222: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vbroadcastsd 16(%eax), %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: load_splat_4f64_4f64_2222: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vbroadcastsd 16(%rdi), %ymm0 ; X64-NEXT: retq entry: %ld = load <4 x double>, ptr %ptr %ret = shufflevector <4 x double> %ld, <4 x double> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2> ret <4 x double> %ret } ; Unsupported vbroadcasts define <2 x i64> @G(ptr %ptr) nounwind uwtable readnone ssp { ; X86-LABEL: G: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; X86-NEXT: retl ; ; X64-LABEL: G: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; X64-NEXT: retq entry: %q = load i64, ptr %ptr, align 8 %vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0 %vecinit2.i = insertelement <2 x i64> %vecinit.i, i64 %q, i32 1 ret <2 x i64> %vecinit2.i } define <2 x i64> @G2(ptr %ptr, ptr %ptr2) nounwind uwtable readnone ssp { ; X86-LABEL: G2: ; X86: ## %bb.0: ## %entry ; X86-NEXT: pushl %esi ; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: .cfi_offset %esi, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl (%ecx), %edx ; X86-NEXT: movl 4(%ecx), %esi ; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: movl %esi, 4(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: G2: ; X64: ## %bb.0: ## %entry ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: movq %rax, (%rsi) ; X64-NEXT: vmovq %rax, %xmm0 ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; X64-NEXT: retq entry: %q = load i64, ptr %ptr, align 8 store i64 %q, ptr %ptr2, align 8 ; to create a chain to prevent broadcast %vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0 %vecinit2.i = insertelement <2 x i64> %vecinit.i, i64 %q, i32 1 ret <2 x i64> %vecinit2.i } define <4 x i32> @H(<4 x i32> %a) { ; X86-LABEL: H: ; X86: ## %bb.0: ## %entry ; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-NEXT: retl ; ; X64-LABEL: H: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X64-NEXT: retq entry: %x = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ret <4 x i32> %x } define <2 x double> @I(ptr %ptr) nounwind uwtable readnone ssp { ; X86-LABEL: I: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; X86-NEXT: retl ; ; X64-LABEL: I: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; X64-NEXT: retq entry: %q = load double, ptr %ptr, align 4 %vecinit.i = insertelement <2 x double> undef, double %q, i32 0 %vecinit2.i = insertelement <2 x double> %vecinit.i, double %q, i32 1 ret <2 x double> %vecinit2.i } define <2 x double> @I2(ptr %ptr, ptr %ptr2) nounwind uwtable readnone ssp { ; X86-LABEL: I2: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; X86-NEXT: vmovlps %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: I2: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; X64-NEXT: vmovlps %xmm0, (%rsi) ; X64-NEXT: retq entry: %q = load double, ptr %ptr, align 4 store double %q, ptr %ptr2, align 4 ; to create a chain to prevent broadcast %vecinit.i = insertelement <2 x double> undef, double %q, i32 0 %vecinit2.i = insertelement <2 x double> %vecinit.i, double %q, i32 1 ret <2 x double> %vecinit2.i } define <4 x float> @_RR(ptr %ptr, ptr %k) nounwind uwtable readnone ssp { ; X86-LABEL: _RR: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: vbroadcastss (%ecx), %xmm0 ; X86-NEXT: movl (%eax), %eax ; X86-NEXT: movl %eax, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: _RR: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vbroadcastss (%rdi), %xmm0 ; X64-NEXT: movl (%rsi), %eax ; X64-NEXT: movl %eax, (%rax) ; X64-NEXT: retq entry: %q = load float, ptr %ptr, align 4 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2 %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3 ; force a chain %j = load i32, ptr %k, align 4 store i32 %j, ptr undef ret <4 x float> %vecinit6.i } define <4 x float> @_RR2(ptr %ptr, ptr %k) nounwind uwtable readnone ssp { ; X86-LABEL: _RR2: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vbroadcastss (%eax), %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: _RR2: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vbroadcastss (%rdi), %xmm0 ; X64-NEXT: retq entry: %q = load float, ptr %ptr, align 4 %v = insertelement <4 x float> undef, float %q, i32 0 %t = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer ret <4 x float> %t } ; These tests check that a vbroadcast instruction is used when we have a splat ; formed from a concat_vectors (via the shufflevector) of two BUILD_VECTORs ; (via the insertelements). define <8 x float> @splat_concat1(ptr %p) { ; X86-LABEL: splat_concat1: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vbroadcastss (%eax), %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: splat_concat1: ; X64: ## %bb.0: ; X64-NEXT: vbroadcastss (%rdi), %ymm0 ; X64-NEXT: retq %1 = load float, ptr %p, align 4 %2 = insertelement <4 x float> undef, float %1, i32 0 %3 = insertelement <4 x float> %2, float %1, i32 1 %4 = insertelement <4 x float> %3, float %1, i32 2 %5 = insertelement <4 x float> %4, float %1, i32 3 %6 = shufflevector <4 x float> %5, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> ret <8 x float> %6 } define <8 x float> @splat_concat2(ptr %p) { ; X86-LABEL: splat_concat2: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vbroadcastss (%eax), %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: splat_concat2: ; X64: ## %bb.0: ; X64-NEXT: vbroadcastss (%rdi), %ymm0 ; X64-NEXT: retq %1 = load float, ptr %p, align 4 %2 = insertelement <4 x float> undef, float %1, i32 0 %3 = insertelement <4 x float> %2, float %1, i32 1 %4 = insertelement <4 x float> %3, float %1, i32 2 %5 = insertelement <4 x float> %4, float %1, i32 3 %6 = insertelement <4 x float> undef, float %1, i32 0 %7 = insertelement <4 x float> %6, float %1, i32 1 %8 = insertelement <4 x float> %7, float %1, i32 2 %9 = insertelement <4 x float> %8, float %1, i32 3 %10 = shufflevector <4 x float> %5, <4 x float> %9, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> ret <8 x float> %10 } define <4 x double> @splat_concat3(ptr %p) { ; X86-LABEL: splat_concat3: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vbroadcastsd (%eax), %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: splat_concat3: ; X64: ## %bb.0: ; X64-NEXT: vbroadcastsd (%rdi), %ymm0 ; X64-NEXT: retq %1 = load double, ptr %p, align 8 %2 = insertelement <2 x double> undef, double %1, i32 0 %3 = insertelement <2 x double> %2, double %1, i32 1 %4 = shufflevector <2 x double> %3, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> ret <4 x double> %4 } define <4 x double> @splat_concat4(ptr %p) { ; X86-LABEL: splat_concat4: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vbroadcastsd (%eax), %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: splat_concat4: ; X64: ## %bb.0: ; X64-NEXT: vbroadcastsd (%rdi), %ymm0 ; X64-NEXT: retq %1 = load double, ptr %p, align 8 %2 = insertelement <2 x double> undef, double %1, i32 0 %3 = insertelement <2 x double> %2, double %1, i32 1 %4 = insertelement <2 x double> undef, double %1, i32 0 %5 = insertelement <2 x double> %2, double %1, i32 1 %6 = shufflevector <2 x double> %3, <2 x double> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3> ret <4 x double> %6 } ; PR34041 define <4 x double> @broadcast_shuffle_1000(ptr %p) { ; X86-LABEL: broadcast_shuffle_1000: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vbroadcastsd (%eax), %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: broadcast_shuffle_1000: ; X64: ## %bb.0: ; X64-NEXT: vbroadcastsd (%rdi), %ymm0 ; X64-NEXT: retq %1 = load double, ptr %p %2 = insertelement <2 x double> undef, double %1, i32 0 %3 = shufflevector <2 x double> %2, <2 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0> ret <4 x double> %3 } define <4 x double> @broadcast_shuffle1032(ptr %p) { ; X86-LABEL: broadcast_shuffle1032: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vbroadcastsd (%eax), %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: broadcast_shuffle1032: ; X64: ## %bb.0: ; X64-NEXT: vbroadcastsd (%rdi), %ymm0 ; X64-NEXT: retq %1 = load double, ptr %p %2 = insertelement <2 x double> undef, double %1, i32 1 %3 = insertelement <2 x double> undef, double %1, i32 0 %4 = shufflevector <2 x double> %2, <2 x double> %3, <4 x i32> <i32 1, i32 0, i32 3, i32 2> ret <4 x double> %4 } define void @broadcast_v16i32(ptr %a, ptr %b) { ; X86-LABEL: broadcast_v16i32: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: vbroadcastss (%ecx), %ymm0 ; X86-NEXT: vmovups %ymm0, 32(%eax) ; X86-NEXT: vmovups %ymm0, (%eax) ; X86-NEXT: vzeroupper ; X86-NEXT: retl ; ; X64-LABEL: broadcast_v16i32: ; X64: ## %bb.0: ; X64-NEXT: vbroadcastss (%rdi), %ymm0 ; X64-NEXT: vmovups %ymm0, 32(%rsi) ; X64-NEXT: vmovups %ymm0, (%rsi) ; X64-NEXT: vzeroupper ; X64-NEXT: retq %1 = load i32, ptr %a, align 4 %2 = insertelement <8 x i32> undef, i32 %1, i32 0 %3 = shufflevector <8 x i32> %2, <8 x i32> undef, <8 x i32> zeroinitializer %4 = shufflevector <8 x i32> undef, <8 x i32> %3, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> store <16 x i32> %4, ptr %b, align 4 ret void } ; ; Broadcast scale factor for xyz vector - slp will have vectorized xy. ; define double @broadcast_scale_xyz(ptr nocapture readonly, ptr nocapture readonly) nounwind { ; X86-LABEL: broadcast_scale_xyz: ; X86: ## %bb.0: ; X86-NEXT: subl $12, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; X86-NEXT: vmulpd (%eax), %xmm0, %xmm1 ; X86-NEXT: vmulsd 16(%eax), %xmm0, %xmm0 ; X86-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; X86-NEXT: vaddsd %xmm2, %xmm1, %xmm1 ; X86-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovsd %xmm0, (%esp) ; X86-NEXT: fldl (%esp) ; X86-NEXT: addl $12, %esp ; X86-NEXT: retl ; ; X64-LABEL: broadcast_scale_xyz: ; X64: ## %bb.0: ; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; X64-NEXT: vmulpd (%rsi), %xmm0, %xmm1 ; X64-NEXT: vmulsd 16(%rsi), %xmm0, %xmm0 ; X64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; X64-NEXT: vaddsd %xmm2, %xmm1, %xmm1 ; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; X64-NEXT: retq %3 = load <2 x double>, ptr %1, align 8 %4 = getelementptr inbounds double, ptr %1, i64 2 %5 = load double, ptr %4, align 8 %6 = load double, ptr %0, align 8 %7 = insertelement <2 x double> undef, double %6, i32 0 %8 = shufflevector <2 x double> %7, <2 x double> undef, <2 x i32> zeroinitializer %9 = fmul <2 x double> %3, %8 %10 = fmul double %5, %6 %11 = extractelement <2 x double> %9, i32 0 %12 = extractelement <2 x double> %9, i32 1 %13 = fadd double %11, %12 %14 = fadd double %10, %13 ret double %14 } ; ; When VBROADCAST replaces an existing load, ensure it still respects lifetime dependencies. ; define float @broadcast_lifetime() nounwind { ; X86-LABEL: broadcast_lifetime: ; X86: ## %bb.0: ; X86-NEXT: pushl %esi ; X86-NEXT: subl $40, %esp ; X86-NEXT: leal {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %esi, (%esp) ; X86-NEXT: calll _gfunc ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-NEXT: vmovss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %esi, (%esp) ; X86-NEXT: calll _gfunc ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-NEXT: vsubss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 4-byte Folded Reload ; X86-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: flds {{[0-9]+}}(%esp) ; X86-NEXT: addl $40, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: broadcast_lifetime: ; X64: ## %bb.0: ; X64-NEXT: subq $40, %rsp ; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi ; X64-NEXT: callq _gfunc ; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill ; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi ; X64-NEXT: callq _gfunc ; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-NEXT: vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 ## 4-byte Folded Reload ; X64-NEXT: addq $40, %rsp ; X64-NEXT: retq %1 = alloca <4 x float>, align 16 %2 = alloca <4 x float>, align 16 call void @llvm.lifetime.start.p0(i64 16, ptr %1) call void @gfunc(ptr %1) %3 = load <4 x float>, ptr %1, align 16 call void @llvm.lifetime.end.p0(i64 16, ptr %1) call void @llvm.lifetime.start.p0(i64 16, ptr %2) call void @gfunc(ptr %2) %4 = load <4 x float>, ptr %2, align 16 call void @llvm.lifetime.end.p0(i64 16, ptr %2) %5 = extractelement <4 x float> %3, i32 1 %6 = extractelement <4 x float> %4, i32 1 %7 = fsub float %6, %5 ret float %7 } define <8 x i16> @broadcast_x86_mmx(x86_mmx %tmp) nounwind { ; X86-LABEL: broadcast_x86_mmx: ; X86: ## %bb.0: ## %bb ; X86-NEXT: subl $12, %esp ; X86-NEXT: movq %mm0, (%esp) ; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; X86-NEXT: addl $12, %esp ; X86-NEXT: retl ; ; X64-LABEL: broadcast_x86_mmx: ; X64: ## %bb.0: ## %bb ; X64-NEXT: movdq2q %xmm0, %mm0 ; X64-NEXT: movq2dq %mm0, %xmm0 ; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; X64-NEXT: retq bb: %tmp1 = bitcast x86_mmx %tmp to i64 %tmp2 = insertelement <2 x i64> undef, i64 %tmp1, i32 0 %tmp3 = bitcast <2 x i64> %tmp2 to <8 x i16> %tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> ret <8 x i16> %tmp4 } declare void @gfunc(ptr) declare void @llvm.lifetime.start.p0(i64, ptr) declare void @llvm.lifetime.end.p0(i64, ptr)