Compiler projects using llvm
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL32,AVX2
; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=ALL32,AVX512
; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=ALL32,AVX512,AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX-64
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL64,AVX2-64
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=ALL64,AVX512F-64
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=ALL64,AVX512F-64,AVX512BW-64

;===-----------------------------------------------------------------------------===
;    This test checks the ability to recognize a cross element pattern of
;    constants and perform the load via broadcasting a smaller constant
;    vector.
;    For example:
;    <i32 0, i32 1, i32 0, i32 1> => broadcast of the constant vector <i32 0, i32 1>
;===-----------------------------------------------------------------------------===

define <16 x i8> @f16xi8_i16(<16 x i8> %a) {
; AVX-LABEL: f16xi8_i16:
; AVX:       # %bb.0:
; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT:    retl
;
; ALL32-LABEL: f16xi8_i16:
; ALL32:       # %bb.0:
; ALL32-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [256,256,256,256,256,256,256,256]
; ALL32-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
; ALL32-NEXT:    vpand %xmm1, %xmm0, %xmm0
; ALL32-NEXT:    retl
;
; AVX-64-LABEL: f16xi8_i16:
; AVX-64:       # %bb.0:
; AVX-64-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX-64-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
; AVX-64-NEXT:    vpand %xmm1, %xmm0, %xmm0
; AVX-64-NEXT:    retq
;
; ALL64-LABEL: f16xi8_i16:
; ALL64:       # %bb.0:
; ALL64-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [256,256,256,256,256,256,256,256]
; ALL64-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
; ALL64-NEXT:    vpand %xmm1, %xmm0, %xmm0
; ALL64-NEXT:    retq
  %res1 = add <16 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>, %a
  %res2 = and <16 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>, %res1
  ret <16 x i8> %res2
}


define <16 x i8> @f16xi8_i32(<16 x i8> %a) {
; AVX-LABEL: f16xi8_i32:
; AVX:       # %bb.0:
; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [50462976,50462976,50462976,50462976]
; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT:    retl
;
; ALL32-LABEL: f16xi8_i32:
; ALL32:       # %bb.0:
; ALL32-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [50462976,50462976,50462976,50462976]
; ALL32-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
; ALL32-NEXT:    vpand %xmm1, %xmm0, %xmm0
; ALL32-NEXT:    retl
;
; AVX-64-LABEL: f16xi8_i32:
; AVX-64:       # %bb.0:
; AVX-64-NEXT:    vbroadcastss {{.*#+}} xmm1 = [50462976,50462976,50462976,50462976]
; AVX-64-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
; AVX-64-NEXT:    vpand %xmm1, %xmm0, %xmm0
; AVX-64-NEXT:    retq
;
; ALL64-LABEL: f16xi8_i32:
; ALL64:       # %bb.0:
; ALL64-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [50462976,50462976,50462976,50462976]
; ALL64-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
; ALL64-NEXT:    vpand %xmm1, %xmm0, %xmm0
; ALL64-NEXT:    retq
  %res1 = add <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %a
  %res2 = and <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %res1
  ret <16 x i8> %res2
}


define <16 x i8> @f16xi8_i64(<16 x i8> %a) {
; AVX-LABEL: f16xi8_i64:
; AVX:       # %bb.0:
; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = [506097522914230528,506097522914230528]
; AVX-NEXT:    # xmm1 = mem[0,0]
; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT:    retl
;
; ALL32-LABEL: f16xi8_i64:
; ALL32:       # %bb.0:
; ALL32-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [506097522914230528,506097522914230528]
; ALL32-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
; ALL32-NEXT:    vpand %xmm1, %xmm0, %xmm0
; ALL32-NEXT:    retl
;
; AVX-64-LABEL: f16xi8_i64:
; AVX-64:       # %bb.0:
; AVX-64-NEXT:    vmovddup {{.*#+}} xmm1 = [506097522914230528,506097522914230528]
; AVX-64-NEXT:    # xmm1 = mem[0,0]
; AVX-64-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
; AVX-64-NEXT:    vpand %xmm1, %xmm0, %xmm0
; AVX-64-NEXT:    retq
;
; ALL64-LABEL: f16xi8_i64:
; ALL64:       # %bb.0:
; ALL64-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [506097522914230528,506097522914230528]
; ALL64-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
; ALL64-NEXT:    vpand %xmm1, %xmm0, %xmm0
; ALL64-NEXT:    retq
  %res1 = add <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %a
  %res2 = and <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %res1
  ret <16 x i8> %res2
}


define <32 x i8> @f32xi8_i16(<32 x i8> %a) {
; AVX-LABEL: f32xi8_i16:
; AVX:       # %bb.0:
; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
; AVX-NEXT:    retl
;
; ALL32-LABEL: f32xi8_i16:
; ALL32:       # %bb.0:
; ALL32-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
; ALL32-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
; ALL32-NEXT:    vpand %ymm1, %ymm0, %ymm0
; ALL32-NEXT:    retl
;
; AVX-64-LABEL: f32xi8_i16:
; AVX-64:       # %bb.0:
; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm1
; AVX-64-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX-64-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
; AVX-64-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
; AVX-64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-64-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX-64-NEXT:    retq
;
; ALL64-LABEL: f32xi8_i16:
; ALL64:       # %bb.0:
; ALL64-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
; ALL64-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
; ALL64-NEXT:    vpand %ymm1, %ymm0, %ymm0
; ALL64-NEXT:    retq
  %res1 = add <32 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>, %a
  %res2 = and <32 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>, %res1
  ret <32 x i8> %res2
}


define <32 x i8> @f32xi8_i32(<32 x i8> %a) {
; AVX-LABEL: f32xi8_i32:
; AVX:       # %bb.0:
; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [50462976,50462976,50462976,50462976]
; AVX-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
; AVX-NEXT:    retl
;
; ALL32-LABEL: f32xi8_i32:
; ALL32:       # %bb.0:
; ALL32-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
; ALL32-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
; ALL32-NEXT:    vpand %ymm1, %ymm0, %ymm0
; ALL32-NEXT:    retl
;
; AVX-64-LABEL: f32xi8_i32:
; AVX-64:       # %bb.0:
; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm1
; AVX-64-NEXT:    vbroadcastss {{.*#+}} xmm2 = [50462976,50462976,50462976,50462976]
; AVX-64-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
; AVX-64-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
; AVX-64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-64-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX-64-NEXT:    retq
;
; ALL64-LABEL: f32xi8_i32:
; ALL64:       # %bb.0:
; ALL64-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
; ALL64-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
; ALL64-NEXT:    vpand %ymm1, %ymm0, %ymm0
; ALL64-NEXT:    retq
  %res1 = add <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %a
  %res2 = and <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %res1
  ret <32 x i8> %res2
}


define <32 x i8> @f32xi8_i64(<32 x i8> %a) {
; AVX-LABEL: f32xi8_i64:
; AVX:       # %bb.0:
; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT:    vmovddup {{.*#+}} xmm2 = [506097522914230528,506097522914230528]
; AVX-NEXT:    # xmm2 = mem[0,0]
; AVX-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
; AVX-NEXT:    retl
;
; ALL32-LABEL: f32xi8_i64:
; ALL32:       # %bb.0:
; ALL32-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528]
; ALL32-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
; ALL32-NEXT:    vpand %ymm1, %ymm0, %ymm0
; ALL32-NEXT:    retl
;
; AVX-64-LABEL: f32xi8_i64:
; AVX-64:       # %bb.0:
; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm1
; AVX-64-NEXT:    vmovddup {{.*#+}} xmm2 = [506097522914230528,506097522914230528]
; AVX-64-NEXT:    # xmm2 = mem[0,0]
; AVX-64-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
; AVX-64-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
; AVX-64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-64-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX-64-NEXT:    retq
;
; ALL64-LABEL: f32xi8_i64:
; ALL64:       # %bb.0:
; ALL64-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528]
; ALL64-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
; ALL64-NEXT:    vpand %ymm1, %ymm0, %ymm0
; ALL64-NEXT:    retq
  %res1 = add <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %a
  %res2 = and <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %res1
  ret <32 x i8> %res2
}


define <32 x i8> @f32xi8_i128(<32 x i8> %a) {
; AVX-LABEL: f32xi8_i128:
; AVX:       # %bb.0:
; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
; AVX-NEXT:    retl
;
; ALL32-LABEL: f32xi8_i128:
; ALL32:       # %bb.0:
; ALL32-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; ALL32-NEXT:    # ymm1 = mem[0,1,0,1]
; ALL32-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
; ALL32-NEXT:    vpand %ymm1, %ymm0, %ymm0
; ALL32-NEXT:    retl
;
; AVX-64-LABEL: f32xi8_i128:
; AVX-64:       # %bb.0:
; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm1
; AVX-64-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX-64-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
; AVX-64-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
; AVX-64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-64-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX-64-NEXT:    retq
;
; ALL64-LABEL: f32xi8_i128:
; ALL64:       # %bb.0:
; ALL64-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; ALL64-NEXT:    # ymm1 = mem[0,1,0,1]
; ALL64-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
; ALL64-NEXT:    vpand %ymm1, %ymm0, %ymm0
; ALL64-NEXT:    retq
  %res1 = add <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %a
  %res2 = and <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %res1
  ret <32 x i8> %res2
}


define <64 x i8> @f64xi8_i16(<64 x i8> %a) {
; AVX-LABEL: f64xi8_i16:
; AVX:       # %bb.0:
; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
; AVX-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
; AVX-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
; AVX-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
; AVX-NEXT:    retl
;
; AVX2-LABEL: f64xi8_i16:
; AVX2:       # %bb.0:
; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
; AVX2-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT:    retl
;
; AVX512BW-LABEL: f64xi8_i16:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm1 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
; AVX512BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT:    retl
;
; AVX-64-LABEL: f64xi8_i16:
; AVX-64:       # %bb.0:
; AVX-64-NEXT:    vextractf128 $1, %ymm1, %xmm2
; AVX-64-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX-64-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
; AVX-64-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm2
; AVX-64-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
; AVX-64-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX-64-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
; AVX-64-NEXT:    retq
;
; AVX2-64-LABEL: f64xi8_i16:
; AVX2-64:       # %bb.0:
; AVX2-64-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
; AVX2-64-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT:    vpand %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT:    vpand %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT:    retq
;
; AVX512BW-64-LABEL: f64xi8_i16:
; AVX512BW-64:       # %bb.0:
; AVX512BW-64-NEXT:    vpbroadcastw {{.*#+}} zmm1 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
; AVX512BW-64-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
; AVX512BW-64-NEXT:    vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-64-NEXT:    retq
  %res1 = add <64 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>, %a
  %res2 = and <64 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>, %res1
  ret <64 x i8> %res2
}


define <64 x i8> @f64i8_i32(<64 x i8> %a) {
; AVX-LABEL: f64i8_i32:
; AVX:       # %bb.0:
; AVX-NEXT:    vbroadcastss {{.*#+}} ymm2 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm3
; AVX-NEXT:    vpaddb %xmm2, %xmm3, %xmm3
; AVX-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm3
; AVX-NEXT:    vpaddb %xmm2, %xmm3, %xmm3
; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
; AVX-NEXT:    retl
;
; AVX2-LABEL: f64i8_i32:
; AVX2:       # %bb.0:
; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
; AVX2-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT:    retl
;
; AVX512BW-LABEL: f64i8_i32:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
; AVX512BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT:    retl
;
; AVX-64-LABEL: f64i8_i32:
; AVX-64:       # %bb.0:
; AVX-64-NEXT:    vbroadcastss {{.*#+}} ymm2 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
; AVX-64-NEXT:    vextractf128 $1, %ymm1, %xmm3
; AVX-64-NEXT:    vpaddb %xmm2, %xmm3, %xmm3
; AVX-64-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
; AVX-64-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm3
; AVX-64-NEXT:    vpaddb %xmm2, %xmm3, %xmm3
; AVX-64-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
; AVX-64-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
; AVX-64-NEXT:    retq
;
; AVX2-64-LABEL: f64i8_i32:
; AVX2-64:       # %bb.0:
; AVX2-64-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
; AVX2-64-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT:    vpand %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT:    vpand %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT:    retq
;
; AVX512BW-64-LABEL: f64i8_i32:
; AVX512BW-64:       # %bb.0:
; AVX512BW-64-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
; AVX512BW-64-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
; AVX512BW-64-NEXT:    vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-64-NEXT:    retq
  %res1 = add <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %a
  %res2 = and <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %res1
  ret <64 x i8> %res2
}


define <64 x i8> @f64xi8_i64(<64 x i8> %a) {
; AVX-LABEL: f64xi8_i64:
; AVX:       # %bb.0:
; AVX-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528]
; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm3
; AVX-NEXT:    vpaddb %xmm2, %xmm3, %xmm3
; AVX-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm3
; AVX-NEXT:    vpaddb %xmm2, %xmm3, %xmm3
; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
; AVX-NEXT:    retl
;
; AVX2-LABEL: f64xi8_i64:
; AVX2:       # %bb.0:
; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528]
; AVX2-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT:    retl
;
; AVX512BW-LABEL: f64xi8_i64:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528]
; AVX512BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT:    retl
;
; AVX-64-LABEL: f64xi8_i64:
; AVX-64:       # %bb.0:
; AVX-64-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528]
; AVX-64-NEXT:    vextractf128 $1, %ymm1, %xmm3
; AVX-64-NEXT:    vpaddb %xmm2, %xmm3, %xmm3
; AVX-64-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
; AVX-64-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm3
; AVX-64-NEXT:    vpaddb %xmm2, %xmm3, %xmm3
; AVX-64-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
; AVX-64-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
; AVX-64-NEXT:    retq
;
; AVX2-64-LABEL: f64xi8_i64:
; AVX2-64:       # %bb.0:
; AVX2-64-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528]
; AVX2-64-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT:    vpand %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT:    vpand %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT:    retq
;
; AVX512BW-64-LABEL: f64xi8_i64:
; AVX512BW-64:       # %bb.0:
; AVX512BW-64-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528]
; AVX512BW-64-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
; AVX512BW-64-NEXT:    vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-64-NEXT:    retq
  %res1 = add <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %a
  %res2 = and <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %res1
  ret <64 x i8> %res2
}


define <64 x i8> @f64xi8_i128(<64 x i8> %a) {
; AVX-LABEL: f64xi8_i128:
; AVX:       # %bb.0:
; AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX-NEXT:    # ymm2 = mem[0,1,0,1]
; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm3
; AVX-NEXT:    vpaddb %xmm2, %xmm3, %xmm3
; AVX-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm3
; AVX-NEXT:    vpaddb %xmm2, %xmm3, %xmm3
; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
; AVX-NEXT:    retl
;
; AVX2-LABEL: f64xi8_i128:
; AVX2:       # %bb.0:
; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
; AVX2-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT:    retl
;
; AVX512BW-LABEL: f64xi8_i128:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT:    retl
;
; AVX-64-LABEL: f64xi8_i128:
; AVX-64:       # %bb.0:
; AVX-64-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX-64-NEXT:    # ymm2 = mem[0,1,0,1]
; AVX-64-NEXT:    vextractf128 $1, %ymm1, %xmm3
; AVX-64-NEXT:    vpaddb %xmm2, %xmm3, %xmm3
; AVX-64-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
; AVX-64-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm3
; AVX-64-NEXT:    vpaddb %xmm2, %xmm3, %xmm3
; AVX-64-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
; AVX-64-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
; AVX-64-NEXT:    retq
;
; AVX2-64-LABEL: f64xi8_i128:
; AVX2-64:       # %bb.0:
; AVX2-64-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX2-64-NEXT:    # ymm2 = mem[0,1,0,1]
; AVX2-64-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT:    vpand %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT:    vpand %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT:    retq
;
; AVX512BW-64-LABEL: f64xi8_i128:
; AVX512BW-64:       # %bb.0:
; AVX512BW-64-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512BW-64-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-64-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
; AVX512BW-64-NEXT:    vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-64-NEXT:    retq
  %res1 = add <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %a
  %res2 = and <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %res1
  ret <64 x i8> %res2
}


define <64 x i8> @f64xi8_i256(<64 x i8> %a) {
; AVX-LABEL: f64xi8_i256:
; AVX:       # %bb.0:
; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
; AVX-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
; AVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
; AVX-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
; AVX-NEXT:    vpaddb %xmm4, %xmm0, %xmm0
; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
; AVX-NEXT:    retl
;
; AVX2-LABEL: f64xi8_i256:
; AVX2:       # %bb.0:
; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
; AVX2-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT:    retl
;
; AVX512BW-LABEL: f64xi8_i256:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
; AVX512BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT:    retl
;
; AVX-64-LABEL: f64xi8_i256:
; AVX-64:       # %bb.0:
; AVX-64-NEXT:    vextractf128 $1, %ymm1, %xmm2
; AVX-64-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
; AVX-64-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
; AVX-64-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX-64-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm2
; AVX-64-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
; AVX-64-NEXT:    vpaddb %xmm4, %xmm0, %xmm0
; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX-64-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
; AVX-64-NEXT:    retq
;
; AVX2-64-LABEL: f64xi8_i256:
; AVX2-64:       # %bb.0:
; AVX2-64-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
; AVX2-64-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT:    vpand %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT:    vpand %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT:    retq
;
; AVX512BW-64-LABEL: f64xi8_i256:
; AVX512BW-64:       # %bb.0:
; AVX512BW-64-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
; AVX512BW-64-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
; AVX512BW-64-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
; AVX512BW-64-NEXT:    vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-64-NEXT:    retq
  %res1 = add <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, %a
  %res2 = and <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, %res1
  ret <64 x i8> %res2
}


define <8 x i16> @f8xi16_i32(<8 x i16> %a) {
; AVX-LABEL: f8xi16_i32:
; AVX:       # %bb.0:
; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [65536,65536,65536,65536]
; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT:    retl
;
; ALL32-LABEL: f8xi16_i32:
; ALL32:       # %bb.0:
; ALL32-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [65536,65536,65536,65536]
; ALL32-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
; ALL32-NEXT:    vpand %xmm1, %xmm0, %xmm0
; ALL32-NEXT:    retl
;
; AVX-64-LABEL: f8xi16_i32:
; AVX-64:       # %bb.0:
; AVX-64-NEXT:    vbroadcastss {{.*#+}} xmm1 = [65536,65536,65536,65536]
; AVX-64-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
; AVX-64-NEXT:    vpand %xmm1, %xmm0, %xmm0
; AVX-64-NEXT:    retq
;
; ALL64-LABEL: f8xi16_i32:
; ALL64:       # %bb.0:
; ALL64-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [65536,65536,65536,65536]
; ALL64-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
; ALL64-NEXT:    vpand %xmm1, %xmm0, %xmm0
; ALL64-NEXT:    retq
  %res1 = add <8 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %a
  %res2 = and <8 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %res1
  ret <8 x i16> %res2
}


define <8 x i16> @f8xi16_i64(<8 x i16> %a) {
; AVX-LABEL: f8xi16_i64:
; AVX:       # %bb.0:
; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = [844433520132096,844433520132096]
; AVX-NEXT:    # xmm1 = mem[0,0]
; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT:    retl
;
; ALL32-LABEL: f8xi16_i64:
; ALL32:       # %bb.0:
; ALL32-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [844433520132096,844433520132096]
; ALL32-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
; ALL32-NEXT:    vpand %xmm1, %xmm0, %xmm0
; ALL32-NEXT:    retl
;
; AVX-64-LABEL: f8xi16_i64:
; AVX-64:       # %bb.0:
; AVX-64-NEXT:    vmovddup {{.*#+}} xmm1 = [844433520132096,844433520132096]
; AVX-64-NEXT:    # xmm1 = mem[0,0]
; AVX-64-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
; AVX-64-NEXT:    vpand %xmm1, %xmm0, %xmm0
; AVX-64-NEXT:    retq
;
; ALL64-LABEL: f8xi16_i64:
; ALL64:       # %bb.0:
; ALL64-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [844433520132096,844433520132096]
; ALL64-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
; ALL64-NEXT:    vpand %xmm1, %xmm0, %xmm0
; ALL64-NEXT:    retq
  %res1 = add <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %a
  %res2 = and <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %res1
  ret <8 x i16> %res2
}


define <16 x i16> @f16xi16_i32(<16 x i16> %a) {
; AVX-LABEL: f16xi16_i32:
; AVX:       # %bb.0:
; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [65536,65536,65536,65536]
; AVX-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
; AVX-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
; AVX-NEXT:    retl
;
; ALL32-LABEL: f16xi16_i32:
; ALL32:       # %bb.0:
; ALL32-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [65536,65536,65536,65536,65536,65536,65536,65536]
; ALL32-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
; ALL32-NEXT:    vpand %ymm1, %ymm0, %ymm0
; ALL32-NEXT:    retl
;
; AVX-64-LABEL: f16xi16_i32:
; AVX-64:       # %bb.0:
; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm1
; AVX-64-NEXT:    vbroadcastss {{.*#+}} xmm2 = [65536,65536,65536,65536]
; AVX-64-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
; AVX-64-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
; AVX-64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-64-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX-64-NEXT:    retq
;
; ALL64-LABEL: f16xi16_i32:
; ALL64:       # %bb.0:
; ALL64-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [65536,65536,65536,65536,65536,65536,65536,65536]
; ALL64-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
; ALL64-NEXT:    vpand %ymm1, %ymm0, %ymm0
; ALL64-NEXT:    retq
  %res1 = add <16 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %a
  %res2 = and <16 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %res1
  ret <16 x i16> %res2
}


define <16 x i16> @f16xi16_i64(<16 x i16> %a) {
; AVX-LABEL: f16xi16_i64:
; AVX:       # %bb.0:
; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT:    vmovddup {{.*#+}} xmm2 = [844433520132096,844433520132096]
; AVX-NEXT:    # xmm2 = mem[0,0]
; AVX-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
; AVX-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
; AVX-NEXT:    retl
;
; ALL32-LABEL: f16xi16_i64:
; ALL32:       # %bb.0:
; ALL32-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [844433520132096,844433520132096,844433520132096,844433520132096]
; ALL32-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
; ALL32-NEXT:    vpand %ymm1, %ymm0, %ymm0
; ALL32-NEXT:    retl
;
; AVX-64-LABEL: f16xi16_i64:
; AVX-64:       # %bb.0:
; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm1
; AVX-64-NEXT:    vmovddup {{.*#+}} xmm2 = [844433520132096,844433520132096]
; AVX-64-NEXT:    # xmm2 = mem[0,0]
; AVX-64-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
; AVX-64-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
; AVX-64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-64-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX-64-NEXT:    retq
;
; ALL64-LABEL: f16xi16_i64:
; ALL64:       # %bb.0:
; ALL64-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [844433520132096,844433520132096,844433520132096,844433520132096]
; ALL64-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
; ALL64-NEXT:    vpand %ymm1, %ymm0, %ymm0
; ALL64-NEXT:    retq
  %res1 = add <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %a
  %res2 = and <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %res1
  ret <16 x i16> %res2
}


define <16 x i16> @f16xi16_i128(<16 x i16> %a) {
; AVX-LABEL: f16xi16_i128:
; AVX:       # %bb.0:
; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
; AVX-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
; AVX-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
; AVX-NEXT:    retl
;
; ALL32-LABEL: f16xi16_i128:
; ALL32:       # %bb.0:
; ALL32-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; ALL32-NEXT:    # ymm1 = mem[0,1,0,1]
; ALL32-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
; ALL32-NEXT:    vpand %ymm1, %ymm0, %ymm0
; ALL32-NEXT:    retl
;
; AVX-64-LABEL: f16xi16_i128:
; AVX-64:       # %bb.0:
; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm1
; AVX-64-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
; AVX-64-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
; AVX-64-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
; AVX-64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-64-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX-64-NEXT:    retq
;
; ALL64-LABEL: f16xi16_i128:
; ALL64:       # %bb.0:
; ALL64-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; ALL64-NEXT:    # ymm1 = mem[0,1,0,1]
; ALL64-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
; ALL64-NEXT:    vpand %ymm1, %ymm0, %ymm0
; ALL64-NEXT:    retq
  %res1 = add <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %a
  %res2 = and <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %res1
  ret <16 x i16> %res2
}


define <32 x i16> @f32xi16_i32(<32 x i16> %a) {
; AVX-LABEL: f32xi16_i32:
; AVX:       # %bb.0:
; AVX-NEXT:    vbroadcastss {{.*#+}} ymm2 = [65536,65536,65536,65536,65536,65536,65536,65536]
; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm3
; AVX-NEXT:    vpaddw %xmm2, %xmm3, %xmm3
; AVX-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm3
; AVX-NEXT:    vpaddw %xmm2, %xmm3, %xmm3
; AVX-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
; AVX-NEXT:    retl
;
; AVX2-LABEL: f32xi16_i32:
; AVX2:       # %bb.0:
; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [65536,65536,65536,65536,65536,65536,65536,65536]
; AVX2-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
; AVX2-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT:    retl
;
; AVX512BW-LABEL: f32xi16_i32:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536]
; AVX512BW-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT:    retl
;
; AVX-64-LABEL: f32xi16_i32:
; AVX-64:       # %bb.0:
; AVX-64-NEXT:    vbroadcastss {{.*#+}} ymm2 = [65536,65536,65536,65536,65536,65536,65536,65536]
; AVX-64-NEXT:    vextractf128 $1, %ymm1, %xmm3
; AVX-64-NEXT:    vpaddw %xmm2, %xmm3, %xmm3
; AVX-64-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
; AVX-64-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm3
; AVX-64-NEXT:    vpaddw %xmm2, %xmm3, %xmm3
; AVX-64-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
; AVX-64-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
; AVX-64-NEXT:    retq
;
; AVX2-64-LABEL: f32xi16_i32:
; AVX2-64:       # %bb.0:
; AVX2-64-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [65536,65536,65536,65536,65536,65536,65536,65536]
; AVX2-64-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT:    vpand %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT:    vpand %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT:    retq
;
; AVX512BW-64-LABEL: f32xi16_i32:
; AVX512BW-64:       # %bb.0:
; AVX512BW-64-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536]
; AVX512BW-64-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
; AVX512BW-64-NEXT:    vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-64-NEXT:    retq
  %res1 = add <32 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %a
  %res2 = and <32 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %res1
  ret <32 x i16> %res2
}


define <32 x i16> @f32xi16_i64(<32 x i16> %a) {
; AVX-LABEL: f32xi16_i64:
; AVX:       # %bb.0:
; AVX-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [844433520132096,844433520132096,844433520132096,844433520132096]
; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm3
; AVX-NEXT:    vpaddw %xmm2, %xmm3, %xmm3
; AVX-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm3
; AVX-NEXT:    vpaddw %xmm2, %xmm3, %xmm3
; AVX-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
; AVX-NEXT:    retl
;
; AVX2-LABEL: f32xi16_i64:
; AVX2:       # %bb.0:
; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [844433520132096,844433520132096,844433520132096,844433520132096]
; AVX2-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
; AVX2-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT:    retl
;
; AVX512BW-LABEL: f32xi16_i64:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [844433520132096,844433520132096,844433520132096,844433520132096,844433520132096,844433520132096,844433520132096,844433520132096]
; AVX512BW-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT:    retl
;
; AVX-64-LABEL: f32xi16_i64:
; AVX-64:       # %bb.0:
; AVX-64-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [844433520132096,844433520132096,844433520132096,844433520132096]
; AVX-64-NEXT:    vextractf128 $1, %ymm1, %xmm3
; AVX-64-NEXT:    vpaddw %xmm2, %xmm3, %xmm3
; AVX-64-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
; AVX-64-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm3
; AVX-64-NEXT:    vpaddw %xmm2, %xmm3, %xmm3
; AVX-64-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
; AVX-64-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
; AVX-64-NEXT:    retq
;
; AVX2-64-LABEL: f32xi16_i64:
; AVX2-64:       # %bb.0:
; AVX2-64-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [844433520132096,844433520132096,844433520132096,844433520132096]
; AVX2-64-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT:    vpand %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT:    vpand %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT:    retq
;
; AVX512BW-64-LABEL: f32xi16_i64:
; AVX512BW-64:       # %bb.0:
; AVX512BW-64-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [844433520132096,844433520132096,844433520132096,844433520132096,844433520132096,844433520132096,844433520132096,844433520132096]
; AVX512BW-64-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
; AVX512BW-64-NEXT:    vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-64-NEXT:    retq
  %res1 = add <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %a
  %res2 = and <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %res1
  ret <32 x i16> %res2
}


define <32 x i16> @f32xi16_i128(<32 x i16> %a) {
; AVX-LABEL: f32xi16_i128:
; AVX:       # %bb.0:
; AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; AVX-NEXT:    # ymm2 = mem[0,1,0,1]
; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm3
; AVX-NEXT:    vpaddw %xmm2, %xmm3, %xmm3
; AVX-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm3
; AVX-NEXT:    vpaddw %xmm2, %xmm3, %xmm3
; AVX-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
; AVX-NEXT:    retl
;
; AVX2-LABEL: f32xi16_i128:
; AVX2:       # %bb.0:
; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
; AVX2-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
; AVX2-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT:    retl
;
; AVX512BW-LABEL: f32xi16_i128:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT:    retl
;
; AVX-64-LABEL: f32xi16_i128:
; AVX-64:       # %bb.0:
; AVX-64-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; AVX-64-NEXT:    # ymm2 = mem[0,1,0,1]
; AVX-64-NEXT:    vextractf128 $1, %ymm1, %xmm3
; AVX-64-NEXT:    vpaddw %xmm2, %xmm3, %xmm3
; AVX-64-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
; AVX-64-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm3
; AVX-64-NEXT:    vpaddw %xmm2, %xmm3, %xmm3
; AVX-64-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
; AVX-64-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
; AVX-64-NEXT:    retq
;
; AVX2-64-LABEL: f32xi16_i128:
; AVX2-64:       # %bb.0:
; AVX2-64-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; AVX2-64-NEXT:    # ymm2 = mem[0,1,0,1]
; AVX2-64-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT:    vpand %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT:    vpand %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT:    retq
;
; AVX512BW-64-LABEL: f32xi16_i128:
; AVX512BW-64:       # %bb.0:
; AVX512BW-64-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; AVX512BW-64-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-64-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
; AVX512BW-64-NEXT:    vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-64-NEXT:    retq
  %res1 = add <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %a
  %res2 = and <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %res1
  ret <32 x i16> %res2
}


define <32 x i16> @f32xi16_i256(<32 x i16> %a) {
; AVX-LABEL: f32xi16_i256:
; AVX:       # %bb.0:
; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,9,10,11,12,13,14,15]
; AVX-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
; AVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7]
; AVX-NEXT:    vpaddw %xmm4, %xmm1, %xmm1
; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
; AVX-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
; AVX-NEXT:    vpaddw %xmm4, %xmm0, %xmm0
; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
; AVX-NEXT:    retl
;
; AVX2-LABEL: f32xi16_i256:
; AVX2:       # %bb.0:
; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX2-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
; AVX2-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT:    retl
;
; AVX512BW-LABEL: f32xi16_i256:
; AVX512BW:       # %bb.0:
; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
; AVX512BW-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT:    retl
;
; AVX-64-LABEL: f32xi16_i256:
; AVX-64:       # %bb.0:
; AVX-64-NEXT:    vextractf128 $1, %ymm1, %xmm2
; AVX-64-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,9,10,11,12,13,14,15]
; AVX-64-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
; AVX-64-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7]
; AVX-64-NEXT:    vpaddw %xmm4, %xmm1, %xmm1
; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm2
; AVX-64-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
; AVX-64-NEXT:    vpaddw %xmm4, %xmm0, %xmm0
; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX-64-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
; AVX-64-NEXT:    retq
;
; AVX2-64-LABEL: f32xi16_i256:
; AVX2-64:       # %bb.0:
; AVX2-64-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX2-64-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT:    vpand %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT:    vpand %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT:    retq
;
; AVX512BW-64-LABEL: f32xi16_i256:
; AVX512BW-64:       # %bb.0:
; AVX512BW-64-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512BW-64-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
; AVX512BW-64-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
; AVX512BW-64-NEXT:    vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-64-NEXT:    retq
  %res1 = add <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, %a
  %res2 = and <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, %res1
  ret <32 x i16> %res2
}


define <4 x i32> @f4xi32_i64(<4 x i32> %a) {
; AVX-LABEL: f4xi32_i64:
; AVX:       # %bb.0:
; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = [4294967296,4294967296]
; AVX-NEXT:    # xmm1 = mem[0,0]
; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT:    retl
;
; ALL32-LABEL: f4xi32_i64:
; ALL32:       # %bb.0:
; ALL32-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4294967296,4294967296]
; ALL32-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
; ALL32-NEXT:    vpand %xmm1, %xmm0, %xmm0
; ALL32-NEXT:    retl
;
; AVX-64-LABEL: f4xi32_i64:
; AVX-64:       # %bb.0:
; AVX-64-NEXT:    vmovddup {{.*#+}} xmm1 = [4294967296,4294967296]
; AVX-64-NEXT:    # xmm1 = mem[0,0]
; AVX-64-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
; AVX-64-NEXT:    vpand %xmm1, %xmm0, %xmm0
; AVX-64-NEXT:    retq
;
; ALL64-LABEL: f4xi32_i64:
; ALL64:       # %bb.0:
; ALL64-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4294967296,4294967296]
; ALL64-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
; ALL64-NEXT:    vpand %xmm1, %xmm0, %xmm0
; ALL64-NEXT:    retq
  %res1 = add <4 x i32> <i32 0, i32 1, i32 0, i32 1>, %a
  %res2 = and <4 x i32> <i32 0, i32 1, i32 0, i32 1>, %res1
  ret <4 x i32> %res2
}


define <8 x i32> @f8xi32_i64(<8 x i32> %a) {
; AVX-LABEL: f8xi32_i64:
; AVX:       # %bb.0:
; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT:    vmovddup {{.*#+}} xmm2 = [4294967296,4294967296]
; AVX-NEXT:    # xmm2 = mem[0,0]
; AVX-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
; AVX-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
; AVX-NEXT:    retl
;
; ALL32-LABEL: f8xi32_i64:
; ALL32:       # %bb.0:
; ALL32-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4294967296,4294967296,4294967296,4294967296]
; ALL32-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
; ALL32-NEXT:    vpand %ymm1, %ymm0, %ymm0
; ALL32-NEXT:    retl
;
; AVX-64-LABEL: f8xi32_i64:
; AVX-64:       # %bb.0:
; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm1
; AVX-64-NEXT:    vmovddup {{.*#+}} xmm2 = [4294967296,4294967296]
; AVX-64-NEXT:    # xmm2 = mem[0,0]
; AVX-64-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
; AVX-64-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
; AVX-64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-64-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX-64-NEXT:    retq
;
; ALL64-LABEL: f8xi32_i64:
; ALL64:       # %bb.0:
; ALL64-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4294967296,4294967296,4294967296,4294967296]
; ALL64-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
; ALL64-NEXT:    vpand %ymm1, %ymm0, %ymm0
; ALL64-NEXT:    retq
  %res1 = add <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %a
  %res2 = and <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %res1
  ret <8 x i32> %res2
}


define <8 x i32> @f8xi32_i128(<8 x i32> %a) {
; AVX-LABEL: f8xi32_i128:
; AVX:       # %bb.0:
; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3]
; AVX-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
; AVX-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
; AVX-NEXT:    retl
;
; ALL32-LABEL: f8xi32_i128:
; ALL32:       # %bb.0:
; ALL32-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3]
; ALL32-NEXT:    # ymm1 = mem[0,1,0,1]
; ALL32-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
; ALL32-NEXT:    vpand %ymm1, %ymm0, %ymm0
; ALL32-NEXT:    retl
;
; AVX-64-LABEL: f8xi32_i128:
; AVX-64:       # %bb.0:
; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm1
; AVX-64-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3]
; AVX-64-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
; AVX-64-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
; AVX-64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-64-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX-64-NEXT:    retq
;
; ALL64-LABEL: f8xi32_i128:
; ALL64:       # %bb.0:
; ALL64-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3]
; ALL64-NEXT:    # ymm1 = mem[0,1,0,1]
; ALL64-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
; ALL64-NEXT:    vpand %ymm1, %ymm0, %ymm0
; ALL64-NEXT:    retq
  %res1 = add <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>, %a
  %res2 = and <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>, %res1
  ret <8 x i32> %res2
}


define <16 x i32> @f16xi32_i64(<16 x i32> %a) {
; AVX-LABEL: f16xi32_i64:
; AVX:       # %bb.0:
; AVX-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [4294967296,4294967296,4294967296,4294967296]
; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm3
; AVX-NEXT:    vpaddd %xmm2, %xmm3, %xmm3
; AVX-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm3
; AVX-NEXT:    vpaddd %xmm2, %xmm3, %xmm3
; AVX-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
; AVX-NEXT:    retl
;
; AVX2-LABEL: f16xi32_i64:
; AVX2:       # %bb.0:
; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4294967296,4294967296,4294967296,4294967296]
; AVX2-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT:    retl
;
; AVX512-LABEL: f16xi32_i64:
; AVX512:       # %bb.0:
; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [4294967296,4294967296,4294967296,4294967296,4294967296,4294967296,4294967296,4294967296]
; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
; AVX512-NEXT:    retl
;
; AVX-64-LABEL: f16xi32_i64:
; AVX-64:       # %bb.0:
; AVX-64-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [4294967296,4294967296,4294967296,4294967296]
; AVX-64-NEXT:    vextractf128 $1, %ymm1, %xmm3
; AVX-64-NEXT:    vpaddd %xmm2, %xmm3, %xmm3
; AVX-64-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
; AVX-64-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm3
; AVX-64-NEXT:    vpaddd %xmm2, %xmm3, %xmm3
; AVX-64-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
; AVX-64-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
; AVX-64-NEXT:    retq
;
; AVX2-64-LABEL: f16xi32_i64:
; AVX2-64:       # %bb.0:
; AVX2-64-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4294967296,4294967296,4294967296,4294967296]
; AVX2-64-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT:    vpand %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT:    vpand %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT:    retq
;
; AVX512F-64-LABEL: f16xi32_i64:
; AVX512F-64:       # %bb.0:
; AVX512F-64-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [4294967296,4294967296,4294967296,4294967296,4294967296,4294967296,4294967296,4294967296]
; AVX512F-64-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
; AVX512F-64-NEXT:    vpandd %zmm1, %zmm0, %zmm0
; AVX512F-64-NEXT:    retq
  %res1 = add <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %a
  %res2 = and <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %res1
  ret <16 x i32> %res2
}


define <16 x i32> @f16xi32_i128(<16 x i32> %a) {
; AVX-LABEL: f16xi32_i128:
; AVX:       # %bb.0:
; AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3]
; AVX-NEXT:    # ymm2 = mem[0,1,0,1]
; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm3
; AVX-NEXT:    vpaddd %xmm2, %xmm3, %xmm3
; AVX-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm3
; AVX-NEXT:    vpaddd %xmm2, %xmm3, %xmm3
; AVX-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
; AVX-NEXT:    retl
;
; AVX2-LABEL: f16xi32_i128:
; AVX2:       # %bb.0:
; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3]
; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
; AVX2-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT:    retl
;
; AVX512-LABEL: f16xi32_i128:
; AVX512:       # %bb.0:
; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
; AVX512-NEXT:    retl
;
; AVX-64-LABEL: f16xi32_i128:
; AVX-64:       # %bb.0:
; AVX-64-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3]
; AVX-64-NEXT:    # ymm2 = mem[0,1,0,1]
; AVX-64-NEXT:    vextractf128 $1, %ymm1, %xmm3
; AVX-64-NEXT:    vpaddd %xmm2, %xmm3, %xmm3
; AVX-64-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
; AVX-64-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm3
; AVX-64-NEXT:    vpaddd %xmm2, %xmm3, %xmm3
; AVX-64-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
; AVX-64-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
; AVX-64-NEXT:    retq
;
; AVX2-64-LABEL: f16xi32_i128:
; AVX2-64:       # %bb.0:
; AVX2-64-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3]
; AVX2-64-NEXT:    # ymm2 = mem[0,1,0,1]
; AVX2-64-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT:    vpand %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT:    vpand %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT:    retq
;
; AVX512F-64-LABEL: f16xi32_i128:
; AVX512F-64:       # %bb.0:
; AVX512F-64-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512F-64-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512F-64-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
; AVX512F-64-NEXT:    vpandd %zmm1, %zmm0, %zmm0
; AVX512F-64-NEXT:    retq
  %res1 = add <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>, %a
  %res2 = and <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>, %res1
  ret <16 x i32> %res2
}


define <4 x i64> @f4xi64_i128(<4 x i64> %a) {
; AVX-LABEL: f4xi64_i128:
; AVX:       # %bb.0:
; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,1,0]
; AVX-NEXT:    vpaddq %xmm2, %xmm1, %xmm1
; AVX-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
; AVX-NEXT:    retl
;
; ALL32-LABEL: f4xi64_i128:
; ALL32:       # %bb.0:
; ALL32-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0]
; ALL32-NEXT:    # ymm1 = mem[0,1,0,1]
; ALL32-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
; ALL32-NEXT:    vpand %ymm1, %ymm0, %ymm0
; ALL32-NEXT:    retl
;
; AVX-64-LABEL: f4xi64_i128:
; AVX-64:       # %bb.0:
; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm1
; AVX-64-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0]
; AVX-64-NEXT:    vpaddq %xmm2, %xmm1, %xmm1
; AVX-64-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
; AVX-64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-64-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX-64-NEXT:    retq
;
; ALL64-LABEL: f4xi64_i128:
; ALL64:       # %bb.0:
; ALL64-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,1,0,1]
; ALL64-NEXT:    # ymm1 = mem[0,1,0,1]
; ALL64-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
; ALL64-NEXT:    vpand %ymm1, %ymm0, %ymm0
; ALL64-NEXT:    retq
  %res1 = add <4 x i64> <i64 0, i64 1, i64 0, i64 1>, %a
  %res2 = and <4 x i64> <i64 0, i64 1, i64 0, i64 1>, %res1
  ret <4 x i64> %res2
}


define <8 x i64> @f8xi64_i128(<8 x i64> %a) {
; AVX-LABEL: f8xi64_i128:
; AVX:       # %bb.0:
; AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [0,0,1,0,0,0,1,0]
; AVX-NEXT:    # ymm2 = mem[0,1,0,1]
; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm3
; AVX-NEXT:    vpaddq %xmm2, %xmm3, %xmm3
; AVX-NEXT:    vpaddq %xmm2, %xmm1, %xmm1
; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm3
; AVX-NEXT:    vpaddq %xmm2, %xmm3, %xmm3
; AVX-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
; AVX-NEXT:    retl
;
; AVX2-LABEL: f8xi64_i128:
; AVX2:       # %bb.0:
; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,0,0,1,0]
; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
; AVX2-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT:    retl
;
; AVX512-LABEL: f8xi64_i128:
; AVX512:       # %bb.0:
; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0]
; AVX512-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
; AVX512-NEXT:    retl
;
; AVX-64-LABEL: f8xi64_i128:
; AVX-64:       # %bb.0:
; AVX-64-NEXT:    vextractf128 $1, %ymm1, %xmm2
; AVX-64-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0]
; AVX-64-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
; AVX-64-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm2
; AVX-64-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
; AVX-64-NEXT:    vpaddq %xmm3, %xmm0, %xmm0
; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX-64-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0]
; AVX-64-NEXT:    # ymm2 = mem[0,1,0,1]
; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
; AVX-64-NEXT:    retq
;
; AVX2-64-LABEL: f8xi64_i128:
; AVX2-64:       # %bb.0:
; AVX2-64-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,1,0,1]
; AVX2-64-NEXT:    # ymm2 = mem[0,1,0,1]
; AVX2-64-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT:    vpand %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT:    vpand %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT:    retq
;
; AVX512F-64-LABEL: f8xi64_i128:
; AVX512F-64:       # %bb.0:
; AVX512F-64-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,0,1,0,1,0,1]
; AVX512F-64-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512F-64-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
; AVX512F-64-NEXT:    vpandq %zmm1, %zmm0, %zmm0
; AVX512F-64-NEXT:    retq
  %res1 = add <8 x i64> <i64 0, i64 1, i64 0, i64 1, i64 0, i64 1, i64 0, i64 1>, %a
  %res2 = and <8 x i64> <i64 0, i64 1, i64 0, i64 1, i64 0, i64 1, i64 0, i64 1>, %res1
  ret <8 x i64> %res2
}


define <8 x i64> @f8xi64_i256(<8 x i64> %a) {
; AVX-LABEL: f8xi64_i256:
; AVX:       # %bb.0:
; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [2,0,3,0]
; AVX-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
; AVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,0,1,0]
; AVX-NEXT:    vpaddq %xmm4, %xmm1, %xmm1
; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
; AVX-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
; AVX-NEXT:    vpaddq %xmm4, %xmm0, %xmm0
; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0]
; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
; AVX-NEXT:    retl
;
; AVX2-LABEL: f8xi64_i256:
; AVX2:       # %bb.0:
; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0]
; AVX2-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT:    retl
;
; AVX512-LABEL: f8xi64_i256:
; AVX512:       # %bb.0:
; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,1,0,2,0,3,0,0,0,1,0,2,0,3,0]
; AVX512-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
; AVX512-NEXT:    retl
;
; AVX-64-LABEL: f8xi64_i256:
; AVX-64:       # %bb.0:
; AVX-64-NEXT:    vextractf128 $1, %ymm1, %xmm2
; AVX-64-NEXT:    vmovdqa {{.*#+}} xmm3 = [2,3]
; AVX-64-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
; AVX-64-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0]
; AVX-64-NEXT:    vpaddq %xmm4, %xmm1, %xmm1
; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm2
; AVX-64-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
; AVX-64-NEXT:    vpaddq %xmm4, %xmm0, %xmm0
; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX-64-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3]
; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
; AVX-64-NEXT:    retq
;
; AVX2-64-LABEL: f8xi64_i256:
; AVX2-64:       # %bb.0:
; AVX2-64-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3]
; AVX2-64-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT:    vpand %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT:    vpand %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT:    retq
;
; AVX512F-64-LABEL: f8xi64_i256:
; AVX512F-64:       # %bb.0:
; AVX512F-64-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3]
; AVX512F-64-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
; AVX512F-64-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
; AVX512F-64-NEXT:    vpandq %zmm1, %zmm0, %zmm0
; AVX512F-64-NEXT:    retq
  %res1 = add <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>, %a
  %res2 = and <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>, %res1
  ret <8 x i64> %res2
}


define <4 x float> @f4xf32_f64(<4 x float> %a) {
; AVX-LABEL: f4xf32_f64:
; AVX:       # %bb.0:
; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = [4575657222482165760,4575657222482165760]
; AVX-NEXT:    # xmm1 = mem[0,0]
; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT:    vdivps %xmm0, %xmm1, %xmm0
; AVX-NEXT:    retl
;
; ALL32-LABEL: f4xf32_f64:
; ALL32:       # %bb.0:
; ALL32-NEXT:    vmovddup {{.*#+}} xmm1 = [4575657222482165760,4575657222482165760]
; ALL32-NEXT:    # xmm1 = mem[0,0]
; ALL32-NEXT:    vaddps %xmm1, %xmm0, %xmm0
; ALL32-NEXT:    vdivps %xmm0, %xmm1, %xmm0
; ALL32-NEXT:    retl
;
; AVX-64-LABEL: f4xf32_f64:
; AVX-64:       # %bb.0:
; AVX-64-NEXT:    vmovddup {{.*#+}} xmm1 = [4575657222482165760,4575657222482165760]
; AVX-64-NEXT:    # xmm1 = mem[0,0]
; AVX-64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
; AVX-64-NEXT:    vdivps %xmm0, %xmm1, %xmm0
; AVX-64-NEXT:    retq
;
; ALL64-LABEL: f4xf32_f64:
; ALL64:       # %bb.0:
; ALL64-NEXT:    vmovddup {{.*#+}} xmm1 = [4575657222482165760,4575657222482165760]
; ALL64-NEXT:    # xmm1 = mem[0,0]
; ALL64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
; ALL64-NEXT:    vdivps %xmm0, %xmm1, %xmm0
; ALL64-NEXT:    retq
  %res1 = fadd <4 x float> <float 2.0, float 1.0, float 2.0, float 1.0>, %a
  %res2 = fdiv <4 x float> <float 2.0, float 1.0, float 2.0, float 1.0>, %res1
  ret <4 x float> %res2
}


define <8 x float> @f8xf32_f64(<8 x float> %a) {
; AVX-LABEL: f8xf32_f64:
; AVX:       # %bb.0:
; AVX-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760]
; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
; AVX-NEXT:    vdivps %ymm0, %ymm1, %ymm0
; AVX-NEXT:    retl
;
; ALL32-LABEL: f8xf32_f64:
; ALL32:       # %bb.0:
; ALL32-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760]
; ALL32-NEXT:    vaddps %ymm1, %ymm0, %ymm0
; ALL32-NEXT:    vdivps %ymm0, %ymm1, %ymm0
; ALL32-NEXT:    retl
;
; AVX-64-LABEL: f8xf32_f64:
; AVX-64:       # %bb.0:
; AVX-64-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760]
; AVX-64-NEXT:    vaddps %ymm1, %ymm0, %ymm0
; AVX-64-NEXT:    vdivps %ymm0, %ymm1, %ymm0
; AVX-64-NEXT:    retq
;
; ALL64-LABEL: f8xf32_f64:
; ALL64:       # %bb.0:
; ALL64-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760]
; ALL64-NEXT:    vaddps %ymm1, %ymm0, %ymm0
; ALL64-NEXT:    vdivps %ymm0, %ymm1, %ymm0
; ALL64-NEXT:    retq
  %res1 = fadd <8 x float> <float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0>, %a
  %res2 = fdiv <8 x float> <float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0>, %res1
  ret <8 x float> %res2
}


define <8 x float> @f8xf32_f128(<8 x float> %a) {
; AVX-LABEL: f8xf32_f128:
; AVX:       # %bb.0:
; AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
; AVX-NEXT:    # ymm1 = mem[0,1,0,1]
; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
; AVX-NEXT:    vdivps %ymm0, %ymm1, %ymm0
; AVX-NEXT:    retl
;
; ALL32-LABEL: f8xf32_f128:
; ALL32:       # %bb.0:
; ALL32-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
; ALL32-NEXT:    # ymm1 = mem[0,1,0,1]
; ALL32-NEXT:    vaddps %ymm1, %ymm0, %ymm0
; ALL32-NEXT:    vdivps %ymm0, %ymm1, %ymm0
; ALL32-NEXT:    retl
;
; AVX-64-LABEL: f8xf32_f128:
; AVX-64:       # %bb.0:
; AVX-64-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
; AVX-64-NEXT:    # ymm1 = mem[0,1,0,1]
; AVX-64-NEXT:    vaddps %ymm1, %ymm0, %ymm0
; AVX-64-NEXT:    vdivps %ymm0, %ymm1, %ymm0
; AVX-64-NEXT:    retq
;
; ALL64-LABEL: f8xf32_f128:
; ALL64:       # %bb.0:
; ALL64-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
; ALL64-NEXT:    # ymm1 = mem[0,1,0,1]
; ALL64-NEXT:    vaddps %ymm1, %ymm0, %ymm0
; ALL64-NEXT:    vdivps %ymm0, %ymm1, %ymm0
; ALL64-NEXT:    retq
  %res1 = fadd <8 x float> <float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0>, %a
  %res2 = fdiv <8 x float> <float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0>, %res1
  ret <8 x float> %res2
}


define <16 x float> @f16xf32_f64(<16 x float> %a) {
; AVX-LABEL: f16xf32_f64:
; AVX:       # %bb.0:
; AVX-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760]
; AVX-NEXT:    vaddps %ymm2, %ymm1, %ymm1
; AVX-NEXT:    vaddps %ymm2, %ymm0, %ymm0
; AVX-NEXT:    vdivps %ymm0, %ymm2, %ymm0
; AVX-NEXT:    vdivps %ymm1, %ymm2, %ymm1
; AVX-NEXT:    retl
;
; AVX2-LABEL: f16xf32_f64:
; AVX2:       # %bb.0:
; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760]
; AVX2-NEXT:    vaddps %ymm2, %ymm1, %ymm1
; AVX2-NEXT:    vaddps %ymm2, %ymm0, %ymm0
; AVX2-NEXT:    vdivps %ymm0, %ymm2, %ymm0
; AVX2-NEXT:    vdivps %ymm1, %ymm2, %ymm1
; AVX2-NEXT:    retl
;
; AVX512-LABEL: f16xf32_f64:
; AVX512:       # %bb.0:
; AVX512-NEXT:    vbroadcastsd {{.*#+}} zmm1 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760]
; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
; AVX512-NEXT:    vdivps %zmm0, %zmm1, %zmm0
; AVX512-NEXT:    retl
;
; AVX-64-LABEL: f16xf32_f64:
; AVX-64:       # %bb.0:
; AVX-64-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760]
; AVX-64-NEXT:    vaddps %ymm2, %ymm1, %ymm1
; AVX-64-NEXT:    vaddps %ymm2, %ymm0, %ymm0
; AVX-64-NEXT:    vdivps %ymm0, %ymm2, %ymm0
; AVX-64-NEXT:    vdivps %ymm1, %ymm2, %ymm1
; AVX-64-NEXT:    retq
;
; AVX2-64-LABEL: f16xf32_f64:
; AVX2-64:       # %bb.0:
; AVX2-64-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760]
; AVX2-64-NEXT:    vaddps %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT:    vaddps %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT:    vdivps %ymm0, %ymm2, %ymm0
; AVX2-64-NEXT:    vdivps %ymm1, %ymm2, %ymm1
; AVX2-64-NEXT:    retq
;
; AVX512F-64-LABEL: f16xf32_f64:
; AVX512F-64:       # %bb.0:
; AVX512F-64-NEXT:    vbroadcastsd {{.*#+}} zmm1 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760]
; AVX512F-64-NEXT:    vaddps %zmm1, %zmm0, %zmm0
; AVX512F-64-NEXT:    vdivps %zmm0, %zmm1, %zmm0
; AVX512F-64-NEXT:    retq
  %res1 = fadd <16 x float> <float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0>, %a
  %res2 = fdiv <16 x float> <float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0>, %res1
  ret <16 x float> %res2
}


define <16 x float> @f16xf32_f128(<16 x float> %a) {
; AVX-LABEL: f16xf32_f128:
; AVX:       # %bb.0:
; AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
; AVX-NEXT:    # ymm2 = mem[0,1,0,1]
; AVX-NEXT:    vaddps %ymm2, %ymm1, %ymm1
; AVX-NEXT:    vaddps %ymm2, %ymm0, %ymm0
; AVX-NEXT:    vdivps %ymm0, %ymm2, %ymm0
; AVX-NEXT:    vdivps %ymm1, %ymm2, %ymm1
; AVX-NEXT:    retl
;
; AVX2-LABEL: f16xf32_f128:
; AVX2:       # %bb.0:
; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
; AVX2-NEXT:    vaddps %ymm2, %ymm1, %ymm1
; AVX2-NEXT:    vaddps %ymm2, %ymm0, %ymm0
; AVX2-NEXT:    vdivps %ymm0, %ymm2, %ymm0
; AVX2-NEXT:    vdivps %ymm1, %ymm2, %ymm1
; AVX2-NEXT:    retl
;
; AVX512-LABEL: f16xf32_f128:
; AVX512:       # %bb.0:
; AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
; AVX512-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
; AVX512-NEXT:    vdivps %zmm0, %zmm1, %zmm0
; AVX512-NEXT:    retl
;
; AVX-64-LABEL: f16xf32_f128:
; AVX-64:       # %bb.0:
; AVX-64-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
; AVX-64-NEXT:    # ymm2 = mem[0,1,0,1]
; AVX-64-NEXT:    vaddps %ymm2, %ymm1, %ymm1
; AVX-64-NEXT:    vaddps %ymm2, %ymm0, %ymm0
; AVX-64-NEXT:    vdivps %ymm0, %ymm2, %ymm0
; AVX-64-NEXT:    vdivps %ymm1, %ymm2, %ymm1
; AVX-64-NEXT:    retq
;
; AVX2-64-LABEL: f16xf32_f128:
; AVX2-64:       # %bb.0:
; AVX2-64-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
; AVX2-64-NEXT:    # ymm2 = mem[0,1,0,1]
; AVX2-64-NEXT:    vaddps %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT:    vaddps %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT:    vdivps %ymm0, %ymm2, %ymm0
; AVX2-64-NEXT:    vdivps %ymm1, %ymm2, %ymm1
; AVX2-64-NEXT:    retq
;
; AVX512F-64-LABEL: f16xf32_f128:
; AVX512F-64:       # %bb.0:
; AVX512F-64-NEXT:    vbroadcastf32x4 {{.*#+}} zmm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
; AVX512F-64-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512F-64-NEXT:    vaddps %zmm1, %zmm0, %zmm0
; AVX512F-64-NEXT:    vdivps %zmm0, %zmm1, %zmm0
; AVX512F-64-NEXT:    retq
  %res1 = fadd <16 x float> <float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0>, %a
  %res2 = fdiv <16 x float> <float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0>, %res1
  ret <16 x float> %res2
}


define <16 x float> @f16xf32_f256(<16 x float> %a) {
; AVX-LABEL: f16xf32_f256:
; AVX:       # %bb.0:
; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0]
; AVX-NEXT:    vaddps %ymm2, %ymm1, %ymm1
; AVX-NEXT:    vaddps %ymm2, %ymm0, %ymm0
; AVX-NEXT:    vdivps %ymm0, %ymm2, %ymm0
; AVX-NEXT:    vdivps %ymm1, %ymm2, %ymm1
; AVX-NEXT:    retl
;
; AVX2-LABEL: f16xf32_f256:
; AVX2:       # %bb.0:
; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0]
; AVX2-NEXT:    vaddps %ymm2, %ymm1, %ymm1
; AVX2-NEXT:    vaddps %ymm2, %ymm0, %ymm0
; AVX2-NEXT:    vdivps %ymm0, %ymm2, %ymm0
; AVX2-NEXT:    vdivps %ymm1, %ymm2, %ymm1
; AVX2-NEXT:    retl
;
; AVX512-LABEL: f16xf32_f256:
; AVX512:       # %bb.0:
; AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm1 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0]
; AVX512-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
; AVX512-NEXT:    vdivps %zmm0, %zmm1, %zmm0
; AVX512-NEXT:    retl
;
; AVX-64-LABEL: f16xf32_f256:
; AVX-64:       # %bb.0:
; AVX-64-NEXT:    vmovaps {{.*#+}} ymm2 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0]
; AVX-64-NEXT:    vaddps %ymm2, %ymm1, %ymm1
; AVX-64-NEXT:    vaddps %ymm2, %ymm0, %ymm0
; AVX-64-NEXT:    vdivps %ymm0, %ymm2, %ymm0
; AVX-64-NEXT:    vdivps %ymm1, %ymm2, %ymm1
; AVX-64-NEXT:    retq
;
; AVX2-64-LABEL: f16xf32_f256:
; AVX2-64:       # %bb.0:
; AVX2-64-NEXT:    vmovaps {{.*#+}} ymm2 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0]
; AVX2-64-NEXT:    vaddps %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT:    vaddps %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT:    vdivps %ymm0, %ymm2, %ymm0
; AVX2-64-NEXT:    vdivps %ymm1, %ymm2, %ymm1
; AVX2-64-NEXT:    retq
;
; AVX512F-64-LABEL: f16xf32_f256:
; AVX512F-64:       # %bb.0:
; AVX512F-64-NEXT:    vbroadcastf64x4 {{.*#+}} zmm1 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0]
; AVX512F-64-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
; AVX512F-64-NEXT:    vaddps %zmm1, %zmm0, %zmm0
; AVX512F-64-NEXT:    vdivps %zmm0, %zmm1, %zmm0
; AVX512F-64-NEXT:    retq
  %res1 = fadd <16 x float> <float 8.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0>, %a
  %res2 = fdiv <16 x float> <float 8.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0>, %res1
  ret <16 x float> %res2
}


define <4 x double> @f4xf64_f128(<4 x double> %a) {
; AVX-LABEL: f4xf64_f128:
; AVX:       # %bb.0:
; AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0]
; AVX-NEXT:    # ymm1 = mem[0,1,0,1]
; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
; AVX-NEXT:    vdivpd %ymm0, %ymm1, %ymm0
; AVX-NEXT:    retl
;
; ALL32-LABEL: f4xf64_f128:
; ALL32:       # %bb.0:
; ALL32-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0]
; ALL32-NEXT:    # ymm1 = mem[0,1,0,1]
; ALL32-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
; ALL32-NEXT:    vdivpd %ymm0, %ymm1, %ymm0
; ALL32-NEXT:    retl
;
; AVX-64-LABEL: f4xf64_f128:
; AVX-64:       # %bb.0:
; AVX-64-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0]
; AVX-64-NEXT:    # ymm1 = mem[0,1,0,1]
; AVX-64-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
; AVX-64-NEXT:    vdivpd %ymm0, %ymm1, %ymm0
; AVX-64-NEXT:    retq
;
; ALL64-LABEL: f4xf64_f128:
; ALL64:       # %bb.0:
; ALL64-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0]
; ALL64-NEXT:    # ymm1 = mem[0,1,0,1]
; ALL64-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
; ALL64-NEXT:    vdivpd %ymm0, %ymm1, %ymm0
; ALL64-NEXT:    retq
  %res1 = fadd <4 x double> <double 2.0, double 1.0, double 2.0, double 1.0>, %a
  %res2 = fdiv <4 x double> <double 2.0, double 1.0, double 2.0, double 1.0>, %res1
  ret <4 x double> %res2
}


define <8 x double> @f8xf64_f128(<8 x double> %a) {
; AVX-LABEL: f8xf64_f128:
; AVX:       # %bb.0:
; AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0]
; AVX-NEXT:    # ymm2 = mem[0,1,0,1]
; AVX-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
; AVX-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
; AVX-NEXT:    vdivpd %ymm0, %ymm2, %ymm0
; AVX-NEXT:    vdivpd %ymm1, %ymm2, %ymm1
; AVX-NEXT:    retl
;
; AVX2-LABEL: f8xf64_f128:
; AVX2:       # %bb.0:
; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0]
; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
; AVX2-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
; AVX2-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
; AVX2-NEXT:    vdivpd %ymm0, %ymm2, %ymm0
; AVX2-NEXT:    vdivpd %ymm1, %ymm2, %ymm1
; AVX2-NEXT:    retl
;
; AVX512-LABEL: f8xf64_f128:
; AVX512:       # %bb.0:
; AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0]
; AVX512-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
; AVX512-NEXT:    vdivpd %zmm0, %zmm1, %zmm0
; AVX512-NEXT:    retl
;
; AVX-64-LABEL: f8xf64_f128:
; AVX-64:       # %bb.0:
; AVX-64-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0]
; AVX-64-NEXT:    # ymm2 = mem[0,1,0,1]
; AVX-64-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
; AVX-64-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
; AVX-64-NEXT:    vdivpd %ymm0, %ymm2, %ymm0
; AVX-64-NEXT:    vdivpd %ymm1, %ymm2, %ymm1
; AVX-64-NEXT:    retq
;
; AVX2-64-LABEL: f8xf64_f128:
; AVX2-64:       # %bb.0:
; AVX2-64-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0]
; AVX2-64-NEXT:    # ymm2 = mem[0,1,0,1]
; AVX2-64-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT:    vdivpd %ymm0, %ymm2, %ymm0
; AVX2-64-NEXT:    vdivpd %ymm1, %ymm2, %ymm1
; AVX2-64-NEXT:    retq
;
; AVX512F-64-LABEL: f8xf64_f128:
; AVX512F-64:       # %bb.0:
; AVX512F-64-NEXT:    vbroadcastf32x4 {{.*#+}} zmm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0]
; AVX512F-64-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512F-64-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
; AVX512F-64-NEXT:    vdivpd %zmm0, %zmm1, %zmm0
; AVX512F-64-NEXT:    retq
  %res1 = fadd <8 x double> <double 2.0, double 1.0, double 2.0, double 1.0, double 2.0, double 1.0, double 2.0, double 1.0>, %a
  %res2 = fdiv <8 x double> <double 2.0, double 1.0, double 2.0, double 1.0, double 2.0, double 1.0, double 2.0, double 1.0>, %res1
  ret <8 x double> %res2
}


; AVX512:       .LCPI37
; AVX512-NEXT:  .quad	0x4010000000000000      # double 4
; AVX512-NEXT:  .quad	0x3ff0000000000000      # double 1
; AVX512-NEXT:  .quad	0x4000000000000000      # double 2
; AVX512-NEXT:  .quad	0x4008000000000000      # double 3
; AVX512-NOT:   .quad

define <8 x double> @f8xf64_f256(<8 x double> %a) {
; AVX-LABEL: f8xf64_f256:
; AVX:       # %bb.0:
; AVX-NEXT:    vmovapd {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0]
; AVX-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
; AVX-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
; AVX-NEXT:    vdivpd %ymm0, %ymm2, %ymm0
; AVX-NEXT:    vdivpd %ymm1, %ymm2, %ymm1
; AVX-NEXT:    retl
;
; AVX2-LABEL: f8xf64_f256:
; AVX2:       # %bb.0:
; AVX2-NEXT:    vmovapd {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0]
; AVX2-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
; AVX2-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
; AVX2-NEXT:    vdivpd %ymm0, %ymm2, %ymm0
; AVX2-NEXT:    vdivpd %ymm1, %ymm2, %ymm1
; AVX2-NEXT:    retl
;
; AVX512-LABEL: f8xf64_f256:
; AVX512:       # %bb.0:
; AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
; AVX512-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
; AVX512-NEXT:    vdivpd %zmm0, %zmm1, %zmm0
; AVX512-NEXT:    retl
;
; AVX-64-LABEL: f8xf64_f256:
; AVX-64:       # %bb.0:
; AVX-64-NEXT:    vmovapd {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0]
; AVX-64-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
; AVX-64-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
; AVX-64-NEXT:    vdivpd %ymm0, %ymm2, %ymm0
; AVX-64-NEXT:    vdivpd %ymm1, %ymm2, %ymm1
; AVX-64-NEXT:    retq
;
; AVX2-64-LABEL: f8xf64_f256:
; AVX2-64:       # %bb.0:
; AVX2-64-NEXT:    vmovapd {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0]
; AVX2-64-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
; AVX2-64-NEXT:    vdivpd %ymm0, %ymm2, %ymm0
; AVX2-64-NEXT:    vdivpd %ymm1, %ymm2, %ymm1
; AVX2-64-NEXT:    retq
;
; AVX512F-64-LABEL: f8xf64_f256:
; AVX512F-64:       # %bb.0:
; AVX512F-64-NEXT:    vbroadcastf64x4 {{.*#+}} zmm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
; AVX512F-64-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
; AVX512F-64-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
; AVX512F-64-NEXT:    vdivpd %zmm0, %zmm1, %zmm0
; AVX512F-64-NEXT:    retq
  %res1 = fadd <8 x double> <double 4.0, double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0>, %a
  %res2 = fdiv <8 x double> <double 4.0, double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0>, %res1
  ret <8 x double> %res2
}


define <8 x i16> @f8xi16_i32_NaN(<8 x i16> %a) {
; AVX-LABEL: f8xi16_i32_NaN:
; AVX:       # %bb.0:
; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [4290379776,4290379776,4290379776,4290379776]
; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT:    retl
;
; ALL32-LABEL: f8xi16_i32_NaN:
; ALL32:       # %bb.0:
; ALL32-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4290379776,4290379776,4290379776,4290379776]
; ALL32-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
; ALL32-NEXT:    vpand %xmm1, %xmm0, %xmm0
; ALL32-NEXT:    retl
;
; AVX-64-LABEL: f8xi16_i32_NaN:
; AVX-64:       # %bb.0:
; AVX-64-NEXT:    vbroadcastss {{.*#+}} xmm1 = [4290379776,4290379776,4290379776,4290379776]
; AVX-64-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
; AVX-64-NEXT:    vpand %xmm1, %xmm0, %xmm0
; AVX-64-NEXT:    retq
;
; ALL64-LABEL: f8xi16_i32_NaN:
; ALL64:       # %bb.0:
; ALL64-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4290379776,4290379776,4290379776,4290379776]
; ALL64-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
; ALL64-NEXT:    vpand %xmm1, %xmm0, %xmm0
; ALL64-NEXT:    retq
  %res1 = add <8 x i16> <i16 0, i16 -70, i16 0, i16 -70, i16 0, i16 -70, i16 0, i16 -70>, %a
  %res2 = and <8 x i16> <i16 0, i16 -70, i16 0, i16 -70, i16 0, i16 -70, i16 0, i16 -70>, %res1
  ret <8 x i16> %res2
}