#include <x86intrin.h>
__m512i mask_Yk_i8(char msk, __m512i x, __m512i y){
__m512i dst;
asm ("vpaddq\t%3, %2, %0 %{%1%}"
: "=x" (dst) : "Yk" (msk), "x" (x), "x" (y)); return dst;
}
__m512i mask_Yk_i16(short msk, __m512i x, __m512i y){
__m512i dst;
asm ("vpaddd\t%3, %2, %0 %{%1%}"
: "=x" (dst) : "Yk" (msk), "x" (x), "x" (y)); return dst;
}
__m512i mask_Yk_i32(int msk, __m512i x, __m512i y){
__m512i dst;
asm ("vpaddw\t%3, %2, %0 %{%1%}"
: "=x" (dst) : "Yk" (msk), "x" (x), "x" (y)); return dst;
}
__m512i mask_Yk_i64(long long msk, __m512i x, __m512i y){
__m512i dst;
asm ("vpaddb\t%3, %2, %0 %{%1%}"
: "=x" (dst) : "Yk" (msk), "x" (x), "x" (y)); return dst;
}
char k_wise_op_i8(char msk_src1,char msk_src2){
char msk_dst;
asm ("kandb\t%2, %1, %0"
: "=k" (msk_dst)
: "k" (msk_src1), "k" (msk_src2));
return msk_dst;
}
short k_wise_op_i16(short msk_src1, short msk_src2){
short msk_dst;
asm ("kandw\t%2, %1, %0"
: "=k" (msk_dst)
: "k" (msk_src1), "k" (msk_src2));
return msk_dst;
}
int k_wise_op_i32(int msk_src1, int msk_src2){
int msk_dst;
asm ("kandd\t%2, %1, %0"
: "=k" (msk_dst)
: "k" (msk_src1), "k" (msk_src2));
return msk_dst;
}
long long k_wise_op_i64(long long msk_src1, long long msk_src2){
long long msk_dst;
asm ("kandq\t%2, %1, %0"
: "=k" (msk_dst)
: "k" (msk_src1), "k" (msk_src2));
return msk_dst;
}