#ifndef NO_WARN_X86_INTRINSICS
#error \
"Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
#endif
#ifndef EMMINTRIN_H_
#define EMMINTRIN_H_
#if defined(__ppc64__) && \
(defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
#include <altivec.h>
#include <xmmintrin.h>
typedef __vector double __v2df;
typedef __vector long long __v2di;
typedef __vector unsigned long long __v2du;
typedef __vector int __v4si;
typedef __vector unsigned int __v4su;
typedef __vector short __v8hi;
typedef __vector unsigned short __v8hu;
typedef __vector signed char __v16qi;
typedef __vector unsigned char __v16qu;
typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__));
typedef double __m128d __attribute__((__vector_size__(16), __may_alias__));
typedef long long __m128i_u
__attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
typedef double __m128d_u
__attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_sd(double __F) {
return __extension__(__m128d){__F, 0.0};
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_pd(double __F) {
return __extension__(__m128d){__F, __F};
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pd1(double __F) {
return _mm_set1_pd(__F);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pd(double __W, double __X) {
return __extension__(__m128d){__X, __W};
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_pd(double __W, double __X) {
return __extension__(__m128d){__W, __X};
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_undefined_pd(void) {
__m128d __Y = __Y;
return __Y;
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setzero_pd(void) {
return (__m128d)vec_splats(0);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_move_sd(__m128d __A, __m128d __B) {
__v2df __result = (__v2df)__A;
__result[0] = ((__v2df)__B)[0];
return (__m128d)__result;
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_pd(double const *__P) {
return ((__m128d)vec_ld(0, (__v16qu *)__P));
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadu_pd(double const *__P) {
return (vec_vsx_ld(0, __P));
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load1_pd(double const *__P) {
return (vec_splats(*__P));
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_sd(double const *__P) {
return _mm_set_sd(*__P);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_pd1(double const *__P) {
return _mm_load1_pd(__P);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadr_pd(double const *__P) {
__v2df __tmp = _mm_load_pd(__P);
return (__m128d)vec_xxpermdi(__tmp, __tmp, 2);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_pd(double *__P, __m128d __A) {
vec_st((__v16qu)__A, 0, (__v16qu *)__P);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeu_pd(double *__P, __m128d __A) {
*(__m128d_u *)__P = __A;
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_sd(double *__P, __m128d __A) {
*__P = ((__v2df)__A)[0];
}
extern __inline double
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_f64(__m128d __A) {
return ((__v2df)__A)[0];
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storel_pd(double *__P, __m128d __A) {
_mm_store_sd(__P, __A);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeh_pd(double *__P, __m128d __A) {
*__P = ((__v2df)__A)[1];
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store1_pd(double *__P, __m128d __A) {
_mm_store_pd(__P, vec_splat(__A, 0));
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_pd1(double *__P, __m128d __A) {
_mm_store1_pd(__P, __A);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storer_pd(double *__P, __m128d __A) {
_mm_store_pd(__P, vec_xxpermdi(__A, __A, 2));
}
extern __inline long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi128_si64(__m128i __A) {
return ((__v2di)__A)[0];
}
extern __inline long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi128_si64x(__m128i __A) {
return ((__v2di)__A)[0];
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_pd(__m128d __A, __m128d __B) {
return (__m128d)((__v2df)__A + (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_sd(__m128d __A, __m128d __B) {
__A[0] = __A[0] + __B[0];
return (__A);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_pd(__m128d __A, __m128d __B) {
return (__m128d)((__v2df)__A - (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_sd(__m128d __A, __m128d __B) {
__A[0] = __A[0] - __B[0];
return (__A);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_pd(__m128d __A, __m128d __B) {
return (__m128d)((__v2df)__A * (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_sd(__m128d __A, __m128d __B) {
__A[0] = __A[0] * __B[0];
return (__A);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_div_pd(__m128d __A, __m128d __B) {
return (__m128d)((__v2df)__A / (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_div_sd(__m128d __A, __m128d __B) {
__A[0] = __A[0] / __B[0];
return (__A);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sqrt_pd(__m128d __A) {
return (vec_sqrt(__A));
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sqrt_sd(__m128d __A, __m128d __B) {
__v2df __c;
__c = vec_sqrt((__v2df)_mm_set1_pd(__B[0]));
return (__m128d)_mm_setr_pd(__c[0], __A[1]);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_pd(__m128d __A, __m128d __B) {
return (vec_min(__A, __B));
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_sd(__m128d __A, __m128d __B) {
__v2df __a, __b, __c;
__a = vec_splats(__A[0]);
__b = vec_splats(__B[0]);
__c = vec_min(__a, __b);
return (__m128d)_mm_setr_pd(__c[0], __A[1]);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_pd(__m128d __A, __m128d __B) {
return (vec_max(__A, __B));
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_sd(__m128d __A, __m128d __B) {
__v2df __a, __b, __c;
__a = vec_splats(__A[0]);
__b = vec_splats(__B[0]);
__c = vec_max(__a, __b);
return (__m128d)_mm_setr_pd(__c[0], __A[1]);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_pd(__m128d __A, __m128d __B) {
return ((__m128d)vec_cmpeq((__v2df)__A, (__v2df)__B));
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_pd(__m128d __A, __m128d __B) {
return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B));
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmple_pd(__m128d __A, __m128d __B) {
return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B));
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_pd(__m128d __A, __m128d __B) {
return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B));
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpge_pd(__m128d __A, __m128d __B) {
return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B));
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpneq_pd(__m128d __A, __m128d __B) {
__v2df __temp = (__v2df)vec_cmpeq((__v2df)__A, (__v2df)__B);
return ((__m128d)vec_nor(__temp, __temp));
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnlt_pd(__m128d __A, __m128d __B) {
return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B));
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnle_pd(__m128d __A, __m128d __B) {
return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B));
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpngt_pd(__m128d __A, __m128d __B) {
return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B));
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnge_pd(__m128d __A, __m128d __B) {
return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B));
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpord_pd(__m128d __A, __m128d __B) {
__v2du __c, __d;
__c = (__v2du)vec_cmpeq(__A, __A);
__d = (__v2du)vec_cmpeq(__B, __B);
return ((__m128d)vec_and(__c, __d));
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpunord_pd(__m128d __A, __m128d __B) {
#if _ARCH_PWR8
__v2du __c, __d;
__c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A);
__d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B);
__c = vec_nor(__c, __c);
return ((__m128d)vec_orc(__c, __d));
#else
__v2du __c, __d;
__c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A);
__d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B);
__c = vec_nor(__c, __c);
__d = vec_nor(__d, __d);
return ((__m128d)vec_or(__c, __d));
#endif
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_sd(__m128d __A, __m128d __B) {
__v2df __a, __b, __c;
__a = vec_splats(__A[0]);
__b = vec_splats(__B[0]);
__c = (__v2df)vec_cmpeq(__a, __b);
return (__m128d)_mm_setr_pd(__c[0], __A[1]);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_sd(__m128d __A, __m128d __B) {
__v2df __a, __b, __c;
__a = vec_splats(__A[0]);
__b = vec_splats(__B[0]);
__c = (__v2df)vec_cmplt(__a, __b);
return (__m128d)_mm_setr_pd(__c[0], __A[1]);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmple_sd(__m128d __A, __m128d __B) {
__v2df __a, __b, __c;
__a = vec_splats(__A[0]);
__b = vec_splats(__B[0]);
__c = (__v2df)vec_cmple(__a, __b);
return (__m128d)_mm_setr_pd(__c[0], __A[1]);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_sd(__m128d __A, __m128d __B) {
__v2df __a, __b, __c;
__a = vec_splats(__A[0]);
__b = vec_splats(__B[0]);
__c = (__v2df)vec_cmpgt(__a, __b);
return (__m128d)_mm_setr_pd(__c[0], __A[1]);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpge_sd(__m128d __A, __m128d __B) {
__v2df __a, __b, __c;
__a = vec_splats(__A[0]);
__b = vec_splats(__B[0]);
__c = (__v2df)vec_cmpge(__a, __b);
return (__m128d)_mm_setr_pd(__c[0], __A[1]);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpneq_sd(__m128d __A, __m128d __B) {
__v2df __a, __b, __c;
__a = vec_splats(__A[0]);
__b = vec_splats(__B[0]);
__c = (__v2df)vec_cmpeq(__a, __b);
__c = vec_nor(__c, __c);
return (__m128d)_mm_setr_pd(__c[0], __A[1]);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnlt_sd(__m128d __A, __m128d __B) {
__v2df __a, __b, __c;
__a = vec_splats(__A[0]);
__b = vec_splats(__B[0]);
__c = (__v2df)vec_cmpge(__a, __b);
return (__m128d)_mm_setr_pd(__c[0], __A[1]);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnle_sd(__m128d __A, __m128d __B) {
__v2df __a, __b, __c;
__a = vec_splats(__A[0]);
__b = vec_splats(__B[0]);
__c = (__v2df)vec_cmpge(__a, __b);
return (__m128d)_mm_setr_pd(__c[0], __A[1]);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpngt_sd(__m128d __A, __m128d __B) {
__v2df __a, __b, __c;
__a = vec_splats(__A[0]);
__b = vec_splats(__B[0]);
__c = (__v2df)vec_cmple(__a, __b);
return (__m128d)_mm_setr_pd(__c[0], __A[1]);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnge_sd(__m128d __A, __m128d __B) {
__v2df __a, __b, __c;
__a = vec_splats(__A[0]);
__b = vec_splats(__B[0]);
__c = (__v2df)vec_cmplt(__a, __b);
return (__m128d)_mm_setr_pd(__c[0], __A[1]);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpord_sd(__m128d __A, __m128d __B) {
__v2df __r;
__r = (__v2df)_mm_cmpord_pd(vec_splats(__A[0]), vec_splats(__B[0]));
return (__m128d)_mm_setr_pd(__r[0], ((__v2df)__A)[1]);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpunord_sd(__m128d __A, __m128d __B) {
__v2df __r;
__r = _mm_cmpunord_pd(vec_splats(__A[0]), vec_splats(__B[0]));
return (__m128d)_mm_setr_pd(__r[0], __A[1]);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comieq_sd(__m128d __A, __m128d __B) {
return (__A[0] == __B[0]);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comilt_sd(__m128d __A, __m128d __B) {
return (__A[0] < __B[0]);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comile_sd(__m128d __A, __m128d __B) {
return (__A[0] <= __B[0]);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comigt_sd(__m128d __A, __m128d __B) {
return (__A[0] > __B[0]);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comige_sd(__m128d __A, __m128d __B) {
return (__A[0] >= __B[0]);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comineq_sd(__m128d __A, __m128d __B) {
return (__A[0] != __B[0]);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomieq_sd(__m128d __A, __m128d __B) {
return (__A[0] == __B[0]);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomilt_sd(__m128d __A, __m128d __B) {
return (__A[0] < __B[0]);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomile_sd(__m128d __A, __m128d __B) {
return (__A[0] <= __B[0]);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomigt_sd(__m128d __A, __m128d __B) {
return (__A[0] > __B[0]);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomige_sd(__m128d __A, __m128d __B) {
return (__A[0] >= __B[0]);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomineq_sd(__m128d __A, __m128d __B) {
return (__A[0] != __B[0]);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi64x(long long __q1, long long __q0) {
return __extension__(__m128i)(__v2di){__q0, __q1};
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi64(__m64 __q1, __m64 __q0) {
return _mm_set_epi64x((long long)__q1, (long long)__q0);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi32(int __q3, int __q2, int __q1, int __q0) {
return __extension__(__m128i)(__v4si){__q0, __q1, __q2, __q3};
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi16(short __q7, short __q6, short __q5, short __q4, short __q3,
short __q2, short __q1, short __q0) {
return __extension__(__m128i)(__v8hi){__q0, __q1, __q2, __q3,
__q4, __q5, __q6, __q7};
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi8(char __q15, char __q14, char __q13, char __q12, char __q11,
char __q10, char __q09, char __q08, char __q07, char __q06,
char __q05, char __q04, char __q03, char __q02, char __q01,
char __q00) {
return __extension__(__m128i)(__v16qi){
__q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
__q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15};
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi64x(long long __A) {
return _mm_set_epi64x(__A, __A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi64(__m64 __A) {
return _mm_set_epi64(__A, __A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi32(int __A) {
return _mm_set_epi32(__A, __A, __A, __A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi16(short __A) {
return _mm_set_epi16(__A, __A, __A, __A, __A, __A, __A, __A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi8(char __A) {
return _mm_set_epi8(__A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A,
__A, __A, __A, __A, __A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi64(__m64 __q0, __m64 __q1) {
return _mm_set_epi64(__q1, __q0);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi32(int __q0, int __q1, int __q2, int __q3) {
return _mm_set_epi32(__q3, __q2, __q1, __q0);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi16(short __q0, short __q1, short __q2, short __q3, short __q4,
short __q5, short __q6, short __q7) {
return _mm_set_epi16(__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi8(char __q00, char __q01, char __q02, char __q03, char __q04,
char __q05, char __q06, char __q07, char __q08, char __q09,
char __q10, char __q11, char __q12, char __q13, char __q14,
char __q15) {
return _mm_set_epi8(__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
__q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_si128(__m128i const *__P) {
return *__P;
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadu_si128(__m128i_u const *__P) {
return (__m128i)(vec_vsx_ld(0, (signed int const *)__P));
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadl_epi64(__m128i_u const *__P) {
return _mm_set_epi64((__m64)0LL, *(__m64 *)__P);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_si128(__m128i *__P, __m128i __B) {
vec_st((__v16qu)__B, 0, (__v16qu *)__P);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeu_si128(__m128i_u *__P, __m128i __B) {
*__P = __B;
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storel_epi64(__m128i_u *__P, __m128i __B) {
*(long long *)__P = ((__v2di)__B)[0];
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movepi64_pi64(__m128i_u __B) {
return (__m64)((__v2di)__B)[0];
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movpi64_epi64(__m64 __A) {
return _mm_set_epi64((__m64)0LL, __A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_move_epi64(__m128i __A) {
return _mm_set_epi64((__m64)0LL, (__m64)__A[0]);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_undefined_si128(void) {
__m128i __Y = __Y;
return __Y;
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setzero_si128(void) {
return __extension__(__m128i)(__v4si){0, 0, 0, 0};
}
#ifdef _ARCH_PWR8
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi32_pd(__m128i __A) {
__v2di __val;
__val = (__v2di)vec_unpackh((__v4si)__A);
return (__m128d)vec_ctf(__val, 0);
}
#endif
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi32_ps(__m128i __A) {
return ((__m128)vec_ctf((__v4si)__A, 0));
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpd_epi32(__m128d __A) {
__v2df __rounded = vec_rint(__A);
__v4si __result, __temp;
const __v4si __vzero = {0, 0, 0, 0};
__asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__rounded) :);
#ifdef _ARCH_PWR8
#ifdef __LITTLE_ENDIAN__
__temp = vec_mergeo(__temp, __temp);
#else
__temp = vec_mergee(__temp, __temp);
#endif
__result = (__v4si)vec_vpkudum((__vector long long)__temp,
(__vector long long)__vzero);
#else
{
const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
__result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
}
#endif
return (__m128i)__result;
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpd_pi32(__m128d __A) {
__m128i __result = _mm_cvtpd_epi32(__A);
return (__m64)__result[0];
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpd_ps(__m128d __A) {
__v4sf __result;
__v4si __temp;
const __v4si __vzero = {0, 0, 0, 0};
__asm__("xvcvdpsp %x0,%x1" : "=wa"(__temp) : "wa"(__A) :);
#ifdef _ARCH_PWR8
#ifdef __LITTLE_ENDIAN__
__temp = vec_mergeo(__temp, __temp);
#else
__temp = vec_mergee(__temp, __temp);
#endif
__result = (__v4sf)vec_vpkudum((__vector long long)__temp,
(__vector long long)__vzero);
#else
{
const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
__result = (__v4sf)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
}
#endif
return ((__m128)__result);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttpd_epi32(__m128d __A) {
__v4si __result;
__v4si __temp;
const __v4si __vzero = {0, 0, 0, 0};
__asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__A) :);
#ifdef _ARCH_PWR8
#ifdef __LITTLE_ENDIAN__
__temp = vec_mergeo(__temp, __temp);
#else
__temp = vec_mergee(__temp, __temp);
#endif
__result = (__v4si)vec_vpkudum((__vector long long)__temp,
(__vector long long)__vzero);
#else
{
const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
__result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
}
#endif
return ((__m128i)__result);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttpd_pi32(__m128d __A) {
__m128i __result = _mm_cvttpd_epi32(__A);
return (__m64)__result[0];
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi128_si32(__m128i __A) {
return ((__v4si)__A)[0];
}
#ifdef _ARCH_PWR8
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpi32_pd(__m64 __A) {
__v4si __temp;
__v2di __tmp2;
__v2df __result;
__temp = (__v4si)vec_splats(__A);
__tmp2 = (__v2di)vec_unpackl(__temp);
__result = vec_ctf((__vector signed long long)__tmp2, 0);
return (__m128d)__result;
}
#endif
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtps_epi32(__m128 __A) {
__v4sf __rounded;
__v4si __result;
__rounded = vec_rint((__v4sf)__A);
__result = vec_cts(__rounded, 0);
return (__m128i)__result;
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttps_epi32(__m128 __A) {
__v4si __result;
__result = vec_cts((__v4sf)__A, 0);
return (__m128i)__result;
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtps_pd(__m128 __A) {
#ifdef vec_doubleh
return (__m128d)vec_doubleh((__v4sf)__A);
#else
__v4sf __a = (__v4sf)__A;
__v4sf __temp;
__v2df __result;
#ifdef __LITTLE_ENDIAN__
__temp = __builtin_vsx_xxsldwi(__a, __a, 3);
__temp = __builtin_vsx_xxsldwi(__a, __temp, 2);
#else
__temp = vec_vmrghw(__a, __a);
#endif
__asm__(" xvcvspdp %x0,%x1" : "=wa"(__result) : "wa"(__temp) :);
return (__m128d)__result;
#endif
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_si32(__m128d __A) {
__v2df __rounded = vec_rint((__v2df)__A);
int __result = ((__v2df)__rounded)[0];
return __result;
}
extern __inline long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_si64(__m128d __A) {
__v2df __rounded = vec_rint((__v2df)__A);
long long __result = ((__v2df)__rounded)[0];
return __result;
}
extern __inline long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_si64x(__m128d __A) {
return _mm_cvtsd_si64((__v2df)__A);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttsd_si32(__m128d __A) {
int __result = ((__v2df)__A)[0];
return __result;
}
extern __inline long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttsd_si64(__m128d __A) {
long long __result = ((__v2df)__A)[0];
return __result;
}
extern __inline long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttsd_si64x(__m128d __A) {
return _mm_cvttsd_si64(__A);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_ss(__m128 __A, __m128d __B) {
__v4sf __result = (__v4sf)__A;
#ifdef __LITTLE_ENDIAN__
__v4sf __temp_s;
__v2df __temp_b = vec_splat((__v2df)__B, 0);
__result = __builtin_vsx_xxsldwi(__result, __result, 3);
__asm__("xscvdpsp %x0,%x1" : "=wa"(__temp_s) : "wa"(__temp_b) :);
__result = __builtin_vsx_xxsldwi(__result, __temp_s, 1);
#else
__result[0] = ((__v2df)__B)[0];
#endif
return (__m128)__result;
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi32_sd(__m128d __A, int __B) {
__v2df __result = (__v2df)__A;
double __db = __B;
__result[0] = __db;
return (__m128d)__result;
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64_sd(__m128d __A, long long __B) {
__v2df __result = (__v2df)__A;
double __db = __B;
__result[0] = __db;
return (__m128d)__result;
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64x_sd(__m128d __A, long long __B) {
return _mm_cvtsi64_sd(__A, __B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtss_sd(__m128d __A, __m128 __B) {
#ifdef __LITTLE_ENDIAN__
__v4sf __temp = vec_splat((__v4sf)__B, 0);
__v2df __res;
__asm__("xscvspdp %x0,%x1" : "=wa"(__res) : "wa"(__temp) :);
return (__m128d)vec_mergel(__res, (__v2df)__A);
#else
__v2df __res = (__v2df)__A;
__res[0] = ((__v4sf)__B)[0];
return (__m128d)__res;
#endif
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) {
__vector double __result;
const int __litmsk = __mask & 0x3;
if (__litmsk == 0)
__result = vec_mergeh(__A, __B);
#if __GNUC__ < 6
else if (__litmsk == 1)
__result = vec_xxpermdi(__B, __A, 2);
else if (__litmsk == 2)
__result = vec_xxpermdi(__B, __A, 1);
#else
else if (__litmsk == 1)
__result = vec_xxpermdi(__A, __B, 2);
else if (__litmsk == 2)
__result = vec_xxpermdi(__A, __B, 1);
#endif
else
__result = vec_mergel(__A, __B);
return __result;
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_pd(__m128d __A, __m128d __B) {
return (__m128d)vec_mergel((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_pd(__m128d __A, __m128d __B) {
return (__m128d)vec_mergeh((__v2df)__A, (__v2df)__B);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadh_pd(__m128d __A, double const *__B) {
__v2df __result = (__v2df)__A;
__result[1] = *__B;
return (__m128d)__result;
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadl_pd(__m128d __A, double const *__B) {
__v2df __result = (__v2df)__A;
__result[0] = *__B;
return (__m128d)__result;
}
#ifdef _ARCH_PWR8
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_pd(__m128d __A) {
#ifdef _ARCH_PWR10
return vec_extractm((__v2du)__A);
#else
__vector unsigned long long __result;
static const __vector unsigned int __perm_mask = {
#ifdef __LITTLE_ENDIAN__
0x80800040, 0x80808080, 0x80808080, 0x80808080
#else
0x80808080, 0x80808080, 0x80808080, 0x80804000
#endif
};
__result = ((__vector unsigned long long)vec_vbpermq(
(__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
#ifdef __LITTLE_ENDIAN__
return __result[1];
#else
return __result[0];
#endif
#endif
}
#endif
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packs_epi16(__m128i __A, __m128i __B) {
return (__m128i)vec_packs((__v8hi)__A, (__v8hi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packs_epi32(__m128i __A, __m128i __B) {
return (__m128i)vec_packs((__v4si)__A, (__v4si)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packus_epi16(__m128i __A, __m128i __B) {
return (__m128i)vec_packsu((__v8hi)__A, (__v8hi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi8(__m128i __A, __m128i __B) {
return (__m128i)vec_mergel((__v16qu)__A, (__v16qu)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi16(__m128i __A, __m128i __B) {
return (__m128i)vec_mergel((__v8hu)__A, (__v8hu)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi32(__m128i __A, __m128i __B) {
return (__m128i)vec_mergel((__v4su)__A, (__v4su)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi64(__m128i __A, __m128i __B) {
return (__m128i)vec_mergel((__vector long long)__A, (__vector long long)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi8(__m128i __A, __m128i __B) {
return (__m128i)vec_mergeh((__v16qu)__A, (__v16qu)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi16(__m128i __A, __m128i __B) {
return (__m128i)vec_mergeh((__v8hi)__A, (__v8hi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi32(__m128i __A, __m128i __B) {
return (__m128i)vec_mergeh((__v4si)__A, (__v4si)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi64(__m128i __A, __m128i __B) {
return (__m128i)vec_mergeh((__vector long long)__A, (__vector long long)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi8(__m128i __A, __m128i __B) {
return (__m128i)((__v16qu)__A + (__v16qu)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi16(__m128i __A, __m128i __B) {
return (__m128i)((__v8hu)__A + (__v8hu)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi32(__m128i __A, __m128i __B) {
return (__m128i)((__v4su)__A + (__v4su)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi64(__m128i __A, __m128i __B) {
return (__m128i)((__v2du)__A + (__v2du)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epi8(__m128i __A, __m128i __B) {
return (__m128i)vec_adds((__v16qi)__A, (__v16qi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epi16(__m128i __A, __m128i __B) {
return (__m128i)vec_adds((__v8hi)__A, (__v8hi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epu8(__m128i __A, __m128i __B) {
return (__m128i)vec_adds((__v16qu)__A, (__v16qu)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epu16(__m128i __A, __m128i __B) {
return (__m128i)vec_adds((__v8hu)__A, (__v8hu)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi8(__m128i __A, __m128i __B) {
return (__m128i)((__v16qu)__A - (__v16qu)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi16(__m128i __A, __m128i __B) {
return (__m128i)((__v8hu)__A - (__v8hu)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi32(__m128i __A, __m128i __B) {
return (__m128i)((__v4su)__A - (__v4su)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi64(__m128i __A, __m128i __B) {
return (__m128i)((__v2du)__A - (__v2du)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epi8(__m128i __A, __m128i __B) {
return (__m128i)vec_subs((__v16qi)__A, (__v16qi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epi16(__m128i __A, __m128i __B) {
return (__m128i)vec_subs((__v8hi)__A, (__v8hi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epu8(__m128i __A, __m128i __B) {
return (__m128i)vec_subs((__v16qu)__A, (__v16qu)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epu16(__m128i __A, __m128i __B) {
return (__m128i)vec_subs((__v8hu)__A, (__v8hu)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_madd_epi16(__m128i __A, __m128i __B) {
__vector signed int __zero = {0, 0, 0, 0};
return (__m128i)vec_vmsumshm((__v8hi)__A, (__v8hi)__B, __zero);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_epi16(__m128i __A, __m128i __B) {
__vector signed int __w0, __w1;
__vector unsigned char __xform1 = {
#ifdef __LITTLE_ENDIAN__
0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
#else
0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08,
0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
#endif
};
__w0 = vec_vmulesh((__v8hi)__A, (__v8hi)__B);
__w1 = vec_vmulosh((__v8hi)__A, (__v8hi)__B);
return (__m128i)vec_perm(__w0, __w1, __xform1);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mullo_epi16(__m128i __A, __m128i __B) {
return (__m128i)((__v8hi)__A * (__v8hi)__B);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_su32(__m64 __A, __m64 __B) {
unsigned int __a = __A;
unsigned int __b = __B;
return ((__m64)__a * (__m64)__b);
}
#ifdef _ARCH_PWR8
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_epu32(__m128i __A, __m128i __B) {
#if __GNUC__ < 8
__v2du __result;
#ifdef __LITTLE_ENDIAN__
__asm__("vmulouw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :);
#else
__asm__("vmuleuw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :);
#endif
return (__m128i)__result;
#else
return (__m128i)vec_mule((__v4su)__A, (__v4su)__B);
#endif
}
#endif
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_epi16(__m128i __A, int __B) {
__v8hu __lshift;
__v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0};
if (__B >= 0 && __B < 16) {
if (__builtin_constant_p(__B))
__lshift = (__v8hu)vec_splat_s16(__B);
else
__lshift = vec_splats((unsigned short)__B);
__result = vec_sl((__v8hi)__A, __lshift);
}
return (__m128i)__result;
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_epi32(__m128i __A, int __B) {
__v4su __lshift;
__v4si __result = {0, 0, 0, 0};
if (__B >= 0 && __B < 32) {
if (__builtin_constant_p(__B) && __B < 16)
__lshift = (__v4su)vec_splat_s32(__B);
else
__lshift = vec_splats((unsigned int)__B);
__result = vec_sl((__v4si)__A, __lshift);
}
return (__m128i)__result;
}
#ifdef _ARCH_PWR8
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_epi64(__m128i __A, int __B) {
__v2du __lshift;
__v2di __result = {0, 0};
if (__B >= 0 && __B < 64) {
if (__builtin_constant_p(__B) && __B < 16)
__lshift = (__v2du)vec_splat_s32(__B);
else
__lshift = (__v2du)vec_splats((unsigned int)__B);
__result = vec_sl((__v2di)__A, __lshift);
}
return (__m128i)__result;
}
#endif
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srai_epi16(__m128i __A, int __B) {
__v8hu __rshift = {15, 15, 15, 15, 15, 15, 15, 15};
__v8hi __result;
if (__B < 16) {
if (__builtin_constant_p(__B))
__rshift = (__v8hu)vec_splat_s16(__B);
else
__rshift = vec_splats((unsigned short)__B);
}
__result = vec_sra((__v8hi)__A, __rshift);
return (__m128i)__result;
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srai_epi32(__m128i __A, int __B) {
__v4su __rshift = {31, 31, 31, 31};
__v4si __result;
if (__B < 32) {
if (__builtin_constant_p(__B)) {
if (__B < 16)
__rshift = (__v4su)vec_splat_s32(__B);
else
__rshift = (__v4su)vec_splats((unsigned int)__B);
} else
__rshift = vec_splats((unsigned int)__B);
}
__result = vec_sra((__v4si)__A, __rshift);
return (__m128i)__result;
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_bslli_si128(__m128i __A, const int __N) {
__v16qu __result;
const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
if (__N < 16)
__result = vec_sld((__v16qu)__A, __zeros, __N);
else
__result = __zeros;
return (__m128i)__result;
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_bsrli_si128(__m128i __A, const int __N) {
__v16qu __result;
const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
if (__N < 16)
#ifdef __LITTLE_ENDIAN__
if (__builtin_constant_p(__N))
__result = vec_sld(__zeros, (__v16qu)__A, (16 - __N));
else
#endif
{
__v16qu __shift = vec_splats((unsigned char)(__N * 8));
#ifdef __LITTLE_ENDIAN__
__result = vec_sro((__v16qu)__A, __shift);
#else
__result = vec_slo((__v16qu)__A, __shift);
#endif
}
else
__result = __zeros;
return (__m128i)__result;
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_si128(__m128i __A, const int __N) {
return _mm_bsrli_si128(__A, __N);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_si128(__m128i __A, const int _imm5) {
__v16qu __result;
const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
if (_imm5 < 16)
#ifdef __LITTLE_ENDIAN__
__result = vec_sld((__v16qu)__A, __zeros, _imm5);
#else
__result = vec_sld(__zeros, (__v16qu)__A, (16 - _imm5));
#endif
else
__result = __zeros;
return (__m128i)__result;
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_epi16(__m128i __A, int __B) {
__v8hu __rshift;
__v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0};
if (__B < 16) {
if (__builtin_constant_p(__B))
__rshift = (__v8hu)vec_splat_s16(__B);
else
__rshift = vec_splats((unsigned short)__B);
__result = vec_sr((__v8hi)__A, __rshift);
}
return (__m128i)__result;
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_epi32(__m128i __A, int __B) {
__v4su __rshift;
__v4si __result = {0, 0, 0, 0};
if (__B < 32) {
if (__builtin_constant_p(__B)) {
if (__B < 16)
__rshift = (__v4su)vec_splat_s32(__B);
else
__rshift = (__v4su)vec_splats((unsigned int)__B);
} else
__rshift = vec_splats((unsigned int)__B);
__result = vec_sr((__v4si)__A, __rshift);
}
return (__m128i)__result;
}
#ifdef _ARCH_PWR8
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_epi64(__m128i __A, int __B) {
__v2du __rshift;
__v2di __result = {0, 0};
if (__B < 64) {
if (__builtin_constant_p(__B)) {
if (__B < 16)
__rshift = (__v2du)vec_splat_s32(__B);
else
__rshift = (__v2du)vec_splats((unsigned long long)__B);
} else
__rshift = (__v2du)vec_splats((unsigned int)__B);
__result = vec_sr((__v2di)__A, __rshift);
}
return (__m128i)__result;
}
#endif
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_epi16(__m128i __A, __m128i __B) {
__v8hu __lshift;
__vector __bool short __shmask;
const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15};
__v8hu __result;
#ifdef __LITTLE_ENDIAN__
__lshift = vec_splat((__v8hu)__B, 0);
#else
__lshift = vec_splat((__v8hu)__B, 3);
#endif
__shmask = vec_cmple(__lshift, __shmax);
__result = vec_sl((__v8hu)__A, __lshift);
__result = vec_sel((__v8hu)__shmask, __result, __shmask);
return (__m128i)__result;
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_epi32(__m128i __A, __m128i __B) {
__v4su __lshift;
__vector __bool int __shmask;
const __v4su __shmax = {32, 32, 32, 32};
__v4su __result;
#ifdef __LITTLE_ENDIAN__
__lshift = vec_splat((__v4su)__B, 0);
#else
__lshift = vec_splat((__v4su)__B, 1);
#endif
__shmask = vec_cmplt(__lshift, __shmax);
__result = vec_sl((__v4su)__A, __lshift);
__result = vec_sel((__v4su)__shmask, __result, __shmask);
return (__m128i)__result;
}
#ifdef _ARCH_PWR8
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_epi64(__m128i __A, __m128i __B) {
__v2du __lshift;
__vector __bool long long __shmask;
const __v2du __shmax = {64, 64};
__v2du __result;
__lshift = vec_splat((__v2du)__B, 0);
__shmask = vec_cmplt(__lshift, __shmax);
__result = vec_sl((__v2du)__A, __lshift);
__result = vec_sel((__v2du)__shmask, __result, __shmask);
return (__m128i)__result;
}
#endif
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sra_epi16(__m128i __A, __m128i __B) {
const __v8hu __rshmax = {15, 15, 15, 15, 15, 15, 15, 15};
__v8hu __rshift;
__v8hi __result;
#ifdef __LITTLE_ENDIAN__
__rshift = vec_splat((__v8hu)__B, 0);
#else
__rshift = vec_splat((__v8hu)__B, 3);
#endif
__rshift = vec_min(__rshift, __rshmax);
__result = vec_sra((__v8hi)__A, __rshift);
return (__m128i)__result;
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sra_epi32(__m128i __A, __m128i __B) {
const __v4su __rshmax = {31, 31, 31, 31};
__v4su __rshift;
__v4si __result;
#ifdef __LITTLE_ENDIAN__
__rshift = vec_splat((__v4su)__B, 0);
#else
__rshift = vec_splat((__v4su)__B, 1);
#endif
__rshift = vec_min(__rshift, __rshmax);
__result = vec_sra((__v4si)__A, __rshift);
return (__m128i)__result;
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_epi16(__m128i __A, __m128i __B) {
__v8hu __rshift;
__vector __bool short __shmask;
const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15};
__v8hu __result;
#ifdef __LITTLE_ENDIAN__
__rshift = vec_splat((__v8hu)__B, 0);
#else
__rshift = vec_splat((__v8hu)__B, 3);
#endif
__shmask = vec_cmple(__rshift, __shmax);
__result = vec_sr((__v8hu)__A, __rshift);
__result = vec_sel((__v8hu)__shmask, __result, __shmask);
return (__m128i)__result;
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_epi32(__m128i __A, __m128i __B) {
__v4su __rshift;
__vector __bool int __shmask;
const __v4su __shmax = {32, 32, 32, 32};
__v4su __result;
#ifdef __LITTLE_ENDIAN__
__rshift = vec_splat((__v4su)__B, 0);
#else
__rshift = vec_splat((__v4su)__B, 1);
#endif
__shmask = vec_cmplt(__rshift, __shmax);
__result = vec_sr((__v4su)__A, __rshift);
__result = vec_sel((__v4su)__shmask, __result, __shmask);
return (__m128i)__result;
}
#ifdef _ARCH_PWR8
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_epi64(__m128i __A, __m128i __B) {
__v2du __rshift;
__vector __bool long long __shmask;
const __v2du __shmax = {64, 64};
__v2du __result;
__rshift = vec_splat((__v2du)__B, 0);
__shmask = vec_cmplt(__rshift, __shmax);
__result = vec_sr((__v2du)__A, __rshift);
__result = vec_sel((__v2du)__shmask, __result, __shmask);
return (__m128i)__result;
}
#endif
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_and_pd(__m128d __A, __m128d __B) {
return (vec_and((__v2df)__A, (__v2df)__B));
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_andnot_pd(__m128d __A, __m128d __B) {
return (vec_andc((__v2df)__B, (__v2df)__A));
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_or_pd(__m128d __A, __m128d __B) {
return (vec_or((__v2df)__A, (__v2df)__B));
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_xor_pd(__m128d __A, __m128d __B) {
return (vec_xor((__v2df)__A, (__v2df)__B));
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_and_si128(__m128i __A, __m128i __B) {
return (__m128i)vec_and((__v2di)__A, (__v2di)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_andnot_si128(__m128i __A, __m128i __B) {
return (__m128i)vec_andc((__v2di)__B, (__v2di)__A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_or_si128(__m128i __A, __m128i __B) {
return (__m128i)vec_or((__v2di)__A, (__v2di)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_xor_si128(__m128i __A, __m128i __B) {
return (__m128i)vec_xor((__v2di)__A, (__v2di)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi8(__m128i __A, __m128i __B) {
return (__m128i)vec_cmpeq((__v16qi)__A, (__v16qi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi16(__m128i __A, __m128i __B) {
return (__m128i)vec_cmpeq((__v8hi)__A, (__v8hi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi32(__m128i __A, __m128i __B) {
return (__m128i)vec_cmpeq((__v4si)__A, (__v4si)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_epi8(__m128i __A, __m128i __B) {
return (__m128i)vec_cmplt((__v16qi)__A, (__v16qi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_epi16(__m128i __A, __m128i __B) {
return (__m128i)vec_cmplt((__v8hi)__A, (__v8hi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_epi32(__m128i __A, __m128i __B) {
return (__m128i)vec_cmplt((__v4si)__A, (__v4si)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi8(__m128i __A, __m128i __B) {
return (__m128i)vec_cmpgt((__v16qi)__A, (__v16qi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi16(__m128i __A, __m128i __B) {
return (__m128i)vec_cmpgt((__v8hi)__A, (__v8hi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi32(__m128i __A, __m128i __B) {
return (__m128i)vec_cmpgt((__v4si)__A, (__v4si)__B);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_extract_epi16(__m128i const __A, int const __N) {
return (unsigned short)((__v8hi)__A)[__N & 7];
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_insert_epi16(__m128i const __A, int const __D, int const __N) {
__v8hi __result = (__v8hi)__A;
__result[(__N & 7)] = __D;
return (__m128i)__result;
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_epi16(__m128i __A, __m128i __B) {
return (__m128i)vec_max((__v8hi)__A, (__v8hi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_epu8(__m128i __A, __m128i __B) {
return (__m128i)vec_max((__v16qu)__A, (__v16qu)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_epi16(__m128i __A, __m128i __B) {
return (__m128i)vec_min((__v8hi)__A, (__v8hi)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_epu8(__m128i __A, __m128i __B) {
return (__m128i)vec_min((__v16qu)__A, (__v16qu)__B);
}
#ifdef _ARCH_PWR8
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_epi8(__m128i __A) {
#ifdef _ARCH_PWR10
return vec_extractm((__v16qu)__A);
#else
__vector unsigned long long __result;
static const __vector unsigned char __perm_mask = {
0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00};
__result = ((__vector unsigned long long)vec_vbpermq(
(__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
#ifdef __LITTLE_ENDIAN__
return __result[1];
#else
return __result[0];
#endif
#endif
}
#endif
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_epu16(__m128i __A, __m128i __B) {
__v4su __w0, __w1;
__v16qu __xform1 = {
#ifdef __LITTLE_ENDIAN__
0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
#else
0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08,
0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
#endif
};
__w0 = vec_vmuleuh((__v8hu)__A, (__v8hu)__B);
__w1 = vec_vmulouh((__v8hu)__A, (__v8hu)__B);
return (__m128i)vec_perm(__w0, __w1, __xform1);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shufflehi_epi16(__m128i __A, const int __mask) {
unsigned long __element_selector_98 = __mask & 0x03;
unsigned long __element_selector_BA = (__mask >> 2) & 0x03;
unsigned long __element_selector_DC = (__mask >> 4) & 0x03;
unsigned long __element_selector_FE = (__mask >> 6) & 0x03;
static const unsigned short __permute_selectors[4] = {
#ifdef __LITTLE_ENDIAN__
0x0908, 0x0B0A, 0x0D0C, 0x0F0E
#else
0x0809, 0x0A0B, 0x0C0D, 0x0E0F
#endif
};
__v2du __pmask =
#ifdef __LITTLE_ENDIAN__
{0x1716151413121110UL, 0UL};
#else
{0x1011121314151617UL, 0UL};
#endif
__m64_union __t;
__v2du __a, __r;
__t.as_short[0] = __permute_selectors[__element_selector_98];
__t.as_short[1] = __permute_selectors[__element_selector_BA];
__t.as_short[2] = __permute_selectors[__element_selector_DC];
__t.as_short[3] = __permute_selectors[__element_selector_FE];
__pmask[1] = __t.as_m64;
__a = (__v2du)__A;
__r = vec_perm(__a, __a, (__vector unsigned char)__pmask);
return (__m128i)__r;
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shufflelo_epi16(__m128i __A, const int __mask) {
unsigned long __element_selector_10 = __mask & 0x03;
unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
static const unsigned short __permute_selectors[4] = {
#ifdef __LITTLE_ENDIAN__
0x0100, 0x0302, 0x0504, 0x0706
#else
0x0001, 0x0203, 0x0405, 0x0607
#endif
};
__v2du __pmask =
#ifdef __LITTLE_ENDIAN__
{0UL, 0x1f1e1d1c1b1a1918UL};
#else
{0UL, 0x18191a1b1c1d1e1fUL};
#endif
__m64_union __t;
__v2du __a, __r;
__t.as_short[0] = __permute_selectors[__element_selector_10];
__t.as_short[1] = __permute_selectors[__element_selector_32];
__t.as_short[2] = __permute_selectors[__element_selector_54];
__t.as_short[3] = __permute_selectors[__element_selector_76];
__pmask[0] = __t.as_m64;
__a = (__v2du)__A;
__r = vec_perm(__a, __a, (__vector unsigned char)__pmask);
return (__m128i)__r;
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_epi32(__m128i __A, const int __mask) {
unsigned long __element_selector_10 = __mask & 0x03;
unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
static const unsigned int __permute_selectors[4] = {
#ifdef __LITTLE_ENDIAN__
0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
#else
0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
#endif
};
__v4su __t;
__t[0] = __permute_selectors[__element_selector_10];
__t[1] = __permute_selectors[__element_selector_32];
__t[2] = __permute_selectors[__element_selector_54] + 0x10101010;
__t[3] = __permute_selectors[__element_selector_76] + 0x10101010;
return (__m128i)vec_perm((__v4si)__A, (__v4si)__A,
(__vector unsigned char)__t);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskmoveu_si128(__m128i __A, __m128i __B, char *__C) {
__v2du __hibit = {0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
__v16qu __mask, __tmp;
__m128i_u *__p = (__m128i_u *)__C;
__tmp = (__v16qu)_mm_loadu_si128(__p);
__mask = (__v16qu)vec_cmpgt((__v16qu)__B, (__v16qu)__hibit);
__tmp = vec_sel(__tmp, (__v16qu)__A, __mask);
_mm_storeu_si128(__p, (__m128i)__tmp);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_avg_epu8(__m128i __A, __m128i __B) {
return (__m128i)vec_avg((__v16qu)__A, (__v16qu)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_avg_epu16(__m128i __A, __m128i __B) {
return (__m128i)vec_avg((__v8hu)__A, (__v8hu)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sad_epu8(__m128i __A, __m128i __B) {
__v16qu __a, __b;
__v16qu __vabsdiff;
__v4si __vsum;
const __v4su __zero = {0, 0, 0, 0};
__v4si __result;
__a = (__v16qu)__A;
__b = (__v16qu)__B;
#ifndef _ARCH_PWR9
__v16qu __vmin = vec_min(__a, __b);
__v16qu __vmax = vec_max(__a, __b);
__vabsdiff = vec_sub(__vmax, __vmin);
#else
__vabsdiff = vec_absd(__a, __b);
#endif
__vsum = (__vector signed int)vec_sum4s(__vabsdiff, __zero);
#ifdef __LITTLE_ENDIAN__
__asm__("vsum2sws %0,%1,%2" : "=v"(__result) : "v"(__vsum), "v"(__zero));
#else
__result = vec_sum2s(__vsum, (__vector signed int)__zero);
__result = vec_sld(__result, __result, 6);
#endif
return (__m128i)__result;
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_si32(int *__A, int __B) {
__asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
*__A = __B;
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_si64(long long int *__A, long long int __B) {
__asm__(" dcbtstt 0,%0" : : "b"(__A) : "memory");
*__A = __B;
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_si128(__m128i *__A, __m128i __B) {
__asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
*__A = __B;
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_pd(double *__A, __m128d __B) {
__asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
*(__m128d *)__A = __B;
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_clflush(void const *__A) {
__asm__("dcbf 0,%0" : : "b"(__A) : "memory");
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_lfence(void) {
__atomic_thread_fence(__ATOMIC_RELEASE);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mfence(void) {
__atomic_thread_fence(__ATOMIC_SEQ_CST);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi32_si128(int __A) {
return _mm_set_epi32(0, 0, 0, __A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64_si128(long long __A) {
return __extension__(__m128i)(__v2di){__A, 0LL};
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64x_si128(long long __A) {
return __extension__(__m128i)(__v2di){__A, 0LL};
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castpd_ps(__m128d __A) {
return (__m128)__A;
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castpd_si128(__m128d __A) {
return (__m128i)__A;
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castps_pd(__m128 __A) {
return (__m128d)__A;
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castps_si128(__m128 __A) {
return (__m128i)__A;
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castsi128_ps(__m128i __A) {
return (__m128)__A;
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castsi128_pd(__m128i __A) {
return (__m128d)__A;
}
#else
#include_next <emmintrin.h>
#endif
#endif