#ifndef NO_WARN_X86_INTRINSICS
#error \
"Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
#endif
#ifndef XMMINTRIN_H_
#define XMMINTRIN_H_
#if defined(__ppc64__) && \
(defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
#define _MM_SHUFFLE(w, x, y, z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))
#include <altivec.h>
#if defined(__STRICT_ANSI__) && \
(defined(__cplusplus) || \
(defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L))
#undef vector
#undef pixel
#undef bool
#endif
#include <mmintrin.h>
#if __STDC_HOSTED__
#include <mm_malloc.h>
#endif
typedef vector float __m128 __attribute__((__may_alias__));
typedef vector float __m128_u __attribute__((__may_alias__, __aligned__(1)));
typedef vector float __v4sf;
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_undefined_ps(void) {
__m128 __Y = __Y;
return __Y;
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setzero_ps(void) {
return __extension__(__m128){0.0f, 0.0f, 0.0f, 0.0f};
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_ps(float const *__P) {
return ((__m128)vec_ld(0, (__v4sf *)__P));
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadu_ps(float const *__P) {
return (vec_vsx_ld(0, __P));
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadr_ps(float const *__P) {
__v4sf __tmp;
__m128 __result;
static const __vector unsigned char __permute_vector = {
0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B,
0x14, 0x15, 0x16, 0x17, 0x10, 0x11, 0x12, 0x13};
__tmp = vec_ld(0, (__v4sf *)__P);
__result = (__m128)vec_perm(__tmp, __tmp, __permute_vector);
return __result;
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_ps(float __F) {
return __extension__(__m128)(__v4sf){__F, __F, __F, __F};
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_ps1(float __F) {
return _mm_set1_ps(__F);
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_set_ps(const float __Z, const float __Y, const float __X, const float __W) {
return __extension__(__m128)(__v4sf){__W, __X, __Y, __Z};
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_ps(float __Z, float __Y, float __X, float __W) {
return __extension__(__m128)(__v4sf){__Z, __Y, __X, __W};
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_ps(float *__P, __m128 __A) {
vec_st((__v4sf)__A, 0, (__v4sf *)__P);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeu_ps(float *__P, __m128 __A) {
*(__m128_u *)__P = __A;
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storer_ps(float *__P, __m128 __A) {
__v4sf __tmp;
static const __vector unsigned char __permute_vector = {
0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B,
0x14, 0x15, 0x16, 0x17, 0x10, 0x11, 0x12, 0x13};
__tmp = (__m128)vec_perm(__A, __A, __permute_vector);
_mm_store_ps(__P, __tmp);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store1_ps(float *__P, __m128 __A) {
__v4sf __va = vec_splat((__v4sf)__A, 0);
_mm_store_ps(__P, __va);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_ps1(float *__P, __m128 __A) {
_mm_store1_ps(__P, __A);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_ss(float __F) {
return __extension__(__m128)(__v4sf){__F, 0.0f, 0.0f, 0.0f};
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_move_ss(__m128 __A, __m128 __B) {
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
return (vec_sel((__v4sf)__A, (__v4sf)__B, __mask));
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_ss(float const *__P) {
return _mm_set_ss(*__P);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_ss(float *__P, __m128 __A) {
*__P = ((__v4sf)__A)[0];
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_ss(__m128 __A, __m128 __B) {
#ifdef _ARCH_PWR7
__m128 __a, __b, __c;
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
__a = vec_splat(__A, 0);
__b = vec_splat(__B, 0);
__c = __a + __b;
return (vec_sel(__A, __c, __mask));
#else
__A[0] = __A[0] + __B[0];
return (__A);
#endif
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_ss(__m128 __A, __m128 __B) {
#ifdef _ARCH_PWR7
__m128 __a, __b, __c;
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
__a = vec_splat(__A, 0);
__b = vec_splat(__B, 0);
__c = __a - __b;
return (vec_sel(__A, __c, __mask));
#else
__A[0] = __A[0] - __B[0];
return (__A);
#endif
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_ss(__m128 __A, __m128 __B) {
#ifdef _ARCH_PWR7
__m128 __a, __b, __c;
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
__a = vec_splat(__A, 0);
__b = vec_splat(__B, 0);
__c = __a * __b;
return (vec_sel(__A, __c, __mask));
#else
__A[0] = __A[0] * __B[0];
return (__A);
#endif
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_div_ss(__m128 __A, __m128 __B) {
#ifdef _ARCH_PWR7
__m128 __a, __b, __c;
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
__a = vec_splat(__A, 0);
__b = vec_splat(__B, 0);
__c = __a / __b;
return (vec_sel(__A, __c, __mask));
#else
__A[0] = __A[0] / __B[0];
return (__A);
#endif
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sqrt_ss(__m128 __A) {
__m128 __a, __c;
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
__a = vec_splat(__A, 0);
__c = vec_sqrt(__a);
return (vec_sel(__A, __c, __mask));
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_ps(__m128 __A, __m128 __B) {
return (__m128)((__v4sf)__A + (__v4sf)__B);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_ps(__m128 __A, __m128 __B) {
return (__m128)((__v4sf)__A - (__v4sf)__B);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_ps(__m128 __A, __m128 __B) {
return (__m128)((__v4sf)__A * (__v4sf)__B);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_div_ps(__m128 __A, __m128 __B) {
return (__m128)((__v4sf)__A / (__v4sf)__B);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sqrt_ps(__m128 __A) {
return (vec_sqrt((__v4sf)__A));
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_rcp_ps(__m128 __A) {
return (vec_re((__v4sf)__A));
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_rsqrt_ps(__m128 __A) {
return (vec_rsqrte(__A));
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_rcp_ss(__m128 __A) {
__m128 __a, __c;
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
__a = vec_splat(__A, 0);
__c = _mm_rcp_ps(__a);
return (vec_sel(__A, __c, __mask));
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_rsqrt_ss(__m128 __A) {
__m128 __a, __c;
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
__a = vec_splat(__A, 0);
__c = vec_rsqrte(__a);
return (vec_sel(__A, __c, __mask));
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_ss(__m128 __A, __m128 __B) {
__v4sf __a, __b, __c;
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
__a = vec_splat((__v4sf)__A, 0);
__b = vec_splat((__v4sf)__B, 0);
__c = vec_min(__a, __b);
return (vec_sel((__v4sf)__A, __c, __mask));
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_ss(__m128 __A, __m128 __B) {
__v4sf __a, __b, __c;
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
__a = vec_splat(__A, 0);
__b = vec_splat(__B, 0);
__c = vec_max(__a, __b);
return (vec_sel((__v4sf)__A, __c, __mask));
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_ps(__m128 __A, __m128 __B) {
__vector __bool int __m = vec_cmpgt((__v4sf)__B, (__v4sf)__A);
return vec_sel(__B, __A, __m);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_ps(__m128 __A, __m128 __B) {
__vector __bool int __m = vec_cmpgt((__v4sf)__A, (__v4sf)__B);
return vec_sel(__B, __A, __m);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_and_ps(__m128 __A, __m128 __B) {
return ((__m128)vec_and((__v4sf)__A, (__v4sf)__B));
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_andnot_ps(__m128 __A, __m128 __B) {
return ((__m128)vec_andc((__v4sf)__B, (__v4sf)__A));
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_or_ps(__m128 __A, __m128 __B) {
return ((__m128)vec_or((__v4sf)__A, (__v4sf)__B));
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_xor_ps(__m128 __A, __m128 __B) {
return ((__m128)vec_xor((__v4sf)__A, (__v4sf)__B));
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_ps(__m128 __A, __m128 __B) {
return ((__m128)vec_cmpeq((__v4sf)__A, (__v4sf)__B));
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_ps(__m128 __A, __m128 __B) {
return ((__m128)vec_cmplt((__v4sf)__A, (__v4sf)__B));
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmple_ps(__m128 __A, __m128 __B) {
return ((__m128)vec_cmple((__v4sf)__A, (__v4sf)__B));
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_ps(__m128 __A, __m128 __B) {
return ((__m128)vec_cmpgt((__v4sf)__A, (__v4sf)__B));
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpge_ps(__m128 __A, __m128 __B) {
return ((__m128)vec_cmpge((__v4sf)__A, (__v4sf)__B));
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpneq_ps(__m128 __A, __m128 __B) {
__v4sf __temp = (__v4sf)vec_cmpeq((__v4sf)__A, (__v4sf)__B);
return ((__m128)vec_nor(__temp, __temp));
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnlt_ps(__m128 __A, __m128 __B) {
return ((__m128)vec_cmpge((__v4sf)__A, (__v4sf)__B));
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnle_ps(__m128 __A, __m128 __B) {
return ((__m128)vec_cmpgt((__v4sf)__A, (__v4sf)__B));
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpngt_ps(__m128 __A, __m128 __B) {
return ((__m128)vec_cmple((__v4sf)__A, (__v4sf)__B));
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnge_ps(__m128 __A, __m128 __B) {
return ((__m128)vec_cmplt((__v4sf)__A, (__v4sf)__B));
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpord_ps(__m128 __A, __m128 __B) {
__vector unsigned int __a, __b;
__vector unsigned int __c, __d;
static const __vector unsigned int __float_exp_mask = {
0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
__a = (__vector unsigned int)vec_abs((__v4sf)__A);
__b = (__vector unsigned int)vec_abs((__v4sf)__B);
__c = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __a);
__d = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __b);
return ((__m128)vec_and(__c, __d));
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpunord_ps(__m128 __A, __m128 __B) {
__vector unsigned int __a, __b;
__vector unsigned int __c, __d;
static const __vector unsigned int __float_exp_mask = {
0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
__a = (__vector unsigned int)vec_abs((__v4sf)__A);
__b = (__vector unsigned int)vec_abs((__v4sf)__B);
__c = (__vector unsigned int)vec_cmpgt(__a, __float_exp_mask);
__d = (__vector unsigned int)vec_cmpgt(__b, __float_exp_mask);
return ((__m128)vec_or(__c, __d));
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_ss(__m128 __A, __m128 __B) {
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
__v4sf __a, __b, __c;
__a = vec_splat((__v4sf)__A, 0);
__b = vec_splat((__v4sf)__B, 0);
__c = (__v4sf)vec_cmpeq(__a, __b);
return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_ss(__m128 __A, __m128 __B) {
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
__v4sf __a, __b, __c;
__a = vec_splat((__v4sf)__A, 0);
__b = vec_splat((__v4sf)__B, 0);
__c = (__v4sf)vec_cmplt(__a, __b);
return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmple_ss(__m128 __A, __m128 __B) {
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
__v4sf __a, __b, __c;
__a = vec_splat((__v4sf)__A, 0);
__b = vec_splat((__v4sf)__B, 0);
__c = (__v4sf)vec_cmple(__a, __b);
return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_ss(__m128 __A, __m128 __B) {
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
__v4sf __a, __b, __c;
__a = vec_splat((__v4sf)__A, 0);
__b = vec_splat((__v4sf)__B, 0);
__c = (__v4sf)vec_cmpgt(__a, __b);
return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpge_ss(__m128 __A, __m128 __B) {
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
__v4sf __a, __b, __c;
__a = vec_splat((__v4sf)__A, 0);
__b = vec_splat((__v4sf)__B, 0);
__c = (__v4sf)vec_cmpge(__a, __b);
return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpneq_ss(__m128 __A, __m128 __B) {
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
__v4sf __a, __b, __c;
__a = vec_splat((__v4sf)__A, 0);
__b = vec_splat((__v4sf)__B, 0);
__c = (__v4sf)vec_cmpeq(__a, __b);
__c = vec_nor(__c, __c);
return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnlt_ss(__m128 __A, __m128 __B) {
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
__v4sf __a, __b, __c;
__a = vec_splat((__v4sf)__A, 0);
__b = vec_splat((__v4sf)__B, 0);
__c = (__v4sf)vec_cmpge(__a, __b);
return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnle_ss(__m128 __A, __m128 __B) {
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
__v4sf __a, __b, __c;
__a = vec_splat((__v4sf)__A, 0);
__b = vec_splat((__v4sf)__B, 0);
__c = (__v4sf)vec_cmpgt(__a, __b);
return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpngt_ss(__m128 __A, __m128 __B) {
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
__v4sf __a, __b, __c;
__a = vec_splat((__v4sf)__A, 0);
__b = vec_splat((__v4sf)__B, 0);
__c = (__v4sf)vec_cmple(__a, __b);
return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnge_ss(__m128 __A, __m128 __B) {
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
__v4sf __a, __b, __c;
__a = vec_splat((__v4sf)__A, 0);
__b = vec_splat((__v4sf)__B, 0);
__c = (__v4sf)vec_cmplt(__a, __b);
return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpord_ss(__m128 __A, __m128 __B) {
__vector unsigned int __a, __b;
__vector unsigned int __c, __d;
static const __vector unsigned int __float_exp_mask = {
0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
__a = (__vector unsigned int)vec_abs((__v4sf)__A);
__b = (__vector unsigned int)vec_abs((__v4sf)__B);
__c = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __a);
__d = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __b);
__c = vec_and(__c, __d);
return ((__m128)vec_sel((__v4sf)__A, (__v4sf)__c, __mask));
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpunord_ss(__m128 __A, __m128 __B) {
__vector unsigned int __a, __b;
__vector unsigned int __c, __d;
static const __vector unsigned int __float_exp_mask = {
0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
__a = (__vector unsigned int)vec_abs((__v4sf)__A);
__b = (__vector unsigned int)vec_abs((__v4sf)__B);
__c = (__vector unsigned int)vec_cmpgt(__a, __float_exp_mask);
__d = (__vector unsigned int)vec_cmpgt(__b, __float_exp_mask);
__c = vec_or(__c, __d);
return ((__m128)vec_sel((__v4sf)__A, (__v4sf)__c, __mask));
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comieq_ss(__m128 __A, __m128 __B) {
return (__A[0] == __B[0]);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comilt_ss(__m128 __A, __m128 __B) {
return (__A[0] < __B[0]);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comile_ss(__m128 __A, __m128 __B) {
return (__A[0] <= __B[0]);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comigt_ss(__m128 __A, __m128 __B) {
return (__A[0] > __B[0]);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comige_ss(__m128 __A, __m128 __B) {
return (__A[0] >= __B[0]);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comineq_ss(__m128 __A, __m128 __B) {
return (__A[0] != __B[0]);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomieq_ss(__m128 __A, __m128 __B) {
return (__A[0] == __B[0]);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomilt_ss(__m128 __A, __m128 __B) {
return (__A[0] < __B[0]);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomile_ss(__m128 __A, __m128 __B) {
return (__A[0] <= __B[0]);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomigt_ss(__m128 __A, __m128 __B) {
return (__A[0] > __B[0]);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomige_ss(__m128 __A, __m128 __B) {
return (__A[0] >= __B[0]);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomineq_ss(__m128 __A, __m128 __B) {
return (__A[0] != __B[0]);
}
extern __inline float
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtss_f32(__m128 __A) {
return ((__v4sf)__A)[0];
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtss_si32(__m128 __A) {
int __res;
#ifdef _ARCH_PWR8
double __dtmp;
__asm__(
#ifdef __LITTLE_ENDIAN__
"xxsldwi %x0,%x0,%x0,3;\n"
#endif
"xscvspdp %x2,%x0;\n"
"fctiw %2,%2;\n"
"mfvsrd %1,%x2;\n"
: "+wa"(__A), "=r"(__res), "=f"(__dtmp)
:);
#else
__res = __builtin_rint(__A[0]);
#endif
return __res;
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvt_ss2si(__m128 __A) {
return _mm_cvtss_si32(__A);
}
extern __inline long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtss_si64(__m128 __A) {
long long __res;
#if defined(_ARCH_PWR8) && defined(__powerpc64__)
double __dtmp;
__asm__(
#ifdef __LITTLE_ENDIAN__
"xxsldwi %x0,%x0,%x0,3;\n"
#endif
"xscvspdp %x2,%x0;\n"
"fctid %2,%2;\n"
"mfvsrd %1,%x2;\n"
: "+wa"(__A), "=r"(__res), "=f"(__dtmp)
:);
#else
__res = __builtin_llrint(__A[0]);
#endif
return __res;
}
extern __inline long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtss_si64x(__m128 __A) {
return _mm_cvtss_si64((__v4sf)__A);
}
enum _mm_hint {
_MM_HINT_ET0 = 7,
_MM_HINT_ET1 = 6,
_MM_HINT_T0 = 3,
_MM_HINT_T1 = 2,
_MM_HINT_T2 = 1,
_MM_HINT_NTA = 0
};
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_prefetch(const void *__P, enum _mm_hint __I) {
__builtin_prefetch(__P);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtps_pi32(__m128 __A) {
__v4sf __temp, __rounded;
__vector unsigned long long __result;
__temp = (__v4sf)vec_splat((__vector long long)__A, 0);
__rounded = vec_rint(__temp);
__result = (__vector unsigned long long)vec_cts(__rounded, 0);
return (__m64)((__vector long long)__result)[0];
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvt_ps2pi(__m128 __A) {
return _mm_cvtps_pi32(__A);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttss_si32(__m128 __A) {
float __temp = __A[0];
return __temp;
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtt_ss2si(__m128 __A) {
return _mm_cvttss_si32(__A);
}
extern __inline long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttss_si64(__m128 __A) {
float __temp = __A[0];
return __temp;
}
extern __inline long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttss_si64x(__m128 __A) {
float __temp = __A[0];
return __temp;
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttps_pi32(__m128 __A) {
__v4sf __temp;
__vector unsigned long long __result;
__temp = (__v4sf)vec_splat((__vector long long)__A, 0);
__result = (__vector unsigned long long)vec_cts(__temp, 0);
return (__m64)((__vector long long)__result)[0];
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtt_ps2pi(__m128 __A) {
return _mm_cvttps_pi32(__A);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi32_ss(__m128 __A, int __B) {
float __temp = __B;
__A[0] = __temp;
return __A;
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvt_si2ss(__m128 __A, int __B) {
return _mm_cvtsi32_ss(__A, __B);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64_ss(__m128 __A, long long __B) {
float __temp = __B;
__A[0] = __temp;
return __A;
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64x_ss(__m128 __A, long long __B) {
return _mm_cvtsi64_ss(__A, __B);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpi32_ps(__m128 __A, __m64 __B) {
__vector signed int __vm1;
__vector float __vf1;
__vm1 = (__vector signed int)(__vector unsigned long long){__B, __B};
__vf1 = (__vector float)vec_ctf(__vm1, 0);
return ((__m128)(__vector unsigned long long){
((__vector unsigned long long)__vf1)[0],
((__vector unsigned long long)__A)[1]});
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvt_pi2ps(__m128 __A, __m64 __B) {
return _mm_cvtpi32_ps(__A, __B);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpi16_ps(__m64 __A) {
__vector signed short __vs8;
__vector signed int __vi4;
__vector float __vf1;
__vs8 = (__vector signed short)(__vector unsigned long long){__A, __A};
__vi4 = vec_vupklsh(__vs8);
__vf1 = (__vector float)vec_ctf(__vi4, 0);
return (__m128)__vf1;
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpu16_ps(__m64 __A) {
const __vector unsigned short __zero = {0, 0, 0, 0, 0, 0, 0, 0};
__vector unsigned short __vs8;
__vector unsigned int __vi4;
__vector float __vf1;
__vs8 = (__vector unsigned short)(__vector unsigned long long){__A, __A};
__vi4 = (__vector unsigned int)vec_mergel
#ifdef __LITTLE_ENDIAN__
(__vs8, __zero);
#else
(__zero, __vs8);
#endif
__vf1 = (__vector float)vec_ctf(__vi4, 0);
return (__m128)__vf1;
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpi8_ps(__m64 __A) {
__vector signed char __vc16;
__vector signed short __vs8;
__vector signed int __vi4;
__vector float __vf1;
__vc16 = (__vector signed char)(__vector unsigned long long){__A, __A};
__vs8 = vec_vupkhsb(__vc16);
__vi4 = vec_vupkhsh(__vs8);
__vf1 = (__vector float)vec_ctf(__vi4, 0);
return (__m128)__vf1;
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpu8_ps(__m64 __A) {
const __vector unsigned char __zero = {0, 0, 0, 0, 0, 0, 0, 0};
__vector unsigned char __vc16;
__vector unsigned short __vs8;
__vector unsigned int __vi4;
__vector float __vf1;
__vc16 = (__vector unsigned char)(__vector unsigned long long){__A, __A};
#ifdef __LITTLE_ENDIAN__
__vs8 = (__vector unsigned short)vec_mergel(__vc16, __zero);
__vi4 =
(__vector unsigned int)vec_mergeh(__vs8, (__vector unsigned short)__zero);
#else
__vs8 = (__vector unsigned short)vec_mergel(__zero, __vc16);
__vi4 =
(__vector unsigned int)vec_mergeh((__vector unsigned short)__zero, __vs8);
#endif
__vf1 = (__vector float)vec_ctf(__vi4, 0);
return (__m128)__vf1;
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpi32x2_ps(__m64 __A, __m64 __B) {
__vector signed int __vi4;
__vector float __vf4;
__vi4 = (__vector signed int)(__vector unsigned long long){__A, __B};
__vf4 = (__vector float)vec_ctf(__vi4, 0);
return (__m128)__vf4;
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtps_pi16(__m128 __A) {
__v4sf __rounded;
__vector signed int __temp;
__vector unsigned long long __result;
__rounded = vec_rint(__A);
__temp = vec_cts(__rounded, 0);
__result = (__vector unsigned long long)vec_pack(__temp, __temp);
return (__m64)((__vector long long)__result)[0];
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtps_pi8(__m128 __A) {
__v4sf __rounded;
__vector signed int __tmp_i;
static const __vector signed int __zero = {0, 0, 0, 0};
__vector signed short __tmp_s;
__vector signed char __res_v;
__rounded = vec_rint(__A);
__tmp_i = vec_cts(__rounded, 0);
__tmp_s = vec_pack(__tmp_i, __zero);
__res_v = vec_pack(__tmp_s, __tmp_s);
return (__m64)((__vector long long)__res_v)[0];
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_ps(__m128 __A, __m128 __B, int const __mask) {
unsigned long __element_selector_10 = __mask & 0x03;
unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
static const unsigned int __permute_selectors[4] = {
#ifdef __LITTLE_ENDIAN__
0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
#else
0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
#endif
};
__vector unsigned int __t;
__t[0] = __permute_selectors[__element_selector_10];
__t[1] = __permute_selectors[__element_selector_32];
__t[2] = __permute_selectors[__element_selector_54] + 0x10101010;
__t[3] = __permute_selectors[__element_selector_76] + 0x10101010;
return vec_perm((__v4sf)__A, (__v4sf)__B, (__vector unsigned char)__t);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_ps(__m128 __A, __m128 __B) {
return (__m128)vec_vmrglw((__v4sf)__A, (__v4sf)__B);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_ps(__m128 __A, __m128 __B) {
return (__m128)vec_vmrghw((__v4sf)__A, (__v4sf)__B);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadh_pi(__m128 __A, __m64 const *__P) {
__vector unsigned long long __a = (__vector unsigned long long)__A;
__vector unsigned long long __p = vec_splats(*__P);
__a[1] = __p[1];
return (__m128)__a;
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeh_pi(__m64 *__P, __m128 __A) {
__vector unsigned long long __a = (__vector unsigned long long)__A;
*__P = __a[1];
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movehl_ps(__m128 __A, __m128 __B) {
return (__m128)vec_mergel((__vector unsigned long long)__B,
(__vector unsigned long long)__A);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movelh_ps(__m128 __A, __m128 __B) {
return (__m128)vec_mergeh((__vector unsigned long long)__A,
(__vector unsigned long long)__B);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadl_pi(__m128 __A, __m64 const *__P) {
__vector unsigned long long __a = (__vector unsigned long long)__A;
__vector unsigned long long __p = vec_splats(*__P);
__a[0] = __p[0];
return (__m128)__a;
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storel_pi(__m64 *__P, __m128 __A) {
__vector unsigned long long __a = (__vector unsigned long long)__A;
*__P = __a[0];
}
#ifdef _ARCH_PWR8
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_ps(__m128 __A) {
#ifdef _ARCH_PWR10
return vec_extractm((__vector unsigned int)__A);
#else
__vector unsigned long long __result;
static const __vector unsigned int __perm_mask = {
#ifdef __LITTLE_ENDIAN__
0x00204060, 0x80808080, 0x80808080, 0x80808080
#else
0x80808080, 0x80808080, 0x80808080, 0x00204060
#endif
};
__result = ((__vector unsigned long long)vec_vbpermq(
(__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
#ifdef __LITTLE_ENDIAN__
return __result[1];
#else
return __result[0];
#endif
#endif
}
#endif
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load1_ps(float const *__P) {
return _mm_set1_ps(*__P);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_ps1(float const *__P) {
return _mm_load1_ps(__P);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_extract_pi16(__m64 const __A, int const __N) {
unsigned int __shiftr = __N & 3;
#ifdef __BIG_ENDIAN__
__shiftr = 3 - __shiftr;
#endif
return ((__A >> (__shiftr * 16)) & 0xffff);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pextrw(__m64 const __A, int const __N) {
return _mm_extract_pi16(__A, __N);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_insert_pi16(__m64 const __A, int const __D, int const __N) {
const int __shiftl = (__N & 3) * 16;
const __m64 __shiftD = (const __m64)__D << __shiftl;
const __m64 __mask = 0xffffUL << __shiftl;
__m64 __result = (__A & (~__mask)) | (__shiftD & __mask);
return __result;
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pinsrw(__m64 const __A, int const __D, int const __N) {
return _mm_insert_pi16(__A, __D, __N);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_pi16(__m64 __A, __m64 __B) {
#if _ARCH_PWR8
__vector signed short __a, __b, __r;
__vector __bool short __c;
__a = (__vector signed short)vec_splats(__A);
__b = (__vector signed short)vec_splats(__B);
__c = (__vector __bool short)vec_cmpgt(__a, __b);
__r = vec_sel(__b, __a, __c);
return (__m64)((__vector long long)__r)[0];
#else
__m64_union __m1, __m2, __res;
__m1.as_m64 = __A;
__m2.as_m64 = __B;
__res.as_short[0] = (__m1.as_short[0] > __m2.as_short[0]) ? __m1.as_short[0]
: __m2.as_short[0];
__res.as_short[1] = (__m1.as_short[1] > __m2.as_short[1]) ? __m1.as_short[1]
: __m2.as_short[1];
__res.as_short[2] = (__m1.as_short[2] > __m2.as_short[2]) ? __m1.as_short[2]
: __m2.as_short[2];
__res.as_short[3] = (__m1.as_short[3] > __m2.as_short[3]) ? __m1.as_short[3]
: __m2.as_short[3];
return (__m64)__res.as_m64;
#endif
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pmaxsw(__m64 __A, __m64 __B) {
return _mm_max_pi16(__A, __B);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_pu8(__m64 __A, __m64 __B) {
#if _ARCH_PWR8
__vector unsigned char __a, __b, __r;
__vector __bool char __c;
__a = (__vector unsigned char)vec_splats(__A);
__b = (__vector unsigned char)vec_splats(__B);
__c = (__vector __bool char)vec_cmpgt(__a, __b);
__r = vec_sel(__b, __a, __c);
return (__m64)((__vector long long)__r)[0];
#else
__m64_union __m1, __m2, __res;
long __i;
__m1.as_m64 = __A;
__m2.as_m64 = __B;
for (__i = 0; __i < 8; __i++)
__res.as_char[__i] =
((unsigned char)__m1.as_char[__i] > (unsigned char)__m2.as_char[__i])
? __m1.as_char[__i]
: __m2.as_char[__i];
return (__m64)__res.as_m64;
#endif
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pmaxub(__m64 __A, __m64 __B) {
return _mm_max_pu8(__A, __B);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_pi16(__m64 __A, __m64 __B) {
#if _ARCH_PWR8
__vector signed short __a, __b, __r;
__vector __bool short __c;
__a = (__vector signed short)vec_splats(__A);
__b = (__vector signed short)vec_splats(__B);
__c = (__vector __bool short)vec_cmplt(__a, __b);
__r = vec_sel(__b, __a, __c);
return (__m64)((__vector long long)__r)[0];
#else
__m64_union __m1, __m2, __res;
__m1.as_m64 = __A;
__m2.as_m64 = __B;
__res.as_short[0] = (__m1.as_short[0] < __m2.as_short[0]) ? __m1.as_short[0]
: __m2.as_short[0];
__res.as_short[1] = (__m1.as_short[1] < __m2.as_short[1]) ? __m1.as_short[1]
: __m2.as_short[1];
__res.as_short[2] = (__m1.as_short[2] < __m2.as_short[2]) ? __m1.as_short[2]
: __m2.as_short[2];
__res.as_short[3] = (__m1.as_short[3] < __m2.as_short[3]) ? __m1.as_short[3]
: __m2.as_short[3];
return (__m64)__res.as_m64;
#endif
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pminsw(__m64 __A, __m64 __B) {
return _mm_min_pi16(__A, __B);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_pu8(__m64 __A, __m64 __B) {
#if _ARCH_PWR8
__vector unsigned char __a, __b, __r;
__vector __bool char __c;
__a = (__vector unsigned char)vec_splats(__A);
__b = (__vector unsigned char)vec_splats(__B);
__c = (__vector __bool char)vec_cmplt(__a, __b);
__r = vec_sel(__b, __a, __c);
return (__m64)((__vector long long)__r)[0];
#else
__m64_union __m1, __m2, __res;
long __i;
__m1.as_m64 = __A;
__m2.as_m64 = __B;
for (__i = 0; __i < 8; __i++)
__res.as_char[__i] =
((unsigned char)__m1.as_char[__i] < (unsigned char)__m2.as_char[__i])
? __m1.as_char[__i]
: __m2.as_char[__i];
return (__m64)__res.as_m64;
#endif
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pminub(__m64 __A, __m64 __B) {
return _mm_min_pu8(__A, __B);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_pi8(__m64 __A) {
#ifdef __powerpc64__
unsigned long long __p =
#ifdef __LITTLE_ENDIAN__
0x0008101820283038UL; #else
0x3830282018100800UL; #endif
return __builtin_bpermd(__p, __A);
#else
#ifdef __LITTLE_ENDIAN__
unsigned int __mask = 0x20283038UL;
unsigned int __r1 = __builtin_bpermd(__mask, __A) & 0xf;
unsigned int __r2 = __builtin_bpermd(__mask, __A >> 32) & 0xf;
#else
unsigned int __mask = 0x38302820UL;
unsigned int __r1 = __builtin_bpermd(__mask, __A >> 32) & 0xf;
unsigned int __r2 = __builtin_bpermd(__mask, __A) & 0xf;
#endif
return (__r2 << 4) | __r1;
#endif
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pmovmskb(__m64 __A) {
return _mm_movemask_pi8(__A);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_pu16(__m64 __A, __m64 __B) {
__vector unsigned short __a, __b;
__vector unsigned short __c;
__vector unsigned int __w0, __w1;
__vector unsigned char __xform1 = {
#ifdef __LITTLE_ENDIAN__
0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
#else
0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00,
0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
#endif
};
__a = (__vector unsigned short)vec_splats(__A);
__b = (__vector unsigned short)vec_splats(__B);
__w0 = vec_vmuleuh(__a, __b);
__w1 = vec_vmulouh(__a, __b);
__c = (__vector unsigned short)vec_perm(__w0, __w1, __xform1);
return (__m64)((__vector long long)__c)[0];
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pmulhuw(__m64 __A, __m64 __B) {
return _mm_mulhi_pu16(__A, __B);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_pi16(__m64 __A, int const __N) {
unsigned long __element_selector_10 = __N & 0x03;
unsigned long __element_selector_32 = (__N >> 2) & 0x03;
unsigned long __element_selector_54 = (__N >> 4) & 0x03;
unsigned long __element_selector_76 = (__N >> 6) & 0x03;
static const unsigned short __permute_selectors[4] = {
#ifdef __LITTLE_ENDIAN__
0x0908, 0x0B0A, 0x0D0C, 0x0F0E
#else
0x0607, 0x0405, 0x0203, 0x0001
#endif
};
__m64_union __t;
__vector unsigned long long __a, __p, __r;
#ifdef __LITTLE_ENDIAN__
__t.as_short[0] = __permute_selectors[__element_selector_10];
__t.as_short[1] = __permute_selectors[__element_selector_32];
__t.as_short[2] = __permute_selectors[__element_selector_54];
__t.as_short[3] = __permute_selectors[__element_selector_76];
#else
__t.as_short[3] = __permute_selectors[__element_selector_10];
__t.as_short[2] = __permute_selectors[__element_selector_32];
__t.as_short[1] = __permute_selectors[__element_selector_54];
__t.as_short[0] = __permute_selectors[__element_selector_76];
#endif
__p = vec_splats(__t.as_m64);
__a = vec_splats(__A);
__r = vec_perm(__a, __a, (__vector unsigned char)__p);
return (__m64)((__vector long long)__r)[0];
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pshufw(__m64 __A, int const __N) {
return _mm_shuffle_pi16(__A, __N);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskmove_si64(__m64 __A, __m64 __N, char *__P) {
__m64 __hibit = 0x8080808080808080UL;
__m64 __mask, __tmp;
__m64 *__p = (__m64 *)__P;
__tmp = *__p;
__mask = _mm_cmpeq_pi8((__N & __hibit), __hibit);
__tmp = (__tmp & (~__mask)) | (__A & __mask);
*__p = __tmp;
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_maskmovq(__m64 __A, __m64 __N, char *__P) {
_mm_maskmove_si64(__A, __N, __P);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_avg_pu8(__m64 __A, __m64 __B) {
__vector unsigned char __a, __b, __c;
__a = (__vector unsigned char)vec_splats(__A);
__b = (__vector unsigned char)vec_splats(__B);
__c = vec_avg(__a, __b);
return (__m64)((__vector long long)__c)[0];
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pavgb(__m64 __A, __m64 __B) {
return _mm_avg_pu8(__A, __B);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_avg_pu16(__m64 __A, __m64 __B) {
__vector unsigned short __a, __b, __c;
__a = (__vector unsigned short)vec_splats(__A);
__b = (__vector unsigned short)vec_splats(__B);
__c = vec_avg(__a, __b);
return (__m64)((__vector long long)__c)[0];
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pavgw(__m64 __A, __m64 __B) {
return _mm_avg_pu16(__A, __B);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sad_pu8(__m64 __A, __m64 __B) {
__vector unsigned char __a, __b;
__vector unsigned char __vmin, __vmax, __vabsdiff;
__vector signed int __vsum;
const __vector unsigned int __zero = {0, 0, 0, 0};
__m64_union __result = {0};
__a = (__vector unsigned char)(__vector unsigned long long){0UL, __A};
__b = (__vector unsigned char)(__vector unsigned long long){0UL, __B};
__vmin = vec_min(__a, __b);
__vmax = vec_max(__a, __b);
__vabsdiff = vec_sub(__vmax, __vmin);
__vsum = (__vector signed int)vec_sum4s(__vabsdiff, __zero);
__vsum = vec_sums(__vsum, (__vector signed int)__zero);
__result.as_short[0] = __vsum[3];
return __result.as_m64;
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psadbw(__m64 __A, __m64 __B) {
return _mm_sad_pu8(__A, __B);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_pi(__m64 *__P, __m64 __A) {
__asm__(" dcbtstt 0,%0" : : "b"(__P) : "memory");
*__P = __A;
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_ps(float *__P, __m128 __A) {
__asm__(" dcbtstt 0,%0" : : "b"(__P) : "memory");
_mm_store_ps(__P, __A);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sfence(void) {
__atomic_thread_fence(__ATOMIC_RELEASE);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_pause(void) {
#if _ARCH_PWR8
unsigned long __PPR;
__asm__ volatile(" mfppr %0;"
" or 31,31,31;"
" isync;"
" lwsync;"
" isync;"
" mtppr %0;"
: "=r"(__PPR)
:
: "memory");
#else
__atomic_thread_fence(__ATOMIC_SEQ_CST);
#endif
}
#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
do { \
__v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \
__v4sf __t0 = vec_vmrghw(__r0, __r1); \
__v4sf __t1 = vec_vmrghw(__r2, __r3); \
__v4sf __t2 = vec_vmrglw(__r0, __r1); \
__v4sf __t3 = vec_vmrglw(__r2, __r3); \
(row0) = (__v4sf)vec_mergeh((__vector long long)__t0, \
(__vector long long)__t1); \
(row1) = (__v4sf)vec_mergel((__vector long long)__t0, \
(__vector long long)__t1); \
(row2) = (__v4sf)vec_mergeh((__vector long long)__t2, \
(__vector long long)__t3); \
(row3) = (__v4sf)vec_mergel((__vector long long)__t2, \
(__vector long long)__t3); \
} while (0)
#else
#include_next <xmmintrin.h>
#endif
#endif