diff options
Diffstat (limited to 'third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h')
-rw-r--r-- | third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h | 135 |
1 files changed, 116 insertions, 19 deletions
diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h index c7574eef5..267441b02 100644 --- a/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h +++ b/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h @@ -13,7 +13,8 @@ #define _V64_INTRINSICS_H #include <arm_neon.h> -#include "./v64_intrinsics_arm.h" + +#include "aom_dsp/simd/v64_intrinsics_arm.h" #include "aom_ports/arm.h" #ifdef AOM_INCOMPATIBLE_GCC @@ -121,20 +122,34 @@ SIMD_INLINE v64 v64_dup_32(uint32_t x) { } SIMD_INLINE int64_t v64_dotp_su8(v64 x, v64 y) { - int64x2_t r = vpaddlq_s32(vpaddlq_s16( + int16x8_t t = vmulq_s16(vmovl_s8(vreinterpret_s8_s64(x)), - vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(y)))))); + vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(y)))); +#if defined(__aarch64__) + return vaddlvq_s16(t); +#else + int64x2_t r = vpaddlq_s32(vpaddlq_s16(t)); return (int64_t)vadd_s64(vget_high_s64(r), vget_low_s64(r)); +#endif } SIMD_INLINE int64_t v64_dotp_s16(v64 x, v64 y) { +#if defined(__aarch64__) + return vaddlvq_s32( + vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); +#else int64x2_t r = vpaddlq_s32(vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); return (int64_t)(vget_high_s64(r) + vget_low_s64(r)); +#endif } SIMD_INLINE uint64_t v64_hadd_u8(v64 x) { +#if defined(__aarch64__) + return vaddlv_u8(vreinterpret_u8_s64(x)); +#else return (uint64_t)vpaddl_u32(vpaddl_u16(vpaddl_u8(vreinterpret_u8_s64(x)))); +#endif } SIMD_INLINE int64_t v64_hadd_s16(v64 a) { @@ -145,34 +160,40 @@ typedef uint16x8_t sad64_internal; SIMD_INLINE sad64_internal v64_sad_u8_init() { return vdupq_n_u16(0); } -/* Implementation dependent return value. Result must be finalised with - v64_sad_u8_sum(). - The result for more than 32 v64_sad_u8() calls is undefined. */ +// Implementation dependent return value. Result must be finalised with +// v64_sad_u8_sum(). SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) { return vabal_u8(s, vreinterpret_u8_s64(a), vreinterpret_u8_s64(b)); } SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) { +#if defined(__aarch64__) + return vaddlvq_u16(s); +#else uint64x2_t r = vpaddlq_u32(vpaddlq_u16(s)); return (uint32_t)(uint64_t)(vget_high_u64(r) + vget_low_u64(r)); +#endif } -typedef int64x1_t ssd64_internal; +typedef uint32x4_t ssd64_internal; -SIMD_INLINE ssd64_internal v64_ssd_u8_init() { - return (ssd64_internal)(uint64_t)0; -} +SIMD_INLINE ssd64_internal v64_ssd_u8_init() { return vdupq_n_u32(0); } -/* Implementation dependent return value. Result must be finalised with - * v64_ssd_u8_sum(). */ +// Implementation dependent return value. Result must be finalised with +// v64_ssd_u8_sum(). SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) { uint8x8_t t = vabd_u8(vreinterpret_u8_s64(a), vreinterpret_u8_s64(b)); - uint64x2_t r = vpaddlq_u32(vpaddlq_u16(vmull_u8(t, t))); - return vadd_u64(s, vadd_u64(vget_high_u64(r), vget_low_u64(r))); + return vaddq_u32(s, vpaddlq_u16(vmull_u8(t, t))); } SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) { - return (uint32_t)(uint64_t)s; +#if defined(__aarch64__) + return vaddvq_u32(s); +#else + uint64x2_t t = vpaddlq_u32(s); + return vget_lane_u32( + vreinterpret_u32_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t))), 0); +#endif } SIMD_INLINE v64 v64_or(v64 x, v64 y) { return vorr_s64(x, y); } @@ -188,6 +209,16 @@ SIMD_INLINE v64 v64_add_8(v64 x, v64 y) { vadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); } +SIMD_INLINE v64 v64_sadd_u8(v64 x, v64 y) { + return vreinterpret_s64_u8( + vqadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); +} + +SIMD_INLINE v64 v64_sadd_s8(v64 x, v64 y) { + return vreinterpret_s64_s8( + vqadd_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y))); +} + SIMD_INLINE v64 v64_add_16(v64 x, v64 y) { return vreinterpret_s64_s16( vadd_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); @@ -252,8 +283,14 @@ SIMD_INLINE v64 v64_mullo_s16(v64 x, v64 y) { } SIMD_INLINE v64 v64_mulhi_s16(v64 x, v64 y) { +#if defined(__aarch64__) + int16x8_t t = vreinterpretq_s16_s32( + vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); + return vget_low_s64(vreinterpretq_s64_s16(vuzp2q_s16(t, t))); +#else return vreinterpret_s64_s16(vmovn_s32(vshrq_n_s32( vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)), 16))); +#endif } SIMD_INLINE v64 v64_mullo_s32(v64 x, v64 y) { @@ -269,10 +306,10 @@ SIMD_INLINE v64 v64_madd_s16(v64 x, v64 y) { } SIMD_INLINE v64 v64_madd_us8(v64 x, v64 y) { - return vreinterpret_s64_s16(vqmovn_s32(vpaddlq_s16( - vaddq_s16(vmull_s8(vadd_s8(vreinterpret_s8_s64(x), vdup_n_s8(-128)), - vreinterpret_s8_s64(y)), - vshlq_n_s16(vmovl_s8(vreinterpret_s8_s64(y)), 7))))); + int16x8_t t = + vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(x))), + vmovl_s8(vreinterpret_s8_s64(y))); + return vreinterpret_s64_s16(vqmovn_s32(vpaddlq_s16(t))); } SIMD_INLINE v64 v64_avg_u8(v64 x, v64 y) { @@ -285,6 +322,11 @@ SIMD_INLINE v64 v64_rdavg_u8(v64 x, v64 y) { vhadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); } +SIMD_INLINE v64 v64_rdavg_u16(v64 x, v64 y) { + return vreinterpret_s64_u16( + vhadd_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y))); +} + SIMD_INLINE v64 v64_avg_u16(v64 x, v64 y) { return vreinterpret_s64_u16( vrhadd_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y))); @@ -321,33 +363,63 @@ SIMD_INLINE v64 v64_min_s16(v64 x, v64 y) { } SIMD_INLINE v64 v64_ziplo_8(v64 x, v64 y) { +#if defined(__aarch64__) + return vreinterpret_s64_u8( + vzip1_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x))); +#else uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); return vreinterpret_s64_u8(r.val[0]); +#endif } SIMD_INLINE v64 v64_ziphi_8(v64 x, v64 y) { +#if defined(__aarch64__) + return vreinterpret_s64_u8( + vzip2_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x))); +#else uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); return vreinterpret_s64_u8(r.val[1]); +#endif } SIMD_INLINE v64 v64_ziplo_16(v64 x, v64 y) { +#if defined(__aarch64__) + return vreinterpret_s64_u16( + vzip1_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x))); +#else int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x)); return vreinterpret_s64_s16(r.val[0]); +#endif } SIMD_INLINE v64 v64_ziphi_16(v64 x, v64 y) { +#if defined(__aarch64__) + return vreinterpret_s64_u16( + vzip2_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x))); +#else int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x)); return vreinterpret_s64_s16(r.val[1]); +#endif } SIMD_INLINE v64 v64_ziplo_32(v64 x, v64 y) { +#if defined(__aarch64__) + return vreinterpret_s64_u32( + vzip1_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x))); +#else int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)); return vreinterpret_s64_s32(r.val[0]); +#endif } SIMD_INLINE v64 v64_ziphi_32(v64 x, v64 y) { +#if defined(__aarch64__) + return vreinterpret_s64_u32( + vzip2_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x))); +#else int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)); return vreinterpret_s64_s32(r.val[1]); +#endif } SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) { @@ -371,6 +443,11 @@ SIMD_INLINE v64 v64_pack_s32_s16(v64 x, v64 y) { vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)))); } +SIMD_INLINE v64 v64_pack_s32_u16(v64 x, v64 y) { + return vreinterpret_s64_u16(vqmovun_s32( + vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)))); +} + SIMD_INLINE v64 v64_pack_s16_u8(v64 x, v64 y) { return vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s32( vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x))))); @@ -382,23 +459,43 @@ SIMD_INLINE v64 v64_pack_s16_s8(v64 x, v64 y) { } SIMD_INLINE v64 v64_unziplo_8(v64 x, v64 y) { +#if defined(__aarch64__) + return vreinterpret_s64_u8( + vuzp1_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x))); +#else uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); return vreinterpret_s64_u8(r.val[0]); +#endif } SIMD_INLINE v64 v64_unziphi_8(v64 x, v64 y) { +#if defined(__aarch64__) + return vreinterpret_s64_u8( + vuzp2_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x))); +#else uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); return vreinterpret_s64_u8(r.val[1]); +#endif } SIMD_INLINE v64 v64_unziplo_16(v64 x, v64 y) { +#if defined(__aarch64__) + return vreinterpret_s64_u16( + vuzp1_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x))); +#else uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)); return vreinterpret_s64_u16(r.val[0]); +#endif } SIMD_INLINE v64 v64_unziphi_16(v64 x, v64 y) { +#if defined(__aarch64__) + return vreinterpret_s64_u16( + vuzp2_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x))); +#else uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)); return vreinterpret_s64_u16(r.val[1]); +#endif } SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 x) { |