From d2499ead93dc4298c0882fe98902acb1b5209f99 Mon Sep 17 00:00:00 2001 From: trav90 Date: Fri, 19 Oct 2018 23:05:00 -0500 Subject: Update libaom to commit ID 1e227d41f0616de9548a673a83a21ef990b62591 --- third_party/aom/av1/common/alloccommon.h | 6 +- third_party/aom/av1/common/arm/av1_inv_txfm_neon.c | 2447 ++++++++- third_party/aom/av1/common/arm/av1_inv_txfm_neon.h | 8 +- .../aom/av1/common/arm/blend_a64_hmask_neon.c | 4 +- .../aom/av1/common/arm/blend_a64_vmask_neon.c | 4 +- third_party/aom/av1/common/arm/cfl_neon.c | 4 +- third_party/aom/av1/common/arm/convolve_neon.c | 363 +- third_party/aom/av1/common/arm/convolve_neon.h | 6 +- third_party/aom/av1/common/arm/jnt_convolve_neon.c | 512 +- third_party/aom/av1/common/arm/mem_neon.h | 15 +- third_party/aom/av1/common/arm/selfguided_neon.c | 18 +- third_party/aom/av1/common/arm/transpose_neon.h | 83 +- third_party/aom/av1/common/arm/warp_plane_neon.c | 714 +++ .../aom/av1/common/arm/wiener_convolve_neon.c | 145 +- third_party/aom/av1/common/av1_inv_txfm1d.c | 140 +- third_party/aom/av1/common/av1_inv_txfm1d.h | 6 +- third_party/aom/av1/common/av1_inv_txfm1d_cfg.h | 6 +- third_party/aom/av1/common/av1_loopfilter.c | 945 +++- third_party/aom/av1/common/av1_loopfilter.h | 120 +- third_party/aom/av1/common/av1_rtcd_defs.pl | 46 +- third_party/aom/av1/common/av1_txfm.c | 50 + third_party/aom/av1/common/av1_txfm.h | 32 +- third_party/aom/av1/common/blockd.c | 64 +- third_party/aom/av1/common/blockd.h | 53 +- third_party/aom/av1/common/cdef.h | 6 +- third_party/aom/av1/common/cdef_block.h | 6 +- third_party/aom/av1/common/cdef_block_simd.h | 5 + third_party/aom/av1/common/cfl.h | 6 +- third_party/aom/av1/common/common.h | 6 +- third_party/aom/av1/common/common_data.h | 75 +- third_party/aom/av1/common/convolve.c | 116 +- third_party/aom/av1/common/convolve.h | 21 +- third_party/aom/av1/common/entropy.h | 6 +- third_party/aom/av1/common/entropymode.h | 8 +- third_party/aom/av1/common/entropymv.c | 55 - third_party/aom/av1/common/entropymv.h | 16 +- third_party/aom/av1/common/enums.h | 12 +- third_party/aom/av1/common/filter.h | 22 +- third_party/aom/av1/common/frame_buffers.c | 11 + third_party/aom/av1/common/frame_buffers.h | 12 +- third_party/aom/av1/common/idct.c | 274 +- third_party/aom/av1/common/idct.h | 31 +- third_party/aom/av1/common/mv.h | 8 +- third_party/aom/av1/common/mvref_common.c | 381 +- third_party/aom/av1/common/mvref_common.h | 43 +- third_party/aom/av1/common/obmc.h | 14 +- third_party/aom/av1/common/obu_util.c | 147 + third_party/aom/av1/common/obu_util.h | 47 + third_party/aom/av1/common/odintrin.h | 12 +- third_party/aom/av1/common/onyxc_int.h | 29 +- third_party/aom/av1/common/ppc/cfl_ppc.c | 85 +- third_party/aom/av1/common/pred_common.c | 4 +- third_party/aom/av1/common/pred_common.h | 6 +- third_party/aom/av1/common/quant_common.h | 6 +- third_party/aom/av1/common/reconinter.c | 652 +-- third_party/aom/av1/common/reconinter.h | 148 +- third_party/aom/av1/common/reconintra.h | 6 +- third_party/aom/av1/common/resize.c | 39 +- third_party/aom/av1/common/resize.h | 6 +- third_party/aom/av1/common/restoration.c | 131 +- third_party/aom/av1/common/restoration.h | 7 +- third_party/aom/av1/common/scale.h | 7 +- third_party/aom/av1/common/scan.h | 6 +- third_party/aom/av1/common/seg_common.h | 6 +- third_party/aom/av1/common/thread_common.c | 18 +- third_party/aom/av1/common/thread_common.h | 6 +- third_party/aom/av1/common/tile_common.c | 16 + third_party/aom/av1/common/tile_common.h | 9 +- third_party/aom/av1/common/timing.h | 6 +- third_party/aom/av1/common/token_cdfs.h | 5 + third_party/aom/av1/common/txb_common.h | 243 +- third_party/aom/av1/common/warped_motion.c | 4 +- third_party/aom/av1/common/warped_motion.h | 6 +- .../aom/av1/common/x86/av1_convolve_scale_sse4.c | 1 - third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c | 6 + third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h | 6 +- .../aom/av1/common/x86/av1_inv_txfm_ssse3.c | 6 + .../aom/av1/common/x86/av1_inv_txfm_ssse3.h | 10 +- third_party/aom/av1/common/x86/av1_txfm_sse2.h | 6 +- third_party/aom/av1/common/x86/av1_txfm_sse4.h | 11 +- third_party/aom/av1/common/x86/cfl_simd.h | 5 + third_party/aom/av1/common/x86/convolve_2d_avx2.c | 2 - third_party/aom/av1/common/x86/convolve_2d_sse2.c | 3 +- third_party/aom/av1/common/x86/convolve_sse2.c | 11 +- .../aom/av1/common/x86/highbd_convolve_2d_avx2.c | 1 - .../aom/av1/common/x86/highbd_convolve_2d_sse4.c | 1 - .../aom/av1/common/x86/highbd_convolve_2d_ssse3.c | 1 - .../aom/av1/common/x86/highbd_inv_txfm_avx2.c | 1117 +++- .../aom/av1/common/x86/highbd_inv_txfm_sse4.c | 5335 +++++++++++++++----- .../aom/av1/common/x86/highbd_jnt_convolve_avx2.c | 1 - .../aom/av1/common/x86/highbd_txfm_utility_sse4.h | 28 +- .../aom/av1/common/x86/highbd_warp_plane_sse4.c | 268 +- third_party/aom/av1/common/x86/jnt_convolve_avx2.c | 211 +- third_party/aom/av1/common/x86/reconinter_avx2.c | 496 ++ third_party/aom/av1/common/x86/selfguided_avx2.c | 23 +- third_party/aom/av1/common/x86/selfguided_sse4.c | 24 +- third_party/aom/av1/common/x86/warp_plane_sse4.c | 809 ++- .../aom/av1/common/x86/wiener_convolve_avx2.c | 3 +- .../aom/av1/common/x86/wiener_convolve_sse2.c | 3 +- 99 files changed, 12657 insertions(+), 4313 deletions(-) create mode 100644 third_party/aom/av1/common/arm/warp_plane_neon.c create mode 100644 third_party/aom/av1/common/obu_util.c create mode 100644 third_party/aom/av1/common/obu_util.h (limited to 'third_party/aom/av1/common') diff --git a/third_party/aom/av1/common/alloccommon.h b/third_party/aom/av1/common/alloccommon.h index dbcb5b947..8e5896981 100644 --- a/third_party/aom/av1/common/alloccommon.h +++ b/third_party/aom/av1/common/alloccommon.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_ALLOCCOMMON_H_ -#define AV1_COMMON_ALLOCCOMMON_H_ +#ifndef AOM_AV1_COMMON_ALLOCCOMMON_H_ +#define AOM_AV1_COMMON_ALLOCCOMMON_H_ #define INVALID_IDX -1 // Invalid buffer index. @@ -45,4 +45,4 @@ int av1_get_MBs(int width, int height); } // extern "C" #endif -#endif // AV1_COMMON_ALLOCCOMMON_H_ +#endif // AOM_AV1_COMMON_ALLOCCOMMON_H_ diff --git a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c index 51c991498..bad411743 100644 --- a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c +++ b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c @@ -9,6 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#include + #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "config/av1_rtcd.h" @@ -19,19 +21,7 @@ #include "av1/common/enums.h" #include "av1/common/idct.h" #include "av1/common/arm/av1_inv_txfm_neon.h" - -static INLINE TxSetType find_TxSetType(TX_SIZE tx_size) { - const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size]; - TxSetType tx_set_type; - if (tx_size_sqr_up > TX_32X32) { - tx_set_type = EXT_TX_SET_DCTONLY; - } else if (tx_size_sqr_up == TX_32X32) { - tx_set_type = EXT_TX_SET_DCT_IDTX; - } else { - tx_set_type = EXT_TX_SET_ALL16; - } - return tx_set_type; -} +#include "av1/common/arm/transpose_neon.h" // 1D itx types typedef enum ATTRIBUTE_PACKED { @@ -65,6 +55,2038 @@ static const transform_1d_neon lowbd_txfm_all_1d_arr[TX_SIZES][ITX_TYPES_1D] = { { av1_idct64_new, NULL, NULL }, }; +static INLINE void lowbd_add_flip_buffer_8xn_neon(int16x8_t *in, + uint8_t *output, int stride, + int flipud, + const int height) { + int j = flipud ? (height - 1) : 0; + const int step = flipud ? -1 : 1; + int16x8_t temp_output; + for (int i = 0; i < height; ++i, j += step) { + temp_output = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(output))); + temp_output = vaddq_s16(temp_output, in[j]); + vst1_u8(output, vqmovun_s16(temp_output)); + output += stride; + } +} + +static INLINE uint8x16_t lowbd_get_recon_16x16_neon(const uint8x16_t pred, + int16x8_t res0, + int16x8_t res1) { + int16x8_t temp_output[2]; + uint8x16_t temp_output_8q; + temp_output[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pred))); + temp_output[0] = vaddq_s16(temp_output[0], res0); + temp_output[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pred))); + temp_output[1] = vaddq_s16(temp_output[1], res1); + temp_output_8q = + vcombine_u8(vqmovun_s16(temp_output[0]), vqmovun_s16(temp_output[1])); + return temp_output_8q; +} + +static INLINE void lowbd_add_flip_buffer_16xn_neon(int16x8_t *in, + uint8_t *output, int stride, + int flipud, int height) { + uint8x16_t temp_output_8q; + int j = flipud ? (height - 1) : 0; + const int step = flipud ? -1 : 1; + for (int i = 0; i < height; ++i, j += step) { + temp_output_8q = vld1q_u8(output + i * stride); + temp_output_8q = + lowbd_get_recon_16x16_neon(temp_output_8q, in[j], in[j + height]); + vst1q_u8((output + i * stride), temp_output_8q); + } +} + +static INLINE void lowbd_inv_txfm2d_memset_neon(int16x8_t *a, int size, + int value) { + for (int i = 0; i < size; i++) { + a[i] = vdupq_n_s16((int16_t)value); + } +} + +static INLINE void btf_16_lane_0_1_neon(const int16x8_t in0, + const int16x8_t in1, const int16x4_t c, + int16x8_t *t0, int16x8_t *t1) { + int32x4_t s0[2], s1[2]; + int16x4_t v0[2], v1[2]; + + s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 0); + s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 0); + s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 1); + s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 1); + + s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 1); + s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 1); + s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 0); + s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 0); + + v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT); + v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT); + v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT); + v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT); + + *t0 = vcombine_s16(v0[0], v0[1]); + *t1 = vcombine_s16(v1[0], v1[1]); +} + +static INLINE void btf_16_lane_1_0_neon(const int16x8_t in0, + const int16x8_t in1, const int16x4_t c, + int16x8_t *t0, int16x8_t *t1) { + int32x4_t s0[2], s1[2]; + int16x4_t v0[2], v1[2]; + + s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 1); + s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 1); + s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 0); + s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 0); + + s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 0); + s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 0); + s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 1); + s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 1); + + v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT); + v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT); + v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT); + v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT); + + *t0 = vcombine_s16(v0[0], v0[1]); + *t1 = vcombine_s16(v1[0], v1[1]); +} + +static INLINE void btf_16_lane_2_3_neon(const int16x8_t in0, + const int16x8_t in1, const int16x4_t c, + int16x8_t *t0, int16x8_t *t1) { + int32x4_t s0[2], s1[2]; + int16x4_t v0[2], v1[2]; + + s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 2); + s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 2); + s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 3); + s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 3); + + s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 3); + s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 3); + s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 2); + s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 2); + + v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT); + v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT); + v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT); + v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT); + + *t0 = vcombine_s16(v0[0], v0[1]); + *t1 = vcombine_s16(v1[0], v1[1]); +} + +static INLINE void btf_16_neon(const int16x8_t in0, int16_t coef1, + int16_t coef2, int16x8_t *t0, int16x8_t *t1) { + int32x4_t s0_l, s0_h, s1_l, s1_h; + int16x4_t v0[2], v1[2]; + + s0_l = vmull_n_s16(vget_low_s16(in0), coef1); + s0_h = vmull_n_s16(vget_high_s16(in0), coef1); + s1_l = vmull_n_s16(vget_low_s16(in0), coef2); + s1_h = vmull_n_s16(vget_high_s16(in0), coef2); + + v0[0] = vrshrn_n_s32(s0_l, INV_COS_BIT); + v0[1] = vrshrn_n_s32(s0_h, INV_COS_BIT); + v1[0] = vrshrn_n_s32(s1_l, INV_COS_BIT); + v1[1] = vrshrn_n_s32(s1_h, INV_COS_BIT); + + *t0 = vcombine_s16(v0[0], v0[1]); + *t1 = vcombine_s16(v1[0], v1[1]); +} + +static INLINE void btf_16_lane_3_2_neon(const int16x8_t in0, + const int16x8_t in1, const int16x4_t c, + int16x8_t *t0, int16x8_t *t1) { + int32x4_t s0[2], s1[2]; + int16x4_t v0[2], v1[2]; + + s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 3); + s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 3); + s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 2); + s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 2); + + s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 2); + s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 2); + s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 3); + s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 3); + + v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT); + v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT); + v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT); + v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT); + + *t0 = vcombine_s16(v0[0], v0[1]); + *t1 = vcombine_s16(v1[0], v1[1]); +} + +static INLINE void btf_16_half_neon(int16x8_t *const x, const int16x4_t c) { + int32x4_t t0[2], t1[2]; + int16x4_t v0[2], v1[2]; + + // Don't add/sub before multiply, which will overflow in iadst8. + const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(x[0]), c, 0); + const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(x[0]), c, 0); + const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(x[1]), c, 0); + const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(x[1]), c, 0); + + t0[0] = vaddq_s32(x0_lo, x1_lo); + t0[1] = vaddq_s32(x0_hi, x1_hi); + t1[0] = vsubq_s32(x0_lo, x1_lo); + t1[1] = vsubq_s32(x0_hi, x1_hi); + + v0[0] = vrshrn_n_s32(t0[0], INV_COS_BIT); + v0[1] = vrshrn_n_s32(t0[1], INV_COS_BIT); + v1[0] = vrshrn_n_s32(t1[0], INV_COS_BIT); + v1[1] = vrshrn_n_s32(t1[1], INV_COS_BIT); + + x[0] = vcombine_s16(v0[0], v0[1]); + x[1] = vcombine_s16(v1[0], v1[1]); +} + +static INLINE int16x4_t create_s16x4_neon(int16_t *const c0, int16_t *const c1, + int16_t *const c2, + int16_t *const c3) { + int16x4_t val = vdup_n_s16((int16_t)0); + val = vld1_lane_s16(c0, val, 0); + val = vld1_lane_s16(c1, val, 1); + val = vld1_lane_s16(c2, val, 2); + val = vld1_lane_s16(c3, val, 3); + return val; +} + +static INLINE void iadst8_new_neon(int16x8_t *const in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + + const int16x4_t c0 = + create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60), + (int16_t *)(cospi + 20), (int16_t *)(cospi + 44)); + const int16x4_t c1 = + create_s16x4_neon((int16_t *)(cospi + 36), (int16_t *)(cospi + 28), + (int16_t *)(cospi + 52), (int16_t *)(cospi + 12)); + const int16x4_t c2 = + create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32), + (int16_t *)(cospi + 16), (int16_t *)(cospi + 48)); + + int16x8_t x[8]; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + + // Stage 1 + x[0] = in[7]; + x[1] = in[0]; + x[2] = in[5]; + x[3] = in[2]; + x[4] = in[3]; + x[5] = in[4]; + x[6] = in[1]; + x[7] = in[6]; + + // Stage 2 + btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1); + btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3); + btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5); + btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7); + + // Stage 3 + x[0] = vqaddq_s16(s0, s4); + x[1] = vqaddq_s16(s1, s5); + x[2] = vqaddq_s16(s2, s6); + x[3] = vqaddq_s16(s3, s7); + x[4] = vqsubq_s16(s0, s4); + x[5] = vqsubq_s16(s1, s5); + x[6] = vqsubq_s16(s2, s6); + x[7] = vqsubq_s16(s3, s7); + + // Stage 4 + s0 = x[0]; + s1 = x[1]; + s2 = x[2]; + s3 = x[3]; + btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5); + btf_16_lane_3_2_neon(x[7], x[6], c2, &s7, &s6); + + // Stage 5 + x[0] = vqaddq_s16(s0, s2); + x[1] = vqaddq_s16(s1, s3); + x[2] = vqsubq_s16(s0, s2); + x[3] = vqsubq_s16(s1, s3); + x[4] = vqaddq_s16(s4, s6); + x[5] = vqaddq_s16(s5, s7); + x[6] = vqsubq_s16(s4, s6); + x[7] = vqsubq_s16(s5, s7); + + // stage 6 + btf_16_half_neon(x + 2, c2); + btf_16_half_neon(x + 6, c2); + + // Stage 7 + out[0] = x[0]; + out[1] = vnegq_s16(x[4]); + out[2] = x[6]; + out[3] = vnegq_s16(x[2]); + out[4] = x[3]; + out[5] = vnegq_s16(x[7]); + out[6] = x[5]; + out[7] = vnegq_s16(x[1]); +} + +static INLINE void iadst8_low1_new_neon(int16x8_t *const in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + const int16x4_t c2 = + create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32), + (int16_t *)(cospi + 16), (int16_t *)(cospi + 48)); + + int16x8_t x[8]; + int16x8_t s0, s1, s4, s5; + + // Stage 1 + x[1] = in[0]; + + // Stage 2 + + btf_16_neon(x[1], cospi[60], -cospi[4], &s0, &s1); + + // Stage 3 + x[0] = s0; + x[1] = s1; + x[4] = s0; + x[5] = s1; + + // Stage 4 + s0 = x[0]; + s1 = x[1]; + btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5); + + // Stage 5 + x[0] = s0; + x[1] = s1; + x[2] = s0; + x[3] = s1; + x[4] = s4; + x[5] = s5; + x[6] = s4; + x[7] = s5; + + // stage 6 + btf_16_half_neon(x + 2, c2); + btf_16_half_neon(x + 6, c2); + + // Stage 7 + out[0] = x[0]; + out[1] = vnegq_s16(x[4]); + out[2] = x[6]; + out[3] = vnegq_s16(x[2]); + out[4] = x[3]; + out[5] = vnegq_s16(x[7]); + out[6] = x[5]; + out[7] = vnegq_s16(x[1]); +} + +static INLINE void idct8_new_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit, + int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1[8], step2[8]; + const int16x4_t c0 = + create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56), + (int16_t *)(cospi + 40), (int16_t *)(cospi + 24)); + const int16x4_t c2 = + create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32), + (int16_t *)(cospi + 16), (int16_t *)(cospi + 48)); + + // stage 2 + btf_16_lane_0_1_neon(in[1], in[7], c0, &step1[7], &step1[4]); + btf_16_lane_2_3_neon(in[5], in[3], c0, &step1[6], &step1[5]); + + // stage 3 + btf_16_lane_0_1_neon(in[0], in[4], c2, &step2[0], &step2[1]); + btf_16_lane_2_3_neon(in[2], in[6], c2, &step2[3], &step2[2]); + step2[4] = vqaddq_s16(step1[4], step1[5]); + step2[5] = vqsubq_s16(step1[4], step1[5]); + step2[6] = vqsubq_s16(step1[7], step1[6]); + step2[7] = vqaddq_s16(step1[7], step1[6]); + + // stage 4 + step1[0] = vqaddq_s16(step2[0], step2[3]); + step1[1] = vqaddq_s16(step2[1], step2[2]); + step1[2] = vqsubq_s16(step2[1], step2[2]); + step1[3] = vqsubq_s16(step2[0], step2[3]); + btf_16_lane_0_1_neon(step2[6], step2[5], c2, &step1[6], &step1[5]); + + // stage 5 + out[0] = vqaddq_s16(step1[0], step2[7]); + out[1] = vqaddq_s16(step1[1], step1[6]); + out[2] = vqaddq_s16(step1[2], step1[5]); + out[3] = vqaddq_s16(step1[3], step2[4]); + out[4] = vqsubq_s16(step1[3], step2[4]); + out[5] = vqsubq_s16(step1[2], step1[5]); + out[6] = vqsubq_s16(step1[1], step1[6]); + out[7] = vqsubq_s16(step1[0], step2[7]); +} + +static INLINE void idct8_low1_new_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1; + int32x4_t t32[2]; + + // stage 1 + // stage 2 + // stage 3 + t32[0] = vmull_n_s16(vget_low_s16(in[0]), (int16_t)cospi[32]); + t32[1] = vmull_n_s16(vget_high_s16(in[0]), (int16_t)cospi[32]); + + step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT), + vrshrn_n_s32(t32[1], INV_COS_BIT)); + + // stage 4 + // stage 5 + out[0] = step1; + out[1] = step1; + out[2] = step1; + out[3] = step1; + out[4] = step1; + out[5] = step1; + out[6] = step1; + out[7] = step1; +} + +void av1_round_shift_array_16_neon(int16x8_t *arr, int size, int bit) { + assert(!(size % 4)); + if (!bit) return; + const int16x8_t dup_bits_n_16x8 = vdupq_n_s16((int16_t)(-bit)); + for (int i = 0; i < size; i++) { + arr[i] = vrshlq_s16(arr[i], dup_bits_n_16x8); + } +} + +static INLINE void flip_buf_ud_neon(int16x8_t *input, int size) { + int16x8_t temp[8]; + for (int i = 0; i < size; ++i) { + temp[i] = input[size - 1 - i]; + } + for (int i = 0; i < size; ++i) { + input[i] = temp[i]; + } +} + +static INLINE void load_buffer_32bit_to_16bit_neon(const int32_t *input, + int16x8_t *const a, + int out_size) { + for (int i = 0; i < 8; ++i) { + a[i] = vcombine_s16(vmovn_s32(vld1q_s32(input)), + vmovn_s32(vld1q_s32(input + 4))); + input += out_size; + } +} + +static INLINE void identity8_new_neon(int16x8_t *input, int16x8_t *output, + int8_t cos_bit, int bit) { + (void)bit; + (void)cos_bit; + + output[0] = vmulq_n_s16(input[0], (int16_t)2); + output[1] = vmulq_n_s16(input[1], (int16_t)2); + output[2] = vmulq_n_s16(input[2], (int16_t)2); + output[3] = vmulq_n_s16(input[3], (int16_t)2); + output[4] = vmulq_n_s16(input[4], (int16_t)2); + output[5] = vmulq_n_s16(input[5], (int16_t)2); + output[6] = vmulq_n_s16(input[6], (int16_t)2); + output[7] = vmulq_n_s16(input[7], (int16_t)2); +} + +static INLINE void round_shift_for_rect(int16x8_t *input, int16x8_t *output, + int size) { + int32x4_t out_low, out_high; + int16x4_t low, high; + + for (int z = 0; z < size; ++z) { + out_low = vmull_n_s16(vget_low_s16(input[z]), (int16_t)NewInvSqrt2); + out_high = vmull_n_s16(vget_high_s16(input[z]), (int16_t)NewInvSqrt2); + + low = vqrshrn_n_s32(out_low, (int32_t)NewSqrt2Bits); + high = vqrshrn_n_s32(out_high, (int32_t)NewSqrt2Bits); + + output[z] = vcombine_s16(low, high); + } +} + +static INLINE void identity16_new_neon(int16x8_t *input, int16x8_t *output, + int8_t cos_bit, int bit) { + (void)bit; + (void)cos_bit; + + int32x4_t out_low, out_high; + int16x4_t low, high; + int16_t scale = (int16_t)(2 * NewSqrt2); + + for (int z = 0; z < 16; ++z) { + out_low = vmull_n_s16(vget_low_s16(input[z]), scale); + out_high = vmull_n_s16(vget_high_s16(input[z]), scale); + + low = vqrshrn_n_s32(out_low, (int32_t)NewSqrt2Bits); + high = vqrshrn_n_s32(out_high, (int32_t)NewSqrt2Bits); + + output[z] = vcombine_s16(low, high); + } +} + +static INLINE void identity32_new_neon(int16x8_t *input, int16x8_t *output, + int8_t cos_bit, int bit) { + (void)bit; + (void)cos_bit; + + for (int z = 0; z < 32; ++z) { + output[z] = vmulq_n_s16(input[z], (int16_t)4); + } +} + +static INLINE void idct16_low1_new_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1; + int32x4_t t32[2]; + + // stage 4 + + t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]); + t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]); + step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT), + vrshrn_n_s32(t32[1], INV_COS_BIT)); + + // stage 6 + // stage 7 + out[0] = step1; + out[1] = step1; + out[2] = step1; + out[3] = step1; + out[4] = step1; + out[5] = step1; + out[6] = step1; + out[7] = step1; + out[8] = step1; + out[9] = step1; + out[10] = step1; + out[11] = step1; + out[12] = step1; + out[13] = step1; + out[14] = step1; + out[15] = step1; +} + +static INLINE void idct16_new_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1[16], step2[16]; + + const int16x4_t c0 = + create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60), + (int16_t *)(cospi + 36), (int16_t *)(cospi + 28)); + const int16x4_t c1 = + create_s16x4_neon((int16_t *)(cospi + 20), (int16_t *)(cospi + 44), + (int16_t *)(cospi + 52), (int16_t *)(cospi + 12)); + const int16x4_t c2 = + create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56), + (int16_t *)(cospi + 40), (int16_t *)(cospi + 24)); + const int16x4_t c3 = + create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32), + (int16_t *)(cospi + 16), (int16_t *)(cospi + 48)); + + // stage 2 + + btf_16_lane_0_1_neon(in[1], in[15], c0, &step2[15], &step2[8]); + btf_16_lane_2_3_neon(in[9], in[7], c0, &step2[14], &step2[9]); + btf_16_lane_0_1_neon(in[5], in[11], c1, &step2[13], &step2[10]); + btf_16_lane_2_3_neon(in[13], in[3], c1, &step2[12], &step2[11]); + + step2[0] = in[0]; + step2[1] = in[8]; + step2[2] = in[4]; + step2[3] = in[12]; + step2[4] = in[2]; + step2[5] = in[10]; + step2[6] = in[6]; + step2[7] = in[14]; + + // stage 3 + + btf_16_lane_0_1_neon(step2[4], step2[7], c2, &step1[7], &step1[4]); + btf_16_lane_2_3_neon(step2[5], step2[6], c2, &step1[6], &step1[5]); + + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + step1[8] = vqaddq_s16(step2[8], step2[9]); + step1[9] = vqsubq_s16(step2[8], step2[9]); + step1[10] = vqsubq_s16(step2[11], step2[10]); + step1[11] = vqaddq_s16(step2[11], step2[10]); + step1[12] = vqaddq_s16(step2[12], step2[13]); + step1[13] = vqsubq_s16(step2[12], step2[13]); + step1[14] = vqsubq_s16(step2[15], step2[14]); + step1[15] = vqaddq_s16(step2[15], step2[14]); + + // stage 4 + + btf_16_lane_0_1_neon(step1[0], step1[1], c3, &step2[0], &step2[1]); + btf_16_lane_2_3_neon(step1[2], step1[3], c3, &step2[3], &step2[2]); + btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]); + btf_16_lane_3_2_neon(vnegq_s16(step1[10]), vnegq_s16(step1[13]), c3, + &step2[10], &step2[13]); + + step2[4] = vqaddq_s16(step1[4], step1[5]); + step2[5] = vqsubq_s16(step1[4], step1[5]); + step2[6] = vqsubq_s16(step1[7], step1[6]); + step2[7] = vqaddq_s16(step1[7], step1[6]); + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + + btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]); + + step1[0] = vqaddq_s16(step2[0], step2[3]); + step1[1] = vqaddq_s16(step2[1], step2[2]); + step1[2] = vqsubq_s16(step2[1], step2[2]); + step1[3] = vqsubq_s16(step2[0], step2[3]); + step1[4] = step2[4]; + step1[7] = step2[7]; + step1[8] = vqaddq_s16(step2[8], step2[11]); + step1[9] = vqaddq_s16(step2[9], step2[10]); + step1[10] = vqsubq_s16(step2[9], step2[10]); + step1[11] = vqsubq_s16(step2[8], step2[11]); + step1[12] = vqsubq_s16(step2[15], step2[12]); + step1[13] = vqsubq_s16(step2[14], step2[13]); + step1[14] = vqaddq_s16(step2[14], step2[13]); + step1[15] = vqaddq_s16(step2[15], step2[12]); + + // stage 6 + + btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]); + btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]); + + step2[0] = vqaddq_s16(step1[0], step1[7]); + step2[1] = vqaddq_s16(step1[1], step1[6]); + step2[2] = vqaddq_s16(step1[2], step1[5]); + step2[3] = vqaddq_s16(step1[3], step1[4]); + step2[4] = vqsubq_s16(step1[3], step1[4]); + step2[5] = vqsubq_s16(step1[2], step1[5]); + step2[6] = vqsubq_s16(step1[1], step1[6]); + step2[7] = vqsubq_s16(step1[0], step1[7]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + out[0] = vqaddq_s16(step2[0], step2[15]); + out[1] = vqaddq_s16(step2[1], step2[14]); + out[2] = vqaddq_s16(step2[2], step2[13]); + out[3] = vqaddq_s16(step2[3], step2[12]); + out[4] = vqaddq_s16(step2[4], step2[11]); + out[5] = vqaddq_s16(step2[5], step2[10]); + out[6] = vqaddq_s16(step2[6], step2[9]); + out[7] = vqaddq_s16(step2[7], step2[8]); + out[8] = vqsubq_s16(step2[7], step2[8]); + out[9] = vqsubq_s16(step2[6], step2[9]); + out[10] = vqsubq_s16(step2[5], step2[10]); + out[11] = vqsubq_s16(step2[4], step2[11]); + out[12] = vqsubq_s16(step2[3], step2[12]); + out[13] = vqsubq_s16(step2[2], step2[13]); + out[14] = vqsubq_s16(step2[1], step2[14]); + out[15] = vqsubq_s16(step2[0], step2[15]); +} + +static INLINE void idct16_low8_new_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1[16], step2[16]; + const int16x4_t c0 = + create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32), + (int16_t *)(cospi + 16), (int16_t *)(cospi + 48)); + + // stage 1 + // stage 2 + + step2[0] = in[0]; + step2[2] = in[4]; + step2[4] = in[2]; + step2[6] = in[6]; + + btf_16_neon(in[1], cospi[60], cospi[4], &step2[8], &step2[15]); + btf_16_neon(in[7], -cospi[36], cospi[28], &step2[9], &step2[14]); + btf_16_neon(in[5], cospi[44], cospi[20], &step2[10], &step2[13]); + btf_16_neon(in[3], -cospi[52], cospi[12], &step2[11], &step2[12]); + + // stage 3 + + btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]); + btf_16_neon(step2[6], -cospi[40], cospi[24], &step1[5], &step1[6]); + + step1[0] = step2[0]; + step1[2] = step2[2]; + step1[8] = vqaddq_s16(step2[8], step2[9]); + step1[9] = vqsubq_s16(step2[8], step2[9]); + step1[10] = vqsubq_s16(step2[11], step2[10]); + step1[11] = vqaddq_s16(step2[11], step2[10]); + step1[12] = vqaddq_s16(step2[12], step2[13]); + step1[13] = vqsubq_s16(step2[12], step2[13]); + step1[14] = vqsubq_s16(step2[15], step2[14]); + step1[15] = vqaddq_s16(step2[15], step2[14]); + + // stage 4 + + btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]); + btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]); + btf_16_lane_2_3_neon(step1[14], step1[9], c0, &step2[14], &step2[9]); + btf_16_lane_3_2_neon(vnegq_s16(step1[10]), vnegq_s16(step1[13]), c0, + &step2[10], &step2[13]); + + step2[4] = vqaddq_s16(step1[4], step1[5]); + step2[5] = vqsubq_s16(step1[4], step1[5]); + step2[6] = vqsubq_s16(step1[7], step1[6]); + step2[7] = vqaddq_s16(step1[7], step1[6]); + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + + btf_16_lane_0_1_neon(step2[6], step2[5], c0, &step1[6], &step1[5]); + step1[0] = vqaddq_s16(step2[0], step2[3]); + step1[1] = vqaddq_s16(step2[1], step2[2]); + step1[2] = vqsubq_s16(step2[1], step2[2]); + step1[3] = vqsubq_s16(step2[0], step2[3]); + step1[4] = step2[4]; + step1[7] = step2[7]; + step1[8] = vqaddq_s16(step2[8], step2[11]); + step1[9] = vqaddq_s16(step2[9], step2[10]); + step1[10] = vqsubq_s16(step2[9], step2[10]); + step1[11] = vqsubq_s16(step2[8], step2[11]); + step1[12] = vqsubq_s16(step2[15], step2[12]); + step1[13] = vqsubq_s16(step2[14], step2[13]); + step1[14] = vqaddq_s16(step2[14], step2[13]); + step1[15] = vqaddq_s16(step2[15], step2[12]); + + // stage 6 + btf_16_lane_0_1_neon(step1[13], step1[10], c0, &step2[13], &step2[10]); + btf_16_lane_0_1_neon(step1[12], step1[11], c0, &step2[12], &step2[11]); + + step2[0] = vqaddq_s16(step1[0], step1[7]); + step2[1] = vqaddq_s16(step1[1], step1[6]); + step2[2] = vqaddq_s16(step1[2], step1[5]); + step2[3] = vqaddq_s16(step1[3], step1[4]); + step2[4] = vqsubq_s16(step1[3], step1[4]); + step2[5] = vqsubq_s16(step1[2], step1[5]); + step2[6] = vqsubq_s16(step1[1], step1[6]); + step2[7] = vqsubq_s16(step1[0], step1[7]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + + out[0] = vqaddq_s16(step2[0], step2[15]); + out[1] = vqaddq_s16(step2[1], step2[14]); + out[2] = vqaddq_s16(step2[2], step2[13]); + out[3] = vqaddq_s16(step2[3], step2[12]); + out[4] = vqaddq_s16(step2[4], step2[11]); + out[5] = vqaddq_s16(step2[5], step2[10]); + out[6] = vqaddq_s16(step2[6], step2[9]); + out[7] = vqaddq_s16(step2[7], step2[8]); + out[8] = vqsubq_s16(step2[7], step2[8]); + out[9] = vqsubq_s16(step2[6], step2[9]); + out[10] = vqsubq_s16(step2[5], step2[10]); + out[11] = vqsubq_s16(step2[4], step2[11]); + out[12] = vqsubq_s16(step2[3], step2[12]); + out[13] = vqsubq_s16(step2[2], step2[13]); + out[14] = vqsubq_s16(step2[1], step2[14]); + out[15] = vqsubq_s16(step2[0], step2[15]); +} + +static INLINE void iadst16_new_neon(int16x8_t *const in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + + const int16x4_t c0 = + create_s16x4_neon((int16_t *)(cospi + 2), (int16_t *)(cospi + 62), + (int16_t *)(cospi + 10), (int16_t *)(cospi + 54)); + const int16x4_t c1 = + create_s16x4_neon((int16_t *)(cospi + 18), (int16_t *)(cospi + 46), + (int16_t *)(cospi + 26), (int16_t *)(cospi + 38)); + const int16x4_t c2 = + create_s16x4_neon((int16_t *)(cospi + 34), (int16_t *)(cospi + 30), + (int16_t *)(cospi + 42), (int16_t *)(cospi + 22)); + const int16x4_t c3 = + create_s16x4_neon((int16_t *)(cospi + 50), (int16_t *)(cospi + 14), + (int16_t *)(cospi + 58), (int16_t *)(cospi + 6)); + const int16x4_t c4 = + create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56), + (int16_t *)(cospi + 40), (int16_t *)(cospi + 24)); + + const int16x4_t c = + create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32), + (int16_t *)(cospi + 16), (int16_t *)(cospi + 48)); + + int16x8_t x[16]; + int16x8_t t[14]; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + int16x8_t s8, s9, s10, s11, s12, s13, s14, s15; + + // Stage 1 + x[0] = in[15]; + x[1] = in[0]; + x[2] = in[13]; + x[3] = in[2]; + x[4] = in[11]; + x[5] = in[4]; + x[6] = in[9]; + x[7] = in[6]; + x[8] = in[7]; + x[9] = in[8]; + x[10] = in[5]; + x[11] = in[10]; + x[12] = in[3]; + x[13] = in[12]; + x[14] = in[1]; + x[15] = in[14]; + + // Stage 2 + btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1); + btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3); + btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5); + btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7); + btf_16_lane_0_1_neon(x[8], x[9], c2, &s8, &s9); + btf_16_lane_2_3_neon(x[10], x[11], c2, &s10, &s11); + btf_16_lane_0_1_neon(x[12], x[13], c3, &s12, &s13); + btf_16_lane_2_3_neon(x[14], x[15], c3, &s14, &s15); + + // Stage 3 + x[0] = vqaddq_s16(s0, s8); + x[1] = vqaddq_s16(s1, s9); + x[2] = vqaddq_s16(s2, s10); + x[3] = vqaddq_s16(s3, s11); + x[4] = vqaddq_s16(s4, s12); + x[5] = vqaddq_s16(s5, s13); + x[6] = vqaddq_s16(s6, s14); + x[7] = vqaddq_s16(s7, s15); + x[8] = vqsubq_s16(s0, s8); + x[9] = vqsubq_s16(s1, s9); + x[10] = vqsubq_s16(s2, s10); + x[11] = vqsubq_s16(s3, s11); + x[12] = vqsubq_s16(s4, s12); + x[13] = vqsubq_s16(s5, s13); + x[14] = vqsubq_s16(s6, s14); + x[15] = vqsubq_s16(s7, s15); + + // Stage 4 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + t[4] = x[4]; + t[5] = x[5]; + t[6] = x[6]; + t[7] = x[7]; + btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9); + btf_16_lane_2_3_neon(x[10], x[11], c4, &s10, &s11); + btf_16_lane_1_0_neon(x[13], x[12], c4, &s13, &s12); + btf_16_lane_3_2_neon(x[15], x[14], c4, &s15, &s14); + + // Stage 5 + x[0] = vqaddq_s16(t[0], t[4]); + x[1] = vqaddq_s16(t[1], t[5]); + x[2] = vqaddq_s16(t[2], t[6]); + x[3] = vqaddq_s16(t[3], t[7]); + x[4] = vqsubq_s16(t[0], t[4]); + x[5] = vqsubq_s16(t[1], t[5]); + x[6] = vqsubq_s16(t[2], t[6]); + x[7] = vqsubq_s16(t[3], t[7]); + x[8] = vqaddq_s16(s8, s12); + x[9] = vqaddq_s16(s9, s13); + x[10] = vqaddq_s16(s10, s14); + x[11] = vqaddq_s16(s11, s15); + x[12] = vqsubq_s16(s8, s12); + x[13] = vqsubq_s16(s9, s13); + x[14] = vqsubq_s16(s10, s14); + x[15] = vqsubq_s16(s11, s15); + + // stage 6 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + btf_16_lane_2_3_neon(x[4], x[5], c, &s4, &s5); + btf_16_lane_3_2_neon(x[7], x[6], c, &s7, &s6); + t[8] = x[8]; + t[9] = x[9]; + t[10] = x[10]; + t[11] = x[11]; + btf_16_lane_2_3_neon(x[12], x[13], c, &s12, &s13); + btf_16_lane_3_2_neon(x[15], x[14], c, &s15, &s14); + + // Stage 7 + x[0] = vqaddq_s16(t[0], t[2]); + x[1] = vqaddq_s16(t[1], t[3]); + x[2] = vqsubq_s16(t[0], t[2]); + x[3] = vqsubq_s16(t[1], t[3]); + x[4] = vqaddq_s16(s4, s6); + x[5] = vqaddq_s16(s5, s7); + x[6] = vqsubq_s16(s4, s6); + x[7] = vqsubq_s16(s5, s7); + x[8] = vqaddq_s16(t[8], t[10]); + x[9] = vqaddq_s16(t[9], t[11]); + x[10] = vqsubq_s16(t[8], t[10]); + x[11] = vqsubq_s16(t[9], t[11]); + x[12] = vqaddq_s16(s12, s14); + x[13] = vqaddq_s16(s13, s15); + x[14] = vqsubq_s16(s12, s14); + x[15] = vqsubq_s16(s13, s15); + + // Stage 8 + btf_16_half_neon(x + 2, c); + btf_16_half_neon(x + 6, c); + btf_16_half_neon(x + 10, c); + btf_16_half_neon(x + 14, c); + + // Stage 9 + out[0] = x[0]; + out[1] = vnegq_s16(x[8]); + out[2] = x[12]; + out[3] = vnegq_s16(x[4]); + out[4] = x[6]; + out[5] = vnegq_s16(x[14]); + out[6] = x[10]; + out[7] = vnegq_s16(x[2]); + out[8] = x[3]; + out[9] = vnegq_s16(x[11]); + out[10] = x[15]; + out[11] = vnegq_s16(x[7]); + out[12] = x[5]; + out[13] = vnegq_s16(x[13]); + out[14] = x[9]; + out[15] = vnegq_s16(x[1]); +} + +static INLINE void iadst16_low1_new_neon(int16x8_t *const in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + const int16x4_t c4 = + create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56), + (int16_t *)(cospi + 40), (int16_t *)(cospi + 24)); + const int16x4_t c = + create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32), + (int16_t *)(cospi + 16), (int16_t *)(cospi + 48)); + + int16x8_t x[16]; + int16x8_t t[10]; + int16x8_t s0, s1, s4, s5; + int16x8_t s8, s9, s12, s13; + + // Stage 1 + x[1] = in[0]; + + // Stage 2 + btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1); + + // Stage 3 + x[0] = s0; + x[1] = s1; + x[8] = s0; + x[9] = s1; + + // Stage 4 + t[0] = x[0]; + t[1] = x[1]; + btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9); + + // Stage 5 + x[0] = t[0]; + x[1] = t[1]; + x[4] = t[0]; + x[5] = t[1]; + x[8] = s8; + x[9] = s9; + x[12] = s8; + x[13] = s9; + + // stage 6 + t[0] = x[0]; + t[1] = x[1]; + btf_16_lane_2_3_neon(x[4], x[5], c, &s4, &s5); + t[8] = x[8]; + t[9] = x[9]; + btf_16_lane_2_3_neon(x[12], x[13], c, &s12, &s13); + + // Stage 7 + x[0] = t[0]; + x[1] = t[1]; + x[2] = t[0]; + x[3] = t[1]; + x[4] = s4; + x[5] = s5; + x[6] = s4; + x[7] = s5; + x[8] = t[8]; + x[9] = t[9]; + x[10] = t[8]; + x[11] = t[9]; + x[12] = s12; + x[13] = s13; + x[14] = s12; + x[15] = s13; + + // Stage 8 + btf_16_half_neon(x + 2, c); + btf_16_half_neon(x + 6, c); + btf_16_half_neon(x + 10, c); + btf_16_half_neon(x + 14, c); + + // Stage 9 + out[0] = x[0]; + out[1] = vnegq_s16(x[8]); + out[2] = x[12]; + out[3] = vnegq_s16(x[4]); + out[4] = x[6]; + out[5] = vnegq_s16(x[14]); + out[6] = x[10]; + out[7] = vnegq_s16(x[2]); + out[8] = x[3]; + out[9] = vnegq_s16(x[11]); + out[10] = x[15]; + out[11] = vnegq_s16(x[7]); + out[12] = x[5]; + out[13] = vnegq_s16(x[13]); + out[14] = x[9]; + out[15] = vnegq_s16(x[1]); +} + +static INLINE void iadst16_low8_new_neon(int16x8_t *const in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + + const int16x4_t c4 = + create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56), + (int16_t *)(cospi + 40), (int16_t *)(cospi + 24)); + const int16x4_t c = + create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32), + (int16_t *)(cospi + 16), (int16_t *)(cospi + 48)); + + int16x8_t x[16]; + int16x8_t t[14]; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + int16x8_t s8, s9, s10, s11, s12, s13, s14, s15; + + // Stage 1 + x[1] = in[0]; + x[3] = in[2]; + x[5] = in[4]; + x[7] = in[6]; + x[8] = in[7]; + x[10] = in[5]; + x[12] = in[3]; + x[14] = in[1]; + + // Stage 2 + btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1); + btf_16_neon(x[3], cospi[54], -cospi[10], &s2, &s3); + btf_16_neon(x[5], cospi[46], -cospi[18], &s4, &s5); + btf_16_neon(x[7], cospi[38], -cospi[26], &s6, &s7); + + btf_16_neon(x[8], cospi[34], cospi[30], &s8, &s9); + btf_16_neon(x[10], cospi[42], cospi[22], &s10, &s11); + btf_16_neon(x[12], cospi[50], cospi[14], &s12, &s13); + btf_16_neon(x[14], cospi[58], cospi[6], &s14, &s15); + + // Stage 3 + x[0] = vqaddq_s16(s0, s8); + x[1] = vqaddq_s16(s1, s9); + x[2] = vqaddq_s16(s2, s10); + x[3] = vqaddq_s16(s3, s11); + x[4] = vqaddq_s16(s4, s12); + x[5] = vqaddq_s16(s5, s13); + x[6] = vqaddq_s16(s6, s14); + x[7] = vqaddq_s16(s7, s15); + x[8] = vqsubq_s16(s0, s8); + x[9] = vqsubq_s16(s1, s9); + x[10] = vqsubq_s16(s2, s10); + x[11] = vqsubq_s16(s3, s11); + x[12] = vqsubq_s16(s4, s12); + x[13] = vqsubq_s16(s5, s13); + x[14] = vqsubq_s16(s6, s14); + x[15] = vqsubq_s16(s7, s15); + + // Stage 4 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + t[4] = x[4]; + t[5] = x[5]; + t[6] = x[6]; + t[7] = x[7]; + btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9); + btf_16_lane_2_3_neon(x[10], x[11], c4, &s10, &s11); + btf_16_lane_1_0_neon(x[13], x[12], c4, &s13, &s12); + btf_16_lane_3_2_neon(x[15], x[14], c4, &s15, &s14); + + // Stage 5 + x[0] = vqaddq_s16(t[0], t[4]); + x[1] = vqaddq_s16(t[1], t[5]); + x[2] = vqaddq_s16(t[2], t[6]); + x[3] = vqaddq_s16(t[3], t[7]); + x[4] = vqsubq_s16(t[0], t[4]); + x[5] = vqsubq_s16(t[1], t[5]); + x[6] = vqsubq_s16(t[2], t[6]); + x[7] = vqsubq_s16(t[3], t[7]); + x[8] = vqaddq_s16(s8, s12); + x[9] = vqaddq_s16(s9, s13); + x[10] = vqaddq_s16(s10, s14); + x[11] = vqaddq_s16(s11, s15); + x[12] = vqsubq_s16(s8, s12); + x[13] = vqsubq_s16(s9, s13); + x[14] = vqsubq_s16(s10, s14); + x[15] = vqsubq_s16(s11, s15); + + // stage 6 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + btf_16_lane_2_3_neon(x[4], x[5], c, &s4, &s5); + btf_16_lane_3_2_neon(x[7], x[6], c, &s7, &s6); + t[8] = x[8]; + t[9] = x[9]; + t[10] = x[10]; + t[11] = x[11]; + btf_16_lane_2_3_neon(x[12], x[13], c, &s12, &s13); + btf_16_lane_3_2_neon(x[15], x[14], c, &s15, &s14); + + // Stage 7 + x[0] = vqaddq_s16(t[0], t[2]); + x[1] = vqaddq_s16(t[1], t[3]); + x[2] = vqsubq_s16(t[0], t[2]); + x[3] = vqsubq_s16(t[1], t[3]); + x[4] = vqaddq_s16(s4, s6); + x[5] = vqaddq_s16(s5, s7); + x[6] = vqsubq_s16(s4, s6); + x[7] = vqsubq_s16(s5, s7); + x[8] = vqaddq_s16(t[8], t[10]); + x[9] = vqaddq_s16(t[9], t[11]); + x[10] = vqsubq_s16(t[8], t[10]); + x[11] = vqsubq_s16(t[9], t[11]); + x[12] = vqaddq_s16(s12, s14); + x[13] = vqaddq_s16(s13, s15); + x[14] = vqsubq_s16(s12, s14); + x[15] = vqsubq_s16(s13, s15); + + // Stage 8 + btf_16_half_neon(x + 2, c); + btf_16_half_neon(x + 6, c); + btf_16_half_neon(x + 10, c); + btf_16_half_neon(x + 14, c); + + // Stage 9 + out[0] = x[0]; + out[1] = vnegq_s16(x[8]); + out[2] = x[12]; + out[3] = vnegq_s16(x[4]); + out[4] = x[6]; + out[5] = vnegq_s16(x[14]); + out[6] = x[10]; + out[7] = vnegq_s16(x[2]); + out[8] = x[3]; + out[9] = vnegq_s16(x[11]); + out[10] = x[15]; + out[11] = vnegq_s16(x[7]); + out[12] = x[5]; + out[13] = vnegq_s16(x[13]); + out[14] = x[9]; + out[15] = vnegq_s16(x[1]); +} + +static INLINE void idct32_new_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1[32], step2[32]; + + const int16x4_t c0 = + create_s16x4_neon((int16_t *)(cospi + 2), (int16_t *)(cospi + 62), + (int16_t *)(cospi + 34), (int16_t *)(cospi + 30)); + const int16x4_t c1 = + create_s16x4_neon((int16_t *)(cospi + 18), (int16_t *)(cospi + 46), + (int16_t *)(cospi + 50), (int16_t *)(cospi + 14)); + const int16x4_t c2 = + create_s16x4_neon((int16_t *)(cospi + 10), (int16_t *)(cospi + 54), + (int16_t *)(cospi + 42), (int16_t *)(cospi + 22)); + const int16x4_t c3 = + create_s16x4_neon((int16_t *)(cospi + 26), (int16_t *)(cospi + 38), + (int16_t *)(cospi + 58), (int16_t *)(cospi + 6)); + const int16x4_t c4 = + create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60), + (int16_t *)(cospi + 36), (int16_t *)(cospi + 28)); + const int16x4_t c5 = + create_s16x4_neon((int16_t *)(cospi + 20), (int16_t *)(cospi + 44), + (int16_t *)(cospi + 52), (int16_t *)(cospi + 12)); + const int16x4_t c6 = + create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56), + (int16_t *)(cospi + 40), (int16_t *)(cospi + 24)); + const int16x4_t c7 = + create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32), + (int16_t *)(cospi + 16), (int16_t *)(cospi + 48)); + + // stage 2 + + btf_16_lane_0_1_neon(in[1], in[31], c0, &step2[31], &step2[16]); + btf_16_lane_2_3_neon(in[17], in[15], c0, &step2[30], &step2[17]); + btf_16_lane_0_1_neon(in[9], in[23], c1, &step2[29], &step2[18]); + btf_16_lane_2_3_neon(in[25], in[7], c1, &step2[28], &step2[19]); + btf_16_lane_0_1_neon(in[5], in[27], c2, &step2[27], &step2[20]); + btf_16_lane_2_3_neon(in[21], in[11], c2, &step2[26], &step2[21]); + btf_16_lane_0_1_neon(in[13], in[19], c3, &step2[25], &step2[22]); + btf_16_lane_2_3_neon(in[29], in[3], c3, &step2[24], &step2[23]); + + step2[0] = in[0]; + step2[1] = in[16]; + step2[2] = in[8]; + step2[3] = in[24]; + step2[4] = in[4]; + step2[5] = in[20]; + step2[6] = in[12]; + step2[7] = in[28]; + step2[8] = in[2]; + step2[9] = in[18]; + step2[10] = in[10]; + step2[11] = in[26]; + step2[12] = in[6]; + step2[13] = in[22]; + step2[14] = in[14]; + step2[15] = in[30]; + + // stage 3 + + btf_16_lane_0_1_neon(step2[8], step2[15], c4, &step1[15], &step1[8]); + btf_16_lane_2_3_neon(step2[9], step2[14], c4, &step1[14], &step1[9]); + btf_16_lane_0_1_neon(step2[10], step2[13], c5, &step1[13], &step1[10]); + btf_16_lane_2_3_neon(step2[11], step2[12], c5, &step1[12], &step1[11]); + + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + step1[4] = step2[4]; + step1[5] = step2[5]; + step1[6] = step2[6]; + step1[7] = step2[7]; + + step1[16] = vqaddq_s16(step2[16], step2[17]); + step1[17] = vqsubq_s16(step2[16], step2[17]); + step1[18] = vqsubq_s16(step2[19], step2[18]); + step1[19] = vqaddq_s16(step2[19], step2[18]); + step1[20] = vqaddq_s16(step2[20], step2[21]); + step1[21] = vqsubq_s16(step2[20], step2[21]); + step1[22] = vqsubq_s16(step2[23], step2[22]); + step1[23] = vqaddq_s16(step2[23], step2[22]); + step1[24] = vqaddq_s16(step2[24], step2[25]); + step1[25] = vqsubq_s16(step2[24], step2[25]); + step1[26] = vqsubq_s16(step2[27], step2[26]); + step1[27] = vqaddq_s16(step2[27], step2[26]); + step1[28] = vqaddq_s16(step2[28], step2[29]); + step1[29] = vqsubq_s16(step2[28], step2[29]); + step1[30] = vqsubq_s16(step2[31], step2[30]); + step1[31] = vqaddq_s16(step2[31], step2[30]); + + // stage 4 + + btf_16_lane_0_1_neon(step1[4], step1[7], c6, &step2[7], &step2[4]); + btf_16_lane_2_3_neon(step1[5], step1[6], c6, &step2[6], &step2[5]); + btf_16_lane_0_1_neon(step1[30], step1[17], c6, &step2[30], &step2[17]); + btf_16_lane_1_0_neon(vnegq_s16(step1[18]), vnegq_s16(step1[29]), c6, + &step2[18], &step2[29]); + btf_16_lane_2_3_neon(step1[26], step1[21], c6, &step2[26], &step2[21]); + btf_16_lane_3_2_neon(vnegq_s16(step1[22]), vnegq_s16(step1[25]), c6, + &step2[22], &step2[25]); + + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[8] = vqaddq_s16(step1[8], step1[9]); + step2[9] = vqsubq_s16(step1[8], step1[9]); + step2[10] = vqsubq_s16(step1[11], step1[10]); + step2[11] = vqaddq_s16(step1[11], step1[10]); + step2[12] = vqaddq_s16(step1[12], step1[13]); + step2[13] = vqsubq_s16(step1[12], step1[13]); + step2[14] = vqsubq_s16(step1[15], step1[14]); + step2[15] = vqaddq_s16(step1[15], step1[14]); + step2[16] = step1[16]; + step2[19] = step1[19]; + step2[20] = step1[20]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[27] = step1[27]; + step2[28] = step1[28]; + step2[31] = step1[31]; + + // stage 5 + + btf_16_lane_0_1_neon(step2[0], step2[1], c7, &step1[0], &step1[1]); + btf_16_lane_2_3_neon(step2[2], step2[3], c7, &step1[3], &step1[2]); + btf_16_lane_2_3_neon(step2[14], step2[9], c7, &step1[14], &step1[9]); + btf_16_lane_3_2_neon(vnegq_s16(step2[10]), vnegq_s16(step2[13]), c7, + &step1[10], &step1[13]); + + step1[4] = vqaddq_s16(step2[4], step2[5]); + step1[5] = vqsubq_s16(step2[4], step2[5]); + step1[6] = vqsubq_s16(step2[7], step2[6]); + step1[7] = vqaddq_s16(step2[7], step2[6]); + step1[8] = step2[8]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[15] = step2[15]; + step1[16] = vqaddq_s16(step2[16], step2[19]); + step1[17] = vqaddq_s16(step2[17], step2[18]); + step1[18] = vqsubq_s16(step2[17], step2[18]); + step1[19] = vqsubq_s16(step2[16], step2[19]); + step1[20] = vqsubq_s16(step2[23], step2[20]); + step1[21] = vqsubq_s16(step2[22], step2[21]); + step1[22] = vqaddq_s16(step2[22], step2[21]); + step1[23] = vqaddq_s16(step2[23], step2[20]); + step1[24] = vqaddq_s16(step2[24], step2[27]); + step1[25] = vqaddq_s16(step2[25], step2[26]); + step1[26] = vqsubq_s16(step2[25], step2[26]); + step1[27] = vqsubq_s16(step2[24], step2[27]); + step1[28] = vqsubq_s16(step2[31], step2[28]); + step1[29] = vqsubq_s16(step2[30], step2[29]); + step1[30] = vqaddq_s16(step2[30], step2[29]); + step1[31] = vqaddq_s16(step2[31], step2[28]); + + // stage 6 + + btf_16_lane_0_1_neon(step1[6], step1[5], c7, &step2[6], &step2[5]); + btf_16_lane_2_3_neon(step1[29], step1[18], c7, &step2[29], &step2[18]); + btf_16_lane_2_3_neon(step1[28], step1[19], c7, &step2[28], &step2[19]); + btf_16_lane_3_2_neon(vnegq_s16(step1[20]), vnegq_s16(step1[27]), c7, + &step2[20], &step2[27]); + btf_16_lane_3_2_neon(vnegq_s16(step1[21]), vnegq_s16(step1[26]), c7, + &step2[21], &step2[26]); + + step2[0] = vqaddq_s16(step1[0], step1[3]); + step2[1] = vqaddq_s16(step1[1], step1[2]); + step2[2] = vqsubq_s16(step1[1], step1[2]); + step2[3] = vqsubq_s16(step1[0], step1[3]); + step2[4] = step1[4]; + step2[7] = step1[7]; + step2[8] = vqaddq_s16(step1[8], step1[11]); + step2[9] = vqaddq_s16(step1[9], step1[10]); + step2[10] = vqsubq_s16(step1[9], step1[10]); + step2[11] = vqsubq_s16(step1[8], step1[11]); + step2[12] = vqsubq_s16(step1[15], step1[12]); + step2[13] = vqsubq_s16(step1[14], step1[13]); + step2[14] = vqaddq_s16(step1[14], step1[13]); + step2[15] = vqaddq_s16(step1[15], step1[12]); + step2[16] = step1[16]; + step2[17] = step1[17]; + step2[22] = step1[22]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[25] = step1[25]; + step2[30] = step1[30]; + step2[31] = step1[31]; + + // stage 7 + + btf_16_lane_0_1_neon(step2[13], step2[10], c7, &step1[13], &step1[10]); + btf_16_lane_0_1_neon(step2[12], step2[11], c7, &step1[12], &step1[11]); + + step1[0] = vqaddq_s16(step2[0], step2[7]); + step1[1] = vqaddq_s16(step2[1], step2[6]); + step1[2] = vqaddq_s16(step2[2], step2[5]); + step1[3] = vqaddq_s16(step2[3], step2[4]); + step1[4] = vqsubq_s16(step2[3], step2[4]); + step1[5] = vqsubq_s16(step2[2], step2[5]); + step1[6] = vqsubq_s16(step2[1], step2[6]); + step1[7] = vqsubq_s16(step2[0], step2[7]); + step1[8] = step2[8]; + step1[9] = step2[9]; + step1[14] = step2[14]; + step1[15] = step2[15]; + step1[16] = vqaddq_s16(step2[16], step2[23]); + step1[17] = vqaddq_s16(step2[17], step2[22]); + step1[18] = vqaddq_s16(step2[18], step2[21]); + step1[19] = vqaddq_s16(step2[19], step2[20]); + step1[20] = vqsubq_s16(step2[19], step2[20]); + step1[21] = vqsubq_s16(step2[18], step2[21]); + step1[22] = vqsubq_s16(step2[17], step2[22]); + step1[23] = vqsubq_s16(step2[16], step2[23]); + step1[24] = vqsubq_s16(step2[31], step2[24]); + step1[25] = vqsubq_s16(step2[30], step2[25]); + step1[26] = vqsubq_s16(step2[29], step2[26]); + step1[27] = vqsubq_s16(step2[28], step2[27]); + step1[28] = vqaddq_s16(step2[27], step2[28]); + step1[29] = vqaddq_s16(step2[26], step2[29]); + step1[30] = vqaddq_s16(step2[25], step2[30]); + step1[31] = vqaddq_s16(step2[24], step2[31]); + + // stage 8 + + btf_16_lane_0_1_neon(step1[27], step1[20], c7, &step2[27], &step2[20]); + btf_16_lane_0_1_neon(step1[26], step1[21], c7, &step2[26], &step2[21]); + btf_16_lane_0_1_neon(step1[25], step1[22], c7, &step2[25], &step2[22]); + btf_16_lane_0_1_neon(step1[24], step1[23], c7, &step2[24], &step2[23]); + + step2[0] = vqaddq_s16(step1[0], step1[15]); + step2[1] = vqaddq_s16(step1[1], step1[14]); + step2[2] = vqaddq_s16(step1[2], step1[13]); + step2[3] = vqaddq_s16(step1[3], step1[12]); + step2[4] = vqaddq_s16(step1[4], step1[11]); + step2[5] = vqaddq_s16(step1[5], step1[10]); + step2[6] = vqaddq_s16(step1[6], step1[9]); + step2[7] = vqaddq_s16(step1[7], step1[8]); + step2[8] = vqsubq_s16(step1[7], step1[8]); + step2[9] = vqsubq_s16(step1[6], step1[9]); + step2[10] = vqsubq_s16(step1[5], step1[10]); + step2[11] = vqsubq_s16(step1[4], step1[11]); + step2[12] = vqsubq_s16(step1[3], step1[12]); + step2[13] = vqsubq_s16(step1[2], step1[13]); + step2[14] = vqsubq_s16(step1[1], step1[14]); + step2[15] = vqsubq_s16(step1[0], step1[15]); + step2[16] = step1[16]; + step2[17] = step1[17]; + step2[18] = step1[18]; + step2[19] = step1[19]; + step2[28] = step1[28]; + step2[29] = step1[29]; + step2[30] = step1[30]; + step2[31] = step1[31]; + + // stage 9 + + out[0] = vqaddq_s16(step2[0], step2[31]); + out[1] = vqaddq_s16(step2[1], step2[30]); + out[2] = vqaddq_s16(step2[2], step2[29]); + out[3] = vqaddq_s16(step2[3], step2[28]); + out[4] = vqaddq_s16(step2[4], step2[27]); + out[5] = vqaddq_s16(step2[5], step2[26]); + out[6] = vqaddq_s16(step2[6], step2[25]); + out[7] = vqaddq_s16(step2[7], step2[24]); + out[8] = vqaddq_s16(step2[8], step2[23]); + out[9] = vqaddq_s16(step2[9], step2[22]); + out[10] = vqaddq_s16(step2[10], step2[21]); + out[11] = vqaddq_s16(step2[11], step2[20]); + out[12] = vqaddq_s16(step2[12], step2[19]); + out[13] = vqaddq_s16(step2[13], step2[18]); + out[14] = vqaddq_s16(step2[14], step2[17]); + out[15] = vqaddq_s16(step2[15], step2[16]); + out[16] = vqsubq_s16(step2[15], step2[16]); + out[17] = vqsubq_s16(step2[14], step2[17]); + out[18] = vqsubq_s16(step2[13], step2[18]); + out[19] = vqsubq_s16(step2[12], step2[19]); + out[20] = vqsubq_s16(step2[11], step2[20]); + out[21] = vqsubq_s16(step2[10], step2[21]); + out[22] = vqsubq_s16(step2[9], step2[22]); + out[23] = vqsubq_s16(step2[8], step2[23]); + out[24] = vqsubq_s16(step2[7], step2[24]); + out[25] = vqsubq_s16(step2[6], step2[25]); + out[26] = vqsubq_s16(step2[5], step2[26]); + out[27] = vqsubq_s16(step2[4], step2[27]); + out[28] = vqsubq_s16(step2[3], step2[28]); + out[29] = vqsubq_s16(step2[2], step2[29]); + out[30] = vqsubq_s16(step2[1], step2[30]); + out[31] = vqsubq_s16(step2[0], step2[31]); +} + +static INLINE void idct32_low1_new_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1; + int32x4_t t32[2]; + + // stage 1 + // stage 2 + // stage 3 + // stage 4 + // stage 5 + + t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]); + t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]); + step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT), + vrshrn_n_s32(t32[1], INV_COS_BIT)); + + // stage 6 + // stage 7 + // stage 8 + // stage 9 + + out[0] = step1; + out[1] = step1; + out[2] = step1; + out[3] = step1; + out[4] = step1; + out[5] = step1; + out[6] = step1; + out[7] = step1; + out[8] = step1; + out[9] = step1; + out[10] = step1; + out[11] = step1; + out[12] = step1; + out[13] = step1; + out[14] = step1; + out[15] = step1; + out[16] = step1; + out[17] = step1; + out[18] = step1; + out[19] = step1; + out[20] = step1; + out[21] = step1; + out[22] = step1; + out[23] = step1; + out[24] = step1; + out[25] = step1; + out[26] = step1; + out[27] = step1; + out[28] = step1; + out[29] = step1; + out[30] = step1; + out[31] = step1; +} + +static INLINE void idct32_low8_new_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1[32], step2[32]; + int32x4_t t32[16]; + const int16x4_t c0 = + create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56), + (int16_t *)(cospi + 40), (int16_t *)(cospi + 24)); + const int16x4_t c1 = + create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32), + (int16_t *)(cospi + 16), (int16_t *)(cospi + 48)); + + // stage 1 + // stage 2 + + step2[0] = in[0]; + step2[4] = in[4]; + step2[8] = in[2]; + step2[12] = in[6]; + + btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]); + btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]); + btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]); + btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]); + + // stage 3 + step1[0] = step2[0]; + step1[4] = step2[4]; + + btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]); + btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]); + + step1[16] = step2[16]; + step1[17] = step2[16]; + step1[18] = step2[19]; + step1[19] = step2[19]; + step1[20] = step2[20]; + step1[21] = step2[20]; + step1[22] = step2[23]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[24]; + step1[26] = step2[27]; + step1[27] = step2[27]; + step1[28] = step2[28]; + step1[29] = step2[28]; + step1[30] = step2[31]; + step1[31] = step2[31]; + + // stage 4 + + btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]); + btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]); + btf_16_lane_1_0_neon(vnegq_s16(step1[18]), vnegq_s16(step1[29]), c0, + &step2[18], &step2[29]); + btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]); + btf_16_lane_3_2_neon(vnegq_s16(step1[22]), vnegq_s16(step1[25]), c0, + &step2[22], &step2[25]); + + step2[0] = step1[0]; + step2[8] = step1[8]; + step2[9] = step1[8]; + step2[10] = step1[11]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[13] = step1[12]; + step2[14] = step1[15]; + step2[15] = step1[15]; + step2[16] = step1[16]; + step2[19] = step1[19]; + step2[20] = step1[20]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[27] = step1[27]; + step2[28] = step1[28]; + step2[31] = step1[31]; + + // stage 5 + + t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]); + t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]); + step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT), + vrshrn_n_s32(t32[1], INV_COS_BIT)); + + btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]); + btf_16_lane_3_2_neon(vnegq_s16(step2[10]), vnegq_s16(step2[13]), c1, + &step1[10], &step1[13]); + + step1[4] = step2[4]; + step1[5] = step2[4]; + step1[6] = step2[7]; + step1[7] = step2[7]; + step1[8] = step2[8]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[15] = step2[15]; + step1[16] = vqaddq_s16(step2[16], step2[19]); + step1[17] = vqaddq_s16(step2[17], step2[18]); + step1[18] = vqsubq_s16(step2[17], step2[18]); + step1[19] = vqsubq_s16(step2[16], step2[19]); + step1[20] = vqsubq_s16(step2[23], step2[20]); + step1[21] = vqsubq_s16(step2[22], step2[21]); + step1[22] = vqaddq_s16(step2[22], step2[21]); + step1[23] = vqaddq_s16(step2[23], step2[20]); + step1[24] = vqaddq_s16(step2[24], step2[27]); + step1[25] = vqaddq_s16(step2[25], step2[26]); + step1[26] = vqsubq_s16(step2[25], step2[26]); + step1[27] = vqsubq_s16(step2[24], step2[27]); + step1[28] = vqsubq_s16(step2[31], step2[28]); + step1[29] = vqsubq_s16(step2[30], step2[29]); + step1[30] = vqaddq_s16(step2[30], step2[29]); + step1[31] = vqaddq_s16(step2[31], step2[28]); + + // stage 6 + + btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]); + btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]); + btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]); + btf_16_lane_3_2_neon(vnegq_s16(step1[20]), vnegq_s16(step1[27]), c1, + &step2[20], &step2[27]); + btf_16_lane_3_2_neon(vnegq_s16(step1[21]), vnegq_s16(step1[26]), c1, + &step2[21], &step2[26]); + + step2[0] = step1[0]; + step2[1] = step1[0]; + step2[2] = step1[0]; + step2[3] = step1[0]; + step2[4] = step1[4]; + step2[7] = step1[7]; + step2[8] = vqaddq_s16(step1[8], step1[11]); + step2[9] = vqaddq_s16(step1[9], step1[10]); + step2[10] = vqsubq_s16(step1[9], step1[10]); + step2[11] = vqsubq_s16(step1[8], step1[11]); + step2[12] = vqsubq_s16(step1[15], step1[12]); + step2[13] = vqsubq_s16(step1[14], step1[13]); + step2[14] = vqaddq_s16(step1[14], step1[13]); + step2[15] = vqaddq_s16(step1[15], step1[12]); + step2[16] = step1[16]; + step2[17] = step1[17]; + step2[22] = step1[22]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[25] = step1[25]; + step2[30] = step1[30]; + step2[31] = step1[31]; + + // stage 7 + + btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]); + btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]); + + step1[0] = vqaddq_s16(step2[0], step2[7]); + step1[1] = vqaddq_s16(step2[1], step2[6]); + step1[2] = vqaddq_s16(step2[2], step2[5]); + step1[3] = vqaddq_s16(step2[3], step2[4]); + step1[4] = vqsubq_s16(step2[3], step2[4]); + step1[5] = vqsubq_s16(step2[2], step2[5]); + step1[6] = vqsubq_s16(step2[1], step2[6]); + step1[7] = vqsubq_s16(step2[0], step2[7]); + step1[8] = step2[8]; + step1[9] = step2[9]; + step1[14] = step2[14]; + step1[15] = step2[15]; + step1[16] = vqaddq_s16(step2[16], step2[23]); + step1[17] = vqaddq_s16(step2[17], step2[22]); + step1[18] = vqaddq_s16(step2[18], step2[21]); + step1[19] = vqaddq_s16(step2[19], step2[20]); + step1[20] = vqsubq_s16(step2[19], step2[20]); + step1[21] = vqsubq_s16(step2[18], step2[21]); + step1[22] = vqsubq_s16(step2[17], step2[22]); + step1[23] = vqsubq_s16(step2[16], step2[23]); + step1[24] = vqsubq_s16(step2[31], step2[24]); + step1[25] = vqsubq_s16(step2[30], step2[25]); + step1[26] = vqsubq_s16(step2[29], step2[26]); + step1[27] = vqsubq_s16(step2[28], step2[27]); + step1[28] = vqaddq_s16(step2[27], step2[28]); + step1[29] = vqaddq_s16(step2[26], step2[29]); + step1[30] = vqaddq_s16(step2[25], step2[30]); + step1[31] = vqaddq_s16(step2[24], step2[31]); + + // stage 8 + + btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]); + btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]); + btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]); + btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]); + + step2[0] = vqaddq_s16(step1[0], step1[15]); + step2[1] = vqaddq_s16(step1[1], step1[14]); + step2[2] = vqaddq_s16(step1[2], step1[13]); + step2[3] = vqaddq_s16(step1[3], step1[12]); + step2[4] = vqaddq_s16(step1[4], step1[11]); + step2[5] = vqaddq_s16(step1[5], step1[10]); + step2[6] = vqaddq_s16(step1[6], step1[9]); + step2[7] = vqaddq_s16(step1[7], step1[8]); + step2[8] = vqsubq_s16(step1[7], step1[8]); + step2[9] = vqsubq_s16(step1[6], step1[9]); + step2[10] = vqsubq_s16(step1[5], step1[10]); + step2[11] = vqsubq_s16(step1[4], step1[11]); + step2[12] = vqsubq_s16(step1[3], step1[12]); + step2[13] = vqsubq_s16(step1[2], step1[13]); + step2[14] = vqsubq_s16(step1[1], step1[14]); + step2[15] = vqsubq_s16(step1[0], step1[15]); + step2[16] = step1[16]; + step2[17] = step1[17]; + step2[18] = step1[18]; + step2[19] = step1[19]; + step2[28] = step1[28]; + step2[29] = step1[29]; + step2[30] = step1[30]; + step2[31] = step1[31]; + + // stage 9 + + out[0] = vqaddq_s16(step2[0], step2[31]); + out[1] = vqaddq_s16(step2[1], step2[30]); + out[2] = vqaddq_s16(step2[2], step2[29]); + out[3] = vqaddq_s16(step2[3], step2[28]); + out[4] = vqaddq_s16(step2[4], step2[27]); + out[5] = vqaddq_s16(step2[5], step2[26]); + out[6] = vqaddq_s16(step2[6], step2[25]); + out[7] = vqaddq_s16(step2[7], step2[24]); + out[8] = vqaddq_s16(step2[8], step2[23]); + out[9] = vqaddq_s16(step2[9], step2[22]); + out[10] = vqaddq_s16(step2[10], step2[21]); + out[11] = vqaddq_s16(step2[11], step2[20]); + out[12] = vqaddq_s16(step2[12], step2[19]); + out[13] = vqaddq_s16(step2[13], step2[18]); + out[14] = vqaddq_s16(step2[14], step2[17]); + out[15] = vqaddq_s16(step2[15], step2[16]); + out[16] = vqsubq_s16(step2[15], step2[16]); + out[17] = vqsubq_s16(step2[14], step2[17]); + out[18] = vqsubq_s16(step2[13], step2[18]); + out[19] = vqsubq_s16(step2[12], step2[19]); + out[20] = vqsubq_s16(step2[11], step2[20]); + out[21] = vqsubq_s16(step2[10], step2[21]); + out[22] = vqsubq_s16(step2[9], step2[22]); + out[23] = vqsubq_s16(step2[8], step2[23]); + out[24] = vqsubq_s16(step2[7], step2[24]); + out[25] = vqsubq_s16(step2[6], step2[25]); + out[26] = vqsubq_s16(step2[5], step2[26]); + out[27] = vqsubq_s16(step2[4], step2[27]); + out[28] = vqsubq_s16(step2[3], step2[28]); + out[29] = vqsubq_s16(step2[2], step2[29]); + out[30] = vqsubq_s16(step2[1], step2[30]); + out[31] = vqsubq_s16(step2[0], step2[31]); +} + +static INLINE void idct32_low16_new_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit, int bit) { + (void)bit; + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1[32], step2[32]; + int32x4_t t32[16]; + const int16x4_t c0 = + create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56), + (int16_t *)(cospi + 40), (int16_t *)(cospi + 24)); + const int16x4_t c1 = + create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32), + (int16_t *)(cospi + 16), (int16_t *)(cospi + 48)); + + // stage 1 + // stage 2 + + btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]); + btf_16_neon(in[15], -cospi[34], cospi[30], &step2[17], &step2[30]); + btf_16_neon(in[9], cospi[46], cospi[18], &step2[18], &step2[29]); + btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]); + btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]); + btf_16_neon(in[11], -cospi[42], cospi[22], &step2[21], &step2[26]); + btf_16_neon(in[13], cospi[38], cospi[26], &step2[22], &step2[25]); + btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]); + + step2[0] = in[0]; + step2[2] = in[8]; + step2[4] = in[4]; + step2[6] = in[12]; + step2[8] = in[2]; + step2[10] = in[10]; + step2[12] = in[6]; + step2[14] = in[14]; + + // stage 3 + + btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]); + btf_16_neon(step2[14], -cospi[36], cospi[28], &step1[9], &step1[14]); + btf_16_neon(step2[10], cospi[44], cospi[20], &step1[10], &step1[13]); + btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]); + + step1[0] = step2[0]; + step1[2] = step2[2]; + step1[4] = step2[4]; + step1[6] = step2[6]; + step1[16] = vqaddq_s16(step2[16], step2[17]); + step1[17] = vqsubq_s16(step2[16], step2[17]); + step1[18] = vqsubq_s16(step2[19], step2[18]); + step1[19] = vqaddq_s16(step2[19], step2[18]); + step1[20] = vqaddq_s16(step2[20], step2[21]); + step1[21] = vqsubq_s16(step2[20], step2[21]); + step1[22] = vqsubq_s16(step2[23], step2[22]); + step1[23] = vqaddq_s16(step2[23], step2[22]); + step1[24] = vqaddq_s16(step2[24], step2[25]); + step1[25] = vqsubq_s16(step2[24], step2[25]); + step1[26] = vqsubq_s16(step2[27], step2[26]); + step1[27] = vqaddq_s16(step2[27], step2[26]); + step1[28] = vqaddq_s16(step2[28], step2[29]); + step1[29] = vqsubq_s16(step2[28], step2[29]); + step1[30] = vqsubq_s16(step2[31], step2[30]); + step1[31] = vqaddq_s16(step2[31], step2[30]); + + // stage 4 + + btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]); + btf_16_neon(step1[6], -cospi[40], cospi[24], &step2[5], &step2[6]); + btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]); + btf_16_lane_1_0_neon(vnegq_s16(step1[18]), vnegq_s16(step1[29]), c0, + &step2[18], &step2[29]); + btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]); + btf_16_lane_3_2_neon(vnegq_s16(step1[22]), vnegq_s16(step1[25]), c0, + &step2[22], &step2[25]); + + step2[0] = step1[0]; + step2[2] = step1[2]; + step2[8] = vqaddq_s16(step1[8], step1[9]); + step2[9] = vqsubq_s16(step1[8], step1[9]); + step2[10] = vqsubq_s16(step1[11], step1[10]); + step2[11] = vqaddq_s16(step1[11], step1[10]); + step2[12] = vqaddq_s16(step1[12], step1[13]); + step2[13] = vqsubq_s16(step1[12], step1[13]); + step2[14] = vqsubq_s16(step1[15], step1[14]); + step2[15] = vqaddq_s16(step1[15], step1[14]); + step2[16] = step1[16]; + step2[19] = step1[19]; + step2[20] = step1[20]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[27] = step1[27]; + step2[28] = step1[28]; + step2[31] = step1[31]; + + // stage 5 + + t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]); + t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]); + + step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT), + vrshrn_n_s32(t32[1], INV_COS_BIT)); + + btf_16_neon(step2[2], cospi[48], cospi[16], &step1[2], &step1[3]); + btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]); + btf_16_lane_3_2_neon(vnegq_s16(step2[10]), vnegq_s16(step2[13]), c1, + &step1[10], &step1[13]); + + step1[4] = vqaddq_s16(step2[4], step2[5]); + step1[5] = vqsubq_s16(step2[4], step2[5]); + step1[6] = vqsubq_s16(step2[7], step2[6]); + step1[7] = vqaddq_s16(step2[7], step2[6]); + step1[8] = step2[8]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[15] = step2[15]; + step1[16] = vqaddq_s16(step2[16], step2[19]); + step1[17] = vqaddq_s16(step2[17], step2[18]); + step1[18] = vqsubq_s16(step2[17], step2[18]); + step1[19] = vqsubq_s16(step2[16], step2[19]); + step1[20] = vqsubq_s16(step2[23], step2[20]); + step1[21] = vqsubq_s16(step2[22], step2[21]); + step1[22] = vqaddq_s16(step2[22], step2[21]); + step1[23] = vqaddq_s16(step2[23], step2[20]); + step1[24] = vqaddq_s16(step2[24], step2[27]); + step1[25] = vqaddq_s16(step2[25], step2[26]); + step1[26] = vqsubq_s16(step2[25], step2[26]); + step1[27] = vqsubq_s16(step2[24], step2[27]); + step1[28] = vqsubq_s16(step2[31], step2[28]); + step1[29] = vqsubq_s16(step2[30], step2[29]); + step1[30] = vqaddq_s16(step2[30], step2[29]); + step1[31] = vqaddq_s16(step2[31], step2[28]); + + // stage 6 + + btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]); + btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]); + btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]); + btf_16_lane_3_2_neon(vnegq_s16(step1[20]), vnegq_s16(step1[27]), c1, + &step2[20], &step2[27]); + btf_16_lane_3_2_neon(vnegq_s16(step1[21]), vnegq_s16(step1[26]), c1, + &step2[21], &step2[26]); + + step2[0] = vqaddq_s16(step1[0], step1[3]); + step2[1] = vqaddq_s16(step1[0], step1[2]); + step2[2] = vqsubq_s16(step1[0], step1[2]); + step2[3] = vqsubq_s16(step1[0], step1[3]); + step2[4] = step1[4]; + step2[7] = step1[7]; + step2[8] = vqaddq_s16(step1[8], step1[11]); + step2[9] = vqaddq_s16(step1[9], step1[10]); + step2[10] = vqsubq_s16(step1[9], step1[10]); + step2[11] = vqsubq_s16(step1[8], step1[11]); + step2[12] = vqsubq_s16(step1[15], step1[12]); + step2[13] = vqsubq_s16(step1[14], step1[13]); + step2[14] = vqaddq_s16(step1[14], step1[13]); + step2[15] = vqaddq_s16(step1[15], step1[12]); + step2[16] = step1[16]; + step2[17] = step1[17]; + step2[22] = step1[22]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[25] = step1[25]; + step2[30] = step1[30]; + step2[31] = step1[31]; + + // stage 7 + + btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]); + btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]); + + step1[0] = vqaddq_s16(step2[0], step2[7]); + step1[1] = vqaddq_s16(step2[1], step2[6]); + step1[2] = vqaddq_s16(step2[2], step2[5]); + step1[3] = vqaddq_s16(step2[3], step2[4]); + step1[4] = vqsubq_s16(step2[3], step2[4]); + step1[5] = vqsubq_s16(step2[2], step2[5]); + step1[6] = vqsubq_s16(step2[1], step2[6]); + step1[7] = vqsubq_s16(step2[0], step2[7]); + step1[8] = step2[8]; + step1[9] = step2[9]; + step1[14] = step2[14]; + step1[15] = step2[15]; + step1[16] = vqaddq_s16(step2[16], step2[23]); + step1[17] = vqaddq_s16(step2[17], step2[22]); + step1[18] = vqaddq_s16(step2[18], step2[21]); + step1[19] = vqaddq_s16(step2[19], step2[20]); + step1[20] = vqsubq_s16(step2[19], step2[20]); + step1[21] = vqsubq_s16(step2[18], step2[21]); + step1[22] = vqsubq_s16(step2[17], step2[22]); + step1[23] = vqsubq_s16(step2[16], step2[23]); + step1[24] = vqsubq_s16(step2[31], step2[24]); + step1[25] = vqsubq_s16(step2[30], step2[25]); + step1[26] = vqsubq_s16(step2[29], step2[26]); + step1[27] = vqsubq_s16(step2[28], step2[27]); + step1[28] = vqaddq_s16(step2[27], step2[28]); + step1[29] = vqaddq_s16(step2[26], step2[29]); + step1[30] = vqaddq_s16(step2[25], step2[30]); + step1[31] = vqaddq_s16(step2[24], step2[31]); + + // stage 8 + + btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]); + btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]); + btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]); + btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]); + + step2[0] = vqaddq_s16(step1[0], step1[15]); + step2[1] = vqaddq_s16(step1[1], step1[14]); + step2[2] = vqaddq_s16(step1[2], step1[13]); + step2[3] = vqaddq_s16(step1[3], step1[12]); + step2[4] = vqaddq_s16(step1[4], step1[11]); + step2[5] = vqaddq_s16(step1[5], step1[10]); + step2[6] = vqaddq_s16(step1[6], step1[9]); + step2[7] = vqaddq_s16(step1[7], step1[8]); + step2[8] = vqsubq_s16(step1[7], step1[8]); + step2[9] = vqsubq_s16(step1[6], step1[9]); + step2[10] = vqsubq_s16(step1[5], step1[10]); + step2[11] = vqsubq_s16(step1[4], step1[11]); + step2[12] = vqsubq_s16(step1[3], step1[12]); + step2[13] = vqsubq_s16(step1[2], step1[13]); + step2[14] = vqsubq_s16(step1[1], step1[14]); + step2[15] = vqsubq_s16(step1[0], step1[15]); + step2[16] = step1[16]; + step2[17] = step1[17]; + step2[18] = step1[18]; + step2[19] = step1[19]; + step2[28] = step1[28]; + step2[29] = step1[29]; + step2[30] = step1[30]; + step2[31] = step1[31]; + + // stage 9 + + out[0] = vqaddq_s16(step2[0], step2[31]); + out[1] = vqaddq_s16(step2[1], step2[30]); + out[2] = vqaddq_s16(step2[2], step2[29]); + out[3] = vqaddq_s16(step2[3], step2[28]); + out[4] = vqaddq_s16(step2[4], step2[27]); + out[5] = vqaddq_s16(step2[5], step2[26]); + out[6] = vqaddq_s16(step2[6], step2[25]); + out[7] = vqaddq_s16(step2[7], step2[24]); + out[8] = vqaddq_s16(step2[8], step2[23]); + out[9] = vqaddq_s16(step2[9], step2[22]); + out[10] = vqaddq_s16(step2[10], step2[21]); + out[11] = vqaddq_s16(step2[11], step2[20]); + out[12] = vqaddq_s16(step2[12], step2[19]); + out[13] = vqaddq_s16(step2[13], step2[18]); + out[14] = vqaddq_s16(step2[14], step2[17]); + out[15] = vqaddq_s16(step2[15], step2[16]); + out[16] = vqsubq_s16(step2[15], step2[16]); + out[17] = vqsubq_s16(step2[14], step2[17]); + out[18] = vqsubq_s16(step2[13], step2[18]); + out[19] = vqsubq_s16(step2[12], step2[19]); + out[20] = vqsubq_s16(step2[11], step2[20]); + out[21] = vqsubq_s16(step2[10], step2[21]); + out[22] = vqsubq_s16(step2[9], step2[22]); + out[23] = vqsubq_s16(step2[8], step2[23]); + out[24] = vqsubq_s16(step2[7], step2[24]); + out[25] = vqsubq_s16(step2[6], step2[25]); + out[26] = vqsubq_s16(step2[5], step2[26]); + out[27] = vqsubq_s16(step2[4], step2[27]); + out[28] = vqsubq_s16(step2[3], step2[28]); + out[29] = vqsubq_s16(step2[2], step2[29]); + out[30] = vqsubq_s16(step2[1], step2[30]); + out[31] = vqsubq_s16(step2[0], step2[31]); +} + // Functions for blocks with eob at DC and within // topleft 8x8, 16x16, 32x32 corner static const transform_1d_neon @@ -90,10 +2112,37 @@ static const transform_1d_neon { NULL, NULL, NULL, NULL }, { NULL, NULL, NULL, NULL } } }; -static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input, - uint8_t *output, int stride, - TX_TYPE tx_type, - TX_SIZE tx_size, int eob) { + +static const transform_neon + lowbd_txfm_all_1d_zeros_w_arr[TX_SIZES][ITX_TYPES_1D][4] = { + { + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + }, + { { idct8_low1_new_neon, idct8_new_neon, NULL, NULL }, + { iadst8_low1_new_neon, iadst8_new_neon, NULL, NULL }, + { identity8_new_neon, identity8_new_neon, NULL, NULL } }, + { + { idct16_low1_new_neon, idct16_low8_new_neon, idct16_new_neon, NULL }, + { iadst16_low1_new_neon, iadst16_low8_new_neon, iadst16_new_neon, + NULL }, + { identity16_new_neon, identity16_new_neon, identity16_new_neon, + NULL }, + }, + { { idct32_low1_new_neon, idct32_low8_new_neon, idct32_low16_new_neon, + idct32_new_neon }, + { NULL, NULL, NULL, NULL }, + { identity32_new_neon, identity32_new_neon, identity32_new_neon, + identity32_new_neon } }, + { { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } } + }; + +static INLINE void lowbd_inv_txfm2d_add_wxh_idtx_neon( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]); int32_t *temp_in = txfm_buf; @@ -160,7 +2209,79 @@ static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input, } } -static INLINE void lowbd_inv_txfm2d_add_v_identity_neon( +static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + int16x8_t a[32 * 4]; + int16x8_t b[32 * 4]; + int eobx, eoby; + get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); + const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3), + 0); + lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3), + 0); + const int buf_size_w_div8 = txfm_size_col >> 3; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; + const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3; + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + const int32_t *input_1; + int temp_b = 0; + const transform_neon row_txfm = + lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + const transform_neon col_txfm = + lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + + for (int i = 0; i < buf_size_nonzero_h_div8; i++) { + input_1 = input; + for (int j = 0; j < buf_size_nonzero_w_div8; ++j) { + int k = j * 8 + i * txfm_size_col; + load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col); + transpose_s16_8x8q(&a[k], &a[k]); + input_1 += 8; + } + input += (txfm_size_col * 8); + if (abs(rect_type) == 1) { + int y = i * txfm_size_col; + round_shift_for_rect(&a[y], &a[y], txfm_size_col); + } + row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0); + av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col, + -shift[0]); + for (int j = 0; j < buf_size_w_div8; ++j) { + int k = j * 8 + i * txfm_size_col; + transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]); + } + temp_b += 8; + } + for (int j = 0; j < buf_size_w_div8; ++j) { + col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0); + av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row, + -shift[1]); + } + if (txfm_size_col >= 16) { + for (int i = 0; i < (txfm_size_col >> 4); i++) { + lowbd_add_flip_buffer_16xn_neon( + &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row); + } + } else if (txfm_size_col == 8) { + lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row); + } +} + +static INLINE void lowbd_inv_txfm2d_add_v_wxh_identity_neon( const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob) { DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]); @@ -244,7 +2365,88 @@ static INLINE void lowbd_inv_txfm2d_add_v_identity_neon( } } -static INLINE void lowbd_inv_txfm2d_add_h_identity_neon( +static INLINE void lowbd_inv_txfm2d_add_v_identity_neon( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + int16x8_t a[16 * 2]; + int16x8_t b[16 * 2]; + int eobx, eoby, ud_flip, lr_flip; + get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob); + const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3), + 0); + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int buf_size_w_div8 = txfm_size_col >> 3; + const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; + const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3; + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + const int32_t *input_1; + int temp_b = 0; + const transform_neon row_txfm = + lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + const transform_neon col_txfm = + lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < buf_size_nonzero_h_div8; i++) { + input_1 = input; + for (int j = 0; j < buf_size_nonzero_w_div8; ++j) { + int k = j * 8 + i * txfm_size_col; + load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col); + transpose_s16_8x8q(&a[k], &a[k]); + input_1 += 8; + } + input += (txfm_size_col * 8); + if (abs(rect_type) == 1) { + int y = i * txfm_size_col; + round_shift_for_rect(&a[y], &a[y], txfm_size_col); + } + row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0); + av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col, + -shift[0]); + if (lr_flip == 1) { + for (int j = 0; j < buf_size_w_div8; ++j) { + int k = j * 8 + i * txfm_size_col; + flip_buf_ud_neon(&a[k], 8); + transpose_s16_8x8q( + &a[k], &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]); + } + temp_b += 8; + } else { + for (int j = 0; j < buf_size_w_div8; ++j) { + int k = j * 8 + i * txfm_size_col; + transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]); + } + temp_b += 8; + } + } + for (int j = 0; j < buf_size_w_div8; ++j) { + col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0); + av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row, + -shift[1]); + } + if (txfm_size_col >= 16) { + for (int i = 0; i < (txfm_size_col >> 4); i++) { + lowbd_add_flip_buffer_16xn_neon( + &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row); + } + } else if (txfm_size_col == 8) { + lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row); + } +} + +static INLINE void lowbd_inv_txfm2d_add_h_wxh_identity_neon( const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob) { DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]); @@ -328,6 +2530,78 @@ static INLINE void lowbd_inv_txfm2d_add_h_identity_neon( } } +static INLINE void lowbd_inv_txfm2d_add_h_identity_neon( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + int16x8_t a[16 * 2]; + int16x8_t b[16 * 2]; + int eobx, eoby, ud_flip, lr_flip; + get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob); + const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3), + 0); + const int buf_size_w_div8 = txfm_size_col >> 3; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; + const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3; + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + const int32_t *input_1; + int temp_b = 0; + const transform_neon row_txfm = + lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + const transform_neon col_txfm = + lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < buf_size_nonzero_h_div8; i++) { + input_1 = input; + for (int j = 0; j < buf_size_nonzero_w_div8; ++j) { + int k = j * 8 + i * txfm_size_col; + load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col); + transpose_s16_8x8q(&a[k], &a[k]); + input_1 += 8; + } + input += (txfm_size_col * 8); + if (abs(rect_type) == 1) { + int y = i * txfm_size_col; + round_shift_for_rect(&a[y], &a[y], txfm_size_col); + } + row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0); + av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col, + -shift[0]); + for (int j = 0; j < buf_size_w_div8; ++j) { + int k = j * 8 + i * txfm_size_col; + transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]); + } + temp_b += 8; + } + for (int j = 0; j < buf_size_w_div8; ++j) { + col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0); + av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row, + -shift[1]); + } + if (txfm_size_col >= 16) { + for (int i = 0; i < (txfm_size_col >> 4); i++) { + lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2], + output + 16 * i, stride, ud_flip, + txfm_size_row); + } + } else if (txfm_size_col == 8) { + lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row); + } +} + static INLINE void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, @@ -644,7 +2918,7 @@ void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input, uint8_t *output, } } -static INLINE void lowbd_inv_txfm2d_add_no_identity_neon( +static INLINE void lowbd_inv_txfm2d_add_wxh_no_identity_neon( const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob) { DECLARE_ALIGNED(32, int, txfm_buf[64 * 64 + 64 + 64]); @@ -727,6 +3001,118 @@ static INLINE void lowbd_inv_txfm2d_add_no_identity_neon( } } +static INLINE void lowbd_inv_txfm2d_add_no_identity_neon( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + int16x8_t a[64 * 8]; + int16x8_t b[64 * 8]; + int eobx, eoby, ud_flip, lr_flip; + get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); + const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int buf_size_w_div8 = txfm_size_col >> 3; + const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; + const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3; + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + const int32_t *input_1; + int temp_b = 0; + + const transform_neon row_txfm = + lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + const transform_neon col_txfm = + lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < buf_size_nonzero_h_div8; i++) { + input_1 = input; + for (int j = 0; j < buf_size_nonzero_w_div8; ++j) { + int k = j * 8 + i * txfm_size_col; + load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col); + transpose_s16_8x8q(&a[k], &a[k]); + input_1 += 8; + } + input += (txfm_size_col * 8); + if (abs(rect_type) == 1) { + int y = i * txfm_size_col; + round_shift_for_rect(&a[y], &a[y], txfm_size_col); + } + row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0); + av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col, + -shift[0]); + if (lr_flip == 1) { + for (int j = 0; j < buf_size_w_div8; ++j) { + int k = j * 8 + i * txfm_size_col; + flip_buf_ud_neon(&a[k], 8); + transpose_s16_8x8q( + &a[k], &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]); + } + temp_b += 8; + } else { + for (int j = 0; j < buf_size_w_div8; ++j) { + int k = j * 8 + i * txfm_size_col; + transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]); + } + temp_b += 8; + } + } + for (int j = 0; j < buf_size_w_div8; ++j) { + col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0); + av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row, + -shift[1]); + } + + if (txfm_size_col >= 16) { + for (int i = 0; i < (txfm_size_col >> 4); i++) { + lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2], + output + 16 * i, stride, ud_flip, + txfm_size_row); + } + } else if (txfm_size_col == 8) { + lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row); + } +} + +static INLINE void lowbd_inv_txfm2d_add_wxh_universe_neon( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + switch (tx_type) { + case IDTX: + lowbd_inv_txfm2d_add_wxh_idtx_neon(input, output, stride, tx_type, + tx_size, eob); + break; + + case H_DCT: + case H_ADST: + case H_FLIPADST: + lowbd_inv_txfm2d_add_v_wxh_identity_neon(input, output, stride, tx_type, + tx_size, eob); + break; + + case V_DCT: + case V_ADST: + case V_FLIPADST: + lowbd_inv_txfm2d_add_h_wxh_identity_neon(input, output, stride, tx_type, + tx_size, eob); + break; + + default: + lowbd_inv_txfm2d_add_wxh_no_identity_neon(input, output, stride, tx_type, + tx_size, eob); + break; + } +} + static INLINE void lowbd_inv_txfm2d_add_universe_neon( const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob) { @@ -756,6 +3142,7 @@ static INLINE void lowbd_inv_txfm2d_add_universe_neon( break; } } + void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob) { @@ -787,8 +3174,8 @@ void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output, break; case TX_16X64: { - lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type, - tx_size, eob); + lowbd_inv_txfm2d_add_wxh_universe_neon(input, output, stride, tx_type, + tx_size, eob); } break; case TX_64X16: { @@ -797,13 +3184,13 @@ void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output, memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input)); memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input)); } - lowbd_inv_txfm2d_add_universe_neon(mod_input, output, stride, tx_type, - tx_size, eob); + lowbd_inv_txfm2d_add_wxh_universe_neon(mod_input, output, stride, tx_type, + tx_size, eob); } break; case TX_32X64: { - lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type, - tx_size, eob); + lowbd_inv_txfm2d_add_wxh_universe_neon(input, output, stride, tx_type, + tx_size, eob); } break; case TX_64X32: { @@ -812,8 +3199,8 @@ void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output, memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input)); memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input)); } - lowbd_inv_txfm2d_add_universe_neon(mod_input, output, stride, tx_type, - tx_size, eob); + lowbd_inv_txfm2d_add_wxh_universe_neon(mod_input, output, stride, tx_type, + tx_size, eob); } break; case TX_64X64: { @@ -822,8 +3209,8 @@ void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output, memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input)); memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input)); } - lowbd_inv_txfm2d_add_universe_neon(mod_input, output, stride, tx_type, - tx_size, eob); + lowbd_inv_txfm2d_add_wxh_universe_neon(mod_input, output, stride, tx_type, + tx_size, eob); } break; default: diff --git a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h index 6af2d61e7..9ec658291 100644 --- a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h +++ b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h @@ -8,8 +8,8 @@ * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_ -#define AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_ +#ifndef AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_ +#define AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_ #include "config/aom_config.h" #include "config/av1_rtcd.h" @@ -23,6 +23,8 @@ typedef void (*transform_1d_neon)(const int32_t *input, int32_t *output, const int8_t cos_bit, const int8_t *stage_ptr); +typedef void (*transform_neon)(int16x8_t *input, int16x8_t *output, + int8_t cos_bit, int bit); DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = { 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, @@ -149,4 +151,4 @@ static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby, *eoby = eob_fill[temp_eoby]; } -#endif // AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_ +#endif // AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_ diff --git a/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c b/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c index 0d8233744..7134f183e 100644 --- a/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c +++ b/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c @@ -34,8 +34,8 @@ void aom_blend_a64_hmask_neon(uint8_t *dst, uint32_t dst_stride, uint8x8_t tmp0, tmp1; uint8x16_t res_q; uint16x8_t res, res_low, res_high; - uint32x2_t tmp0_32, tmp1_32; - uint16x4_t tmp0_16, tmp1_16; + uint32x2_t tmp0_32 = vdup_n_u32(0), tmp1_32 = vdup_n_u32(0); + uint16x4_t tmp0_16 = vdup_n_u16(0), tmp1_16 = vdup_n_u16(0); const uint8x8_t vdup_64 = vdup_n_u8((uint8_t)64); if (w >= 16) { diff --git a/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c b/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c index 33b06b767..194e94c8c 100644 --- a/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c +++ b/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c @@ -27,8 +27,8 @@ void aom_blend_a64_vmask_neon(uint8_t *dst, uint32_t dst_stride, uint8x8_t tmp0, tmp1; uint8x16_t tmp0_q, tmp1_q, res_q; uint16x8_t res, res_low, res_high; - uint32x2_t tmp0_32, tmp1_32; - uint16x4_t tmp0_16, tmp1_16; + uint32x2_t tmp0_32 = vdup_n_u32(0), tmp1_32 = vdup_n_u32(0); + uint16x4_t tmp0_16 = vdup_n_u16(0), tmp1_16 = vdup_n_u16(0); assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); diff --git a/third_party/aom/av1/common/arm/cfl_neon.c b/third_party/aom/av1/common/arm/cfl_neon.c index d731b6a66..39025b5e5 100644 --- a/third_party/aom/av1/common/arm/cfl_neon.c +++ b/third_party/aom/av1/common/arm/cfl_neon.c @@ -131,7 +131,7 @@ static void cfl_luma_subsampling_444_lbd_neon(const uint8_t *input, } while ((pred_buf_q3 += CFL_BUF_LINE) < end); } -#if __ARM_ARCH <= 7 +#ifndef __aarch64__ uint16x8_t vpaddq_u16(uint16x8_t a, uint16x8_t b) { return vcombine_u16(vpadd_u16(vget_low_u16(a), vget_high_u16(a)), vpadd_u16(vget_low_u16(b), vget_high_u16(b))); @@ -311,7 +311,7 @@ static INLINE void subtract_average_neon(const uint16_t *src, int16_t *dst, // Permute and add in such a way that each lane contains the block sum. // [A+C+B+D, B+D+A+C, C+A+D+B, D+B+C+A] -#if __ARM_ARCH >= 8 +#ifdef __aarch64__ sum_32x4 = vpaddq_u32(sum_32x4, sum_32x4); sum_32x4 = vpaddq_u32(sum_32x4, sum_32x4); #else diff --git a/third_party/aom/av1/common/arm/convolve_neon.c b/third_party/aom/av1/common/arm/convolve_neon.c index f15744c94..d0c4f8ff6 100644 --- a/third_party/aom/av1/common/arm/convolve_neon.c +++ b/third_party/aom/av1/common/arm/convolve_neon.c @@ -13,6 +13,8 @@ #include #include +#include "config/av1_rtcd.h" + #include "aom_dsp/aom_dsp_common.h" #include "aom_ports/mem.h" #include "av1/common/convolve.h" @@ -68,6 +70,33 @@ static INLINE uint8x8_t convolve8_horiz_8x8( return vqmovun_s16(sum); } +#if !defined(__aarch64__) +static INLINE uint8x8_t convolve8_horiz_4x1( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, const int16_t *filter, + const int16x4_t shift_round_0, const int16x4_t shift_by_bits) { + int16x4_t sum; + + sum = vmul_n_s16(s0, filter[0]); + sum = vmla_n_s16(sum, s1, filter[1]); + sum = vmla_n_s16(sum, s2, filter[2]); + sum = vmla_n_s16(sum, s5, filter[5]); + sum = vmla_n_s16(sum, s6, filter[6]); + sum = vmla_n_s16(sum, s7, filter[7]); + /* filter[3] can take a max value of 128. So the max value of the result : + * 128*255 + sum > 16 bits + */ + sum = vqadd_s16(sum, vmul_n_s16(s3, filter[3])); + sum = vqadd_s16(sum, vmul_n_s16(s4, filter[4])); + + sum = vqrshl_s16(sum, shift_round_0); + sum = vqrshl_s16(sum, shift_by_bits); + + return vqmovun_s16(vcombine_s16(sum, sum)); +} +#endif // !defined(__arch64__) + static INLINE uint8x8_t convolve8_vert_8x4( const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, @@ -175,7 +204,10 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, (void)conv_params; (void)filter_params_y; - uint8x8_t t0, t1, t2, t3; + uint8x8_t t0; +#if defined(__aarch64__) + uint8x8_t t1, t2, t3; +#endif assert(bits >= 0); assert((FILTER_BITS - conv_params->round_1) >= 0 || @@ -188,7 +220,7 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, const int16x8_t shift_by_bits = vdupq_n_s16(-bits); src -= horiz_offset; - +#if defined(__aarch64__) if (h == 4) { uint8x8_t d01, d23; int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; @@ -275,12 +307,18 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, w -= 4; } while (w > 0); } else { +#endif int width; const uint8_t *s; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + +#if defined(__aarch64__) + int16x8_t s8, s9, s10; uint8x8_t t4, t5, t6, t7; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; +#endif if (w <= 4) { +#if defined(__aarch64__) do { load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); @@ -387,10 +425,49 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, } h -= 8; } while (h > 0); +#else + int16x8_t tt0; + int16x4_t x0, x1, x2, x3, x4, x5, x6, x7; + const int16x4_t shift_round_0_low = vget_low_s16(shift_round_0); + const int16x4_t shift_by_bits_low = vget_low_s16(shift_by_bits); + do { + t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7 + tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + x0 = vget_low_s16(tt0); // a0 a1 a2 a3 + x4 = vget_high_s16(tt0); // a4 a5 a6 a7 + + t0 = vld1_u8(src + 8); // a8 a9 a10 a11 a12 a13 a14 a15 + tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + x7 = vget_low_s16(tt0); // a8 a9 a10 a11 + + x1 = vext_s16(x0, x4, 1); // a1 a2 a3 a4 + x2 = vext_s16(x0, x4, 2); // a2 a3 a4 a5 + x3 = vext_s16(x0, x4, 3); // a3 a4 a5 a6 + x5 = vext_s16(x4, x7, 1); // a5 a6 a7 a8 + x6 = vext_s16(x4, x7, 2); // a6 a7 a8 a9 + x7 = vext_s16(x4, x7, 3); // a7 a8 a9 a10 + + src += src_stride; + + t0 = convolve8_horiz_4x1(x0, x1, x2, x3, x4, x5, x6, x7, x_filter, + shift_round_0_low, shift_by_bits_low); + + if (w == 4) { + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), + 0); // 00 01 02 03 + dst += dst_stride; + } else if (w == 2) { + vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0); // 00 01 + dst += dst_stride; + } + h -= 1; + } while (h > 0); +#endif } else { uint8_t *d; - int16x8_t s11, s12, s13, s14; - + int16x8_t s11; +#if defined(__aarch64__) + int16x8_t s12, s13, s14; do { __builtin_prefetch(src + 0 * src_stride); __builtin_prefetch(src + 1 * src_stride); @@ -479,8 +556,47 @@ void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, dst += 8 * dst_stride; h -= 8; } while (h > 0); +#else + do { + t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7 + s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + + width = w; + s = src + 8; + d = dst; + __builtin_prefetch(dst); + + do { + t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 + s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s11 = s0; + s0 = s7; + + s1 = vextq_s16(s11, s7, 1); // a1 a2 a3 a4 a5 a6 a7 a8 + s2 = vextq_s16(s11, s7, 2); // a2 a3 a4 a5 a6 a7 a8 a9 + s3 = vextq_s16(s11, s7, 3); // a3 a4 a5 a6 a7 a8 a9 a10 + s4 = vextq_s16(s11, s7, 4); // a4 a5 a6 a7 a8 a9 a10 a11 + s5 = vextq_s16(s11, s7, 5); // a5 a6 a7 a8 a9 a10 a11 a12 + s6 = vextq_s16(s11, s7, 6); // a6 a7 a8 a9 a10 a11 a12 a13 + s7 = vextq_s16(s11, s7, 7); // a7 a8 a9 a10 a11 a12 a13 a14 + + t0 = convolve8_horiz_8x8(s11, s1, s2, s3, s4, s5, s6, s7, x_filter, + shift_round_0, shift_by_bits); + vst1_u8(d, t0); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src += src_stride; + dst += dst_stride; + h -= 1; + } while (h > 0); +#endif } +#if defined(__aarch64__) } +#endif } void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, @@ -505,9 +621,12 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, filter_params_y, subpel_y_q4 & SUBPEL_MASK); if (w <= 4) { - uint8x8_t d01, d23; - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; - + uint8x8_t d01; + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0; +#if defined(__aarch64__) + uint8x8_t d23; + int16x4_t s8, s9, s10, d1, d2, d3; +#endif s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); src += src_stride; s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); @@ -526,6 +645,7 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, do { s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); src += src_stride; +#if defined(__aarch64__) s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); src += src_stride; s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); @@ -591,14 +711,41 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, s5 = s9; s6 = s10; h -= 4; +#else + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(src + 0 * src_stride); + + d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); + + d01 = vqrshrun_n_s16(vcombine_s16(d0, d0), FILTER_BITS); + + if (w == 4) { + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0); + dst += dst_stride; + } else if (w == 2) { + vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0); + dst += dst_stride; + } + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + h -= 1; +#endif } while (h > 0); } else { int height; const uint8_t *s; uint8_t *d; - uint8x8_t t0, t1, t2, t3; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - + uint8x8_t t0; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; +#if defined(__aarch64__) + uint8x8_t t1, t2, t3; + int16x8_t s8, s9, s10; +#endif do { __builtin_prefetch(src + 0 * src_stride); __builtin_prefetch(src + 1 * src_stride); @@ -628,6 +775,7 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, do { s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); s += src_stride; +#if defined(__aarch64__) s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); s += src_stride; s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); @@ -670,6 +818,24 @@ void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, s5 = s9; s6 = s10; height -= 4; +#else + __builtin_prefetch(d); + __builtin_prefetch(s); + + t0 = convolve8_vert_8x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); + + vst1_u8(d, t0); + d += dst_stride; + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + height -= 1; +#endif } while (height > 0); src += 8; dst += 8; @@ -686,7 +852,10 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, ConvolveParams *conv_params) { int im_dst_stride; int width, height; - uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; + uint8x8_t t0; +#if defined(__aarch64__) + uint8x8_t t1, t2, t3, t4, t5, t6, t7; +#endif DECLARE_ALIGNED(16, int16_t, im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]); @@ -724,13 +893,18 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, assert(conv_params->round_0 > 0); if (w <= 4) { - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0; +#if defined(__aarch64__) + int16x4_t s8, s9, s10, d1, d2, d3; +#endif const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2))); const int16x4_t shift_round_0 = vdup_n_s16(-(conv_params->round_0 - 1)); do { s = src_ptr; + +#if defined(__aarch64__) __builtin_prefetch(s + 0 * src_stride); __builtin_prefetch(s + 1 * src_stride); __builtin_prefetch(s + 2 * src_stride); @@ -789,16 +963,56 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, src_ptr += 4 * src_stride; dst_ptr += 4 * im_dst_stride; height -= 4; +#else + int16x8_t tt0; + + __builtin_prefetch(s); + + t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7 + tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s0 = vget_low_s16(tt0); + s4 = vget_high_s16(tt0); + + __builtin_prefetch(dst_ptr); + s += 8; + + t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 + s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + + s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 + s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 + s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 + s5 = vext_s16(s4, s7, 1); // a5 a6 a7 a8 + s6 = vext_s16(s4, s7, 2); // a6 a7 a8 a9 + s7 = vext_s16(s4, s7, 3); // a7 a8 a9 a10 + + d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp, + horiz_const, shift_round_0); + + if (w == 4) { + vst1_s16(dst_ptr, d0); + dst_ptr += im_dst_stride; + } else if (w == 2) { + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_s16(d0), 0); + dst_ptr += im_dst_stride; + } + + src_ptr += src_stride; + height -= 1; +#endif } while (height > 0); } else { int16_t *d_tmp; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, res0; +#if defined(__aarch64__) + int16x8_t s8, s9, s10, res1, res2, res3, res4, res5, res6, res7; int16x8_t s11, s12, s13, s14; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - int16x8_t res0, res1, res2, res3, res4, res5, res6, res7; +#endif const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2))); const int16x8_t shift_round_0 = vdupq_n_s16(-(conv_params->round_0 - 1)); +#if defined(__aarch64__) do { __builtin_prefetch(src_ptr + 0 * src_stride); __builtin_prefetch(src_ptr + 1 * src_stride); @@ -886,6 +1100,45 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, dst_ptr += 8 * im_dst_stride; height -= 8; } while (height > 0); +#else + do { + t0 = vld1_u8(src_ptr); + s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7 + + width = w; + s = src_ptr + 8; + d_tmp = dst_ptr; + + __builtin_prefetch(dst_ptr); + + do { + t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 + s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t sum = s0; + s0 = s7; + + s1 = vextq_s16(sum, s7, 1); // a1 a2 a3 a4 a5 a6 a7 a8 + s2 = vextq_s16(sum, s7, 2); // a2 a3 a4 a5 a6 a7 a8 a9 + s3 = vextq_s16(sum, s7, 3); // a3 a4 a5 a6 a7 a8 a9 a10 + s4 = vextq_s16(sum, s7, 4); // a4 a5 a6 a7 a8 a9 a10 a11 + s5 = vextq_s16(sum, s7, 5); // a5 a6 a7 a8 a9 a10 a11 a12 + s6 = vextq_s16(sum, s7, 6); // a6 a7 a8 a9 a10 a11 a12 a13 + s7 = vextq_s16(sum, s7, 7); // a7 a8 a9 a10 a11 a12 a13 a14 + + res0 = convolve8_8x8_s16(sum, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp, + horiz_const, shift_round_0); + + vst1q_s16(d_tmp, res0); + + s += 8; + d_tmp += 8; + width -= 8; + } while (width > 0); + src_ptr += src_stride; + dst_ptr += im_dst_stride; + height -= 1; + } while (height > 0); +#endif } // vertical @@ -910,10 +1163,17 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, width = w; if (width <= 4) { - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - uint16x4_t d0, d1, d2, d3; - uint16x8_t dd0, dd1; - uint8x8_t d01, d23; + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7; + uint16x4_t d0; + uint16x8_t dd0; + uint8x8_t d01; + +#if defined(__aarch64__) + int16x4_t s8, s9, s10; + uint16x4_t d1, d2, d3; + uint16x8_t dd1; + uint8x8_t d23; +#endif d_u8 = dst_u8_ptr; v_s = v_src_ptr; @@ -931,6 +1191,7 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, v_s += (7 * im_stride); do { +#if defined(__aarch64__) load_s16_4x4(v_s, im_stride, &s7, &s8, &s9, &s10); v_s += (im_stride << 2); @@ -1008,11 +1269,48 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, s5 = s9; s6 = s10; height -= 4; +#else + s7 = vld1_s16(v_s); + v_s += im_stride; + + __builtin_prefetch(d_u8 + 0 * dst_stride); + + d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + round_shift_vec, offset_const, + sub_const_vec); + + dd0 = vqrshlq_u16(vcombine_u16(d0, d0), vec_round_bits); + d01 = vqmovn_u16(dd0); + + if (w == 4) { + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01), + 0); // 00 01 02 03 + d_u8 += dst_stride; + + } else if (w == 2) { + vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01), + 0); // 00 01 + d_u8 += dst_stride; + } + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + height -= 1; +#endif } while (height > 0); } else { // if width is a multiple of 8 & height is a multiple of 4 - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - uint8x8_t res0, res1, res2, res3; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + uint8x8_t res0; +#if defined(__aarch64__) + int16x8_t s8, s9, s10; + uint8x8_t res1, res2, res3; +#endif do { __builtin_prefetch(v_src_ptr + 0 * im_stride); @@ -1032,6 +1330,7 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, height = h; do { +#if defined(__aarch64__) load_s16_8x4(v_s, im_stride, &s7, &s8, &s9, &s10); v_s += (im_stride << 2); @@ -1076,6 +1375,28 @@ void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, s5 = s9; s6 = s10; height -= 4; +#else + s7 = vld1q_s16(v_s); + v_s += im_stride; + + __builtin_prefetch(d_u8 + 0 * dst_stride); + + res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, + y_filter, round_shift_vec, offset_const, + sub_const_vec, vec_round_bits); + + vst1_u8(d_u8, res0); + d_u8 += dst_stride; + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + height -= 1; +#endif } while (height > 0); v_src_ptr += 8; dst_u8_ptr += 8; diff --git a/third_party/aom/av1/common/arm/convolve_neon.h b/third_party/aom/av1/common/arm/convolve_neon.h index 47c93d645..f382984f2 100644 --- a/third_party/aom/av1/common/arm/convolve_neon.h +++ b/third_party/aom/av1/common/arm/convolve_neon.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef AV1_COMMON_ARM_CONVOLVE_NEON_H_ -#define AV1_COMMON_ARM_CONVOLVE_NEON_H_ +#ifndef AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_ +#define AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_ #include @@ -225,4 +225,4 @@ static INLINE uint16x4_t convolve8_4x4_s32( return res; } -#endif // AV1_COMMON_ARM_CONVOLVE_NEON_H_ +#endif // AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_ diff --git a/third_party/aom/av1/common/arm/jnt_convolve_neon.c b/third_party/aom/av1/common/arm/jnt_convolve_neon.c index 4015082b4..e5674ef7c 100644 --- a/third_party/aom/av1/common/arm/jnt_convolve_neon.c +++ b/third_party/aom/av1/common/arm/jnt_convolve_neon.c @@ -22,12 +22,108 @@ #include "av1/common/arm/mem_neon.h" #include "av1/common/arm/transpose_neon.h" +#if !defined(__aarch64__) +static INLINE void compute_avg_4x1(uint16x4_t res0, uint16x4_t d0, + const uint16_t fwd_offset, + const uint16_t bck_offset, + const int16x4_t sub_const_vec, + const int16_t round_bits, + const int use_jnt_comp_avg, uint8x8_t *t0) { + int16x4_t tmp0; + uint16x4_t tmp_u0; + uint32x4_t sum0; + int32x4_t dst0; + int16x8_t tmp4; + + if (use_jnt_comp_avg) { + const int32x4_t round_bits_vec = vdupq_n_s32((int32_t)(-round_bits)); + + sum0 = vmull_n_u16(res0, fwd_offset); + sum0 = vmlal_n_u16(sum0, d0, bck_offset); + + sum0 = vshrq_n_u32(sum0, DIST_PRECISION_BITS); + + dst0 = vsubq_s32(vreinterpretq_s32_u32(sum0), vmovl_s16(sub_const_vec)); + + dst0 = vqrshlq_s32(dst0, round_bits_vec); + + tmp0 = vqmovn_s32(dst0); + tmp4 = vcombine_s16(tmp0, tmp0); + + *t0 = vqmovun_s16(tmp4); + } else { + const int16x4_t round_bits_vec = vdup_n_s16(-round_bits); + tmp_u0 = vhadd_u16(res0, d0); + + tmp0 = vsub_s16(vreinterpret_s16_u16(tmp_u0), sub_const_vec); + + tmp0 = vqrshl_s16(tmp0, round_bits_vec); + + tmp4 = vcombine_s16(tmp0, tmp0); + + *t0 = vqmovun_s16(tmp4); + } +} + +static INLINE void compute_avg_8x1(uint16x8_t res0, uint16x8_t d0, + const uint16_t fwd_offset, + const uint16_t bck_offset, + const int16x4_t sub_const, + const int16_t round_bits, + const int use_jnt_comp_avg, uint8x8_t *t0) { + int16x4_t tmp0, tmp2; + int16x8_t f0; + uint32x4_t sum0, sum2; + int32x4_t dst0, dst2; + + uint16x8_t tmp_u0; + + if (use_jnt_comp_avg) { + const int32x4_t sub_const_vec = vmovl_s16(sub_const); + const int32x4_t round_bits_vec = vdupq_n_s32(-(int32_t)round_bits); + + sum0 = vmull_n_u16(vget_low_u16(res0), fwd_offset); + sum0 = vmlal_n_u16(sum0, vget_low_u16(d0), bck_offset); + sum0 = vshrq_n_u32(sum0, DIST_PRECISION_BITS); + + sum2 = vmull_n_u16(vget_high_u16(res0), fwd_offset); + sum2 = vmlal_n_u16(sum2, vget_high_u16(d0), bck_offset); + sum2 = vshrq_n_u32(sum2, DIST_PRECISION_BITS); + + dst0 = vsubq_s32(vreinterpretq_s32_u32(sum0), sub_const_vec); + dst2 = vsubq_s32(vreinterpretq_s32_u32(sum2), sub_const_vec); + + dst0 = vqrshlq_s32(dst0, round_bits_vec); + dst2 = vqrshlq_s32(dst2, round_bits_vec); + + tmp0 = vqmovn_s32(dst0); + tmp2 = vqmovn_s32(dst2); + + f0 = vcombine_s16(tmp0, tmp2); + + *t0 = vqmovun_s16(f0); + + } else { + const int16x8_t sub_const_vec = vcombine_s16(sub_const, sub_const); + const int16x8_t round_bits_vec = vdupq_n_s16(-round_bits); + + tmp_u0 = vhaddq_u16(res0, d0); + + f0 = vsubq_s16(vreinterpretq_s16_u16(tmp_u0), sub_const_vec); + + f0 = vqrshlq_s16(f0, round_bits_vec); + + *t0 = vqmovun_s16(f0); + } +} +#endif // !defined(__arch64__) + static INLINE void compute_avg_4x4( uint16x4_t res0, uint16x4_t res1, uint16x4_t res2, uint16x4_t res3, uint16x4_t d0, uint16x4_t d1, uint16x4_t d2, uint16x4_t d3, const uint16_t fwd_offset, const uint16_t bck_offset, const int16x4_t sub_const_vec, const int16_t round_bits, - const int32_t use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1) { + const int use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1) { int16x4_t tmp0, tmp1, tmp2, tmp3; uint16x4_t tmp_u0, tmp_u1, tmp_u2, tmp_u3; uint32x4_t sum0, sum1, sum2, sum3; @@ -107,7 +203,7 @@ static INLINE void compute_avg_8x4( uint16x8_t d0, uint16x8_t d1, uint16x8_t d2, uint16x8_t d3, const uint16_t fwd_offset, const uint16_t bck_offset, const int16x4_t sub_const, const int16_t round_bits, - const int32_t use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1, uint8x8_t *t2, + const int use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1, uint8x8_t *t2, uint8x8_t *t3) { int16x4_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; int16x8_t f0, f1, f2, f3; @@ -231,7 +327,6 @@ static INLINE void jnt_convolve_2d_horiz_neon( int16_t *dst_ptr; int dst_stride; int width, height; - uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; dst_ptr = im_block; dst_stride = im_stride; @@ -239,15 +334,22 @@ static INLINE void jnt_convolve_2d_horiz_neon( width = w; if (w == 4) { - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; - int16x8_t tt0, tt1, tt2, tt3; + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0; + int16x8_t tt0; + uint8x8_t t0; const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2))); const int16x4_t shift_round_0 = vdup_n_s16(-(round_0)); +#if defined(__aarch64__) + int16x4_t s8, s9, s10, d1, d2, d3; + int16x8_t tt1, tt2, tt3; + uint8x8_t t1, t2, t3; +#endif do { s = src; __builtin_prefetch(s + 0 * src_stride); +#if defined(__aarch64__) __builtin_prefetch(s + 1 * src_stride); __builtin_prefetch(s + 2 * src_stride); __builtin_prefetch(s + 3 * src_stride); @@ -301,17 +403,48 @@ static INLINE void jnt_convolve_2d_horiz_neon( src += 4 * src_stride; dst_ptr += 4 * dst_stride; height -= 4; +#else + t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7 + tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7 + s0 = vget_low_s16(tt0); // a0 a1 a2 a3 + s4 = vget_high_s16(tt0); // a4 a5 a6 a7 + __builtin_prefetch(dst_ptr); + s += 8; + t0 = vld1_u8(s); // a8 a9 a10 a11 + + // a8 a9 a10 a11 + s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + + s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 + s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 + s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 + s5 = vext_s16(s4, s7, 1); // a5 a6 a7 a8 + s6 = vext_s16(s4, s7, 2); // a6 a7 a8 a9 + s7 = vext_s16(s4, s7, 3); // a7 a8 a9 a10 + + d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp, + horiz_const, shift_round_0); + + vst1_s16(dst_ptr, d0); + + src += src_stride; + dst_ptr += dst_stride; + height -= 1; +#endif } while (height > 0); } else { int16_t *d_tmp; - int16x8_t s11, s12, s13, s14; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - int16x8_t res0, res1, res2, res3, res4, res5, res6, res7; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + int16x8_t res0; + uint8x8_t t0; const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2))); const int16x8_t shift_round_0 = vdupq_n_s16(-(round_0)); - do { +#if defined(__aarch64__) + uint8x8_t t1, t2, t3, t4, t5, t6, t7; + int16x8_t s8, s9, s10, s11, s12, s13, s14; + int16x8_t res1, res2, res3, res4, res5, res6, res7; __builtin_prefetch(src + 0 * src_stride); __builtin_prefetch(src + 1 * src_stride); __builtin_prefetch(src + 2 * src_stride); @@ -390,6 +523,42 @@ static INLINE void jnt_convolve_2d_horiz_neon( src += 8 * src_stride; dst_ptr += 8 * dst_stride; height -= 8; +#else + int16x8_t temp_0; + t0 = vld1_u8(src); + s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7 + + width = w; + s = src + 8; + d_tmp = dst_ptr; + __builtin_prefetch(dst_ptr); + + do { + t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 + s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + temp_0 = s0; + s0 = s7; + + s1 = vextq_s16(temp_0, s7, 1); // a1 a2 a3 a4 a5 a6 a7 a8 + s2 = vextq_s16(temp_0, s7, 2); // a2 a3 a4 a5 a6 a7 a8 a9 + s3 = vextq_s16(temp_0, s7, 3); // a3 a4 a5 a6 a7 a8 a9 a10 + s4 = vextq_s16(temp_0, s7, 4); // a4 a5 a6 a7 a8 a9 a10 a11 + s5 = vextq_s16(temp_0, s7, 5); // a5 a6 a7 a8 a9 a10 a11 a12 + s6 = vextq_s16(temp_0, s7, 6); // a6 a7 a8 a9 a10 a11 a12 a13 + s7 = vextq_s16(temp_0, s7, 7); // a7 a8 a9 a10 a11 a12 a13 a14 + + res0 = convolve8_8x8_s16(temp_0, s1, s2, s3, s4, s5, s6, s7, + x_filter_tmp, horiz_const, shift_round_0); + vst1q_s16(d_tmp, res0); + + s += 8; + d_tmp += 8; + width -= 8; + } while (width > 0); + src += src_stride; + dst_ptr += dst_stride; + height -= 1; +#endif } while (height > 0); } } @@ -420,10 +589,15 @@ static INLINE void jnt_convolve_2d_vert_neon( const int do_average = conv_params->do_average; const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - uint16x4_t res4, res5, res6, res7; - uint16x4_t d0, d1, d2, d3; - uint8x8_t t0, t1; + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7; + uint16x4_t res4, d0; + uint8x8_t t0; + +#if defined(__aarch64__) + int16x4_t s8, s9, s10; + uint16x4_t res5, res6, res7, d1, d2, d3; + uint8x8_t t1; +#endif dst = conv_params->dst; src_ptr = im_block; @@ -450,6 +624,7 @@ static INLINE void jnt_convolve_2d_vert_neon( s += (7 * im_stride); do { +#if defined(__aarch64__) load_s16_4x4(s, im_stride, &s7, &s8, &s9, &s10); s += (im_stride << 2); @@ -480,17 +655,13 @@ static INLINE void jnt_convolve_2d_vert_neon( bck_offset, sub_const_vec, round_bits, use_jnt_comp_avg, &t0, &t1); - vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), - 0); // 00 01 02 03 + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0); d_u8 += dst8_stride; - vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), - 1); // 10 11 12 13 + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 1); d_u8 += dst8_stride; - vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), - 0); // 20 21 22 23 + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 0); d_u8 += dst8_stride; - vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), - 1); // 30 31 32 33 + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 1); d_u8 += dst8_stride; } else { @@ -505,6 +676,39 @@ static INLINE void jnt_convolve_2d_vert_neon( s5 = s9; s6 = s10; height -= 4; +#else + s7 = vld1_s16(s); + s += (im_stride); + + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d_u8 + 0 * dst8_stride); + + d0 = convolve8_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + round_shift_vec, offset_const); + + if (do_average) { + res4 = vld1_u16(d); + d += (dst_stride); + + compute_avg_4x1(res4, d0, fwd_offset, bck_offset, sub_const_vec, + round_bits, use_jnt_comp_avg, &t0); + + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0); + d_u8 += dst8_stride; + + } else { + vst1_u16(d, d0); + d += (dst_stride); + } + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + height--; +#endif } while (height > 0); src_ptr += 4; dst_ptr += 4; @@ -722,8 +926,10 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8, uint8_t *dst_u8_ptr; CONV_BUF_TYPE *d, *dst_ptr; int width, height; - uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; - + uint8x8_t t0; +#if defined(__aarch64__) + uint8x8_t t1, t2, t3, t4, t5, t6, t7; +#endif s = src_ptr; dst_ptr = dst; dst_u8_ptr = dst8; @@ -731,11 +937,18 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8, height = h; if ((w == 4) || (h == 4)) { - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; - int16x8_t tt0, tt1, tt2, tt3; - uint16x4_t res4, res5, res6, res7; - uint32x2_t tu0, tu1; + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0; + int16x8_t tt0; + uint16x4_t res4; +#if defined(__aarch64__) + int16x4_t s8, s9, s10, d1, d2, d3; + int16x8_t tt1, tt2, tt3; + uint16x4_t res5, res6, res7; + uint32x2_t tu0 = vdup_n_u32(0), tu1 = vdup_n_u32(0); int16x8_t u0, u1; +#else + int16x4_t temp_0; +#endif const int16x4_t zero = vdup_n_s16(0); const int16x4_t round_offset_vec = vdup_n_s16(round_offset); const int16x4_t shift_round_0 = vdup_n_s16(-conv_params->round_0 + 1); @@ -746,6 +959,7 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8, d_u8 = dst_u8_ptr; width = w; __builtin_prefetch(s + 0 * src_stride); +#if defined(__aarch64__) __builtin_prefetch(s + 1 * src_stride); __builtin_prefetch(s + 2 * src_stride); __builtin_prefetch(s + 3 * src_stride); @@ -854,15 +1068,66 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8, dst_ptr += (dst_stride << 2); dst_u8_ptr += (dst8_stride << 2); height -= 4; +#else + t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7 + tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7 + s0 = vget_low_s16(tt0); // a0 a1 a2 a3 + s4 = vget_high_s16(tt0); // a4 a5 a6 a7 + __builtin_prefetch(d); + + s += 8; + do { + t0 = vld1_u8(s); // a8 a9 a10 a11 + + // a8 a9 a10 a11 + s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + temp_0 = s7; + s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 + s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 + s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 + s5 = vext_s16(s4, s7, 1); // a5 a6 a7 a8 + s6 = vext_s16(s4, s7, 2); // a6 a7 a8 a9 + s7 = vext_s16(s4, s7, 3); // a7 a8 a9 a10 + + d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp, + zero, shift_round_0); + d0 = vrshl_s16(d0, horiz_const); + d0 = vadd_s16(d0, round_offset_vec); + s0 = s4; + s4 = temp_0; + if (conv_params->do_average) { + __builtin_prefetch(d); + __builtin_prefetch(d_u8); + + res4 = vld1_u16(d); + + compute_avg_4x1(res4, vreinterpret_u16_s16(d0), fwd_offset, + bck_offset, round_offset_vec, round_bits, + use_jnt_comp_avg, &t0); + + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), + 0); // 00 01 02 03 + } else { + vst1_u16(d, vreinterpret_u16_s16(d0)); + } + + s += 4; + width -= 4; + d += 4; + d_u8 += 4; + } while (width > 0); + src_ptr += (src_stride); + dst_ptr += (dst_stride); + dst_u8_ptr += (dst8_stride); + height--; +#endif } while (height > 0); } else { CONV_BUF_TYPE *d_tmp; uint8_t *d_u8_tmp; - int16x8_t s11, s12, s13, s14; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - int16x8_t res0, res1, res2, res3, res4, res5, res6, res7; - uint16x8_t res8, res9, res10, res11; - + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + int16x8_t res0; + uint16x8_t res8; const int16x8_t round_offset128 = vdupq_n_s16(round_offset); const int16x4_t round_offset64 = vdup_n_s16(round_offset); const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0 + 1); @@ -872,6 +1137,11 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8, d = dst_ptr = dst; d_u8 = dst_u8_ptr = dst8; do { +#if defined(__aarch64__) + int16x8_t s11, s12, s13, s14; + int16x8_t s8, s9, s10; + int16x8_t res1, res2, res3, res4, res5, res6, res7; + uint16x8_t res9, res10, res11; __builtin_prefetch(src_ptr + 0 * src_stride); __builtin_prefetch(src_ptr + 1 * src_stride); __builtin_prefetch(src_ptr + 2 * src_stride); @@ -1007,6 +1277,67 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8, dst_ptr += 8 * dst_stride; dst_u8_ptr += 8 * dst8_stride; height -= 8; +#else + int16x8_t temp_0; + __builtin_prefetch(src_ptr); + t0 = vld1_u8(src_ptr); + s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7 + + width = w; + s = src_ptr + 8; + d = dst_ptr; + d_u8_tmp = dst_u8_ptr; + + __builtin_prefetch(dst_ptr); + + do { + d_u8 = d_u8_tmp; + d_tmp = d; + + t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 + s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + temp_0 = s0; + s0 = s7; + + s1 = vextq_s16(temp_0, s7, 1); // a1 a2 a3 a4 a5 a6 a7 a8 + s2 = vextq_s16(temp_0, s7, 2); // a2 a3 a4 a5 a6 a7 a8 a9 + s3 = vextq_s16(temp_0, s7, 3); // a3 a4 a5 a6 a7 a8 a9 a10 + s4 = vextq_s16(temp_0, s7, 4); // a4 a5 a6 a7 a8 a9 a10 a11 + s5 = vextq_s16(temp_0, s7, 5); // a5 a6 a7 a8 a9 a10 a11 a12 + s6 = vextq_s16(temp_0, s7, 6); // a6 a7 a8 a9 a10 a11 a12 a13 + s7 = vextq_s16(temp_0, s7, 7); // a7 a8 a9 a10 a11 a12 a13 a14 + + res0 = convolve8_8x8_s16(temp_0, s1, s2, s3, s4, s5, s6, s7, + x_filter_tmp, zero, shift_round_0); + + res0 = vrshlq_s16(res0, horiz_const); + res0 = vaddq_s16(res0, round_offset128); + + if (conv_params->do_average) { + res8 = vld1q_u16(d_tmp); + d_tmp += (dst_stride); + + compute_avg_8x1(res8, vreinterpretq_u16_s16(res0), fwd_offset, + bck_offset, round_offset64, round_bits, + use_jnt_comp_avg, &t0); + + vst1_u8(d_u8, t0); + d_u8 += (dst8_stride); + } else { + vst1q_u16(d_tmp, vreinterpretq_u16_s16(res0)); + d_tmp += (dst_stride); + } + + s += 8; + d += 8; + width -= 8; + d_u8_tmp += 8; + } while (width > 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + dst_u8_ptr += dst8_stride; + height--; +#endif } while (height > 0); } } @@ -1057,7 +1388,6 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8, uint8_t *dst_u8_ptr; CONV_BUF_TYPE *d, *dst_ptr; int width, height; - uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; s = src_ptr; dst_ptr = dst; @@ -1070,11 +1400,18 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8, assert((conv_params->round_1 - 2) >= bits); if ((w == 4) || (h == 4)) { - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; - uint16x4_t res4, res5, res6, res7; - uint32x2_t tu0, tu1, tu2, tu3; + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0; + uint16x4_t res4; + uint32x2_t tu0 = vdup_n_u32(0), tu1 = vdup_n_u32(0), tu2 = vdup_n_u32(0), + tu3 = vdup_n_u32(0); int16x8_t u0, u1, u2, u3; + uint8x8_t t0; +#if defined(__aarch64__) + int16x4_t s8, s9, s10, d1, d2, d3; + uint16x4_t res5, res6, res7; + uint8x8_t t1; +#endif const int16x4_t round_offset64 = vdup_n_s16(round_offset); const int16x4_t shift_vec = vdup_n_s16(-shift_value); const int16x4_t zero = vdup_n_s16(0); @@ -1111,6 +1448,7 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8, s += (7 * src_stride); do { +#if defined(__aarch64__) load_unaligned_u8_4x4(s, src_stride, &tu0, &tu1); u0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0))); @@ -1154,17 +1492,13 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8, round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1); - vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), - 0); // 00 01 02 03 + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0); d_u8 += dst8_stride; - vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), - 1); // 10 11 12 13 + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 1); d_u8 += dst8_stride; - vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), - 0); // 20 21 22 23 + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 0); d_u8 += dst8_stride; - vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), - 1); // 30 31 32 33 + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 1); d_u8 += dst8_stride; } else { store_u16_4x4(d, dst_stride, vreinterpret_u16_s16(d0), @@ -1183,6 +1517,44 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8, s += (src_stride << 2); height -= 4; +#else + load_unaligned_u8_4x1(s, src_stride, &tu0); + u0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0))); + s7 = vget_low_s16(u0); + + d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp, + zero, shift_vec); + + d0 = vadd_s16(d0, round_offset64); + + if (conv_params->do_average) { + __builtin_prefetch(d); + + res4 = vld1_u16(d); + d += (dst_stride); + + compute_avg_4x1(res4, vreinterpret_u16_s16(d0), fwd_offset, + bck_offset, round_offset64, round_bits, + use_jnt_comp_avg, &t0); + + vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0); + d_u8 += dst8_stride; + } else { + vst1_u16(d, vreinterpret_u16_s16(d0)); + d += (dst_stride); + } + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + + s += (src_stride); + height--; +#endif } while (height > 0); src_ptr += 4; dst_ptr += 4; @@ -1191,15 +1563,19 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8, } while (width > 0); } else { CONV_BUF_TYPE *d_tmp; - int16x8_t s11, s12, s13, s14; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - int16x8_t res0, res1, res2, res3, res4, res5, res6, res7; - uint16x8_t res8, res9, res10, res11; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + int16x8_t res0; + uint16x8_t res8; + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; const int16x8_t round_offset128 = vdupq_n_s16(round_offset); const int16x8_t shift_vec = vdupq_n_s16(-shift_value); const int16x4_t round_offset64 = vdup_n_s16(round_offset); const int16x8_t zero = vdupq_n_s16(0); - +#if defined(__aarch64__) + int16x8_t s8, s9, s10, s11, s12, s13, s14; + int16x8_t res1, res2, res3, res4, res5, res6, res7; + uint16x8_t res10, res11, res9; +#endif dst_ptr = dst; dst_u8_ptr = dst8; do { @@ -1227,6 +1603,7 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8, d_u8 = dst_u8_ptr; do { +#if defined(__aarch64__) load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); @@ -1316,6 +1693,43 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8, s6 = s14; s += (8 * src_stride); height -= 8; +#else + s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + + __builtin_prefetch(dst_ptr); + + res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp, + zero, shift_vec); + res0 = vaddq_s16(res0, round_offset128); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + + if (conv_params->do_average) { + __builtin_prefetch(d_tmp); + + res8 = vld1q_u16(d_tmp); + d_tmp += (dst_stride); + + compute_avg_8x1(res8, vreinterpretq_u16_s16(res0), fwd_offset, + bck_offset, round_offset64, round_bits, + use_jnt_comp_avg, &t0); + + vst1_u8(d_u8, t0); + d_u8 += (dst8_stride); + } else { + vst1q_u16(d_tmp, vreinterpretq_u16_s16(res0)); + d_tmp += dst_stride; + } + + s += (src_stride); + height--; +#endif } while (height > 0); src_ptr += 8; dst_ptr += 8; diff --git a/third_party/aom/av1/common/arm/mem_neon.h b/third_party/aom/av1/common/arm/mem_neon.h index 4bf45a52c..c4ae2e784 100644 --- a/third_party/aom/av1/common/arm/mem_neon.h +++ b/third_party/aom/av1/common/arm/mem_neon.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef AV1_COMMON_ARM_MEM_NEON_H_ -#define AV1_COMMON_ARM_MEM_NEON_H_ +#ifndef AOM_AV1_COMMON_ARM_MEM_NEON_H_ +#define AOM_AV1_COMMON_ARM_MEM_NEON_H_ #include #include @@ -362,6 +362,15 @@ static INLINE void load_unaligned_u8_4x4(const uint8_t *buf, int stride, *tu1 = vset_lane_u32(a, *tu1, 1); } +static INLINE void load_unaligned_u8_4x1(const uint8_t *buf, int stride, + uint32x2_t *tu0) { + uint32_t a; + + memcpy(&a, buf, 4); + buf += stride; + *tu0 = vset_lane_u32(a, *tu0, 0); +} + static INLINE void load_unaligned_u8_4x2(const uint8_t *buf, int stride, uint32x2_t *tu0) { uint32_t a; @@ -482,4 +491,4 @@ static INLINE void store_u32_4x4(uint32_t *s, int32_t p, uint32x4_t s1, vst1q_u32(s, s4); } -#endif // AV1_COMMON_ARM_MEM_NEON_H_ +#endif // AOM_AV1_COMMON_ARM_MEM_NEON_H_ diff --git a/third_party/aom/av1/common/arm/selfguided_neon.c b/third_party/aom/av1/common/arm/selfguided_neon.c index b4808a972..b3a37c4cb 100644 --- a/third_party/aom/av1/common/arm/selfguided_neon.c +++ b/third_party/aom/av1/common/arm/selfguided_neon.c @@ -1007,10 +1007,11 @@ static INLINE void cross_sum_fast_odd_row_inp16(uint16_t *buf, int32x4_t *a0, vaddq_u32(vmovl_u16(vget_high_u16(xl)), vmovl_u16(vget_high_u16(x)))); } -void final_filter_fast_internal(uint16_t *A, int32_t *B, const int buf_stride, - int16_t *src, const int src_stride, - int32_t *dst, const int dst_stride, - const int width, const int height) { +static void final_filter_fast_internal(uint16_t *A, int32_t *B, + const int buf_stride, int16_t *src, + const int src_stride, int32_t *dst, + const int dst_stride, const int width, + const int height) { int16x8_t s0; int32_t *B_tmp, *dst_ptr; uint16_t *A_tmp; @@ -1340,10 +1341,10 @@ static INLINE void src_convert_hbd_copy(const uint16_t *src, int src_stride, } } -void av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height, - int stride, int32_t *flt0, int32_t *flt1, - int flt_stride, int sgr_params_idx, - int bit_depth, int highbd) { +int av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height, + int stride, int32_t *flt0, int32_t *flt1, + int flt_stride, int sgr_params_idx, + int bit_depth, int highbd) { const sgr_params_type *const params = &sgr_params[sgr_params_idx]; assert(!(params->r[0] == 0 && params->r[1] == 0)); @@ -1376,6 +1377,7 @@ void av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height, if (params->r[1] > 0) restoration_internal(dgd16, width, height, dgd16_stride, flt1, flt_stride, bit_depth, sgr_params_idx, 1); + return 0; } void apply_selfguided_restoration_neon(const uint8_t *dat8, int width, diff --git a/third_party/aom/av1/common/arm/transpose_neon.h b/third_party/aom/av1/common/arm/transpose_neon.h index fe134087b..8a3d9f07f 100644 --- a/third_party/aom/av1/common/arm/transpose_neon.h +++ b/third_party/aom/av1/common/arm/transpose_neon.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef AV1_COMMON_ARM_TRANSPOSE_NEON_H_ -#define AV1_COMMON_ARM_TRANSPOSE_NEON_H_ +#ifndef AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_ +#define AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_ #include @@ -386,6 +386,83 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1, vget_high_s16(vreinterpretq_s16_s32(c3.val[1]))); } +static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) { + int16x8x2_t b0; + b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)), + vreinterpret_s16_s32(vget_low_s32(a1))); + b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)), + vreinterpret_s16_s32(vget_high_s32(a1))); + return b0; +} + +static INLINE void transpose_s16_8x8q(int16x8_t *a0, int16x8_t *out) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 04 05 06 07 + // a1: 10 11 12 13 14 15 16 17 + // a2: 20 21 22 23 24 25 26 27 + // a3: 30 31 32 33 34 35 36 37 + // a4: 40 41 42 43 44 45 46 47 + // a5: 50 51 52 53 54 55 56 57 + // a6: 60 61 62 63 64 65 66 67 + // a7: 70 71 72 73 74 75 76 77 + // to: + // b0.val[0]: 00 10 02 12 04 14 06 16 + // b0.val[1]: 01 11 03 13 05 15 07 17 + // b1.val[0]: 20 30 22 32 24 34 26 36 + // b1.val[1]: 21 31 23 33 25 35 27 37 + // b2.val[0]: 40 50 42 52 44 54 46 56 + // b2.val[1]: 41 51 43 53 45 55 47 57 + // b3.val[0]: 60 70 62 72 64 74 66 76 + // b3.val[1]: 61 71 63 73 65 75 67 77 + + const int16x8x2_t b0 = vtrnq_s16(*a0, *(a0 + 1)); + const int16x8x2_t b1 = vtrnq_s16(*(a0 + 2), *(a0 + 3)); + const int16x8x2_t b2 = vtrnq_s16(*(a0 + 4), *(a0 + 5)); + const int16x8x2_t b3 = vtrnq_s16(*(a0 + 6), *(a0 + 7)); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 10 20 30 04 14 24 34 + // c0.val[1]: 02 12 22 32 06 16 26 36 + // c1.val[0]: 01 11 21 31 05 15 25 35 + // c1.val[1]: 03 13 23 33 07 17 27 37 + // c2.val[0]: 40 50 60 70 44 54 64 74 + // c2.val[1]: 42 52 62 72 46 56 66 76 + // c3.val[0]: 41 51 61 71 45 55 65 75 + // c3.val[1]: 43 53 63 73 47 57 67 77 + + const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]), + vreinterpretq_s32_s16(b1.val[0])); + const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]), + vreinterpretq_s32_s16(b1.val[1])); + const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]), + vreinterpretq_s32_s16(b3.val[0])); + const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]), + vreinterpretq_s32_s16(b3.val[1])); + + // Swap 64 bit elements resulting in: + // d0.val[0]: 00 10 20 30 40 50 60 70 + // d0.val[1]: 04 14 24 34 44 54 64 74 + // d1.val[0]: 01 11 21 31 41 51 61 71 + // d1.val[1]: 05 15 25 35 45 55 65 75 + // d2.val[0]: 02 12 22 32 42 52 62 72 + // d2.val[1]: 06 16 26 36 46 56 66 76 + // d3.val[0]: 03 13 23 33 43 53 63 73 + // d3.val[1]: 07 17 27 37 47 57 67 77 + const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]); + const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]); + const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]); + const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]); + + *out = d0.val[0]; + *(out + 1) = d1.val[0]; + *(out + 2) = d2.val[0]; + *(out + 3) = d3.val[0]; + *(out + 4) = d0.val[1]; + *(out + 5) = d1.val[1]; + *(out + 6) = d2.val[1]; + *(out + 7) = d3.val[1]; +} + static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1, int16x4_t *a2, int16x4_t *a3) { // Swap 16 bit elements. Goes from: @@ -457,4 +534,4 @@ static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1, *a3 = c1.val[1]; } -#endif // AV1_COMMON_ARM_TRANSPOSE_NEON_H_ +#endif // AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_ diff --git a/third_party/aom/av1/common/arm/warp_plane_neon.c b/third_party/aom/av1/common/arm/warp_plane_neon.c new file mode 100644 index 000000000..7f02d42a7 --- /dev/null +++ b/third_party/aom/av1/common/arm/warp_plane_neon.c @@ -0,0 +1,714 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" +#include "config/av1_rtcd.h" +#include "av1/common/warped_motion.h" +#include "av1/common/scale.h" + +/* This is a modified version of 'warped_filter' from warped_motion.c: + * Each coefficient is stored in 8 bits instead of 16 bits + * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7 + + This is done in order to avoid overflow: Since the tap with the largest + coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation + order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular + convolve functions. + + Instead, we use the summation order + ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)). + The rearrangement of coefficients in this table is so that we can get the + coefficients into the correct order more quickly. +*/ +/* clang-format off */ +DECLARE_ALIGNED(8, static const int8_t, + filter_8bit_neon[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = { +#if WARPEDPIXEL_PREC_BITS == 6 + // [-1, 0) + { 0, 127, 0, 0, 0, 1, 0, 0}, { 0, 127, 0, 0, -1, 2, 0, 0}, + { 1, 127, -1, 0, -3, 4, 0, 0}, { 1, 126, -2, 0, -4, 6, 1, 0}, + { 1, 126, -3, 0, -5, 8, 1, 0}, { 1, 125, -4, 0, -6, 11, 1, 0}, + { 1, 124, -4, 0, -7, 13, 1, 0}, { 2, 123, -5, 0, -8, 15, 1, 0}, + { 2, 122, -6, 0, -9, 18, 1, 0}, { 2, 121, -6, 0, -10, 20, 1, 0}, + { 2, 120, -7, 0, -11, 22, 2, 0}, { 2, 119, -8, 0, -12, 25, 2, 0}, + { 3, 117, -8, 0, -13, 27, 2, 0}, { 3, 116, -9, 0, -13, 29, 2, 0}, + { 3, 114, -10, 0, -14, 32, 3, 0}, { 3, 113, -10, 0, -15, 35, 2, 0}, + { 3, 111, -11, 0, -15, 37, 3, 0}, { 3, 109, -11, 0, -16, 40, 3, 0}, + { 3, 108, -12, 0, -16, 42, 3, 0}, { 4, 106, -13, 0, -17, 45, 3, 0}, + { 4, 104, -13, 0, -17, 47, 3, 0}, { 4, 102, -14, 0, -17, 50, 3, 0}, + { 4, 100, -14, 0, -17, 52, 3, 0}, { 4, 98, -15, 0, -18, 55, 4, 0}, + { 4, 96, -15, 0, -18, 58, 3, 0}, { 4, 94, -16, 0, -18, 60, 4, 0}, + { 4, 91, -16, 0, -18, 63, 4, 0}, { 4, 89, -16, 0, -18, 65, 4, 0}, + { 4, 87, -17, 0, -18, 68, 4, 0}, { 4, 85, -17, 0, -18, 70, 4, 0}, + { 4, 82, -17, 0, -18, 73, 4, 0}, { 4, 80, -17, 0, -18, 75, 4, 0}, + { 4, 78, -18, 0, -18, 78, 4, 0}, { 4, 75, -18, 0, -17, 80, 4, 0}, + { 4, 73, -18, 0, -17, 82, 4, 0}, { 4, 70, -18, 0, -17, 85, 4, 0}, + { 4, 68, -18, 0, -17, 87, 4, 0}, { 4, 65, -18, 0, -16, 89, 4, 0}, + { 4, 63, -18, 0, -16, 91, 4, 0}, { 4, 60, -18, 0, -16, 94, 4, 0}, + { 3, 58, -18, 0, -15, 96, 4, 0}, { 4, 55, -18, 0, -15, 98, 4, 0}, + { 3, 52, -17, 0, -14, 100, 4, 0}, { 3, 50, -17, 0, -14, 102, 4, 0}, + { 3, 47, -17, 0, -13, 104, 4, 0}, { 3, 45, -17, 0, -13, 106, 4, 0}, + { 3, 42, -16, 0, -12, 108, 3, 0}, { 3, 40, -16, 0, -11, 109, 3, 0}, + { 3, 37, -15, 0, -11, 111, 3, 0}, { 2, 35, -15, 0, -10, 113, 3, 0}, + { 3, 32, -14, 0, -10, 114, 3, 0}, { 2, 29, -13, 0, -9, 116, 3, 0}, + { 2, 27, -13, 0, -8, 117, 3, 0}, { 2, 25, -12, 0, -8, 119, 2, 0}, + { 2, 22, -11, 0, -7, 120, 2, 0}, { 1, 20, -10, 0, -6, 121, 2, 0}, + { 1, 18, -9, 0, -6, 122, 2, 0}, { 1, 15, -8, 0, -5, 123, 2, 0}, + { 1, 13, -7, 0, -4, 124, 1, 0}, { 1, 11, -6, 0, -4, 125, 1, 0}, + { 1, 8, -5, 0, -3, 126, 1, 0}, { 1, 6, -4, 0, -2, 126, 1, 0}, + { 0, 4, -3, 0, -1, 127, 1, 0}, { 0, 2, -1, 0, 0, 127, 0, 0}, + // [0, 1) + { 0, 0, 1, 0, 0, 127, 0, 0}, { 0, -1, 2, 0, 0, 127, 0, 0}, + { 0, -3, 4, 1, 1, 127, -2, 0}, { 0, -5, 6, 1, 1, 127, -2, 0}, + { 0, -6, 8, 1, 2, 126, -3, 0}, {-1, -7, 11, 2, 2, 126, -4, -1}, + {-1, -8, 13, 2, 3, 125, -5, -1}, {-1, -10, 16, 3, 3, 124, -6, -1}, + {-1, -11, 18, 3, 4, 123, -7, -1}, {-1, -12, 20, 3, 4, 122, -7, -1}, + {-1, -13, 23, 3, 4, 121, -8, -1}, {-2, -14, 25, 4, 5, 120, -9, -1}, + {-1, -15, 27, 4, 5, 119, -10, -1}, {-1, -16, 30, 4, 5, 118, -11, -1}, + {-2, -17, 33, 5, 6, 116, -12, -1}, {-2, -17, 35, 5, 6, 114, -12, -1}, + {-2, -18, 38, 5, 6, 113, -13, -1}, {-2, -19, 41, 6, 7, 111, -14, -2}, + {-2, -19, 43, 6, 7, 110, -15, -2}, {-2, -20, 46, 6, 7, 108, -15, -2}, + {-2, -20, 49, 6, 7, 106, -16, -2}, {-2, -21, 51, 7, 7, 104, -16, -2}, + {-2, -21, 54, 7, 7, 102, -17, -2}, {-2, -21, 56, 7, 8, 100, -18, -2}, + {-2, -22, 59, 7, 8, 98, -18, -2}, {-2, -22, 62, 7, 8, 96, -19, -2}, + {-2, -22, 64, 7, 8, 94, -19, -2}, {-2, -22, 67, 8, 8, 91, -20, -2}, + {-2, -22, 69, 8, 8, 89, -20, -2}, {-2, -22, 72, 8, 8, 87, -21, -2}, + {-2, -21, 74, 8, 8, 84, -21, -2}, {-2, -22, 77, 8, 8, 82, -21, -2}, + {-2, -21, 79, 8, 8, 79, -21, -2}, {-2, -21, 82, 8, 8, 77, -22, -2}, + {-2, -21, 84, 8, 8, 74, -21, -2}, {-2, -21, 87, 8, 8, 72, -22, -2}, + {-2, -20, 89, 8, 8, 69, -22, -2}, {-2, -20, 91, 8, 8, 67, -22, -2}, + {-2, -19, 94, 8, 7, 64, -22, -2}, {-2, -19, 96, 8, 7, 62, -22, -2}, + {-2, -18, 98, 8, 7, 59, -22, -2}, {-2, -18, 100, 8, 7, 56, -21, -2}, + {-2, -17, 102, 7, 7, 54, -21, -2}, {-2, -16, 104, 7, 7, 51, -21, -2}, + {-2, -16, 106, 7, 6, 49, -20, -2}, {-2, -15, 108, 7, 6, 46, -20, -2}, + {-2, -15, 110, 7, 6, 43, -19, -2}, {-2, -14, 111, 7, 6, 41, -19, -2}, + {-1, -13, 113, 6, 5, 38, -18, -2}, {-1, -12, 114, 6, 5, 35, -17, -2}, + {-1, -12, 116, 6, 5, 33, -17, -2}, {-1, -11, 118, 5, 4, 30, -16, -1}, + {-1, -10, 119, 5, 4, 27, -15, -1}, {-1, -9, 120, 5, 4, 25, -14, -2}, + {-1, -8, 121, 4, 3, 23, -13, -1}, {-1, -7, 122, 4, 3, 20, -12, -1}, + {-1, -7, 123, 4, 3, 18, -11, -1}, {-1, -6, 124, 3, 3, 16, -10, -1}, + {-1, -5, 125, 3, 2, 13, -8, -1}, {-1, -4, 126, 2, 2, 11, -7, -1}, + { 0, -3, 126, 2, 1, 8, -6, 0}, { 0, -2, 127, 1, 1, 6, -5, 0}, + { 0, -2, 127, 1, 1, 4, -3, 0}, { 0, 0, 127, 0, 0, 2, -1, 0}, + // [1, 2) + { 0, 0, 127, 0, 0, 1, 0, 0}, { 0, 0, 127, 0, 0, -1, 2, 0}, + { 0, 1, 127, -1, 0, -3, 4, 0}, { 0, 1, 126, -2, 0, -4, 6, 1}, + { 0, 1, 126, -3, 0, -5, 8, 1}, { 0, 1, 125, -4, 0, -6, 11, 1}, + { 0, 1, 124, -4, 0, -7, 13, 1}, { 0, 2, 123, -5, 0, -8, 15, 1}, + { 0, 2, 122, -6, 0, -9, 18, 1}, { 0, 2, 121, -6, 0, -10, 20, 1}, + { 0, 2, 120, -7, 0, -11, 22, 2}, { 0, 2, 119, -8, 0, -12, 25, 2}, + { 0, 3, 117, -8, 0, -13, 27, 2}, { 0, 3, 116, -9, 0, -13, 29, 2}, + { 0, 3, 114, -10, 0, -14, 32, 3}, { 0, 3, 113, -10, 0, -15, 35, 2}, + { 0, 3, 111, -11, 0, -15, 37, 3}, { 0, 3, 109, -11, 0, -16, 40, 3}, + { 0, 3, 108, -12, 0, -16, 42, 3}, { 0, 4, 106, -13, 0, -17, 45, 3}, + { 0, 4, 104, -13, 0, -17, 47, 3}, { 0, 4, 102, -14, 0, -17, 50, 3}, + { 0, 4, 100, -14, 0, -17, 52, 3}, { 0, 4, 98, -15, 0, -18, 55, 4}, + { 0, 4, 96, -15, 0, -18, 58, 3}, { 0, 4, 94, -16, 0, -18, 60, 4}, + { 0, 4, 91, -16, 0, -18, 63, 4}, { 0, 4, 89, -16, 0, -18, 65, 4}, + { 0, 4, 87, -17, 0, -18, 68, 4}, { 0, 4, 85, -17, 0, -18, 70, 4}, + { 0, 4, 82, -17, 0, -18, 73, 4}, { 0, 4, 80, -17, 0, -18, 75, 4}, + { 0, 4, 78, -18, 0, -18, 78, 4}, { 0, 4, 75, -18, 0, -17, 80, 4}, + { 0, 4, 73, -18, 0, -17, 82, 4}, { 0, 4, 70, -18, 0, -17, 85, 4}, + { 0, 4, 68, -18, 0, -17, 87, 4}, { 0, 4, 65, -18, 0, -16, 89, 4}, + { 0, 4, 63, -18, 0, -16, 91, 4}, { 0, 4, 60, -18, 0, -16, 94, 4}, + { 0, 3, 58, -18, 0, -15, 96, 4}, { 0, 4, 55, -18, 0, -15, 98, 4}, + { 0, 3, 52, -17, 0, -14, 100, 4}, { 0, 3, 50, -17, 0, -14, 102, 4}, + { 0, 3, 47, -17, 0, -13, 104, 4}, { 0, 3, 45, -17, 0, -13, 106, 4}, + { 0, 3, 42, -16, 0, -12, 108, 3}, { 0, 3, 40, -16, 0, -11, 109, 3}, + { 0, 3, 37, -15, 0, -11, 111, 3}, { 0, 2, 35, -15, 0, -10, 113, 3}, + { 0, 3, 32, -14, 0, -10, 114, 3}, { 0, 2, 29, -13, 0, -9, 116, 3}, + { 0, 2, 27, -13, 0, -8, 117, 3}, { 0, 2, 25, -12, 0, -8, 119, 2}, + { 0, 2, 22, -11, 0, -7, 120, 2}, { 0, 1, 20, -10, 0, -6, 121, 2}, + { 0, 1, 18, -9, 0, -6, 122, 2}, { 0, 1, 15, -8, 0, -5, 123, 2}, + { 0, 1, 13, -7, 0, -4, 124, 1}, { 0, 1, 11, -6, 0, -4, 125, 1}, + { 0, 1, 8, -5, 0, -3, 126, 1}, { 0, 1, 6, -4, 0, -2, 126, 1}, + { 0, 0, 4, -3, 0, -1, 127, 1}, { 0, 0, 2, -1, 0, 0, 127, 0}, + // dummy (replicate row index 191) + { 0, 0, 2, -1, 0, 0, 127, 0}, + +#else + // [-1, 0) + { 0, 127, 0, 0, 0, 1, 0, 0}, { 1, 127, -1, 0, -3, 4, 0, 0}, + { 1, 126, -3, 0, -5, 8, 1, 0}, { 1, 124, -4, 0, -7, 13, 1, 0}, + { 2, 122, -6, 0, -9, 18, 1, 0}, { 2, 120, -7, 0, -11, 22, 2, 0}, + { 3, 117, -8, 0, -13, 27, 2, 0}, { 3, 114, -10, 0, -14, 32, 3, 0}, + { 3, 111, -11, 0, -15, 37, 3, 0}, { 3, 108, -12, 0, -16, 42, 3, 0}, + { 4, 104, -13, 0, -17, 47, 3, 0}, { 4, 100, -14, 0, -17, 52, 3, 0}, + { 4, 96, -15, 0, -18, 58, 3, 0}, { 4, 91, -16, 0, -18, 63, 4, 0}, + { 4, 87, -17, 0, -18, 68, 4, 0}, { 4, 82, -17, 0, -18, 73, 4, 0}, + { 4, 78, -18, 0, -18, 78, 4, 0}, { 4, 73, -18, 0, -17, 82, 4, 0}, + { 4, 68, -18, 0, -17, 87, 4, 0}, { 4, 63, -18, 0, -16, 91, 4, 0}, + { 3, 58, -18, 0, -15, 96, 4, 0}, { 3, 52, -17, 0, -14, 100, 4, 0}, + { 3, 47, -17, 0, -13, 104, 4, 0}, { 3, 42, -16, 0, -12, 108, 3, 0}, + { 3, 37, -15, 0, -11, 111, 3, 0}, { 3, 32, -14, 0, -10, 114, 3, 0}, + { 2, 27, -13, 0, -8, 117, 3, 0}, { 2, 22, -11, 0, -7, 120, 2, 0}, + { 1, 18, -9, 0, -6, 122, 2, 0}, { 1, 13, -7, 0, -4, 124, 1, 0}, + { 1, 8, -5, 0, -3, 126, 1, 0}, { 0, 4, -3, 0, -1, 127, 1, 0}, + // [0, 1) + { 0, 0, 1, 0, 0, 127, 0, 0}, { 0, -3, 4, 1, 1, 127, -2, 0}, + { 0, -6, 8, 1, 2, 126, -3, 0}, {-1, -8, 13, 2, 3, 125, -5, -1}, + {-1, -11, 18, 3, 4, 123, -7, -1}, {-1, -13, 23, 3, 4, 121, -8, -1}, + {-1, -15, 27, 4, 5, 119, -10, -1}, {-2, -17, 33, 5, 6, 116, -12, -1}, + {-2, -18, 38, 5, 6, 113, -13, -1}, {-2, -19, 43, 6, 7, 110, -15, -2}, + {-2, -20, 49, 6, 7, 106, -16, -2}, {-2, -21, 54, 7, 7, 102, -17, -2}, + {-2, -22, 59, 7, 8, 98, -18, -2}, {-2, -22, 64, 7, 8, 94, -19, -2}, + {-2, -22, 69, 8, 8, 89, -20, -2}, {-2, -21, 74, 8, 8, 84, -21, -2}, + {-2, -21, 79, 8, 8, 79, -21, -2}, {-2, -21, 84, 8, 8, 74, -21, -2}, + {-2, -20, 89, 8, 8, 69, -22, -2}, {-2, -19, 94, 8, 7, 64, -22, -2}, + {-2, -18, 98, 8, 7, 59, -22, -2}, {-2, -17, 102, 7, 7, 54, -21, -2}, + {-2, -16, 106, 7, 6, 49, -20, -2}, {-2, -15, 110, 7, 6, 43, -19, -2}, + {-1, -13, 113, 6, 5, 38, -18, -2}, {-1, -12, 116, 6, 5, 33, -17, -2}, + {-1, -10, 119, 5, 4, 27, -15, -1}, {-1, -8, 121, 4, 3, 23, -13, -1}, + {-1, -7, 123, 4, 3, 18, -11, -1}, {-1, -5, 125, 3, 2, 13, -8, -1}, + { 0, -3, 126, 2, 1, 8, -6, 0}, { 0, -2, 127, 1, 1, 4, -3, 0}, + // [1, 2) + { 0, 0, 127, 0, 0, 1, 0, 0}, { 0, 1, 127, -1, 0, -3, 4, 0}, + { 0, 1, 126, -3, 0, -5, 8, 1}, { 0, 1, 124, -4, 0, -7, 13, 1}, + { 0, 2, 122, -6, 0, -9, 18, 1}, { 0, 2, 120, -7, 0, -11, 22, 2}, + { 0, 3, 117, -8, 0, -13, 27, 2}, { 0, 3, 114, -10, 0, -14, 32, 3}, + { 0, 3, 111, -11, 0, -15, 37, 3}, { 0, 3, 108, -12, 0, -16, 42, 3}, + { 0, 4, 104, -13, 0, -17, 47, 3}, { 0, 4, 100, -14, 0, -17, 52, 3}, + { 0, 4, 96, -15, 0, -18, 58, 3}, { 0, 4, 91, -16, 0, -18, 63, 4}, + { 0, 4, 87, -17, 0, -18, 68, 4}, { 0, 4, 82, -17, 0, -18, 73, 4}, + { 0, 4, 78, -18, 0, -18, 78, 4}, { 0, 4, 73, -18, 0, -17, 82, 4}, + { 0, 4, 68, -18, 0, -17, 87, 4}, { 0, 4, 63, -18, 0, -16, 91, 4}, + { 0, 3, 58, -18, 0, -15, 96, 4}, { 0, 3, 52, -17, 0, -14, 100, 4}, + { 0, 3, 47, -17, 0, -13, 104, 4}, { 0, 3, 42, -16, 0, -12, 108, 3}, + { 0, 3, 37, -15, 0, -11, 111, 3}, { 0, 3, 32, -14, 0, -10, 114, 3}, + { 0, 2, 27, -13, 0, -8, 117, 3}, { 0, 2, 22, -11, 0, -7, 120, 2}, + { 0, 1, 18, -9, 0, -6, 122, 2}, { 0, 1, 13, -7, 0, -4, 124, 1}, + { 0, 1, 8, -5, 0, -3, 126, 1}, { 0, 0, 4, -3, 0, -1, 127, 1}, + // dummy (replicate row index 95) + { 0, 0, 4, -3, 0, -1, 127, 1}, +#endif // WARPEDPIXEL_PREC_BITS == 6 +}; +/* clang-format on */ + +static INLINE void convolve(int32x2x2_t x0, int32x2x2_t x1, uint8x8_t src_0, + uint8x8_t src_1, int16x4_t *res) { + int16x8_t coeff_0, coeff_1; + int16x8_t pix_0, pix_1; + + coeff_0 = vcombine_s16(vreinterpret_s16_s32(x0.val[0]), + vreinterpret_s16_s32(x1.val[0])); + coeff_1 = vcombine_s16(vreinterpret_s16_s32(x0.val[1]), + vreinterpret_s16_s32(x1.val[1])); + + pix_0 = vreinterpretq_s16_u16(vmovl_u8(src_0)); + pix_0 = vmulq_s16(coeff_0, pix_0); + + pix_1 = vreinterpretq_s16_u16(vmovl_u8(src_1)); + pix_0 = vmlaq_s16(pix_0, coeff_1, pix_1); + + *res = vpadd_s16(vget_low_s16(pix_0), vget_high_s16(pix_0)); +} + +static INLINE void horizontal_filter_neon(uint8x16_t src_1, uint8x16_t src_2, + uint8x16_t src_3, uint8x16_t src_4, + int16x8_t *tmp_dst, int sx, int alpha, + int k, const int offset_bits_horiz, + const int reduce_bits_horiz) { + const uint8x16_t mask = { 255, 0, 255, 0, 255, 0, 255, 0, + 255, 0, 255, 0, 255, 0, 255, 0 }; + const int32x4_t add_const = vdupq_n_s32((int32_t)(1 << offset_bits_horiz)); + const int16x8_t shift = vdupq_n_s16(-(int16_t)reduce_bits_horiz); + + int16x8_t f0, f1, f2, f3, f4, f5, f6, f7; + int32x2x2_t b0, b1; + uint8x8_t src_1_low, src_2_low, src_3_low, src_4_low, src_5_low, src_6_low; + int32x4_t tmp_res_low, tmp_res_high; + uint16x8_t res; + int16x4_t res_0246_even, res_0246_odd, res_1357_even, res_1357_odd; + + uint8x16_t tmp_0 = vandq_u8(src_1, mask); + uint8x16_t tmp_1 = vandq_u8(src_2, mask); + uint8x16_t tmp_2 = vandq_u8(src_3, mask); + uint8x16_t tmp_3 = vandq_u8(src_4, mask); + + tmp_2 = vextq_u8(tmp_0, tmp_0, 1); + tmp_3 = vextq_u8(tmp_1, tmp_1, 1); + + src_1 = vaddq_u8(tmp_0, tmp_2); + src_2 = vaddq_u8(tmp_1, tmp_3); + + src_1_low = vget_low_u8(src_1); + src_2_low = vget_low_u8(src_2); + src_3_low = vget_low_u8(vextq_u8(src_1, src_1, 4)); + src_4_low = vget_low_u8(vextq_u8(src_2, src_2, 4)); + src_5_low = vget_low_u8(vextq_u8(src_1, src_1, 2)); + src_6_low = vget_low_u8(vextq_u8(src_1, src_1, 6)); + + // Loading the 8 filter taps + f0 = vmovl_s8( + vld1_s8(filter_8bit_neon[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS])); + f1 = vmovl_s8( + vld1_s8(filter_8bit_neon[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS])); + f2 = vmovl_s8( + vld1_s8(filter_8bit_neon[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS])); + f3 = vmovl_s8( + vld1_s8(filter_8bit_neon[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS])); + f4 = vmovl_s8( + vld1_s8(filter_8bit_neon[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS])); + f5 = vmovl_s8( + vld1_s8(filter_8bit_neon[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS])); + f6 = vmovl_s8( + vld1_s8(filter_8bit_neon[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS])); + f7 = vmovl_s8( + vld1_s8(filter_8bit_neon[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS])); + + b0 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f0)), + vreinterpret_s32_s16(vget_low_s16(f2))); + b1 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f4)), + vreinterpret_s32_s16(vget_low_s16(f6))); + convolve(b0, b1, src_1_low, src_3_low, &res_0246_even); + + b0 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f1)), + vreinterpret_s32_s16(vget_low_s16(f3))); + b1 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f5)), + vreinterpret_s32_s16(vget_low_s16(f7))); + convolve(b0, b1, src_2_low, src_4_low, &res_0246_odd); + + b0 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f0)), + vreinterpret_s32_s16(vget_high_s16(f2))); + b1 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f4)), + vreinterpret_s32_s16(vget_high_s16(f6))); + convolve(b0, b1, src_2_low, src_4_low, &res_1357_even); + + b0 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f1)), + vreinterpret_s32_s16(vget_high_s16(f3))); + b1 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f5)), + vreinterpret_s32_s16(vget_high_s16(f7))); + convolve(b0, b1, src_5_low, src_6_low, &res_1357_odd); + + tmp_res_low = vaddl_s16(res_0246_even, res_1357_even); + tmp_res_high = vaddl_s16(res_0246_odd, res_1357_odd); + + tmp_res_low = vaddq_s32(tmp_res_low, add_const); + tmp_res_high = vaddq_s32(tmp_res_high, add_const); + + res = vcombine_u16(vqmovun_s32(tmp_res_low), vqmovun_s32(tmp_res_high)); + res = vqrshlq_u16(res, shift); + + tmp_dst[k + 7] = vreinterpretq_s16_u16(res); +} + +static INLINE void vertical_filter_neon(const int16x8_t *src, + int32x4_t *res_low, int32x4_t *res_high, + int sy, int gamma) { + int16x4_t src_0, src_1, fltr_0, fltr_1; + int32x4_t res_0, res_1; + int32x2_t res_0_im, res_1_im; + int32x4_t res_even, res_odd, im_res_0, im_res_1; + + int16x8_t f0, f1, f2, f3, f4, f5, f6, f7; + int16x8x2_t b0, b1, b2, b3; + int32x4x2_t c0, c1, c2, c3; + int32x4x2_t d0, d1, d2, d3; + + b0 = vtrnq_s16(src[0], src[1]); + b1 = vtrnq_s16(src[2], src[3]); + b2 = vtrnq_s16(src[4], src[5]); + b3 = vtrnq_s16(src[6], src[7]); + + c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]), + vreinterpretq_s32_s16(b0.val[1])); + c1 = vtrnq_s32(vreinterpretq_s32_s16(b1.val[0]), + vreinterpretq_s32_s16(b1.val[1])); + c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]), + vreinterpretq_s32_s16(b2.val[1])); + c3 = vtrnq_s32(vreinterpretq_s32_s16(b3.val[0]), + vreinterpretq_s32_s16(b3.val[1])); + + f0 = vld1q_s16( + (int16_t *)(warped_filter + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); + f1 = vld1q_s16( + (int16_t *)(warped_filter + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); + f2 = vld1q_s16( + (int16_t *)(warped_filter + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); + f3 = vld1q_s16( + (int16_t *)(warped_filter + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); + f4 = vld1q_s16( + (int16_t *)(warped_filter + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); + f5 = vld1q_s16( + (int16_t *)(warped_filter + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); + f6 = vld1q_s16( + (int16_t *)(warped_filter + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); + f7 = vld1q_s16( + (int16_t *)(warped_filter + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); + + d0 = vtrnq_s32(vreinterpretq_s32_s16(f0), vreinterpretq_s32_s16(f2)); + d1 = vtrnq_s32(vreinterpretq_s32_s16(f4), vreinterpretq_s32_s16(f6)); + d2 = vtrnq_s32(vreinterpretq_s32_s16(f1), vreinterpretq_s32_s16(f3)); + d3 = vtrnq_s32(vreinterpretq_s32_s16(f5), vreinterpretq_s32_s16(f7)); + + // row:0,1 even_col:0,2 + src_0 = vget_low_s16(vreinterpretq_s16_s32(c0.val[0])); + fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d0.val[0])); + res_0 = vmull_s16(src_0, fltr_0); + + // row:0,1,2,3 even_col:0,2 + src_0 = vget_low_s16(vreinterpretq_s16_s32(c1.val[0])); + fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d0.val[1])); + res_0 = vmlal_s16(res_0, src_0, fltr_0); + res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0)); + + // row:0,1 even_col:4,6 + src_1 = vget_low_s16(vreinterpretq_s16_s32(c0.val[1])); + fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d1.val[0])); + res_1 = vmull_s16(src_1, fltr_1); + + // row:0,1,2,3 even_col:4,6 + src_1 = vget_low_s16(vreinterpretq_s16_s32(c1.val[1])); + fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d1.val[1])); + res_1 = vmlal_s16(res_1, src_1, fltr_1); + res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1)); + + // row:0,1,2,3 even_col:0,2,4,6 + im_res_0 = vcombine_s32(res_0_im, res_1_im); + + // row:4,5 even_col:0,2 + src_0 = vget_low_s16(vreinterpretq_s16_s32(c2.val[0])); + fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d0.val[0])); + res_0 = vmull_s16(src_0, fltr_0); + + // row:4,5,6,7 even_col:0,2 + src_0 = vget_low_s16(vreinterpretq_s16_s32(c3.val[0])); + fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d0.val[1])); + res_0 = vmlal_s16(res_0, src_0, fltr_0); + res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0)); + + // row:4,5 even_col:4,6 + src_1 = vget_low_s16(vreinterpretq_s16_s32(c2.val[1])); + fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d1.val[0])); + res_1 = vmull_s16(src_1, fltr_1); + + // row:4,5,6,7 even_col:4,6 + src_1 = vget_low_s16(vreinterpretq_s16_s32(c3.val[1])); + fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d1.val[1])); + res_1 = vmlal_s16(res_1, src_1, fltr_1); + res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1)); + + // row:4,5,6,7 even_col:0,2,4,6 + im_res_1 = vcombine_s32(res_0_im, res_1_im); + + // row:0-7 even_col:0,2,4,6 + res_even = vaddq_s32(im_res_0, im_res_1); + + // row:0,1 odd_col:1,3 + src_0 = vget_high_s16(vreinterpretq_s16_s32(c0.val[0])); + fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d2.val[0])); + res_0 = vmull_s16(src_0, fltr_0); + + // row:0,1,2,3 odd_col:1,3 + src_0 = vget_high_s16(vreinterpretq_s16_s32(c1.val[0])); + fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d2.val[1])); + res_0 = vmlal_s16(res_0, src_0, fltr_0); + res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0)); + + // row:0,1 odd_col:5,7 + src_1 = vget_high_s16(vreinterpretq_s16_s32(c0.val[1])); + fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d3.val[0])); + res_1 = vmull_s16(src_1, fltr_1); + + // row:0,1,2,3 odd_col:5,7 + src_1 = vget_high_s16(vreinterpretq_s16_s32(c1.val[1])); + fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d3.val[1])); + res_1 = vmlal_s16(res_1, src_1, fltr_1); + res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1)); + + // row:0,1,2,3 odd_col:1,3,5,7 + im_res_0 = vcombine_s32(res_0_im, res_1_im); + + // row:4,5 odd_col:1,3 + src_0 = vget_high_s16(vreinterpretq_s16_s32(c2.val[0])); + fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d2.val[0])); + res_0 = vmull_s16(src_0, fltr_0); + + // row:4,5,6,7 odd_col:1,3 + src_0 = vget_high_s16(vreinterpretq_s16_s32(c3.val[0])); + fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d2.val[1])); + res_0 = vmlal_s16(res_0, src_0, fltr_0); + res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0)); + + // row:4,5 odd_col:5,7 + src_1 = vget_high_s16(vreinterpretq_s16_s32(c2.val[1])); + fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d3.val[0])); + res_1 = vmull_s16(src_1, fltr_1); + + // row:4,5,6,7 odd_col:5,7 + src_1 = vget_high_s16(vreinterpretq_s16_s32(c3.val[1])); + fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d3.val[1])); + res_1 = vmlal_s16(res_1, src_1, fltr_1); + res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1)); + + // row:4,5,6,7 odd_col:1,3,5,7 + im_res_1 = vcombine_s32(res_0_im, res_1_im); + + // row:0-7 odd_col:1,3,5,7 + res_odd = vaddq_s32(im_res_0, im_res_1); + + // reordering as 0 1 2 3 | 4 5 6 7 + c0 = vtrnq_s32(res_even, res_odd); + + // Final store + *res_low = vcombine_s32(vget_low_s32(c0.val[0]), vget_low_s32(c0.val[1])); + *res_high = vcombine_s32(vget_high_s32(c0.val[0]), vget_high_s32(c0.val[1])); +} + +void av1_warp_affine_neon(const int32_t *mat, const uint8_t *ref, int width, + int height, int stride, uint8_t *pred, int p_col, + int p_row, int p_width, int p_height, int p_stride, + int subsampling_x, int subsampling_y, + ConvolveParams *conv_params, int16_t alpha, + int16_t beta, int16_t gamma, int16_t delta) { + int16x8_t tmp[15]; + const int bd = 8; + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const int32x4_t fwd = vdupq_n_s32((int32_t)w0); + const int32x4_t bwd = vdupq_n_s32((int32_t)w1); + const int16x8_t sub_constant = vdupq_n_s16((1 << (bd - 1)) + (1 << bd)); + + int limit = 0; + uint8x16_t vec_dup, mask_val; + int32x4_t res_lo, res_hi; + int16x8_t result_final; + uint8x16_t src_1, src_2, src_3, src_4; + uint8x16_t indx_vec = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + }; + uint8x16_t cmp_vec; + + const int reduce_bits_horiz = conv_params->round_0; + const int reduce_bits_vert = conv_params->is_compound + ? conv_params->round_1 + : 2 * FILTER_BITS - reduce_bits_horiz; + const int32x4_t shift_vert = vdupq_n_s32(-(int32_t)reduce_bits_vert); + const int offset_bits_horiz = bd + FILTER_BITS - 1; + + assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL)); + + const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz; + int32x4_t add_const_vert = vdupq_n_s32((int32_t)(1 << offset_bits_vert)); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int16x4_t round_bits_vec = vdup_n_s16(-(int16_t)round_bits); + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int16x4_t res_sub_const = + vdup_n_s16(-((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)))); + int k; + + assert(IMPLIES(conv_params->do_average, conv_params->is_compound)); + + for (int i = 0; i < p_height; i += 8) { + for (int j = 0; j < p_width; j += 8) { + const int32_t src_x = (p_col + j + 4) << subsampling_x; + const int32_t src_y = (p_row + i + 4) << subsampling_y; + const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0]; + const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1]; + const int32_t x4 = dst_x >> subsampling_x; + const int32_t y4 = dst_y >> subsampling_y; + + int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS; + int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS; + int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + + sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + + sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + // horizontal + if (ix4 <= -7) { + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + int16_t dup_val = + (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + + ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)); + + tmp[k + 7] = vdupq_n_s16(dup_val); + } + } else if (ix4 >= width + 6) { + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + int16_t dup_val = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + + ref[iy * stride + (width - 1)] * + (1 << (FILTER_BITS - reduce_bits_horiz)); + tmp[k + 7] = vdupq_n_s16(dup_val); + } + } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) { + const int out_of_boundary_left = -(ix4 - 6); + const int out_of_boundary_right = (ix4 + 8) - width; + + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + int sx = sx4 + beta * (k + 4); + + const uint8_t *src = ref + iy * stride + ix4 - 7; + src_1 = vld1q_u8(src); + + if (out_of_boundary_left >= 0) { + limit = out_of_boundary_left + 1; + cmp_vec = vdupq_n_u8(out_of_boundary_left); + vec_dup = vdupq_n_u8(*(src + limit)); + mask_val = vcleq_u8(indx_vec, cmp_vec); + src_1 = vbslq_u8(mask_val, vec_dup, src_1); + } + if (out_of_boundary_right >= 0) { + limit = 15 - (out_of_boundary_right + 1); + cmp_vec = vdupq_n_u8(15 - out_of_boundary_right); + vec_dup = vdupq_n_u8(*(src + limit)); + mask_val = vcgeq_u8(indx_vec, cmp_vec); + src_1 = vbslq_u8(mask_val, vec_dup, src_1); + } + src_2 = vextq_u8(src_1, src_1, 1); + src_3 = vextq_u8(src_2, src_2, 1); + src_4 = vextq_u8(src_3, src_3, 1); + + horizontal_filter_neon(src_1, src_2, src_3, src_4, tmp, sx, alpha, k, + offset_bits_horiz, reduce_bits_horiz); + } + } else { + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + int sx = sx4 + beta * (k + 4); + + const uint8_t *src = ref + iy * stride + ix4 - 7; + src_1 = vld1q_u8(src); + src_2 = vextq_u8(src_1, src_1, 1); + src_3 = vextq_u8(src_2, src_2, 1); + src_4 = vextq_u8(src_3, src_3, 1); + + horizontal_filter_neon(src_1, src_2, src_3, src_4, tmp, sx, alpha, k, + offset_bits_horiz, reduce_bits_horiz); + } + } + + // vertical + for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { + int sy = sy4 + delta * (k + 4); + + const int16x8_t *v_src = tmp + (k + 4); + + vertical_filter_neon(v_src, &res_lo, &res_hi, sy, gamma); + + res_lo = vaddq_s32(res_lo, add_const_vert); + res_hi = vaddq_s32(res_hi, add_const_vert); + + if (conv_params->is_compound) { + uint16_t *const p = + (uint16_t *)&conv_params + ->dst[(i + k + 4) * conv_params->dst_stride + j]; + + res_lo = vrshlq_s32(res_lo, shift_vert); + if (conv_params->do_average) { + uint8_t *const dst8 = &pred[(i + k + 4) * p_stride + j]; + uint16x4_t tmp16_lo = vld1_u16(p); + int32x4_t tmp32_lo = vreinterpretq_s32_u32(vmovl_u16(tmp16_lo)); + int16x4_t tmp16_low; + if (conv_params->use_jnt_comp_avg) { + res_lo = vmulq_s32(res_lo, bwd); + tmp32_lo = vmulq_s32(tmp32_lo, fwd); + tmp32_lo = vaddq_s32(tmp32_lo, res_lo); + tmp16_low = vshrn_n_s32(tmp32_lo, DIST_PRECISION_BITS); + } else { + tmp32_lo = vaddq_s32(tmp32_lo, res_lo); + tmp16_low = vshrn_n_s32(tmp32_lo, 1); + } + int16x4_t res_low = vadd_s16(tmp16_low, res_sub_const); + res_low = vqrshl_s16(res_low, round_bits_vec); + int16x8_t final_res_low = vcombine_s16(res_low, res_low); + uint8x8_t res_8_low = vqmovun_s16(final_res_low); + + vst1_lane_u32((uint32_t *)dst8, vreinterpret_u32_u8(res_8_low), 0); + } else { + uint16x4_t res_u16_low = vqmovun_s32(res_lo); + vst1_u16(p, res_u16_low); + } + if (p_width > 4) { + uint16_t *const p4 = + (uint16_t *)&conv_params + ->dst[(i + k + 4) * conv_params->dst_stride + j + 4]; + + res_hi = vrshlq_s32(res_hi, shift_vert); + if (conv_params->do_average) { + uint8_t *const dst8_4 = &pred[(i + k + 4) * p_stride + j + 4]; + + uint16x4_t tmp16_hi = vld1_u16(p4); + int32x4_t tmp32_hi = vreinterpretq_s32_u32(vmovl_u16(tmp16_hi)); + int16x4_t tmp16_high; + if (conv_params->use_jnt_comp_avg) { + res_hi = vmulq_s32(res_hi, bwd); + tmp32_hi = vmulq_s32(tmp32_hi, fwd); + tmp32_hi = vaddq_s32(tmp32_hi, res_hi); + tmp16_high = vshrn_n_s32(tmp32_hi, DIST_PRECISION_BITS); + } else { + tmp32_hi = vaddq_s32(tmp32_hi, res_hi); + tmp16_high = vshrn_n_s32(tmp32_hi, 1); + } + int16x4_t res_high = vadd_s16(tmp16_high, res_sub_const); + res_high = vqrshl_s16(res_high, round_bits_vec); + int16x8_t final_res_high = vcombine_s16(res_high, res_high); + uint8x8_t res_8_high = vqmovun_s16(final_res_high); + + vst1_lane_u32((uint32_t *)dst8_4, vreinterpret_u32_u8(res_8_high), + 0); + } else { + uint16x4_t res_u16_high = vqmovun_s32(res_hi); + vst1_u16(p4, res_u16_high); + } + } + } else { + res_lo = vrshlq_s32(res_lo, shift_vert); + res_hi = vrshlq_s32(res_hi, shift_vert); + + result_final = vcombine_s16(vmovn_s32(res_lo), vmovn_s32(res_hi)); + result_final = vsubq_s16(result_final, sub_constant); + + uint8_t *const p = (uint8_t *)&pred[(i + k + 4) * p_stride + j]; + uint8x8_t val = vqmovun_s16(result_final); + + if (p_width == 4) { + vst1_lane_u32((uint32_t *)p, vreinterpret_u32_u8(val), 0); + } else { + vst1_u8(p, val); + } + } + } + } + } +} diff --git a/third_party/aom/av1/common/arm/wiener_convolve_neon.c b/third_party/aom/av1/common/arm/wiener_convolve_neon.c index 72fbed4d4..a9bb5bcf0 100644 --- a/third_party/aom/av1/common/arm/wiener_convolve_neon.c +++ b/third_party/aom/av1/common/arm/wiener_convolve_neon.c @@ -26,7 +26,6 @@ Apply horizontal filter and store in a temporary buffer. When applying vertical filter, overwrite the original pixel values. */ - void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, @@ -78,8 +77,10 @@ void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride, /* if height is a multiple of 8 */ if (!(h & 7)) { int16x8_t res0, res1, res2, res3; - uint16x8_t res4, res5, res6, res7, res8, res9, res10, res11; + uint16x8_t res4; uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; +#if defined(__aarch64__) + uint16x8_t res5, res6, res7, res8, res9, res10, res11; uint8x8_t t8, t9, t10, t11, t12, t13, t14; do { @@ -190,16 +191,64 @@ void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride, dst_ptr += 8 * MAX_SB_SIZE; height -= 8; } while (height > 0); +#else + uint8x8_t temp_0; + + do { + const uint8_t *s; + + __builtin_prefetch(src_ptr); + + t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7 + s = src_ptr + 8; + d_tmp = dst_ptr; + width = w; + + __builtin_prefetch(dst_ptr); + + do { + t7 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 + temp_0 = t0; + t0 = t7; + + t1 = vext_u8(temp_0, t7, 1); // a1 a2 a3 a4 a5 a6 a7 a8 + t2 = vext_u8(temp_0, t7, 2); // a2 a3 a4 a5 a6 a7 a8 a9 + t3 = vext_u8(temp_0, t7, 3); // a3 a4 a5 a6 a7 a8 a9 a10 + t4 = vext_u8(temp_0, t7, 4); // a4 a5 a6 a7 a8 a9 a10 a11 + t5 = vext_u8(temp_0, t7, 5); // a5 a6 a7 a8 a9 a10 a11 a12 + t6 = vext_u8(temp_0, t7, 6); // a6 a7 a8 a9 a10 a11 a12 a13 + t7 = vext_u8(temp_0, t7, 7); // a7 a8 a9 a10 a11 a12 a13 a14 + + res0 = vreinterpretq_s16_u16(vaddl_u8(temp_0, t6)); + res1 = vreinterpretq_s16_u16(vaddl_u8(t1, t5)); + res2 = vreinterpretq_s16_u16(vaddl_u8(t2, t4)); + res3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + res4 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp, + bd, conv_params->round_0); + + vst1q_u16(d_tmp, res4); + + s += 8; + d_tmp += 8; + width -= 8; + } while (width > 0); + src_ptr += src_stride; + dst_ptr += MAX_SB_SIZE; + height--; + } while (height > 0); +#endif } else { /*if height is a multiple of 4*/ - int16x8_t tt0, tt1, tt2, tt3; const uint8_t *s; + int16x8_t tt0, tt1, tt2, tt3; + uint16x8_t d0; + uint8x8_t t0, t1, t2, t3; + +#if defined(__aarch64__) uint16x4_t res0, res1, res2, res3, res4, res5, res6, res7; - uint16x8_t d0, d1, d2, d3; + uint16x8_t d1, d2, d3; int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; int16x4_t s11, s12, s13, s14; - uint8x8_t t0, t1, t2, t3; - do { __builtin_prefetch(src_ptr + 0 * src_stride); __builtin_prefetch(src_ptr + 1 * src_stride); @@ -292,11 +341,61 @@ void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride, dst_ptr += 4 * MAX_SB_SIZE; height -= 4; } while (height > 0); +#else + uint8x8_t temp_0, t4, t5, t6, t7; + + do { + __builtin_prefetch(src_ptr); + + t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7 + + __builtin_prefetch(dst_ptr); + + s = src_ptr + 8; + d_tmp = dst_ptr; + width = w; + + do { + t7 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 + temp_0 = t0; + t0 = t7; + + t1 = vext_u8(temp_0, t7, 1); // a1 a2 a3 a4 a5 a6 a7 a8 + t2 = vext_u8(temp_0, t7, 2); // a2 a3 a4 a5 a6 a7 a8 a9 + t3 = vext_u8(temp_0, t7, 3); // a3 a4 a5 a6 a7 a8 a9 a10 + t4 = vext_u8(temp_0, t7, 4); // a4 a5 a6 a7 a8 a9 a10 a11 + t5 = vext_u8(temp_0, t7, 5); // a5 a6 a7 a8 a9 a10 a11 a12 + t6 = vext_u8(temp_0, t7, 6); // a6 a7 a8 a9 a10 a11 a12 a13 + t7 = vext_u8(temp_0, t7, 7); // a7 a8 a9 a10 a11 a12 a13 a14 + + tt0 = vreinterpretq_s16_u16(vaddl_u8(temp_0, t6)); + tt1 = vreinterpretq_s16_u16(vaddl_u8(t1, t5)); + tt2 = vreinterpretq_s16_u16(vaddl_u8(t2, t4)); + tt3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + d0 = wiener_convolve8_horiz_8x8(tt0, tt1, tt2, tt3, filter_x_tmp, bd, + conv_params->round_0); + + vst1q_u16(d_tmp, d0); + + s += 8; + d_tmp += 8; + width -= 8; + } while (width > 0); + + src_ptr += src_stride; + dst_ptr += MAX_SB_SIZE; + height -= 1; + } while (height > 0); +#endif } { - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - uint8x8_t t0, t1, t2, t3; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + uint8x8_t t0; +#if defined(__aarch64__) + int16x8_t s8, s9, s10; + uint8x8_t t1, t2, t3; +#endif int16_t *src_tmp_ptr, *s; uint8_t *dst_tmp_ptr; height = h; @@ -324,6 +423,7 @@ void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride, d = dst_tmp_ptr; height = h; +#if defined(__aarch64__) do { __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride); __builtin_prefetch(dst_tmp_ptr + 1 * dst_stride); @@ -397,5 +497,34 @@ void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride, w -= 8; } while (w > 0); +#else + do { + __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride); + + s7 = vld1q_s16(s); + s += src_stride; + + t0 = wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6, filter_y_tmp, + bd, conv_params->round_1); + + vst1_u8(d, t0); + d += dst_stride; + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + height -= 1; + } while (height > 0); + + src_tmp_ptr += 8; + dst_tmp_ptr += 8; + + w -= 8; + } while (w > 0); +#endif } } diff --git a/third_party/aom/av1/common/av1_inv_txfm1d.c b/third_party/aom/av1/common/av1_inv_txfm1d.c index 8514dc64c..7ef2d6d7f 100644 --- a/third_party/aom/av1/common/av1_inv_txfm1d.c +++ b/third_party/aom/av1/common/av1_inv_txfm1d.c @@ -11,56 +11,7 @@ #include #include "av1/common/av1_inv_txfm1d.h" - -static void range_check_buf(int32_t stage, const int32_t *input, - const int32_t *buf, int32_t size, int8_t bit) { -#if CONFIG_COEFFICIENT_RANGE_CHECKING - const int64_t max_value = (1LL << (bit - 1)) - 1; - const int64_t min_value = -(1LL << (bit - 1)); - - int in_range = 1; - - for (int i = 0; i < size; ++i) { - if (buf[i] < min_value || buf[i] > max_value) { - in_range = 0; - } - } - - if (!in_range) { - fprintf(stderr, "Error: coeffs contain out-of-range values\n"); - fprintf(stderr, "size: %d\n", size); - fprintf(stderr, "stage: %d\n", stage); - fprintf(stderr, "allowed range: [%" PRId64 ";%" PRId64 "]\n", min_value, - max_value); - - fprintf(stderr, "coeffs: "); - - fprintf(stderr, "["); - for (int j = 0; j < size; j++) { - if (j > 0) fprintf(stderr, ", "); - fprintf(stderr, "%d", input[j]); - } - fprintf(stderr, "]\n"); - - fprintf(stderr, " buf: "); - - fprintf(stderr, "["); - for (int j = 0; j < size; j++) { - if (j > 0) fprintf(stderr, ", "); - fprintf(stderr, "%d", buf[j]); - } - fprintf(stderr, "]\n\n"); - } - - assert(in_range); -#else - (void)stage; - (void)input; - (void)buf; - (void)size; - (void)bit; -#endif -} +#include "av1/common/av1_txfm.h" // TODO(angiebird): Make 1-d txfm functions static // @@ -84,7 +35,7 @@ void av1_idct4_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[1] = input[2]; bf1[2] = input[1]; bf1[3] = input[3]; - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; @@ -94,7 +45,7 @@ void av1_idct4_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit); bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit); bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; @@ -129,7 +80,7 @@ void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = input[5]; bf1[6] = input[3]; bf1[7] = input[7]; - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; @@ -143,7 +94,7 @@ void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit); bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit); bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; @@ -157,7 +108,7 @@ void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]); bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]); bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 4 stage++; @@ -171,7 +122,7 @@ void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); bf1[7] = bf0[7]; - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 5 stage++; @@ -218,7 +169,7 @@ void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = input[11]; bf1[14] = input[7]; bf1[15] = input[15]; - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; @@ -240,7 +191,7 @@ void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit); bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit); bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; @@ -262,7 +213,7 @@ void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]); bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]); bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 4 stage++; @@ -284,7 +235,7 @@ void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit); bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit); bf1[15] = bf0[15]; - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 5 stage++; @@ -306,7 +257,7 @@ void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]); bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]); bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 6 stage++; @@ -328,7 +279,7 @@ void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); bf1[14] = bf0[14]; bf1[15] = bf0[15]; - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 7 stage++; @@ -399,7 +350,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[29] = input[23]; bf1[30] = input[15]; bf1[31] = input[31]; - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; @@ -437,7 +388,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit); bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit); bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; @@ -475,7 +426,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[29] = clamp_value(bf0[28] - bf0[29], stage_range[stage]); bf1[30] = clamp_value(-bf0[30] + bf0[31], stage_range[stage]); bf1[31] = clamp_value(bf0[30] + bf0[31], stage_range[stage]); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 4 stage++; @@ -513,7 +464,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit); bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit); bf1[31] = bf0[31]; - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 5 stage++; @@ -551,7 +502,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[29] = clamp_value(-bf0[29] + bf0[30], stage_range[stage]); bf1[30] = clamp_value(bf0[29] + bf0[30], stage_range[stage]); bf1[31] = clamp_value(bf0[28] + bf0[31], stage_range[stage]); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 6 stage++; @@ -589,7 +540,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit); bf1[30] = bf0[30]; bf1[31] = bf0[31]; - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 7 stage++; @@ -627,7 +578,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[29] = clamp_value(bf0[26] + bf0[29], stage_range[stage]); bf1[30] = clamp_value(bf0[25] + bf0[30], stage_range[stage]); bf1[31] = clamp_value(bf0[24] + bf0[31], stage_range[stage]); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 8 stage++; @@ -665,7 +616,7 @@ void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[29] = bf0[29]; bf1[30] = bf0[30]; bf1[31] = bf0[31]; - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 9 stage++; @@ -760,7 +711,6 @@ void av1_iadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit, output[1] = round_shift(x1, bit); output[2] = round_shift(x2, bit); output[3] = round_shift(x3, bit); - range_check_buf(6, input, output, 4, stage_range[6]); } void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit, @@ -786,7 +736,7 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = input[4]; bf1[6] = input[1]; bf1[7] = input[6]; - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; @@ -800,7 +750,7 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit); bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit); bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; @@ -814,7 +764,7 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = clamp_value(bf0[1] - bf0[5], stage_range[stage]); bf1[6] = clamp_value(bf0[2] - bf0[6], stage_range[stage]); bf1[7] = clamp_value(bf0[3] - bf0[7], stage_range[stage]); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 4 stage++; @@ -828,7 +778,7 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit); bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit); bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 5 stage++; @@ -842,7 +792,7 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = clamp_value(bf0[5] + bf0[7], stage_range[stage]); bf1[6] = clamp_value(bf0[4] - bf0[6], stage_range[stage]); bf1[7] = clamp_value(bf0[5] - bf0[7], stage_range[stage]); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 6 stage++; @@ -856,7 +806,7 @@ void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[5] = bf0[5]; bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit); bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 7 stage++; @@ -903,7 +853,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = input[12]; bf1[14] = input[1]; bf1[15] = input[14]; - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; @@ -925,7 +875,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit); bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit); bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; @@ -947,7 +897,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = clamp_value(bf0[5] - bf0[13], stage_range[stage]); bf1[14] = clamp_value(bf0[6] - bf0[14], stage_range[stage]); bf1[15] = clamp_value(bf0[7] - bf0[15], stage_range[stage]); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 4 stage++; @@ -969,7 +919,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit); bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit); bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 5 stage++; @@ -991,7 +941,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = clamp_value(bf0[9] - bf0[13], stage_range[stage]); bf1[14] = clamp_value(bf0[10] - bf0[14], stage_range[stage]); bf1[15] = clamp_value(bf0[11] - bf0[15], stage_range[stage]); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 6 stage++; @@ -1013,7 +963,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit); bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit); bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 7 stage++; @@ -1035,7 +985,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = clamp_value(bf0[13] + bf0[15], stage_range[stage]); bf1[14] = clamp_value(bf0[12] - bf0[14], stage_range[stage]); bf1[15] = clamp_value(bf0[13] - bf0[15], stage_range[stage]); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 8 stage++; @@ -1057,7 +1007,7 @@ void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[13] = bf0[13]; bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit); bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 9 stage++; @@ -1193,7 +1143,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = input[47]; bf1[62] = input[31]; bf1[63] = input[63]; - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; @@ -1263,7 +1213,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = half_btf(cospi[17], bf0[34], cospi[47], bf0[61], cos_bit); bf1[62] = half_btf(cospi[33], bf0[33], cospi[31], bf0[62], cos_bit); bf1[63] = half_btf(cospi[1], bf0[32], cospi[63], bf0[63], cos_bit); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; @@ -1333,7 +1283,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = clamp_value(bf0[60] - bf0[61], stage_range[stage]); bf1[62] = clamp_value(-bf0[62] + bf0[63], stage_range[stage]); bf1[63] = clamp_value(bf0[62] + bf0[63], stage_range[stage]); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 4 stage++; @@ -1403,7 +1353,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = half_btf(-cospi[4], bf0[34], cospi[60], bf0[61], cos_bit); bf1[62] = half_btf(cospi[60], bf0[33], cospi[4], bf0[62], cos_bit); bf1[63] = bf0[63]; - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 5 stage++; @@ -1473,7 +1423,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = clamp_value(-bf0[61] + bf0[62], stage_range[stage]); bf1[62] = clamp_value(bf0[61] + bf0[62], stage_range[stage]); bf1[63] = clamp_value(bf0[60] + bf0[63], stage_range[stage]); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 6 stage++; @@ -1543,7 +1493,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = half_btf(cospi[56], bf0[34], cospi[8], bf0[61], cos_bit); bf1[62] = bf0[62]; bf1[63] = bf0[63]; - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 7 stage++; @@ -1613,7 +1563,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = clamp_value(bf0[58] + bf0[61], stage_range[stage]); bf1[62] = clamp_value(bf0[57] + bf0[62], stage_range[stage]); bf1[63] = clamp_value(bf0[56] + bf0[63], stage_range[stage]); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 8 stage++; @@ -1683,7 +1633,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = bf0[61]; bf1[62] = bf0[62]; bf1[63] = bf0[63]; - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 9 stage++; @@ -1753,7 +1703,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = clamp_value(bf0[50] + bf0[61], stage_range[stage]); bf1[62] = clamp_value(bf0[49] + bf0[62], stage_range[stage]); bf1[63] = clamp_value(bf0[48] + bf0[63], stage_range[stage]); - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 10 stage++; @@ -1823,7 +1773,7 @@ void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit, bf1[61] = bf0[61]; bf1[62] = bf0[62]; bf1[63] = bf0[63]; - range_check_buf(stage, input, bf1, size, stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 11 stage++; diff --git a/third_party/aom/av1/common/av1_inv_txfm1d.h b/third_party/aom/av1/common/av1_inv_txfm1d.h index 64a1a921c..c31c019aa 100644 --- a/third_party/aom/av1/common/av1_inv_txfm1d.h +++ b/third_party/aom/av1/common/av1_inv_txfm1d.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_INV_TXFM1D_H_ -#define AV1_INV_TXFM1D_H_ +#ifndef AOM_AV1_COMMON_AV1_INV_TXFM1D_H_ +#define AOM_AV1_COMMON_AV1_INV_TXFM1D_H_ #include "av1/common/av1_txfm.h" @@ -58,4 +58,4 @@ void av1_iidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit, } #endif -#endif // AV1_INV_TXFM1D_H_ +#endif // AOM_AV1_COMMON_AV1_INV_TXFM1D_H_ diff --git a/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h b/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h index 4c600f756..7d80a0099 100644 --- a/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h +++ b/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_INV_TXFM2D_CFG_H_ -#define AV1_INV_TXFM2D_CFG_H_ +#ifndef AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_ +#define AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_ #include "av1/common/av1_inv_txfm1d.h" // sum of fwd_shift_## @@ -44,4 +44,4 @@ extern const int8_t *inv_txfm_shift_ls[TX_SIZES_ALL]; extern const int8_t inv_cos_bit_col[5 /*row*/][5 /*col*/]; extern const int8_t inv_cos_bit_row[5 /*row*/][5 /*col*/]; -#endif // AV1_INV_TXFM2D_CFG_H_ +#endif // AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_ diff --git a/third_party/aom/av1/common/av1_loopfilter.c b/third_party/aom/av1/common/av1_loopfilter.c index 9d68b8760..537d8dfe9 100644 --- a/third_party/aom/av1/common/av1_loopfilter.c +++ b/third_party/aom/av1/common/av1_loopfilter.c @@ -68,23 +68,6 @@ static const int mode_lf_lut[] = { // 10101010|10101010 // // A loopfilter should be applied to every other 4x4 horizontally. -// TODO(chengchen): make these tables static -const FilterMask left_txform_mask[TX_SIZES] = { - { { 0xffffffffffffffffULL, // TX_4X4, - 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL } }, - - { { 0x5555555555555555ULL, // TX_8X8, - 0x5555555555555555ULL, 0x5555555555555555ULL, 0x5555555555555555ULL } }, - - { { 0x1111111111111111ULL, // TX_16X16, - 0x1111111111111111ULL, 0x1111111111111111ULL, 0x1111111111111111ULL } }, - - { { 0x0101010101010101ULL, // TX_32X32, - 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL } }, - - { { 0x0001000100010001ULL, // TX_64X64, - 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL } }, -}; // 256 bit masks (64x64 / 4x4) for above transform size for Y plane. // We use 4 uint64_t to represent the 256 bit. @@ -113,98 +96,314 @@ const FilterMask left_txform_mask[TX_SIZES] = { // 00000000|00000000 // // A loopfilter should be applied to every other 4x4 horizontally. -const FilterMask above_txform_mask[TX_SIZES] = { - { { 0xffffffffffffffffULL, // TX_4X4 - 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL } }, - { { 0x0000ffff0000ffffULL, // TX_8X8 - 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL } }, - - { { 0x000000000000ffffULL, // TX_16X16 - 0x000000000000ffffULL, 0x000000000000ffffULL, 0x000000000000ffffULL } }, - - { { 0x000000000000ffffULL, // TX_32X32 - 0x0000000000000000ULL, 0x000000000000ffffULL, 0x0000000000000000ULL } }, - - { { 0x000000000000ffffULL, // TX_64X64 - 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, +const int mask_id_table_tx_4x4[BLOCK_SIZES_ALL] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1, 13, 14, 15, 16, 17, 18 }; -// 64 bit mask to shift and set for each prediction size. A bit is set for -// each 4x4 block that would be in the top left most block of the given block -// size in the 64x64 block. -const FilterMask size_mask_y[BLOCK_SIZES_ALL] = { - { { 0x0000000000000001ULL, // BLOCK_4X4 - 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, - - { { 0x0000000000010001ULL, // BLOCK_4X8 - 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, - - { { 0x0000000000000003ULL, // BLOCK_8X4 - 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, - - { { 0x0000000000030003ULL, // BLOCK_8X8 - 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, - - { { 0x0003000300030003ULL, // BLOCK_8X16 - 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, - - { { 0x00000000000f000fULL, // BLOCK_16X8 - 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, - - { { 0x000f000f000f000fULL, // BLOCK_16X16 - 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, - - { { 0x000f000f000f000fULL, // BLOCK_16X32 - 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, - - { { 0x00ff00ff00ff00ffULL, // BLOCK_32X16 - 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, - - { { 0x00ff00ff00ff00ffULL, // BLOCK_32X32 - 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, - - { { 0x00ff00ff00ff00ffULL, // BLOCK_32X64 - 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL } }, - - { { 0xffffffffffffffffULL, // BLOCK_64X32 - 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, - - { { 0xffffffffffffffffULL, // BLOCK_64X64 - 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL } }, - // Y plane max coding block size is 128x128, but the codec divides it - // into 4 64x64 blocks. - // BLOCK_64X128 - { { 0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL } }, - // BLOCK_128X64 - { { 0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL } }, - // BLOCK_128X128 - { { 0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL } }, - - { { 0x0001000100010001ULL, // BLOCK_4X16 - 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, - - { { 0x000000000000000fULL, // BLOCK_16X4 - 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, - - { { 0x0003000300030003ULL, // BLOCK_8X32 - 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, +const int mask_id_table_tx_8x8[BLOCK_SIZES_ALL] = { + -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, 10, 11, 12, 13 +}; - { { 0x0000000000ff00ffULL, // BLOCK_32X8 - 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, +const int mask_id_table_tx_16x16[BLOCK_SIZES_ALL] = { + -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, -1, -1, -1, -1, -1, -1, -1, 7, 8 +}; - { { 0x000f000f000f000fULL, // BLOCK_16X64 - 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL } }, +const int mask_id_table_tx_32x32[BLOCK_SIZES_ALL] = { -1, -1, -1, -1, -1, -1, + -1, -1, -1, 0, 1, 2, + 3, -1, -1, -1, -1, -1, + -1, -1, -1, -1 }; + +const FilterMask left_mask_univariant_reordered[67] = { + // TX_4X4 + { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X4, TX_4X4 + { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X8, TX_4X4 + { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X4, TX_4X4 + { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X8, TX_4X4 + { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X16, TX_4X4 + { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X8, TX_4X4 + { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X16, TX_4X4 + { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X32, TX_4X4 + { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X16, TX_4X4 + { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X32, TX_4X4 + { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, + 0x00ff00ff00ff00ffULL } }, // block size 32X64, TX_4X4 + { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X32, TX_4X4 + { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL, + 0xffffffffffffffffULL } }, // block size 64X64, TX_4X4 + { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X16, TX_4X4 + { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X4, TX_4X4 + { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X32, TX_4X4 + { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X8, TX_4X4 + { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, + 0x000f000f000f000fULL } }, // block size 16X64, TX_4X4 + { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X16, TX_4X4 + // TX_8X8 + { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X8, TX_8X8 + { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X16, TX_8X8 + { { 0x0000000000050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X8, TX_8X8 + { { 0x0005000500050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X16, TX_8X8 + { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X32, TX_8X8 + { { 0x0055005500550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X16, TX_8X8 + { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X32, TX_8X8 + { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0055005500550055ULL, + 0x0055005500550055ULL } }, // block size 32X64, TX_8X8 + { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X32, TX_8X8 + { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x5555555555555555ULL, + 0x5555555555555555ULL } }, // block size 64X64, TX_8X8 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X32, TX_8X8 + { { 0x0000000000550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X8, TX_8X8 + { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0005000500050005ULL, + 0x0005000500050005ULL } }, // block size 16X64, TX_8X8 + { { 0x5555555555555555ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X16, TX_8X8 + // TX_16X16 + { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X16, TX_16X16 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X32, TX_16X16 + { { 0x0011001100110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X16, TX_16X16 + { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X32, TX_16X16 + { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0011001100110011ULL, + 0x0011001100110011ULL } }, // block size 32X64, TX_16X16 + { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X32, TX_16X16 + { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x1111111111111111ULL, + 0x1111111111111111ULL } }, // block size 64X64, TX_16X16 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL, + 0x0001000100010001ULL } }, // block size 16X64, TX_16X16 + { { 0x1111111111111111ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X16, TX_16X16 + // TX_32X32 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X32, TX_32X32 + { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL, + 0x0101010101010101ULL } }, // block size 32X64, TX_32X32 + { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X32, TX_32X32 + { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL, + 0x0101010101010101ULL } }, // block size 64X64, TX_32X32 + // TX_64X64 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL, + 0x0001000100010001ULL } }, // block size 64X64, TX_64X64 + // 2:1, 1:2 transform sizes. + { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X8, TX_4X8 + { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X16, TX_4X8 + { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X4, TX_8X4 + { { 0x0000000000000005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X4, TX_8X4 + { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X16, TX_8X16 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X32, TX_8X16 + { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X8, TX_16X8 + { { 0x0000000000110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X8, TX_16X8 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X32, TX_16X32 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL, + 0x0001000100010001ULL } }, // block size 16X64, TX_16X32 + { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X16, TX_32X16 + { { 0x0101010101010101ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X16, TX_32X16 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL, + 0x0001000100010001ULL } }, // block size 32X64, TX_32X64 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X32, TX_64X32 + // 4:1, 1:4 transform sizes. + { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X16, TX_4X16 + { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X4, TX_16X4 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X32, TX_8X32 + { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X8, TX_32X8 + { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL, + 0x0001000100010001ULL } }, // block size 16X64, TX_16X64 + { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X16, TX_64X16 +}; - { { 0xffffffffffffffffULL, // BLOCK_64X16 - 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } } +const FilterMask above_mask_univariant_reordered[67] = { + // TX_4X4 + { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X4, TX_4X4 + { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X8, TX_4X4 + { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X4, TX_4X4 + { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X8, TX_4X4 + { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X16, TX_4X4 + { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X8, TX_4X4 + { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X16, TX_4X4 + { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X32, TX_4X4 + { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X16, TX_4X4 + { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X32, TX_4X4 + { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, + 0x00ff00ff00ff00ffULL } }, // block size 32X64, TX_4X4 + { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X32, TX_4X4 + { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL, + 0xffffffffffffffffULL } }, // block size 64X64, TX_4x4 + { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X16, TX_4X4 + { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X4, TX_4X4 + { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X32, TX_4X4 + { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X8, TX_4X4 + { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL, + 0x000f000f000f000fULL } }, // block size 16X64, TX_4X4 + { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X16, TX_4X4 + // TX_8X8 + { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X8, TX_8X8 + { { 0x0000000300000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X16, TX_8X8 + { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X8, TX_8X8 + { { 0x0000000f0000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X16, TX_8X8 + { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X32, TX_8X8 + { { 0x000000ff000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X16, TX_8X8 + { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X32, TX_8X8 + { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x000000ff000000ffULL, + 0x000000ff000000ffULL } }, // block size 32X64, TX_8X8 + { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X32, TX_8X8 + { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, + 0x0000ffff0000ffffULL } }, // block size 64X64, TX_8X8 + { { 0x0000000300000003ULL, 0x0000000300000003ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X32, TX_8X8 + { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X8, TX_8X8 + { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000f0000000fULL, + 0x0000000f0000000fULL } }, // block size 16X64, TX_8X8 + { { 0x0000ffff0000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X16, TX_8X8 + // TX_16X16 + { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X16, TX_16X16 + { { 0x000000000000000fULL, 0x000000000000000fULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X32, TX_16X16 + { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X16, TX_16X16 + { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X32, TX_16X16 + { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x00000000000000ffULL, + 0x00000000000000ffULL } }, // block size 32X64, TX_16X16 + { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X32, TX_16X16 + { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x000000000000ffffULL, + 0x000000000000ffffULL } }, // block size 64X64, TX_16X16 + { { 0x000000000000000fULL, 0x000000000000000fULL, 0x000000000000000fULL, + 0x000000000000000fULL } }, // block size 16X64, TX_16X16 + { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X16, TX_16X16 + // TX_32X32 + { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X32, TX_32X32 + { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x00000000000000ffULL, + 0x0000000000000000ULL } }, // block size 32X64, TX_32X32 + { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X32, TX_32X32 + { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x000000000000ffffULL, + 0x0000000000000000ULL } }, // block size 64X64, TX_32X32 + // TX_64X64 + { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X64, TX_64X64 + // 2:1, 1:2 transform sizes. + { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X8, TX_4X8 + { { 0x0000000100000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X16, TX_4X8 + { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X4, TX_8X4 + { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X4, TX_8X4 + { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X16, TX_8X16 + { { 0x0000000000000003ULL, 0x0000000000000003ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X32, TX_8X16 + { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X8, TX_16X8 + { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X8, TX_16X8 + { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X32, TX_16X32 + { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x000000000000000fULL, + 0x0000000000000000ULL } }, // block size 16X64, TX_16X32 + { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X16, TX_32X16 + { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X16, TX_32X16 + { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X64, TX_32X64 + { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X32, TX_64X32 + // 4:1, 1:4 transform sizes. + { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 4X16, TX_4X16 + { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X4, TX_16X4 + { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 8X32, TX_8X32 + { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 32X8, TX_32X8 + { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 16X64, TX_16X64 + { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL } }, // block size 64X16, TX_64X16 }; LoopFilterMask *get_loop_filter_mask(const AV1_COMMON *const cm, int mi_row, int mi_col) { - if ((mi_row << MI_SIZE_LOG2) >= cm->height || - (mi_col << MI_SIZE_LOG2) >= cm->width) - return NULL; assert(cm->lf.lfm != NULL); const int row = mi_row >> MIN_MIB_SIZE_LOG2; // 64x64 const int col = mi_col >> MIN_MIB_SIZE_LOG2; @@ -248,10 +447,10 @@ static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) { SIMD_WIDTH); } } -static uint8_t get_filter_level(const AV1_COMMON *cm, - const loop_filter_info_n *lfi_n, - const int dir_idx, int plane, - const MB_MODE_INFO *mbmi) { + +uint8_t get_filter_level(const AV1_COMMON *cm, const loop_filter_info_n *lfi_n, + const int dir_idx, int plane, + const MB_MODE_INFO *mbmi) { const int segment_id = mbmi->segment_id; if (cm->delta_lf_present_flag) { int delta_lf; @@ -374,30 +573,6 @@ void av1_loop_filter_frame_init(AV1_COMMON *cm, int plane_start, } } } - -#if LOOP_FILTER_BITMASK - memset(lf->neighbor_sb_lpf_info.tx_size_y_above, TX_64X64, - sizeof(TX_SIZE) * MI_SIZE_64X64); - memset(lf->neighbor_sb_lpf_info.tx_size_y_left, TX_64X64, - sizeof(TX_SIZE) * MI_SIZE_64X64); - memset(lf->neighbor_sb_lpf_info.tx_size_uv_above, TX_64X64, - sizeof(TX_SIZE) * MI_SIZE_64X64); - memset(lf->neighbor_sb_lpf_info.tx_size_uv_left, TX_64X64, - sizeof(TX_SIZE) * MI_SIZE_64X64); - memset(lf->neighbor_sb_lpf_info.y_level_above, 0, - sizeof(uint8_t) * MI_SIZE_64X64); - memset(lf->neighbor_sb_lpf_info.y_level_left, 0, - sizeof(uint8_t) * MI_SIZE_64X64); - memset(lf->neighbor_sb_lpf_info.u_level_above, 0, - sizeof(uint8_t) * MI_SIZE_64X64); - memset(lf->neighbor_sb_lpf_info.u_level_left, 0, - sizeof(uint8_t) * MI_SIZE_64X64); - memset(lf->neighbor_sb_lpf_info.v_level_above, 0, - sizeof(uint8_t) * MI_SIZE_64X64); - memset(lf->neighbor_sb_lpf_info.v_level_left, 0, - sizeof(uint8_t) * MI_SIZE_64X64); - memset(lf->neighbor_sb_lpf_info.skip, 0, sizeof(uint8_t) * MI_SIZE_64X64); -#endif // LOOP_FILTER_BITMASK } #if LOOP_FILTER_BITMASK @@ -413,7 +588,7 @@ void av1_loop_filter_frame_init(AV1_COMMON *cm, int plane_start, // After locating which uint64_t, mi_row % 4 is the // row offset, and each row has 16 = 1 << stride_log2 4x4 units. // Therefore, shift = (row << stride_log2) + mi_col; -static int get_index_shift(int mi_col, int mi_row, int *index) { +int get_index_shift(int mi_col, int mi_row, int *index) { // *index = mi_row >> 2; // rows = mi_row % 4; // stride_log2 = 4; @@ -588,15 +763,9 @@ static void setup_masks(AV1_COMMON *const cm, int mi_row, int mi_col, int plane, else lfm->lfl_y_hor[row][col] = level; } else if (plane == 1) { - if (dir == VERT_EDGE) - lfm->lfl_u_ver[row][col] = level; - else - lfm->lfl_u_hor[row][col] = level; + lfm->lfl_u[row][col] = level; } else { - if (dir == VERT_EDGE) - lfm->lfl_v_ver[row][col] = level; - else - lfm->lfl_v_hor[row][col] = level; + lfm->lfl_v[row][col] = level; } } } @@ -623,11 +792,12 @@ static void setup_masks(AV1_COMMON *const cm, int mi_row, int mi_col, int plane, const TX_SIZE prev_tx_size = plane ? av1_get_max_uv_txsize(mbmi_prev->sb_type, ssx, ssy) : mbmi_prev->tx_size; - const TX_SIZE min_tx_size = - (dir == VERT_EDGE) ? AOMMIN(txsize_horz_map[tx_size], - txsize_horz_map[prev_tx_size]) - : AOMMIN(txsize_vert_map[tx_size], - txsize_vert_map[prev_tx_size]); + TX_SIZE min_tx_size = (dir == VERT_EDGE) + ? AOMMIN(txsize_horz_map[tx_size], + txsize_horz_map[prev_tx_size]) + : AOMMIN(txsize_vert_map[tx_size], + txsize_vert_map[prev_tx_size]); + min_tx_size = AOMMIN(min_tx_size, TX_16X16); assert(min_tx_size < TX_SIZES); const int row = r % MI_SIZE_64X64; const int col = c % MI_SIZE_64X64; @@ -883,13 +1053,11 @@ void av1_setup_bitmask(AV1_COMMON *const cm, int mi_row, int mi_col, int plane, } else if (plane == 1) { av1_zero(lfm->left_u); av1_zero(lfm->above_u); - av1_zero(lfm->lfl_u_ver); - av1_zero(lfm->lfl_u_hor); + av1_zero(lfm->lfl_u); } else { av1_zero(lfm->left_v); av1_zero(lfm->above_v); - av1_zero(lfm->lfl_v_ver); - av1_zero(lfm->lfl_v_hor); + av1_zero(lfm->lfl_v); } } } @@ -979,13 +1147,10 @@ static void filter_selectively_vert_row2( if ((mask_16x16_0 & mask_16x16_1) & 1) { if (plane) { - // TODO(any): add aom_lpf_vertical_6_dual for chroma plane. - aom_lpf_vertical_6(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr); - aom_lpf_vertical_6(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim, - lfi1->hev_thr); + aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr, lfi1->mblim, lfi1->lim, + lfi1->hev_thr); } else { - // TODO(any): add dual function simd function. Current sse2 code - // just called aom_lpf_vertical_14_sse2 twice. aom_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr, lfi1->mblim, lfi1->lim, lfi1->hev_thr); @@ -1005,9 +1170,9 @@ static void filter_selectively_vert_row2( if ((mask_8x8_0 & mask_8x8_1) & 1) { if (plane) { - aom_lpf_vertical_6(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr); - aom_lpf_vertical_6(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim, - lfi1->hev_thr); + aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr, lfi1->mblim, lfi1->lim, + lfi1->hev_thr); } else { aom_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr, lfi1->mblim, lfi1->lim, @@ -1070,10 +1235,9 @@ static void highbd_filter_selectively_vert_row2( if ((mask_16x16_0 & mask_16x16_1) & 1) { if (plane) { - aom_highbd_lpf_vertical_6(s, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, bd); - aom_highbd_lpf_vertical_6(s + 4 * pitch, pitch, lfi1->mblim, - lfi1->lim, lfi1->hev_thr, bd); + aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr, lfi1->mblim, + lfi1->lim, lfi1->hev_thr, bd); } else { aom_highbd_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr, lfi1->mblim, @@ -1094,10 +1258,9 @@ static void highbd_filter_selectively_vert_row2( if ((mask_8x8_0 & mask_8x8_1) & 1) { if (plane) { - aom_highbd_lpf_vertical_6(s, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, bd); - aom_highbd_lpf_vertical_6(s + 4 * pitch, pitch, lfi1->mblim, - lfi1->lim, lfi1->hev_thr, bd); + aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr, lfi1->mblim, + lfi1->lim, lfi1->hev_thr, bd); } else { aom_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr, lfi1->mblim, @@ -1163,13 +1326,15 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, int plane, plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_14; if ((mask_16x16 & two_block_mask) == two_block_mask) { - /* - aom_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr); - */ - - lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); - lpf_horizontal(s + 4, pitch, lfin->mblim, lfin->lim, lfin->hev_thr); + if (plane) { + aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, lfin->lim, + lfin->hev_thr); + } else { + aom_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, lfin->lim, + lfin->hev_thr); + } count = 2; } else { lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); @@ -1181,28 +1346,24 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, int plane, plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_8; if ((mask_8x8 & two_block_mask) == two_block_mask) { - /* - aom_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, lfin->mblim, lfin->lim, - lfin->hev_thr); - */ - - lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); - lpf_horizontal(s + 4, pitch, lfin->mblim, lfin->lim, lfin->hev_thr); + if (plane) { + aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, lfin->lim, + lfin->hev_thr); + } else { + aom_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, lfin->lim, + lfin->hev_thr); + } count = 2; } else { lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); } } else if (mask_4x4 & 1) { if ((mask_4x4 & two_block_mask) == two_block_mask) { - /* aom_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, lfin->mblim, lfin->lim, lfin->hev_thr); - */ - aom_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); - aom_lpf_horizontal_4(s + 4, pitch, lfin->mblim, lfin->lim, - lfin->hev_thr); count = 2; } else { aom_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); @@ -1239,15 +1400,15 @@ static void highbd_filter_selectively_horiz( plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_14; if ((mask_16x16 & two_block_mask) == two_block_mask) { - /* - aom_highbd_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, bd); - */ - - highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, - bd); - highbd_lpf_horizontal(s + 4, pitch, lfin->mblim, lfin->lim, - lfin->hev_thr, bd); + if (plane) { + aom_highbd_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, + lfin->lim, lfin->hev_thr, bd); + } else { + aom_highbd_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, + lfin->lim, lfin->hev_thr, bd); + } count = 2; } else { highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, @@ -1258,15 +1419,15 @@ static void highbd_filter_selectively_horiz( plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_8; if ((mask_8x8 & two_block_mask) == two_block_mask) { - /* - aom_highbd_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, lfin->mblim, lfin->lim, - lfin->hev_thr, bd); - */ - highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, - bd); - highbd_lpf_horizontal(s + 4, pitch, lfin->mblim, lfin->lim, - lfin->hev_thr, bd); + if (plane) { + aom_highbd_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, + lfin->lim, lfin->hev_thr, bd); + } else { + aom_highbd_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, + lfin->lim, lfin->hev_thr, bd); + } count = 2; } else { highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, @@ -1274,15 +1435,9 @@ static void highbd_filter_selectively_horiz( } } else if (mask_4x4 & 1) { if ((mask_4x4 & two_block_mask) == two_block_mask) { - /* aom_highbd_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, lfin->mblim, lfin->lim, lfin->hev_thr, bd); - */ - aom_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, bd); - aom_highbd_lpf_horizontal_4(s + 4, pitch, lfin->mblim, lfin->lim, - lfin->hev_thr, bd); count = 2; } else { aom_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, @@ -1299,43 +1454,289 @@ static void highbd_filter_selectively_horiz( } } -static int compare_ref_dst(AV1_COMMON *const cm, uint8_t *ref_buf, - uint8_t *dst_buf, int ref_stride, int dst_stride, - int start, int end) { - return 0; - - start <<= MI_SIZE_LOG2; - end <<= MI_SIZE_LOG2; - uint8_t *ref0 = ref_buf; - uint8_t *dst0 = dst_buf; - if (cm->seq_params.use_highbitdepth) { - const uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref_buf); - const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst_buf); - for (int j = 0; j < 4; ++j) { - for (int i = start; i < end; ++i) - if (ref16[i] != dst16[i]) { - ref_buf = ref0; - dst_buf = dst0; - return i + 1; +void av1_build_bitmask_vert_info( + AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr, + int plane) { + const int subsampling_x = plane_ptr->subsampling_x; + const int subsampling_y = plane_ptr->subsampling_y; + const int row_step = (MI_SIZE >> MI_SIZE_LOG2); + const int is_uv = plane > 0; + TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16; + uint8_t level, prev_level = 1; + int skip, prev_skip = 0; + int is_coding_block_border; + + for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height; r += row_step) { + const int mi_row = r << subsampling_y; + const int row = mi_row % MI_SIZE_64X64; + int index = 0; + const int shift = get_index_shift(0, row, &index); + + for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width; + c += (tx_size_wide_unit[TX_64X64] >> subsampling_x)) { + const int mi_col = c << subsampling_x; + LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col); + + for (int col_in_unit = 0; + col_in_unit < (tx_size_wide_unit[TX_64X64] >> subsampling_x);) { + const int x = (c + col_in_unit) << MI_SIZE_LOG2; + if (x >= plane_ptr->dst.width) break; + const int col = col_in_unit << subsampling_x; + const uint64_t mask = ((uint64_t)1 << (shift | col)); + skip = lfm->skip.bits[index] & mask; + is_coding_block_border = lfm->is_vert_border.bits[index] & mask; + switch (plane) { + case 0: level = lfm->lfl_y_ver[row][col]; break; + case 1: level = lfm->lfl_u[row][col]; break; + case 2: level = lfm->lfl_v[row][col]; break; + default: assert(plane >= 0 && plane <= 2); return; } - ref16 += ref_stride; - dst16 += dst_stride; + for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) { + if (is_uv && ts == TX_64X64) continue; + if (lfm->tx_size_ver[is_uv][ts].bits[index] & mask) { + tx_size = ts; + break; + } + } + if ((c + col_in_unit > 0) && (level || prev_level) && + (!prev_skip || !skip || is_coding_block_border)) { + const TX_SIZE min_tx_size = + AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size)); + const int tmp_row = (mi_row | subsampling_y) % MI_SIZE_64X64; + const int tmp_col = (col | subsampling_x) % MI_SIZE_64X64; + const int shift_1 = get_index_shift(tmp_col, tmp_row, &index); + const uint64_t mask_1 = ((uint64_t)1 << shift_1); + switch (plane) { + case 0: lfm->left_y[min_tx_size].bits[index] |= mask_1; break; + case 1: lfm->left_u[min_tx_size].bits[index] |= mask_1; break; + case 2: lfm->left_v[min_tx_size].bits[index] |= mask_1; break; + default: assert(plane >= 0 && plane <= 2); return; + } + } + + // update prev info + prev_level = level; + prev_skip = skip; + prev_tx_size = tx_size; + // advance + col_in_unit += tx_size_wide_unit[tx_size]; + } } - } else { - for (int j = 0; j < 4; ++j) { - for (int i = start; i < end; ++i) - if (ref_buf[i] != dst_buf[i]) { - ref_buf = ref0; - dst_buf = dst0; - return i + 1; + } +} + +void av1_build_bitmask_horz_info( + AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr, + int plane) { + const int subsampling_x = plane_ptr->subsampling_x; + const int subsampling_y = plane_ptr->subsampling_y; + const int col_step = (MI_SIZE >> MI_SIZE_LOG2); + const int is_uv = plane > 0; + TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16; + uint8_t level, prev_level = 1; + int skip, prev_skip = 0; + int is_coding_block_border; + + for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width; c += col_step) { + const int mi_col = c << subsampling_x; + const int col = mi_col % MI_SIZE_64X64; + + for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height; + r += (tx_size_high_unit[TX_64X64] >> subsampling_y)) { + const int mi_row = r << subsampling_y; + LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col); + + for (int r_in_unit = 0; + r_in_unit < (tx_size_high_unit[TX_64X64] >> subsampling_y);) { + const int y = (r + r_in_unit) << MI_SIZE_LOG2; + if (y >= plane_ptr->dst.height) break; + const int row = r_in_unit << subsampling_y; + int index = 0; + const int shift = get_index_shift(col, row, &index); + const uint64_t mask = ((uint64_t)1 << shift); + skip = lfm->skip.bits[index] & mask; + is_coding_block_border = lfm->is_horz_border.bits[index] & mask; + switch (plane) { + case 0: level = lfm->lfl_y_hor[row][col]; break; + case 1: level = lfm->lfl_u[row][col]; break; + case 2: level = lfm->lfl_v[row][col]; break; + default: assert(plane >= 0 && plane <= 2); return; } - ref_buf += ref_stride; - dst_buf += dst_stride; + for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) { + if (is_uv && ts == TX_64X64) continue; + if (lfm->tx_size_hor[is_uv][ts].bits[index] & mask) { + tx_size = ts; + break; + } + } + if ((r + r_in_unit > 0) && (level || prev_level) && + (!prev_skip || !skip || is_coding_block_border)) { + const TX_SIZE min_tx_size = + AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size)); + const int tmp_row = (row | subsampling_y) % MI_SIZE_64X64; + const int tmp_col = (mi_col | subsampling_x) % MI_SIZE_64X64; + const int shift_1 = get_index_shift(tmp_col, tmp_row, &index); + const uint64_t mask_1 = ((uint64_t)1 << shift_1); + + switch (plane) { + case 0: lfm->above_y[min_tx_size].bits[index] |= mask_1; break; + case 1: lfm->above_u[min_tx_size].bits[index] |= mask_1; break; + case 2: lfm->above_v[min_tx_size].bits[index] |= mask_1; break; + default: assert(plane >= 0 && plane <= 2); return; + } + } + + // update prev info + prev_level = level; + prev_skip = skip; + prev_tx_size = tx_size; + // advance + r_in_unit += tx_size_high_unit[tx_size]; + } + } + } +} + +void av1_filter_block_plane_bitmask_vert( + AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl, + int mi_row, int mi_col) { + struct buf_2d *const dst = &plane_ptr->dst; + uint8_t *const buf0 = dst->buf; + const int ssx = plane_ptr->subsampling_x; + const int ssy = plane_ptr->subsampling_y; + const int mask_cutoff = 0xffff; + const int row_step = 1 << ssy; + const int two_row_step = 2 << ssy; + const int row_stride = dst->stride << MI_SIZE_LOG2; + const int two_row_stride = row_stride << 1; + uint64_t mask_16x16 = 0; + uint64_t mask_8x8 = 0; + uint64_t mask_4x4 = 0; + uint8_t *lfl; + uint8_t *lfl2; + LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col); + assert(lfm); + + // 1. vertical filtering. filter two rows at a time + for (int r = 0; + ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64; + r += two_row_step) { + const int row = r | ssy; + const int row_next = row + row_step; + const int col = ssx; + int index = 0; + const int shift = get_index_shift(col, row, &index); + int index_next = 0; + const int shift_next = get_index_shift(col, row_next, &index_next); + switch (pl) { + case 0: + mask_16x16 = lfm->left_y[TX_16X16].bits[index]; + mask_8x8 = lfm->left_y[TX_8X8].bits[index]; + mask_4x4 = lfm->left_y[TX_4X4].bits[index]; + lfl = &lfm->lfl_y_ver[row][col]; + lfl2 = &lfm->lfl_y_ver[row_next][col]; + break; + case 1: + mask_16x16 = lfm->left_u[TX_16X16].bits[index]; + mask_8x8 = lfm->left_u[TX_8X8].bits[index]; + mask_4x4 = lfm->left_u[TX_4X4].bits[index]; + lfl = &lfm->lfl_u[row][col]; + lfl2 = &lfm->lfl_u[row_next][col]; + break; + case 2: + mask_16x16 = lfm->left_v[TX_16X16].bits[index]; + mask_8x8 = lfm->left_v[TX_8X8].bits[index]; + mask_4x4 = lfm->left_v[TX_4X4].bits[index]; + lfl = &lfm->lfl_v[row][col]; + lfl2 = &lfm->lfl_v[row_next][col]; + break; + default: assert(pl >= 0 && pl <= 2); return; + } + uint64_t mask_16x16_0 = (mask_16x16 >> shift) & mask_cutoff; + uint64_t mask_8x8_0 = (mask_8x8 >> shift) & mask_cutoff; + uint64_t mask_4x4_0 = (mask_4x4 >> shift) & mask_cutoff; + uint64_t mask_16x16_1 = (mask_16x16 >> shift_next) & mask_cutoff; + uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff; + uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff; + + if (cm->seq_params.use_highbitdepth) + highbd_filter_selectively_vert_row2( + ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0, + mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1, + &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth); + else + filter_selectively_vert_row2( + ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0, + mask_16x16_1, mask_8x8_1, mask_4x4_1, &cm->lf_info, lfl, lfl2); + dst->buf += two_row_stride; + } + // reset buf pointer for horizontal filtering + dst->buf = buf0; +} + +void av1_filter_block_plane_bitmask_horz( + AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl, + int mi_row, int mi_col) { + struct buf_2d *const dst = &plane_ptr->dst; + uint8_t *const buf0 = dst->buf; + const int ssx = plane_ptr->subsampling_x; + const int ssy = plane_ptr->subsampling_y; + const int mask_cutoff = 0xffff; + const int row_step = 1 << ssy; + const int row_stride = dst->stride << MI_SIZE_LOG2; + uint64_t mask_16x16 = 0; + uint64_t mask_8x8 = 0; + uint64_t mask_4x4 = 0; + uint8_t *lfl; + LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col); + assert(lfm); + for (int r = 0; + ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64; + r += row_step) { + if (mi_row + r == 0) { + dst->buf += row_stride; + continue; } + const int row = r | ssy; + const int col = ssx; + int index = 0; + const int shift = get_index_shift(col, row, &index); + switch (pl) { + case 0: + mask_16x16 = lfm->above_y[TX_16X16].bits[index]; + mask_8x8 = lfm->above_y[TX_8X8].bits[index]; + mask_4x4 = lfm->above_y[TX_4X4].bits[index]; + lfl = &lfm->lfl_y_hor[row][col]; + break; + case 1: + mask_16x16 = lfm->above_u[TX_16X16].bits[index]; + mask_8x8 = lfm->above_u[TX_8X8].bits[index]; + mask_4x4 = lfm->above_u[TX_4X4].bits[index]; + lfl = &lfm->lfl_u[row][col]; + break; + case 2: + mask_16x16 = lfm->above_v[TX_16X16].bits[index]; + mask_8x8 = lfm->above_v[TX_8X8].bits[index]; + mask_4x4 = lfm->above_v[TX_4X4].bits[index]; + lfl = &lfm->lfl_v[row][col]; + break; + default: assert(pl >= 0 && pl <= 2); return; + } + mask_16x16 = (mask_16x16 >> shift) & mask_cutoff; + mask_8x8 = (mask_8x8 >> shift) & mask_cutoff; + mask_4x4 = (mask_4x4 >> shift) & mask_cutoff; + + if (cm->seq_params.use_highbitdepth) + highbd_filter_selectively_horiz( + CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, ssx, mask_16x16, + mask_8x8, mask_4x4, &cm->lf_info, lfl, (int)cm->seq_params.bit_depth); + else + filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16, + mask_8x8, mask_4x4, &cm->lf_info, lfl); + dst->buf += row_stride; } - ref_buf = ref0; - dst_buf = dst0; - return 0; + // reset buf pointer for next block + dst->buf = buf0; } void av1_filter_block_plane_ver(AV1_COMMON *const cm, @@ -1385,15 +1786,15 @@ void av1_filter_block_plane_ver(AV1_COMMON *const cm, mask_16x16 = lfm->left_u[TX_16X16].bits[index]; mask_8x8 = lfm->left_u[TX_8X8].bits[index]; mask_4x4 = lfm->left_u[TX_4X4].bits[index]; - lfl = &lfm->lfl_u_ver[row][col]; - lfl2 = &lfm->lfl_u_ver[row_next][col]; + lfl = &lfm->lfl_u[row][col]; + lfl2 = &lfm->lfl_u[row_next][col]; break; case 2: mask_16x16 = lfm->left_v[TX_16X16].bits[index]; mask_8x8 = lfm->left_v[TX_8X8].bits[index]; mask_4x4 = lfm->left_v[TX_4X4].bits[index]; - lfl = &lfm->lfl_v_ver[row][col]; - lfl2 = &lfm->lfl_v_ver[row_next][col]; + lfl = &lfm->lfl_v[row][col]; + lfl2 = &lfm->lfl_v[row_next][col]; break; default: assert(pl >= 0 && pl <= 2); return; } @@ -1460,13 +1861,13 @@ void av1_filter_block_plane_hor(AV1_COMMON *const cm, mask_16x16 = lfm->above_u[TX_16X16].bits[index]; mask_8x8 = lfm->above_u[TX_8X8].bits[index]; mask_4x4 = lfm->above_u[TX_4X4].bits[index]; - lfl = &lfm->lfl_u_hor[row][col]; + lfl = &lfm->lfl_u[row][col]; break; case 2: mask_16x16 = lfm->above_v[TX_16X16].bits[index]; mask_8x8 = lfm->above_v[TX_8X8].bits[index]; mask_4x4 = lfm->above_v[TX_4X4].bits[index]; - lfl = &lfm->lfl_v_hor[row][col]; + lfl = &lfm->lfl_v[row][col]; break; default: assert(pl >= 0 && pl <= 2); return; } @@ -1820,6 +2221,9 @@ void av1_filter_block_plane_horz(const AV1_COMMON *const cm, static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm, MACROBLOCKD *xd, int start, int stop, +#if LOOP_FILTER_BITMASK + int is_decoding, +#endif int plane_start, int plane_end) { struct macroblockd_plane *pd = xd->plane; const int col_start = 0; @@ -1827,6 +2231,45 @@ static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm, int mi_row, mi_col; int plane; +#if LOOP_FILTER_BITMASK + if (is_decoding) { + for (plane = plane_start; plane < plane_end; plane++) { + if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1])) + break; + else if (plane == 1 && !(cm->lf.filter_level_u)) + continue; + else if (plane == 2 && !(cm->lf.filter_level_v)) + continue; + + av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, 0, 0, + plane, plane + 1); + av1_build_bitmask_vert_info(cm, &pd[plane], plane); + av1_build_bitmask_horz_info(cm, &pd[plane], plane); + + // apply loop filtering which only goes through buffer once + for (mi_row = start; mi_row < stop; mi_row += MI_SIZE_64X64) { + for (mi_col = col_start; mi_col < col_end; mi_col += MI_SIZE_64X64) { + av1_setup_dst_planes(pd, MI_SIZE_64X64, frame_buffer, mi_row, mi_col, + plane, plane + 1); + av1_filter_block_plane_bitmask_vert(cm, &pd[plane], plane, mi_row, + mi_col); + if (mi_col - MI_SIZE_64X64 >= 0) { + av1_setup_dst_planes(pd, MI_SIZE_64X64, frame_buffer, mi_row, + mi_col - MI_SIZE_64X64, plane, plane + 1); + av1_filter_block_plane_bitmask_horz(cm, &pd[plane], plane, mi_row, + mi_col - MI_SIZE_64X64); + } + } + av1_setup_dst_planes(pd, MI_SIZE_64X64, frame_buffer, mi_row, + mi_col - MI_SIZE_64X64, plane, plane + 1); + av1_filter_block_plane_bitmask_horz(cm, &pd[plane], plane, mi_row, + mi_col - MI_SIZE_64X64); + } + } + return; + } +#endif + for (plane = plane_start; plane < plane_end; plane++) { if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1])) break; @@ -1910,8 +2353,11 @@ static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm, } void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, - MACROBLOCKD *xd, int plane_start, int plane_end, - int partial_frame) { + MACROBLOCKD *xd, +#if LOOP_FILTER_BITMASK + int is_decoding, +#endif + int plane_start, int plane_end, int partial_frame) { int start_mi_row, end_mi_row, mi_rows_to_filter; start_mi_row = 0; @@ -1923,6 +2369,9 @@ void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, } end_mi_row = start_mi_row + mi_rows_to_filter; av1_loop_filter_frame_init(cm, plane_start, plane_end); - loop_filter_rows(frame, cm, xd, start_mi_row, end_mi_row, plane_start, - plane_end); + loop_filter_rows(frame, cm, xd, start_mi_row, end_mi_row, +#if LOOP_FILTER_BITMASK + is_decoding, +#endif + plane_start, plane_end); } diff --git a/third_party/aom/av1/common/av1_loopfilter.h b/third_party/aom/av1/common/av1_loopfilter.h index c35c3b2dc..80ac61178 100644 --- a/third_party/aom/av1/common/av1_loopfilter.h +++ b/third_party/aom/av1/common/av1_loopfilter.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_LOOPFILTER_H_ -#define AV1_COMMON_LOOPFILTER_H_ +#ifndef AOM_AV1_COMMON_AV1_LOOPFILTER_H_ +#define AOM_AV1_COMMON_AV1_LOOPFILTER_H_ #include "config/aom_config.h" @@ -60,51 +60,20 @@ typedef struct { uint8_t lfl_y_hor[MI_SIZE_64X64][MI_SIZE_64X64]; uint8_t lfl_y_ver[MI_SIZE_64X64][MI_SIZE_64X64]; - // U plane vertical edge and horizontal edge filter level - uint8_t lfl_u_hor[MI_SIZE_64X64][MI_SIZE_64X64]; - uint8_t lfl_u_ver[MI_SIZE_64X64][MI_SIZE_64X64]; + // U plane filter level + uint8_t lfl_u[MI_SIZE_64X64][MI_SIZE_64X64]; - // V plane vertical edge and horizontal edge filter level - uint8_t lfl_v_hor[MI_SIZE_64X64][MI_SIZE_64X64]; - uint8_t lfl_v_ver[MI_SIZE_64X64][MI_SIZE_64X64]; -} LoopFilterMask; + // V plane filter level + uint8_t lfl_v[MI_SIZE_64X64][MI_SIZE_64X64]; -// To determine whether to apply loop filtering at one transform block edge, -// we need information of the neighboring transform block. Specifically, -// in determining a vertical edge, we need the information of the tx block -// to its left. For a horizontal edge, we need info of the tx block above it. -// Thus, we need to record info of right column and bottom row of tx blocks. -// We record the information of the neighboring superblock, when bitmask -// building for a superblock is finished. And it will be used for next -// superblock bitmask building. -// Information includes: -// ------------------------------------------------------------ -// MI_SIZE_64X64 -// Y tx_size above |--------------| -// Y tx_size left |--------------| -// UV tx_size above |--------------| -// UV tx_size left |--------------| -// Y level above |--------------| -// Y level left |--------------| -// U level above |--------------| -// U level left |--------------| -// V level above |--------------| -// V level left |--------------| -// skip |--------------| -// ------------------------------------------------------------ -typedef struct { - TX_SIZE tx_size_y_above[MI_SIZE_64X64]; - TX_SIZE tx_size_y_left[MI_SIZE_64X64]; - TX_SIZE tx_size_uv_above[MI_SIZE_64X64]; - TX_SIZE tx_size_uv_left[MI_SIZE_64X64]; - uint8_t y_level_above[MI_SIZE_64X64]; - uint8_t y_level_left[MI_SIZE_64X64]; - uint8_t u_level_above[MI_SIZE_64X64]; - uint8_t u_level_left[MI_SIZE_64X64]; - uint8_t v_level_above[MI_SIZE_64X64]; - uint8_t v_level_left[MI_SIZE_64X64]; - uint8_t skip[MI_SIZE_64X64]; -} LpfSuperblockInfo; + // other info + FilterMask skip; + FilterMask is_vert_border; + FilterMask is_horz_border; + // Y or UV planes, 5 tx sizes: 4x4, 8x8, 16x16, 32x32, 64x64 + FilterMask tx_size_ver[2][5]; + FilterMask tx_size_hor[2][5]; +} LoopFilterMask; #endif // LOOP_FILTER_BITMASK struct loopfilter { @@ -130,7 +99,6 @@ struct loopfilter { LoopFilterMask *lfm; size_t lfm_num; int lfm_stride; - LpfSuperblockInfo neighbor_sb_lpf_info; #endif // LOOP_FILTER_BITMASK }; @@ -157,9 +125,15 @@ void av1_loop_filter_init(struct AV1Common *cm); void av1_loop_filter_frame_init(struct AV1Common *cm, int plane_start, int plane_end); +#if LOOP_FILTER_BITMASK +void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm, + struct macroblockd *mbd, int is_decoding, + int plane_start, int plane_end, int partial_frame); +#else void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm, struct macroblockd *mbd, int plane_start, int plane_end, int partial_frame); +#endif void av1_filter_block_plane_vert(const struct AV1Common *const cm, const MACROBLOCKD *const xd, const int plane, @@ -180,6 +154,9 @@ typedef struct LoopFilterWorkerData { MACROBLOCKD *xd; } LFWorkerData; +uint8_t get_filter_level(const struct AV1Common *cm, + const loop_filter_info_n *lfi_n, const int dir_idx, + int plane, const MB_MODE_INFO *mbmi); #if LOOP_FILTER_BITMASK void av1_setup_bitmask(struct AV1Common *const cm, int mi_row, int mi_col, int plane, int subsampling_x, int subsampling_y, @@ -192,10 +169,59 @@ void av1_filter_block_plane_ver(struct AV1Common *const cm, void av1_filter_block_plane_hor(struct AV1Common *const cm, struct macroblockd_plane *const plane, int pl, int mi_row, int mi_col); +LoopFilterMask *get_loop_filter_mask(const struct AV1Common *const cm, + int mi_row, int mi_col); +int get_index_shift(int mi_col, int mi_row, int *index); + +static const FilterMask left_txform_mask[TX_SIZES] = { + { { 0x0000000000000001ULL, // TX_4X4, + 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, + + { { 0x0000000000010001ULL, // TX_8X8, + 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, + + { { 0x0001000100010001ULL, // TX_16X16, + 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, + + { { 0x0001000100010001ULL, // TX_32X32, + 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } }, + + { { 0x0001000100010001ULL, // TX_64X64, + 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL } }, +}; + +static const uint64_t above_txform_mask[2][TX_SIZES] = { + { + 0x0000000000000001ULL, // TX_4X4 + 0x0000000000000003ULL, // TX_8X8 + 0x000000000000000fULL, // TX_16X16 + 0x00000000000000ffULL, // TX_32X32 + 0x000000000000ffffULL, // TX_64X64 + }, + { + 0x0000000000000001ULL, // TX_4X4 + 0x0000000000000005ULL, // TX_8X8 + 0x0000000000000055ULL, // TX_16X16 + 0x0000000000005555ULL, // TX_32X32 + 0x0000000055555555ULL, // TX_64X64 + }, +}; + +extern const int mask_id_table_tx_4x4[BLOCK_SIZES_ALL]; + +extern const int mask_id_table_tx_8x8[BLOCK_SIZES_ALL]; + +extern const int mask_id_table_tx_16x16[BLOCK_SIZES_ALL]; + +extern const int mask_id_table_tx_32x32[BLOCK_SIZES_ALL]; + +extern const FilterMask left_mask_univariant_reordered[67]; + +extern const FilterMask above_mask_univariant_reordered[67]; #endif #ifdef __cplusplus } // extern "C" #endif -#endif // AV1_COMMON_LOOPFILTER_H_ +#endif // AOM_AV1_COMMON_AV1_LOOPFILTER_H_ diff --git a/third_party/aom/av1/common/av1_rtcd_defs.pl b/third_party/aom/av1/common/av1_rtcd_defs.pl index fa8b34981..dee1f1c79 100755 --- a/third_party/aom/av1/common/av1_rtcd_defs.pl +++ b/third_party/aom/av1/common/av1_rtcd_defs.pl @@ -76,12 +76,12 @@ specialize qw/av1_wiener_convolve_add_src sse2 avx2 neon/; specialize qw/av1_highbd_wiener_convolve_add_src ssse3/; specialize qw/av1_highbd_wiener_convolve_add_src avx2/; + # directional intra predictor functions add_proto qw/void av1_dr_prediction_z1/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy"; add_proto qw/void av1_dr_prediction_z2/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy"; add_proto qw/void av1_dr_prediction_z3/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy"; - # FILTER_INTRA predictor functions add_proto qw/void av1_filter_intra_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode"; specialize qw/av1_filter_intra_predictor sse4_1/; @@ -108,6 +108,22 @@ specialize qw/av1_highbd_convolve8_vert/, "$sse2_x86_64"; add_proto qw/void av1_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; specialize qw/av1_inv_txfm_add ssse3 avx2 neon/; +add_proto qw/void av1_highbd_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; +specialize qw/av1_highbd_inv_txfm_add sse4_1 avx2/; + +add_proto qw/void av1_highbd_inv_txfm_add_4x4/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; +specialize qw/av1_highbd_inv_txfm_add_4x4 sse4_1/; +add_proto qw/void av1_highbd_inv_txfm_add_8x8/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; +specialize qw/av1_highbd_inv_txfm_add_8x8 sse4_1/; +add_proto qw/void av1_highbd_inv_txfm_add_16x8/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; +specialize qw/av1_highbd_inv_txfm_add_16x8 sse4_1/; +add_proto qw/void av1_highbd_inv_txfm_add_8x16/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; +specialize qw/av1_highbd_inv_txfm_add_8x16 sse4_1/; +add_proto qw/void av1_highbd_inv_txfm_add_16x16/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; +specialize qw/av1_highbd_inv_txfm_add_16x16 sse4_1/; +add_proto qw/void av1_highbd_inv_txfm_add_32x32/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; +specialize qw/av1_highbd_inv_txfm_add_32x32 sse4_1 avx2/; + add_proto qw/void av1_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; add_proto qw/void av1_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; @@ -122,9 +138,7 @@ specialize qw/av1_inv_txfm2d_add_4x4 sse4_1/; add_proto qw/void av1_inv_txfm2d_add_8x8/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd"; specialize qw/av1_inv_txfm2d_add_8x8 sse4_1/; add_proto qw/void av1_inv_txfm2d_add_16x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd"; -specialize qw/av1_inv_txfm2d_add_16x16 sse4_1/; add_proto qw/void av1_inv_txfm2d_add_32x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd"; -specialize qw/av1_inv_txfm2d_add_32x32 avx2/; add_proto qw/void av1_inv_txfm2d_add_64x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd"; add_proto qw/void av1_inv_txfm2d_add_32x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd"; @@ -132,8 +146,6 @@ add_proto qw/void av1_inv_txfm2d_add_64x32/, "const int32_t *input, uint16_t *ou add_proto qw/void av1_inv_txfm2d_add_16x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd"; add_proto qw/void av1_inv_txfm2d_add_64x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd"; -specialize qw/av1_inv_txfm2d_add_64x64 sse4_1/; - add_proto qw/void av1_inv_txfm2d_add_4x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd"; add_proto qw/void av1_inv_txfm2d_add_16x4/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd"; add_proto qw/void av1_inv_txfm2d_add_8x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd"; @@ -146,13 +158,13 @@ add_proto qw/void av1_highbd_dr_prediction_z3/, "uint16_t *dst, ptrdiff_t stride # build compound seg mask functions add_proto qw/void av1_build_compound_diffwtd_mask/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w"; -specialize qw/av1_build_compound_diffwtd_mask sse4_1/; +specialize qw/av1_build_compound_diffwtd_mask sse4_1 avx2/; add_proto qw/void av1_build_compound_diffwtd_mask_highbd/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd"; specialize qw/av1_build_compound_diffwtd_mask_highbd ssse3 avx2/; add_proto qw/void av1_build_compound_diffwtd_mask_d16/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd"; -specialize qw/av1_build_compound_diffwtd_mask_d16 sse4_1 neon/; +specialize qw/av1_build_compound_diffwtd_mask_d16 sse4_1 avx2 neon/; # # Encoder functions below this point. @@ -186,7 +198,9 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/void av1_fwd_txfm2d_4x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; add_proto qw/void av1_fwd_txfm2d_8x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; add_proto qw/void av1_fwd_txfm2d_8x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; + specialize qw/av1_fwd_txfm2d_8x16 sse4_1/; add_proto qw/void av1_fwd_txfm2d_16x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; + specialize qw/av1_fwd_txfm2d_16x8 sse4_1/; add_proto qw/void av1_fwd_txfm2d_16x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; add_proto qw/void av1_fwd_txfm2d_32x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; add_proto qw/void av1_fwd_txfm2d_4x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; @@ -203,6 +217,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { specialize qw/av1_fwd_txfm2d_32x32 sse4_1/; add_proto qw/void av1_fwd_txfm2d_64x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; + specialize qw/av1_fwd_txfm2d_64x64 sse4_1/; add_proto qw/void av1_fwd_txfm2d_32x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; add_proto qw/void av1_fwd_txfm2d_64x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; add_proto qw/void av1_fwd_txfm2d_16x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; @@ -218,7 +233,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/void av1_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count"; specialize qw/av1_temporal_filter_apply sse2 msa/; - add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale"; + add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale"; # ENCODEMB INVOKE @@ -238,7 +253,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/void av1_get_nz_map_contexts/, "const uint8_t *const levels, const int16_t *const scan, const uint16_t eob, const TX_SIZE tx_size, const TX_CLASS tx_class, int8_t *const coeff_contexts"; specialize qw/av1_get_nz_map_contexts sse2/; add_proto qw/void av1_txb_init_levels/, "const tran_low_t *const coeff, const int width, const int height, uint8_t *const levels"; - specialize qw/av1_txb_init_levels sse4_1/; + specialize qw/av1_txb_init_levels sse4_1 avx2/; add_proto qw/uint64_t av1_wedge_sse_from_residuals/, "const int16_t *r1, const int16_t *d, const uint8_t *m, int N"; specialize qw/av1_wedge_sse_from_residuals sse2 avx2/; @@ -251,6 +266,11 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/uint32_t av1_get_crc32c_value/, "void *crc_calculator, uint8_t *p, int length"; specialize qw/av1_get_crc32c_value sse4_2/; + add_proto qw/void av1_compute_stats/, "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, double *M, double *H"; + specialize qw/av1_compute_stats sse4_1 avx2/; + + add_proto qw/int64_t av1_lowbd_pixel_proj_error/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params"; + specialize qw/av1_lowbd_pixel_proj_error sse4_1 avx2/; } # end encoder functions @@ -275,7 +295,7 @@ if ($opts{config} !~ /libs-x86-win32-vs.*/) { # WARPED_MOTION / GLOBAL_MOTION functions add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta"; -specialize qw/av1_warp_affine sse4_1/; +specialize qw/av1_warp_affine sse4_1 neon/; add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta"; specialize qw/av1_highbd_warp_affine sse4_1/; @@ -290,9 +310,9 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/void apply_selfguided_restoration/, "const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd"; specialize qw/apply_selfguided_restoration sse4_1 avx2 neon/; -add_proto qw/void av1_selfguided_restoration/, "const uint8_t *dgd8, int width, int height, - int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride, - int sgr_params_idx, int bit_depth, int highbd"; +add_proto qw/int av1_selfguided_restoration/, "const uint8_t *dgd8, int width, int height, + int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride, + int sgr_params_idx, int bit_depth, int highbd"; specialize qw/av1_selfguided_restoration sse4_1 avx2 neon/; # CONVOLVE_ROUND/COMPOUND_ROUND functions diff --git a/third_party/aom/av1/common/av1_txfm.c b/third_party/aom/av1/common/av1_txfm.c index 1e6654121..bb70eab70 100644 --- a/third_party/aom/av1/common/av1_txfm.c +++ b/third_party/aom/av1/common/av1_txfm.c @@ -108,3 +108,53 @@ const int8_t av1_txfm_stage_num_list[TXFM_TYPES] = { 1, // TXFM_TYPE_IDENTITY16 1, // TXFM_TYPE_IDENTITY32 }; + +void av1_range_check_buf(int32_t stage, const int32_t *input, + const int32_t *buf, int32_t size, int8_t bit) { +#if CONFIG_COEFFICIENT_RANGE_CHECKING + const int64_t max_value = (1LL << (bit - 1)) - 1; + const int64_t min_value = -(1LL << (bit - 1)); + + int in_range = 1; + + for (int i = 0; i < size; ++i) { + if (buf[i] < min_value || buf[i] > max_value) { + in_range = 0; + } + } + + if (!in_range) { + fprintf(stderr, "Error: coeffs contain out-of-range values\n"); + fprintf(stderr, "size: %d\n", size); + fprintf(stderr, "stage: %d\n", stage); + fprintf(stderr, "allowed range: [%" PRId64 ";%" PRId64 "]\n", min_value, + max_value); + + fprintf(stderr, "coeffs: "); + + fprintf(stderr, "["); + for (int j = 0; j < size; j++) { + if (j > 0) fprintf(stderr, ", "); + fprintf(stderr, "%d", input[j]); + } + fprintf(stderr, "]\n"); + + fprintf(stderr, " buf: "); + + fprintf(stderr, "["); + for (int j = 0; j < size; j++) { + if (j > 0) fprintf(stderr, ", "); + fprintf(stderr, "%d", buf[j]); + } + fprintf(stderr, "]\n\n"); + } + + assert(in_range); +#else + (void)stage; + (void)input; + (void)buf; + (void)size; + (void)bit; +#endif +} diff --git a/third_party/aom/av1/common/av1_txfm.h b/third_party/aom/av1/common/av1_txfm.h index c9cc79852..59d64ca4a 100644 --- a/third_party/aom/av1/common/av1_txfm.h +++ b/third_party/aom/av1/common/av1_txfm.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_TXFM_H_ -#define AV1_TXFM_H_ +#ifndef AOM_AV1_COMMON_AV1_TXFM_H_ +#define AOM_AV1_COMMON_AV1_TXFM_H_ #include #include @@ -39,7 +39,7 @@ extern const int32_t av1_sinpi_arr_data[7][5]; static const int cos_bit_min = 10; static const int cos_bit_max = 16; -static const int NewSqrt2Bits = 12; +#define NewSqrt2Bits ((int32_t)12) // 2^12 * sqrt(2) static const int32_t NewSqrt2 = 5793; // 2^12 / sqrt(2) @@ -64,7 +64,7 @@ static INLINE int32_t range_check_value(int32_t value, int8_t bit) { #endif // CONFIG_COEFFICIENT_RANGE_CHECKING #if DO_RANGE_CHECK_CLAMP bit = AOMMIN(bit, 31); - return clamp(value, (1 << (bit - 1)) - 1, -(1 << (bit - 1))); + return clamp(value, -(1 << (bit - 1)), (1 << (bit - 1)) - 1); #endif // DO_RANGE_CHECK_CLAMP (void)bit; return value; @@ -78,10 +78,25 @@ static INLINE int32_t round_shift(int64_t value, int bit) { static INLINE int32_t half_btf(int32_t w0, int32_t in0, int32_t w1, int32_t in1, int bit) { int64_t result_64 = (int64_t)(w0 * in0) + (int64_t)(w1 * in1); + int64_t intermediate = result_64 + (1LL << (bit - 1)); + // NOTE(david.barker): The value 'result_64' may not necessarily fit + // into 32 bits. However, the result of this function is nominally + // ROUND_POWER_OF_TWO_64(result_64, bit) + // and that is required to fit into stage_range[stage] many bits + // (checked by range_check_buf()). + // + // Here we've unpacked that rounding operation, and it can be shown + // that the value of 'intermediate' here *does* fit into 32 bits + // for any conformant bitstream. + // The upshot is that, if you do all this calculation using + // wrapping 32-bit arithmetic instead of (non-wrapping) 64-bit arithmetic, + // then you'll still get the correct result. + // To provide a check on this logic, we assert that 'intermediate' + // would fit into an int32 if range checking is enabled. #if CONFIG_COEFFICIENT_RANGE_CHECKING - assert(result_64 >= INT32_MIN && result_64 <= INT32_MAX); + assert(intermediate >= INT32_MIN && intermediate <= INT32_MAX); #endif - return round_shift(result_64, bit); + return (int32_t)(intermediate >> bit); } static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans, @@ -206,9 +221,12 @@ static INLINE int get_txw_idx(TX_SIZE tx_size) { static INLINE int get_txh_idx(TX_SIZE tx_size) { return tx_size_high_log2[tx_size] - tx_size_high_log2[0]; } + +void av1_range_check_buf(int32_t stage, const int32_t *input, + const int32_t *buf, int32_t size, int8_t bit); #define MAX_TXWH_IDX 5 #ifdef __cplusplus } #endif // __cplusplus -#endif // AV1_TXFM_H_ +#endif // AOM_AV1_COMMON_AV1_TXFM_H_ diff --git a/third_party/aom/av1/common/blockd.c b/third_party/aom/av1/common/blockd.c index 86b4b5d6c..2e796b656 100644 --- a/third_party/aom/av1/common/blockd.c +++ b/third_party/aom/av1/common/blockd.c @@ -28,66 +28,6 @@ PREDICTION_MODE av1_above_block_mode(const MB_MODE_INFO *above_mi) { return above_mi->mode; } -void av1_foreach_transformed_block_in_plane( - const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane, - foreach_transformed_block_visitor visit, void *arg) { - const struct macroblockd_plane *const pd = &xd->plane[plane]; - // block and transform sizes, in number of 4x4 blocks log 2 ("*_b") - // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8 - // transform size varies per plane, look it up in a common way. - const TX_SIZE tx_size = av1_get_tx_size(plane, xd); - const BLOCK_SIZE plane_bsize = - get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); - const uint8_t txw_unit = tx_size_wide_unit[tx_size]; - const uint8_t txh_unit = tx_size_high_unit[tx_size]; - const int step = txw_unit * txh_unit; - int i = 0, r, c; - - // If mb_to_right_edge is < 0 we are in a situation in which - // the current block size extends into the UMV and we won't - // visit the sub blocks that are wholly within the UMV. - const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); - const int max_blocks_high = max_block_high(xd, plane_bsize, plane); - - int blk_row, blk_col; - - const BLOCK_SIZE max_unit_bsize = - get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y); - int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0]; - int mu_blocks_high = block_size_high[max_unit_bsize] >> tx_size_high_log2[0]; - mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide); - mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high); - - // Keep track of the row and column of the blocks we use so that we know - // if we are in the unrestricted motion border. - for (r = 0; r < max_blocks_high; r += mu_blocks_high) { - const int unit_height = AOMMIN(mu_blocks_high + r, max_blocks_high); - // Skip visiting the sub blocks that are wholly within the UMV. - for (c = 0; c < max_blocks_wide; c += mu_blocks_wide) { - const int unit_width = AOMMIN(mu_blocks_wide + c, max_blocks_wide); - for (blk_row = r; blk_row < unit_height; blk_row += txh_unit) { - for (blk_col = c; blk_col < unit_width; blk_col += txw_unit) { - visit(plane, i, blk_row, blk_col, plane_bsize, tx_size, arg); - i += step; - } - } - } - } -} - -void av1_foreach_transformed_block(const MACROBLOCKD *const xd, - BLOCK_SIZE bsize, int mi_row, int mi_col, - foreach_transformed_block_visitor visit, - void *arg, const int num_planes) { - for (int plane = 0; plane < num_planes; ++plane) { - if (!is_chroma_reference(mi_row, mi_col, bsize, - xd->plane[plane].subsampling_x, - xd->plane[plane].subsampling_y)) - continue; - av1_foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg); - } -} - void av1_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd, int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob, int aoff, int loff) { @@ -159,6 +99,10 @@ void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y, xd->plane[i].subsampling_x = i ? ss_x : 0; xd->plane[i].subsampling_y = i ? ss_y : 0; } + for (i = num_planes; i < MAX_MB_PLANE; i++) { + xd->plane[i].subsampling_x = 1; + xd->plane[i].subsampling_y = 1; + } } const int16_t dr_intra_derivative[90] = { diff --git a/third_party/aom/av1/common/blockd.h b/third_party/aom/av1/common/blockd.h index 979f13bd9..a2311c1b0 100644 --- a/third_party/aom/av1/common/blockd.h +++ b/third_party/aom/av1/common/blockd.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_BLOCKD_H_ -#define AV1_COMMON_BLOCKD_H_ +#ifndef AOM_AV1_COMMON_BLOCKD_H_ +#define AOM_AV1_COMMON_BLOCKD_H_ #include "config/aom_config.h" @@ -38,13 +38,13 @@ extern "C" { #define MAX_DIFFWTD_MASK_BITS 1 // DIFFWTD_MASK_TYPES should not surpass 1 << MAX_DIFFWTD_MASK_BITS -typedef enum { +typedef enum ATTRIBUTE_PACKED { DIFFWTD_38 = 0, DIFFWTD_38_INV, DIFFWTD_MASK_TYPES, } DIFFWTD_MASK_TYPE; -typedef enum { +typedef enum ATTRIBUTE_PACKED { KEY_FRAME = 0, INTER_FRAME = 1, INTRA_ONLY_FRAME = 2, // replaces intra-only @@ -57,7 +57,7 @@ static INLINE int is_comp_ref_allowed(BLOCK_SIZE bsize) { } static INLINE int is_inter_mode(PREDICTION_MODE mode) { - return mode >= NEARESTMV && mode <= NEW_NEWMV; + return mode >= INTER_MODE_START && mode < INTER_MODE_END; } typedef struct { @@ -66,10 +66,10 @@ typedef struct { } BUFFER_SET; static INLINE int is_inter_singleref_mode(PREDICTION_MODE mode) { - return mode >= NEARESTMV && mode <= NEWMV; + return mode >= SINGLE_INTER_MODE_START && mode < SINGLE_INTER_MODE_END; } static INLINE int is_inter_compound_mode(PREDICTION_MODE mode) { - return mode >= NEAREST_NEARESTMV && mode <= NEW_NEWMV; + return mode >= COMP_INTER_MODE_START && mode < COMP_INTER_MODE_END; } static INLINE PREDICTION_MODE compound_ref0_mode(PREDICTION_MODE mode) { @@ -148,10 +148,6 @@ static INLINE int have_newmv_in_inter_mode(PREDICTION_MODE mode) { mode == NEW_NEARESTMV || mode == NEAR_NEWMV || mode == NEW_NEARMV); } -static INLINE int use_masked_motion_search(COMPOUND_TYPE type) { - return (type == COMPOUND_WEDGE); -} - static INLINE int is_masked_compound_type(COMPOUND_TYPE type) { return (type == COMPOUND_WEDGE || type == COMPOUND_DIFFWTD); } @@ -267,8 +263,8 @@ typedef struct MB_MODE_INFO { int mi_row; int mi_col; #endif - int num_proj_ref[2]; - WarpedMotionParams wm_params[2]; + int num_proj_ref; + WarpedMotionParams wm_params; // Index of the alpha Cb and alpha Cr combination int cfl_alpha_idx; @@ -376,7 +372,7 @@ static INLINE void mi_to_pixel_loc(int *pixel_c, int *pixel_r, int mi_col, } #endif -enum mv_precision { MV_PRECISION_Q3, MV_PRECISION_Q4 }; +enum ATTRIBUTE_PACKED mv_precision { MV_PRECISION_Q3, MV_PRECISION_Q4 }; struct buf_2d { uint8_t *buf; @@ -500,6 +496,8 @@ typedef struct jnt_comp_params { int bck_offset; } JNT_COMP_PARAMS; +// Most/all of the pointers are mere pointers to actual arrays are allocated +// elsewhere. This is mostly for coding convenience. typedef struct macroblockd { struct macroblockd_plane plane[MAX_MB_PLANE]; @@ -544,7 +542,7 @@ typedef struct macroblockd { SgrprojInfo sgrproj_info[MAX_MB_PLANE]; // block dimension in the unit of mode_info. - uint8_t n8_w, n8_h; + uint8_t n4_w, n4_h; uint8_t ref_mv_count[MODE_CTX_REF_FRAMES]; CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE]; @@ -599,6 +597,9 @@ typedef struct macroblockd { uint16_t cb_offset[MAX_MB_PLANE]; uint16_t txb_offset[MAX_MB_PLANE]; uint16_t color_index_map_offset[2]; + + CONV_BUF_TYPE *tmp_conv_dst; + uint8_t *tmp_obmc_bufs[2]; } MACROBLOCKD; static INLINE int get_bitdepth_data_path_index(const MACROBLOCKD *xd) { @@ -623,6 +624,11 @@ static INLINE int get_sqr_bsize_idx(BLOCK_SIZE bsize) { } } +// For a square block size 'bsize', returns the size of the sub-blocks used by +// the given partition type. If the partition produces sub-blocks of different +// sizes, then the function returns the largest sub-block size. +// Implements the Partition_Subsize lookup table in the spec (Section 9.3. +// Conversion tables). // Note: the input block size should be square. // Otherwise it's considered invalid. static INLINE BLOCK_SIZE get_partition_subsize(BLOCK_SIZE bsize, @@ -781,6 +787,8 @@ static INLINE TX_TYPE get_default_tx_type(PLANE_TYPE plane_type, return intra_mode_to_tx_type(mbmi, plane_type); } +// Implements the get_plane_residual_size() function in the spec (Section +// 5.11.38. Get plane residual size function). static INLINE BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize, int subsampling_x, int subsampling_y) { @@ -952,15 +960,6 @@ typedef void (*foreach_transformed_block_visitor)(int plane, int block, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg); -void av1_foreach_transformed_block_in_plane( - const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane, - foreach_transformed_block_visitor visit, void *arg); - -void av1_foreach_transformed_block(const MACROBLOCKD *const xd, - BLOCK_SIZE bsize, int mi_row, int mi_col, - foreach_transformed_block_visitor visit, - void *arg, const int num_planes); - void av1_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd, int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob, int aoff, int loff); @@ -976,7 +975,7 @@ static INLINE int is_interintra_allowed_bsize(const BLOCK_SIZE bsize) { } static INLINE int is_interintra_allowed_mode(const PREDICTION_MODE mode) { - return (mode >= NEARESTMV) && (mode <= NEWMV); + return (mode >= SINGLE_INTER_MODE_START) && (mode < SINGLE_INTER_MODE_END); } static INLINE int is_interintra_allowed_ref(const MV_REFERENCE_FRAME rf[2]) { @@ -1045,7 +1044,7 @@ motion_mode_allowed(const WarpedMotionParams *gm_params, const MACROBLOCKD *xd, is_motion_variation_allowed_compound(mbmi)) { if (!check_num_overlappable_neighbors(mbmi)) return SIMPLE_TRANSLATION; assert(!has_second_ref(mbmi)); - if (mbmi->num_proj_ref[0] >= 1 && + if (mbmi->num_proj_ref >= 1 && (allow_warped_motion && !av1_is_scaled(&(xd->block_refs[0]->sf)))) { if (xd->cur_frame_force_integer_mv) { return OBMC_CAUSAL; @@ -1174,4 +1173,4 @@ static INLINE int av1_get_max_eob(TX_SIZE tx_size) { } // extern "C" #endif -#endif // AV1_COMMON_BLOCKD_H_ +#endif // AOM_AV1_COMMON_BLOCKD_H_ diff --git a/third_party/aom/av1/common/cdef.h b/third_party/aom/av1/common/cdef.h index 092230de9..3b2eac8a5 100644 --- a/third_party/aom/av1/common/cdef.h +++ b/third_party/aom/av1/common/cdef.h @@ -8,8 +8,8 @@ * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_CDEF_H_ -#define AV1_COMMON_CDEF_H_ +#ifndef AOM_AV1_COMMON_CDEF_H_ +#define AOM_AV1_COMMON_CDEF_H_ #define CDEF_STRENGTH_BITS 6 @@ -48,4 +48,4 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref, #ifdef __cplusplus } // extern "C" #endif -#endif // AV1_COMMON_CDEF_H_ +#endif // AOM_AV1_COMMON_CDEF_H_ diff --git a/third_party/aom/av1/common/cdef_block.h b/third_party/aom/av1/common/cdef_block.h index 81c6da077..6b4452cd6 100644 --- a/third_party/aom/av1/common/cdef_block.h +++ b/third_party/aom/av1/common/cdef_block.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#if !defined(_CDEF_BLOCK_H) -#define _CDEF_BLOCK_H (1) +#ifndef AOM_AV1_COMMON_CDEF_BLOCK_H_ +#define AOM_AV1_COMMON_CDEF_BLOCK_H_ #include "av1/common/odintrin.h" @@ -56,4 +56,4 @@ void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in, cdef_list *dlist, int cdef_count, int level, int sec_strength, int pri_damping, int sec_damping, int coeff_shift); -#endif +#endif // AOM_AV1_COMMON_CDEF_BLOCK_H_ diff --git a/third_party/aom/av1/common/cdef_block_simd.h b/third_party/aom/av1/common/cdef_block_simd.h index d24a7c0fa..14587a023 100644 --- a/third_party/aom/av1/common/cdef_block_simd.h +++ b/third_party/aom/av1/common/cdef_block_simd.h @@ -9,6 +9,9 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#ifndef AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_ +#define AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_ + #include "config/av1_rtcd.h" #include "av1/common/cdef_block.h" @@ -913,3 +916,5 @@ void SIMD_FUNC(copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, } } } + +#endif // AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_ diff --git a/third_party/aom/av1/common/cfl.h b/third_party/aom/av1/common/cfl.h index bc9fbce1b..d627891bf 100644 --- a/third_party/aom/av1/common/cfl.h +++ b/third_party/aom/av1/common/cfl.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_CFL_H_ -#define AV1_COMMON_CFL_H_ +#ifndef AOM_AV1_COMMON_CFL_H_ +#define AOM_AV1_COMMON_CFL_H_ #include "av1/common/blockd.h" #include "av1/common/onyxc_int.h" @@ -299,4 +299,4 @@ void cfl_predict_hbd_null(const int16_t *pred_buf_q3, uint16_t *dst, return pred[tx_size % TX_SIZES_ALL]; \ } -#endif // AV1_COMMON_CFL_H_ +#endif // AOM_AV1_COMMON_CFL_H_ diff --git a/third_party/aom/av1/common/common.h b/third_party/aom/av1/common/common.h index 72c6d3a1e..bed6083db 100644 --- a/third_party/aom/av1/common/common.h +++ b/third_party/aom/av1/common/common.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_COMMON_H_ -#define AV1_COMMON_COMMON_H_ +#ifndef AOM_AV1_COMMON_COMMON_H_ +#define AOM_AV1_COMMON_COMMON_H_ /* Interface header for common constant data structures and lookup tables */ @@ -60,4 +60,4 @@ static INLINE int get_unsigned_bits(unsigned int num_values) { } // extern "C" #endif -#endif // AV1_COMMON_COMMON_H_ +#endif // AOM_AV1_COMMON_COMMON_H_ diff --git a/third_party/aom/av1/common/common_data.h b/third_party/aom/av1/common/common_data.h index f521f10bf..46e455fdb 100644 --- a/third_party/aom/av1/common/common_data.h +++ b/third_party/aom/av1/common/common_data.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_COMMON_DATA_H_ -#define AV1_COMMON_COMMON_DATA_H_ +#ifndef AOM_AV1_COMMON_COMMON_DATA_H_ +#define AOM_AV1_COMMON_COMMON_DATA_H_ #include "av1/common/enums.h" #include "aom/aom_integer.h" @@ -20,34 +20,43 @@ extern "C" { #endif -// Log 2 conversion lookup tables in units of mode info(4x4). +// Log 2 conversion lookup tables in units of mode info (4x4). +// The Mi_Width_Log2 table in the spec (Section 9.3. Conversion tables). static const uint8_t mi_size_wide_log2[BLOCK_SIZES_ALL] = { 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 0, 2, 1, 3, 2, 4 }; +// The Mi_Height_Log2 table in the spec (Section 9.3. Conversion tables). static const uint8_t mi_size_high_log2[BLOCK_SIZES_ALL] = { 0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 2, 0, 3, 1, 4, 2 }; +// Width/height lookup tables in units of mode info (4x4). +// The Num_4x4_Blocks_Wide table in the spec (Section 9.3. Conversion tables). static const uint8_t mi_size_wide[BLOCK_SIZES_ALL] = { 1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16, 16, 32, 32, 1, 4, 2, 8, 4, 16 }; +// The Num_4x4_Blocks_High table in the spec (Section 9.3. Conversion tables). static const uint8_t mi_size_high[BLOCK_SIZES_ALL] = { 1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16, 32, 16, 32, 4, 1, 8, 2, 16, 4 }; -// Width/height lookup tables in units of various block sizes +// Width/height lookup tables in units of samples. +// The Block_Width table in the spec (Section 9.3. Conversion tables). static const uint8_t block_size_wide[BLOCK_SIZES_ALL] = { 4, 4, 8, 8, 8, 16, 16, 16, 32, 32, 32, 64, 64, 64, 128, 128, 4, 16, 8, 32, 16, 64 }; +// The Block_Height table in the spec (Section 9.3. Conversion tables). static const uint8_t block_size_high[BLOCK_SIZES_ALL] = { 4, 8, 4, 8, 16, 8, 16, 32, 16, 32, 64, 32, 64, 128, 64, 128, 16, 4, 32, 8, 64, 16 }; -// AOMMIN(3, AOMMIN(b_width_log2(bsize), b_height_log2(bsize))) +// Maps a block size to a context. +// The Size_Group table in the spec (Section 9.3. Conversion tables). +// AOMMIN(3, AOMMIN(mi_size_wide_log2(bsize), mi_size_high_log2(bsize))) static const uint8_t size_group_lookup[BLOCK_SIZES_ALL] = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 0, 0, 1, 1, 2, 2 }; @@ -56,6 +65,8 @@ static const uint8_t num_pels_log2_lookup[BLOCK_SIZES_ALL] = { 4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 13, 14, 6, 6, 8, 8, 10, 10 }; +// A compressed version of the Partition_Subsize table in the spec (9.3. +// Conversion tables), for square block sizes only. /* clang-format off */ static const BLOCK_SIZE subsize_lookup[EXT_PARTITION_TYPES][SQR_BLOCK_SIZES] = { { // PARTITION_NONE @@ -350,34 +361,36 @@ static const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = { TX_64X64, // TX_MODE_LARGEST TX_64X64, // TX_MODE_SELECT }; -/* clang-format on */ +// The Subsampled_Size table in the spec (Section 5.11.38. Get plane residual +// size function). static const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES_ALL][2][2] = { - // ss_x == 0 ss_x == 0 ss_x == 1 ss_x == 1 - // ss_y == 0 ss_y == 1 ss_y == 0 ss_y == 1 - { { BLOCK_4X4, BLOCK_4X4 }, { BLOCK_4X4, BLOCK_4X4 } }, - { { BLOCK_4X8, BLOCK_4X4 }, { BLOCK_4X4, BLOCK_4X4 } }, - { { BLOCK_8X4, BLOCK_4X4 }, { BLOCK_4X4, BLOCK_4X4 } }, - { { BLOCK_8X8, BLOCK_8X4 }, { BLOCK_4X8, BLOCK_4X4 } }, - { { BLOCK_8X16, BLOCK_8X8 }, { BLOCK_4X16, BLOCK_4X8 } }, - { { BLOCK_16X8, BLOCK_16X4 }, { BLOCK_8X8, BLOCK_8X4 } }, - { { BLOCK_16X16, BLOCK_16X8 }, { BLOCK_8X16, BLOCK_8X8 } }, - { { BLOCK_16X32, BLOCK_16X16 }, { BLOCK_8X32, BLOCK_8X16 } }, - { { BLOCK_32X16, BLOCK_32X8 }, { BLOCK_16X16, BLOCK_16X8 } }, - { { BLOCK_32X32, BLOCK_32X16 }, { BLOCK_16X32, BLOCK_16X16 } }, - { { BLOCK_32X64, BLOCK_32X32 }, { BLOCK_16X64, BLOCK_16X32 } }, - { { BLOCK_64X32, BLOCK_64X16 }, { BLOCK_32X32, BLOCK_32X16 } }, - { { BLOCK_64X64, BLOCK_64X32 }, { BLOCK_32X64, BLOCK_32X32 } }, - { { BLOCK_64X128, BLOCK_64X64 }, { BLOCK_INVALID, BLOCK_32X64 } }, - { { BLOCK_128X64, BLOCK_INVALID }, { BLOCK_64X64, BLOCK_64X32 } }, - { { BLOCK_128X128, BLOCK_128X64 }, { BLOCK_64X128, BLOCK_64X64 } }, - { { BLOCK_4X16, BLOCK_4X8 }, { BLOCK_4X16, BLOCK_4X8 } }, - { { BLOCK_16X4, BLOCK_16X4 }, { BLOCK_8X4, BLOCK_8X4 } }, - { { BLOCK_8X32, BLOCK_8X16 }, { BLOCK_INVALID, BLOCK_4X16 } }, - { { BLOCK_32X8, BLOCK_INVALID }, { BLOCK_16X8, BLOCK_16X4 } }, - { { BLOCK_16X64, BLOCK_16X32 }, { BLOCK_INVALID, BLOCK_8X32 } }, - { { BLOCK_64X16, BLOCK_INVALID }, { BLOCK_32X16, BLOCK_32X8 } } + // ss_x == 0 ss_x == 0 ss_x == 1 ss_x == 1 + // ss_y == 0 ss_y == 1 ss_y == 0 ss_y == 1 + { { BLOCK_4X4, BLOCK_4X4 }, { BLOCK_4X4, BLOCK_4X4 } }, + { { BLOCK_4X8, BLOCK_4X4 }, { BLOCK_INVALID, BLOCK_4X4 } }, + { { BLOCK_8X4, BLOCK_INVALID }, { BLOCK_4X4, BLOCK_4X4 } }, + { { BLOCK_8X8, BLOCK_8X4 }, { BLOCK_4X8, BLOCK_4X4 } }, + { { BLOCK_8X16, BLOCK_8X8 }, { BLOCK_INVALID, BLOCK_4X8 } }, + { { BLOCK_16X8, BLOCK_INVALID }, { BLOCK_8X8, BLOCK_8X4 } }, + { { BLOCK_16X16, BLOCK_16X8 }, { BLOCK_8X16, BLOCK_8X8 } }, + { { BLOCK_16X32, BLOCK_16X16 }, { BLOCK_INVALID, BLOCK_8X16 } }, + { { BLOCK_32X16, BLOCK_INVALID }, { BLOCK_16X16, BLOCK_16X8 } }, + { { BLOCK_32X32, BLOCK_32X16 }, { BLOCK_16X32, BLOCK_16X16 } }, + { { BLOCK_32X64, BLOCK_32X32 }, { BLOCK_INVALID, BLOCK_16X32 } }, + { { BLOCK_64X32, BLOCK_INVALID }, { BLOCK_32X32, BLOCK_32X16 } }, + { { BLOCK_64X64, BLOCK_64X32 }, { BLOCK_32X64, BLOCK_32X32 } }, + { { BLOCK_64X128, BLOCK_64X64 }, { BLOCK_INVALID, BLOCK_32X64 } }, + { { BLOCK_128X64, BLOCK_INVALID }, { BLOCK_64X64, BLOCK_64X32 } }, + { { BLOCK_128X128, BLOCK_128X64 }, { BLOCK_64X128, BLOCK_64X64 } }, + { { BLOCK_4X16, BLOCK_4X8 }, { BLOCK_INVALID, BLOCK_4X8 } }, + { { BLOCK_16X4, BLOCK_INVALID }, { BLOCK_8X4, BLOCK_8X4 } }, + { { BLOCK_8X32, BLOCK_8X16 }, { BLOCK_INVALID, BLOCK_4X16 } }, + { { BLOCK_32X8, BLOCK_INVALID }, { BLOCK_16X8, BLOCK_16X4 } }, + { { BLOCK_16X64, BLOCK_16X32 }, { BLOCK_INVALID, BLOCK_8X32 } }, + { { BLOCK_64X16, BLOCK_INVALID }, { BLOCK_32X16, BLOCK_32X8 } } }; +/* clang-format on */ // Generates 5 bit field in which each bit set to 1 represents // a blocksize partition 11111 means we split 128x128, 64x64, 32x32, 16x16 @@ -430,4 +443,4 @@ static const int quant_dist_lookup_table[2][4][2] = { } // extern "C" #endif -#endif // AV1_COMMON_COMMON_DATA_H_ +#endif // AOM_AV1_COMMON_COMMON_DATA_H_ diff --git a/third_party/aom/av1/common/convolve.c b/third_party/aom/av1/common/convolve.c index ed962c722..1f11126fc 100644 --- a/third_party/aom/av1/common/convolve.c +++ b/third_party/aom/av1/common/convolve.c @@ -173,6 +173,7 @@ void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, // horizontal filter const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_q4 & SUBPEL_MASK); + for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { int32_t res = 0; @@ -510,31 +511,73 @@ static void convolve_2d_scale_wrapper( y_step_qn, conv_params); } +// TODO(huisu@google.com): bilinear filtering only needs 2 taps in general. So +// we may create optimized code to do 2-tap filtering for all bilinear filtering +// usages, not just IntraBC. +static void convolve_2d_for_intrabc(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + int subpel_x_q4, int subpel_y_q4, + ConvolveParams *conv_params) { + const InterpFilterParams *filter_params_x = + subpel_x_q4 ? &av1_intrabc_filter_params : NULL; + const InterpFilterParams *filter_params_y = + subpel_y_q4 ? &av1_intrabc_filter_params : NULL; + if (subpel_x_q4 != 0 && subpel_y_q4 != 0) { + av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, 0, 0, conv_params); + } else if (subpel_x_q4 != 0) { + av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x, + filter_params_y, 0, 0, conv_params); + } else { + av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x, + filter_params_y, 0, 0, conv_params); + } +} + void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilters interp_filters, const int subpel_x_q4, int x_step_q4, const int subpel_y_q4, int y_step_q4, int scaled, ConvolveParams *conv_params, - const struct scale_factors *sf) { + const struct scale_factors *sf, int is_intrabc) { + assert(IMPLIES(is_intrabc, !scaled)); (void)x_step_q4; (void)y_step_q4; (void)dst; (void)dst_stride; - InterpFilter filter_x = av1_extract_interp_filter(interp_filters, 1); - InterpFilter filter_y = av1_extract_interp_filter(interp_filters, 0); + + if (is_intrabc && (subpel_x_q4 != 0 || subpel_y_q4 != 0)) { + convolve_2d_for_intrabc(src, src_stride, dst, dst_stride, w, h, subpel_x_q4, + subpel_y_q4, conv_params); + return; + } + + InterpFilter filter_x = 0; + InterpFilter filter_y = 0; + const int need_filter_params_x = (subpel_x_q4 != 0) | scaled; + const int need_filter_params_y = (subpel_y_q4 != 0) | scaled; + if (need_filter_params_x) + filter_x = av1_extract_interp_filter(interp_filters, 1); + if (need_filter_params_y) + filter_y = av1_extract_interp_filter(interp_filters, 0); const InterpFilterParams *filter_params_x = - av1_get_interp_filter_params_with_block_size(filter_x, w); + need_filter_params_x + ? av1_get_interp_filter_params_with_block_size(filter_x, w) + : NULL; const InterpFilterParams *filter_params_y = - av1_get_interp_filter_params_with_block_size(filter_y, h); + need_filter_params_y + ? av1_get_interp_filter_params_with_block_size(filter_y, h) + : NULL; - if (scaled) + if (scaled) { convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h, filter_params_x, filter_params_y, subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4, conv_params); - else + } else { sf->convolve[subpel_x_q4 != 0][subpel_y_q4 != 0][conv_params->is_compound]( src, src_stride, dst, dst_stride, w, h, filter_params_x, filter_params_y, subpel_x_q4, subpel_y_q4, conv_params); + } } void av1_highbd_convolve_2d_copy_sr_c( @@ -964,24 +1007,68 @@ void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, } } +static void highbd_convolve_2d_for_intrabc(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, + int h, int subpel_x_q4, + int subpel_y_q4, + ConvolveParams *conv_params, + int bd) { + const InterpFilterParams *filter_params_x = + subpel_x_q4 ? &av1_intrabc_filter_params : NULL; + const InterpFilterParams *filter_params_y = + subpel_y_q4 ? &av1_intrabc_filter_params : NULL; + if (subpel_x_q4 != 0 && subpel_y_q4 != 0) { + av1_highbd_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, 0, 0, + conv_params, bd); + } else if (subpel_x_q4 != 0) { + av1_highbd_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, 0, 0, + conv_params, bd); + } else { + av1_highbd_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, 0, 0, + conv_params, bd); + } +} + void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride, uint8_t *dst8, int dst_stride, int w, int h, InterpFilters interp_filters, const int subpel_x_q4, int x_step_q4, const int subpel_y_q4, int y_step_q4, int scaled, ConvolveParams *conv_params, - const struct scale_factors *sf, int bd) { + const struct scale_factors *sf, + int is_intrabc, int bd) { + assert(IMPLIES(is_intrabc, !scaled)); (void)x_step_q4; (void)y_step_q4; (void)dst_stride; - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - InterpFilter filter_x = av1_extract_interp_filter(interp_filters, 1); - InterpFilter filter_y = av1_extract_interp_filter(interp_filters, 0); + + if (is_intrabc && (subpel_x_q4 != 0 || subpel_y_q4 != 0)) { + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + highbd_convolve_2d_for_intrabc(src, src_stride, dst, dst_stride, w, h, + subpel_x_q4, subpel_y_q4, conv_params, bd); + return; + } + + InterpFilter filter_x = 0; + InterpFilter filter_y = 0; + const int need_filter_params_x = (subpel_x_q4 != 0) | scaled; + const int need_filter_params_y = (subpel_y_q4 != 0) | scaled; + if (need_filter_params_x) + filter_x = av1_extract_interp_filter(interp_filters, 1); + if (need_filter_params_y) + filter_y = av1_extract_interp_filter(interp_filters, 0); const InterpFilterParams *filter_params_x = - av1_get_interp_filter_params_with_block_size(filter_x, w); + need_filter_params_x + ? av1_get_interp_filter_params_with_block_size(filter_x, w) + : NULL; const InterpFilterParams *filter_params_y = - av1_get_interp_filter_params_with_block_size(filter_y, h); + need_filter_params_y + ? av1_get_interp_filter_params_with_block_size(filter_y, h) + : NULL; if (scaled) { uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); @@ -1111,7 +1198,8 @@ void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE]; const int intermediate_height = - (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS - 1; + memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE); assert(w <= MAX_SB_SIZE); assert(h <= MAX_SB_SIZE); diff --git a/third_party/aom/av1/common/convolve.h b/third_party/aom/av1/common/convolve.h index bc2d4bccf..4109dd843 100644 --- a/third_party/aom/av1/common/convolve.h +++ b/third_party/aom/av1/common/convolve.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_AV1_CONVOLVE_H_ -#define AV1_COMMON_AV1_CONVOLVE_H_ +#ifndef AOM_AV1_COMMON_CONVOLVE_H_ +#define AOM_AV1_COMMON_CONVOLVE_H_ #include "av1/common/filter.h" #ifdef __cplusplus @@ -19,7 +19,6 @@ extern "C" { typedef uint16_t CONV_BUF_TYPE; typedef struct ConvolveParams { - int ref; int do_average; CONV_BUF_TYPE *dst; int dst_stride; @@ -59,15 +58,13 @@ void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst, InterpFilters interp_filters, const int subpel_x_q4, int x_step_q4, const int subpel_y_q4, int y_step_q4, int scaled, ConvolveParams *conv_params, - const struct scale_factors *sf); + const struct scale_factors *sf, int is_intrabc); -static INLINE ConvolveParams get_conv_params_no_round(int ref, int do_average, - int plane, +static INLINE ConvolveParams get_conv_params_no_round(int do_average, int plane, CONV_BUF_TYPE *dst, int dst_stride, int is_compound, int bd) { ConvolveParams conv_params; - conv_params.ref = ref; conv_params.do_average = do_average; assert(IMPLIES(do_average, is_compound)); conv_params.is_compound = is_compound; @@ -88,15 +85,14 @@ static INLINE ConvolveParams get_conv_params_no_round(int ref, int do_average, return conv_params; } -static INLINE ConvolveParams get_conv_params(int ref, int do_average, int plane, +static INLINE ConvolveParams get_conv_params(int do_average, int plane, int bd) { - return get_conv_params_no_round(ref, do_average, plane, NULL, 0, 0, bd); + return get_conv_params_no_round(do_average, plane, NULL, 0, 0, bd); } static INLINE ConvolveParams get_conv_params_wiener(int bd) { ConvolveParams conv_params; (void)bd; - conv_params.ref = 0; conv_params.do_average = 0; conv_params.is_compound = 0; conv_params.round_0 = WIENER_ROUND0_BITS; @@ -119,10 +115,11 @@ void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride, const int subpel_x_q4, int x_step_q4, const int subpel_y_q4, int y_step_q4, int scaled, ConvolveParams *conv_params, - const struct scale_factors *sf, int bd); + const struct scale_factors *sf, + int is_intrabc, int bd); #ifdef __cplusplus } // extern "C" #endif -#endif // AV1_COMMON_AV1_CONVOLVE_H_ +#endif // AOM_AV1_COMMON_CONVOLVE_H_ diff --git a/third_party/aom/av1/common/entropy.h b/third_party/aom/av1/common/entropy.h index ef944c5a0..991692c2f 100644 --- a/third_party/aom/av1/common/entropy.h +++ b/third_party/aom/av1/common/entropy.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_ENTROPY_H_ -#define AV1_COMMON_ENTROPY_H_ +#ifndef AOM_AV1_COMMON_ENTROPY_H_ +#define AOM_AV1_COMMON_ENTROPY_H_ #include "config/aom_config.h" @@ -178,4 +178,4 @@ static INLINE TX_SIZE get_txsize_entropy_ctx(TX_SIZE txsize) { } // extern "C" #endif -#endif // AV1_COMMON_ENTROPY_H_ +#endif // AOM_AV1_COMMON_ENTROPY_H_ diff --git a/third_party/aom/av1/common/entropymode.h b/third_party/aom/av1/common/entropymode.h index 0bd2e20a1..7047f34d2 100644 --- a/third_party/aom/av1/common/entropymode.h +++ b/third_party/aom/av1/common/entropymode.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_ENTROPYMODE_H_ -#define AV1_COMMON_ENTROPYMODE_H_ +#ifndef AOM_AV1_COMMON_ENTROPYMODE_H_ +#define AOM_AV1_COMMON_ENTROPYMODE_H_ #include "av1/common/entropy.h" #include "av1/common/entropymv.h" @@ -186,6 +186,8 @@ void av1_set_default_mode_deltas(int8_t *mode_deltas); void av1_setup_frame_contexts(struct AV1Common *cm); void av1_setup_past_independence(struct AV1Common *cm); +// Returns (int)ceil(log2(n)). +// NOTE: This implementation only works for n <= 2^30. static INLINE int av1_ceil_log2(int n) { if (n < 2) return 0; int i = 1, p = 2; @@ -207,4 +209,4 @@ int av1_get_palette_color_index_context(const uint8_t *color_map, int stride, } // extern "C" #endif -#endif // AV1_COMMON_ENTROPYMODE_H_ +#endif // AOM_AV1_COMMON_ENTROPYMODE_H_ diff --git a/third_party/aom/av1/common/entropymv.c b/third_party/aom/av1/common/entropymv.c index 446aa433c..491337387 100644 --- a/third_party/aom/av1/common/entropymv.c +++ b/third_party/aom/av1/common/entropymv.c @@ -60,61 +60,6 @@ static const nmv_context default_nmv_context = { } }, }; -static const uint8_t log_in_base_2[] = { - 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10 -}; - -static INLINE int mv_class_base(MV_CLASS_TYPE c) { - return c ? CLASS0_SIZE << (c + 2) : 0; -} - -MV_CLASS_TYPE av1_get_mv_class(int z, int *offset) { - const MV_CLASS_TYPE c = (z >= CLASS0_SIZE * 4096) - ? MV_CLASS_10 - : (MV_CLASS_TYPE)log_in_base_2[z >> 3]; - if (offset) *offset = z - mv_class_base(c); - return c; -} - void av1_init_mv_probs(AV1_COMMON *cm) { // NB: this sets CDFs too cm->fc->nmvc = default_nmv_context; diff --git a/third_party/aom/av1/common/entropymv.h b/third_party/aom/av1/common/entropymv.h index 02ca7b66b..fa818a2c1 100644 --- a/third_party/aom/av1/common/entropymv.h +++ b/third_party/aom/av1/common/entropymv.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_ENTROPYMV_H_ -#define AV1_COMMON_ENTROPYMV_H_ +#ifndef AOM_AV1_COMMON_ENTROPYMV_H_ +#define AOM_AV1_COMMON_ENTROPYMV_H_ #include "config/aom_config.h" @@ -91,16 +91,6 @@ typedef struct { nmv_component comps[2]; } nmv_context; -static INLINE MV_JOINT_TYPE av1_get_mv_joint(const MV *mv) { - if (mv->row == 0) { - return mv->col == 0 ? MV_JOINT_ZERO : MV_JOINT_HNZVZ; - } else { - return mv->col == 0 ? MV_JOINT_HZVNZ : MV_JOINT_HNZVNZ; - } -} - -MV_CLASS_TYPE av1_get_mv_class(int z, int *offset); - typedef enum { MV_SUBPEL_NONE = -1, MV_SUBPEL_LOW_PRECISION = 0, @@ -111,4 +101,4 @@ typedef enum { } // extern "C" #endif -#endif // AV1_COMMON_ENTROPYMV_H_ +#endif // AOM_AV1_COMMON_ENTROPYMV_H_ diff --git a/third_party/aom/av1/common/enums.h b/third_party/aom/av1/common/enums.h index 689c25f30..869c06ef2 100644 --- a/third_party/aom/av1/common/enums.h +++ b/third_party/aom/av1/common/enums.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_ENUMS_H_ -#define AV1_COMMON_ENUMS_H_ +#ifndef AOM_AV1_COMMON_ENUMS_H_ +#define AOM_AV1_COMMON_ENUMS_H_ #include "config/aom_config.h" @@ -274,7 +274,7 @@ typedef enum ATTRIBUTE_PACKED { TX_TYPES, } TX_TYPE; -typedef enum { +typedef enum ATTRIBUTE_PACKED { REG_REG, REG_SMOOTH, REG_SHARP, @@ -438,6 +438,8 @@ typedef enum ATTRIBUTE_PACKED { COMP_INTER_MODE_START = NEAREST_NEARESTMV, COMP_INTER_MODE_END = MB_MODE_COUNT, COMP_INTER_MODE_NUM = COMP_INTER_MODE_END - COMP_INTER_MODE_START, + INTER_MODE_START = NEARESTMV, + INTER_MODE_END = MB_MODE_COUNT, INTRA_MODES = PAETH_PRED + 1, // PAETH_PRED has to be the last intra mode. INTRA_INVALID = MB_MODE_COUNT // For uv_mode in inter blocks } PREDICTION_MODE; @@ -478,7 +480,7 @@ typedef enum ATTRIBUTE_PACKED { INTERINTRA_MODES } INTERINTRA_MODE; -typedef enum { +typedef enum ATTRIBUTE_PACKED { COMPOUND_AVERAGE, COMPOUND_WEDGE, COMPOUND_DIFFWTD, @@ -614,4 +616,4 @@ typedef enum ATTRIBUTE_PACKED { } // extern "C" #endif -#endif // AV1_COMMON_ENUMS_H_ +#endif // AOM_AV1_COMMON_ENUMS_H_ diff --git a/third_party/aom/av1/common/filter.h b/third_party/aom/av1/common/filter.h index 7f8ad583a..571422d11 100644 --- a/third_party/aom/av1/common/filter.h +++ b/third_party/aom/av1/common/filter.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_FILTER_H_ -#define AV1_COMMON_FILTER_H_ +#ifndef AOM_AV1_COMMON_FILTER_H_ +#define AOM_AV1_COMMON_FILTER_H_ #include @@ -139,6 +139,17 @@ static const InterpFilterParams BILINEAR } }; +// A special 2-tap bilinear filter for IntraBC chroma. IntraBC uses full pixel +// MV for luma. If sub-sampling exists, chroma may possibly use half-pel MV. +DECLARE_ALIGNED(256, static const int16_t, av1_intrabc_bilinear_filter[2]) = { + 64, + 64, +}; + +static const InterpFilterParams av1_intrabc_filter_params = { + av1_intrabc_bilinear_filter, 2, 0, BILINEAR +}; + DECLARE_ALIGNED(256, static const InterpKernel, av1_sub_pel_filters_4[SUBPEL_SHIFTS]) = { { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, -4, 126, 8, -2, 0, 0 }, @@ -181,6 +192,11 @@ av1_get_interp_filter_params_with_block_size(const InterpFilter interp_filter, return &av1_interp_filter_params_list[interp_filter]; } +static INLINE const InterpFilterParams *av1_get_4tap_interp_filter_params( + const InterpFilter interp_filter) { + return &av1_interp_4tap[interp_filter]; +} + static INLINE const int16_t *av1_get_interp_filter_kernel( const InterpFilter interp_filter) { return av1_interp_filter_params_list[interp_filter].filter_ptr; @@ -195,4 +211,4 @@ static INLINE const int16_t *av1_get_interp_filter_subpel_kernel( } // extern "C" #endif -#endif // AV1_COMMON_FILTER_H_ +#endif // AOM_AV1_COMMON_FILTER_H_ diff --git a/third_party/aom/av1/common/frame_buffers.c b/third_party/aom/av1/common/frame_buffers.c index 502ccd27d..fd6c4bc79 100644 --- a/third_party/aom/av1/common/frame_buffers.c +++ b/third_party/aom/av1/common/frame_buffers.c @@ -38,6 +38,17 @@ void av1_free_internal_frame_buffers(InternalFrameBufferList *list) { list->int_fb = NULL; } +void av1_zero_unused_internal_frame_buffers(InternalFrameBufferList *list) { + int i; + + assert(list != NULL); + + for (i = 0; i < list->num_internal_frame_buffers; ++i) { + if (list->int_fb[i].data && !list->int_fb[i].in_use) + memset(list->int_fb[i].data, 0, list->int_fb[i].size); + } +} + int av1_get_frame_buffer(void *cb_priv, size_t min_size, aom_codec_frame_buffer_t *fb) { int i; diff --git a/third_party/aom/av1/common/frame_buffers.h b/third_party/aom/av1/common/frame_buffers.h index e7341cfdd..16188e51c 100644 --- a/third_party/aom/av1/common/frame_buffers.h +++ b/third_party/aom/av1/common/frame_buffers.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_FRAME_BUFFERS_H_ -#define AV1_COMMON_FRAME_BUFFERS_H_ +#ifndef AOM_AV1_COMMON_FRAME_BUFFERS_H_ +#define AOM_AV1_COMMON_FRAME_BUFFERS_H_ #include "aom/aom_frame_buffer.h" #include "aom/aom_integer.h" @@ -36,6 +36,12 @@ int av1_alloc_internal_frame_buffers(InternalFrameBufferList *list); // Free any data allocated to the frame buffers. void av1_free_internal_frame_buffers(InternalFrameBufferList *list); +// Zeros all unused internal frame buffers. In particular, this zeros the +// frame borders. Call this function after a sequence header change to +// re-initialize the frame borders for the different width, height, or bit +// depth. +void av1_zero_unused_internal_frame_buffers(InternalFrameBufferList *list); + // Callback used by libaom to request an external frame buffer. |cb_priv| // Callback private data, which points to an InternalFrameBufferList. // |min_size| is the minimum size in bytes needed to decode the next frame. @@ -51,4 +57,4 @@ int av1_release_frame_buffer(void *cb_priv, aom_codec_frame_buffer_t *fb); } // extern "C" #endif -#endif // AV1_COMMON_FRAME_BUFFERS_H_ +#endif // AOM_AV1_COMMON_FRAME_BUFFERS_H_ diff --git a/third_party/aom/av1/common/idct.c b/third_party/aom/av1/common/idct.c index bc758eb57..2c1cb9827 100644 --- a/third_party/aom/av1/common/idct.c +++ b/third_party/aom/av1/common/idct.c @@ -31,21 +31,16 @@ int av1_get_tx_scale(const TX_SIZE tx_size) { // that input and output could be the same buffer. // idct -static void highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, - int stride, int eob, int bd) { +void av1_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob, int bd) { if (eob > 1) av1_highbd_iwht4x4_16_add(input, dest, stride, bd); else av1_highbd_iwht4x4_1_add(input, dest, stride, bd); } -static const int32_t *cast_to_int32(const tran_low_t *input) { - assert(sizeof(int32_t) == sizeof(tran_low_t)); - return (const int32_t *)input; -} - -void av1_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); int eob = txfm_param->eob; int bd = txfm_param->bd; @@ -54,206 +49,150 @@ void av1_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest, const TX_TYPE tx_type = txfm_param->tx_type; if (lossless) { assert(tx_type == DCT_DCT); - highbd_iwht4x4_add(input, dest, stride, eob, bd); + av1_highbd_iwht4x4_add(input, dest, stride, eob, bd); return; } - switch (tx_type) { - // Assembly version doesn't support some transform types, so use C version - // for those. - case V_DCT: - case H_DCT: - case V_ADST: - case H_ADST: - case V_FLIPADST: - case H_FLIPADST: - case IDTX: - av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, - bd); - break; - default: - av1_inv_txfm2d_add_4x4(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, - bd); - break; - } + + av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd); } -static void highbd_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); const int32_t *src = cast_to_int32(input); - av1_inv_txfm2d_add_4x8(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); + av1_inv_txfm2d_add_4x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); } -static void highbd_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); const int32_t *src = cast_to_int32(input); - av1_inv_txfm2d_add_8x4(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); + av1_inv_txfm2d_add_8x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); } -static void highbd_inv_txfm_add_8x16(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_16x32(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); - av1_inv_txfm2d_add_8x16(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); + av1_inv_txfm2d_add_16x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); } -static void highbd_inv_txfm_add_16x8(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_32x16(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); - av1_inv_txfm2d_add_16x8(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); + av1_inv_txfm2d_add_32x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); } -static void highbd_inv_txfm_add_16x32(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_16x4(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); - av1_inv_txfm2d_add_16x32(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); + av1_inv_txfm2d_add_16x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); } -static void highbd_inv_txfm_add_32x16(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_4x16(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); - av1_inv_txfm2d_add_32x16(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); + av1_inv_txfm2d_add_4x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); } -static void highbd_inv_txfm_add_16x4(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_32x8(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); - av1_inv_txfm2d_add_16x4(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); + av1_inv_txfm2d_add_32x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); } -static void highbd_inv_txfm_add_4x16(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_8x32(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); - av1_inv_txfm2d_add_4x16(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); + av1_inv_txfm2d_add_8x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); } -static void highbd_inv_txfm_add_32x8(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_32x64(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); - av1_inv_txfm2d_add_32x8(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); + av1_inv_txfm2d_add_32x64_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); } -static void highbd_inv_txfm_add_8x32(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_64x32(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); - av1_inv_txfm2d_add_8x32(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); + av1_inv_txfm2d_add_64x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); } -static void highbd_inv_txfm_add_32x64(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_16x64(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); - av1_inv_txfm2d_add_32x64(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); + av1_inv_txfm2d_add_16x64_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); } -static void highbd_inv_txfm_add_64x32(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_64x16(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); - av1_inv_txfm2d_add_64x32(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); + av1_inv_txfm2d_add_64x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); } -static void highbd_inv_txfm_add_16x64(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_8x8_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; const int32_t *src = cast_to_int32(input); - av1_inv_txfm2d_add_16x64(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); -} -static void highbd_inv_txfm_add_64x16(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { - const int32_t *src = cast_to_int32(input); - av1_inv_txfm2d_add_64x16(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); + av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd); } -static void highbd_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_16x16_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { int bd = txfm_param->bd; const TX_TYPE tx_type = txfm_param->tx_type; const int32_t *src = cast_to_int32(input); - switch (tx_type) { - // Assembly version doesn't support some transform types, so use C version - // for those. - case V_DCT: - case H_DCT: - case V_ADST: - case H_ADST: - case V_FLIPADST: - case H_FLIPADST: - case IDTX: - av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, - bd); - break; - default: - av1_inv_txfm2d_add_8x8(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, + + av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd); - break; - } } -static void highbd_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { - int bd = txfm_param->bd; - const TX_TYPE tx_type = txfm_param->tx_type; +void av1_highbd_inv_txfm_add_8x16_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); - switch (tx_type) { - // Assembly version doesn't support some transform types, so use C version - // for those. - case V_DCT: - case H_DCT: - case V_ADST: - case H_ADST: - case V_FLIPADST: - case H_FLIPADST: - case IDTX: - av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, - tx_type, bd); - break; - default: - av1_inv_txfm2d_add_16x16(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, - bd); - break; - } + av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); } -static void highbd_inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_16x8_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_32x32_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { const int bd = txfm_param->bd; const TX_TYPE tx_type = txfm_param->tx_type; const int32_t *src = cast_to_int32(input); - switch (tx_type) { - case DCT_DCT: - av1_inv_txfm2d_add_32x32(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, - bd); - break; - // Assembly version doesn't support IDTX, so use C version for it. - case IDTX: - av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, - tx_type, bd); - break; - default: assert(0); - } + av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, + bd); } -static void highbd_inv_txfm_add_64x64(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_64x64_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { const int bd = txfm_param->bd; const TX_TYPE tx_type = txfm_param->tx_type; const int32_t *src = cast_to_int32(input); assert(tx_type == DCT_DCT); - av1_inv_txfm2d_add_64x64(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd); + av1_inv_txfm2d_add_64x64_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, + bd); } static void init_txfm_param(const MACROBLOCKD *xd, int plane, TX_SIZE tx_size, @@ -270,70 +209,70 @@ static void init_txfm_param(const MACROBLOCKD *xd, int plane, TX_SIZE tx_size, txfm_param->tx_size, is_inter_block(xd->mi[0]), reduced_tx_set); } -static void highbd_inv_txfm_add(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { +void av1_highbd_inv_txfm_add_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); const TX_SIZE tx_size = txfm_param->tx_size; switch (tx_size) { case TX_32X32: - highbd_inv_txfm_add_32x32(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_32x32_c(input, dest, stride, txfm_param); break; case TX_16X16: - highbd_inv_txfm_add_16x16(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_16x16_c(input, dest, stride, txfm_param); break; case TX_8X8: - highbd_inv_txfm_add_8x8(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_8x8_c(input, dest, stride, txfm_param); break; case TX_4X8: - highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param); break; case TX_8X4: - highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param); break; case TX_8X16: - highbd_inv_txfm_add_8x16(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_8x16_c(input, dest, stride, txfm_param); break; case TX_16X8: - highbd_inv_txfm_add_16x8(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_16x8_c(input, dest, stride, txfm_param); break; case TX_16X32: - highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param); break; case TX_32X16: - highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param); break; case TX_64X64: - highbd_inv_txfm_add_64x64(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_64x64_c(input, dest, stride, txfm_param); break; case TX_32X64: - highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param); break; case TX_64X32: - highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param); break; case TX_16X64: - highbd_inv_txfm_add_16x64(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_16x64(input, dest, stride, txfm_param); break; case TX_64X16: - highbd_inv_txfm_add_64x16(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_64x16(input, dest, stride, txfm_param); break; case TX_4X4: // this is like av1_short_idct4x4 but has a special case around eob<=1 // which is significant (not just an optimization) for the lossless // case. - av1_highbd_inv_txfm_add_4x4(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_4x4_c(input, dest, stride, txfm_param); break; case TX_16X4: - highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param); break; case TX_4X16: - highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param); break; case TX_8X32: - highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param); break; case TX_32X8: - highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param); + av1_highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param); break; default: assert(0 && "Invalid transform size"); break; } @@ -352,7 +291,8 @@ void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, } } - highbd_inv_txfm_add(dqcoeff, CONVERT_TO_BYTEPTR(tmp), tmp_stride, txfm_param); + av1_highbd_inv_txfm_add(dqcoeff, CONVERT_TO_BYTEPTR(tmp), tmp_stride, + txfm_param); for (int r = 0; r < h; ++r) { for (int c = 0; c < w; ++c) { @@ -375,7 +315,7 @@ void av1_inverse_transform_block(const MACROBLOCKD *xd, assert(av1_ext_tx_used[txfm_param.tx_set_type][txfm_param.tx_type]); if (txfm_param.is_hbd) { - highbd_inv_txfm_add(dqcoeff, dst, stride, &txfm_param); + av1_highbd_inv_txfm_add(dqcoeff, dst, stride, &txfm_param); } else { av1_inv_txfm_add(dqcoeff, dst, stride, &txfm_param); } diff --git a/third_party/aom/av1/common/idct.h b/third_party/aom/av1/common/idct.h index 50032a167..d9454e73f 100644 --- a/third_party/aom/av1/common/idct.h +++ b/third_party/aom/av1/common/idct.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_IDCT_H_ -#define AV1_COMMON_IDCT_H_ +#ifndef AOM_AV1_COMMON_IDCT_H_ +#define AOM_AV1_COMMON_IDCT_H_ #include "config/aom_config.h" @@ -36,11 +36,32 @@ void av1_inverse_transform_block(const MACROBLOCKD *xd, const tran_low_t *dqcoeff, int plane, TX_TYPE tx_type, TX_SIZE tx_size, uint8_t *dst, int stride, int eob, int reduced_tx_set); +void av1_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob, int bd); + +static INLINE const int32_t *cast_to_int32(const tran_low_t *input) { + assert(sizeof(int32_t) == sizeof(tran_low_t)); + return (const int32_t *)input; +} + +typedef void(highbd_inv_txfm_add)(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *param); + +highbd_inv_txfm_add av1_highbd_inv_txfm_add_4x8; +highbd_inv_txfm_add av1_highbd_inv_txfm_add_8x4; +highbd_inv_txfm_add av1_highbd_inv_txfm_add_16x32; +highbd_inv_txfm_add av1_highbd_inv_txfm_add_32x16; +highbd_inv_txfm_add av1_highbd_inv_txfm_add_32x64; +highbd_inv_txfm_add av1_highbd_inv_txfm_add_64x32; +highbd_inv_txfm_add av1_highbd_inv_txfm_add_16x64; +highbd_inv_txfm_add av1_highbd_inv_txfm_add_64x16; +highbd_inv_txfm_add av1_highbd_inv_txfm_add_16x4; +highbd_inv_txfm_add av1_highbd_inv_txfm_add_4x16; +highbd_inv_txfm_add av1_highbd_inv_txfm_add_8x32; +highbd_inv_txfm_add av1_highbd_inv_txfm_add_32x8; -void av1_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *param); #ifdef __cplusplus } // extern "C" #endif -#endif // AV1_COMMON_IDCT_H_ +#endif // AOM_AV1_COMMON_IDCT_H_ diff --git a/third_party/aom/av1/common/mv.h b/third_party/aom/av1/common/mv.h index c2495640e..5b0225192 100644 --- a/third_party/aom/av1/common/mv.h +++ b/third_party/aom/av1/common/mv.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_MV_H_ -#define AV1_COMMON_MV_H_ +#ifndef AOM_AV1_COMMON_MV_H_ +#define AOM_AV1_COMMON_MV_H_ #include "av1/common/common.h" #include "av1/common/common_data.h" @@ -56,7 +56,7 @@ typedef struct mv32 { #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS) /* clang-format off */ -typedef enum { +typedef enum ATTRIBUTE_PACKED { IDENTITY = 0, // identity transformation, 0-parameter TRANSLATION = 1, // translational motion 2-parameter ROTZOOM = 2, // simplified affine with rotation + zoom only, 4-parameter @@ -298,4 +298,4 @@ static INLINE void clamp_mv(MV *mv, int min_col, int max_col, int min_row, } // extern "C" #endif -#endif // AV1_COMMON_MV_H_ +#endif // AOM_AV1_COMMON_MV_H_ diff --git a/third_party/aom/av1/common/mvref_common.c b/third_party/aom/av1/common/mvref_common.c index 6939df335..7f24ab4e6 100644 --- a/third_party/aom/av1/common/mvref_common.c +++ b/third_party/aom/av1/common/mvref_common.c @@ -27,16 +27,19 @@ static void get_mv_projection(MV *output, MV ref, int num, int den) { den = AOMMIN(den, MAX_FRAME_DISTANCE); num = num > 0 ? AOMMIN(num, MAX_FRAME_DISTANCE) : AOMMAX(num, -MAX_FRAME_DISTANCE); - int mv_row = ROUND_POWER_OF_TWO_SIGNED(ref.row * num * div_mult[den], 14); - int mv_col = ROUND_POWER_OF_TWO_SIGNED(ref.col * num * div_mult[den], 14); + const int mv_row = + ROUND_POWER_OF_TWO_SIGNED(ref.row * num * div_mult[den], 14); + const int mv_col = + ROUND_POWER_OF_TWO_SIGNED(ref.col * num * div_mult[den], 14); const int clamp_max = MV_UPP - 1; const int clamp_min = MV_LOW + 1; output->row = (int16_t)clamp(mv_row, clamp_min, clamp_max); output->col = (int16_t)clamp(mv_col, clamp_min, clamp_max); } -void av1_copy_frame_mvs(const AV1_COMMON *const cm, MB_MODE_INFO *mi, - int mi_row, int mi_col, int x_mis, int y_mis) { +void av1_copy_frame_mvs(const AV1_COMMON *const cm, + const MB_MODE_INFO *const mi, int mi_row, int mi_col, + int x_mis, int y_mis) { const int frame_mvs_stride = ROUND_POWER_OF_TWO(cm->mi_cols, 1); MV_REF *frame_mvs = cm->cur_frame->mvs + (mi_row >> 1) * frame_mvs_stride + (mi_col >> 1); @@ -141,38 +144,37 @@ static void scan_row_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd, uint8_t *ref_match_count, uint8_t *newmv_count, int_mv *gm_mv_candidates, int max_row_offset, int *processed_rows) { - int end_mi = AOMMIN(xd->n8_w, cm->mi_cols - mi_col); + int end_mi = AOMMIN(xd->n4_w, cm->mi_cols - mi_col); end_mi = AOMMIN(end_mi, mi_size_wide[BLOCK_64X64]); const int n8_w_8 = mi_size_wide[BLOCK_8X8]; const int n8_w_16 = mi_size_wide[BLOCK_16X16]; int i; int col_offset = 0; - const int shift = 0; // TODO(jingning): Revisit this part after cb4x4 is stable. if (abs(row_offset) > 1) { col_offset = 1; - if ((mi_col & 0x01) && xd->n8_w < n8_w_8) --col_offset; + if ((mi_col & 0x01) && xd->n4_w < n8_w_8) --col_offset; } - const int use_step_16 = (xd->n8_w >= 16); + const int use_step_16 = (xd->n4_w >= 16); MB_MODE_INFO **const candidate_mi0 = xd->mi + row_offset * xd->mi_stride; (void)mi_row; for (i = 0; i < end_mi;) { const MB_MODE_INFO *const candidate = candidate_mi0[col_offset + i]; const int candidate_bsize = candidate->sb_type; - const int n8_w = mi_size_wide[candidate_bsize]; - int len = AOMMIN(xd->n8_w, n8_w); + const int n4_w = mi_size_wide[candidate_bsize]; + int len = AOMMIN(xd->n4_w, n4_w); if (use_step_16) len = AOMMAX(n8_w_16, len); else if (abs(row_offset) > 1) len = AOMMAX(len, n8_w_8); int weight = 2; - if (xd->n8_w >= n8_w_8 && xd->n8_w <= n8_w) { + if (xd->n4_w >= n8_w_8 && xd->n4_w <= n4_w) { int inc = AOMMIN(-max_row_offset + row_offset + 1, mi_size_high[candidate_bsize]); // Obtain range used in weight calculation. - weight = AOMMAX(weight, (inc << shift)); + weight = AOMMAX(weight, inc); // Update processed rows. *processed_rows = inc - row_offset - 1; } @@ -192,37 +194,36 @@ static void scan_col_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd, uint8_t *ref_match_count, uint8_t *newmv_count, int_mv *gm_mv_candidates, int max_col_offset, int *processed_cols) { - int end_mi = AOMMIN(xd->n8_h, cm->mi_rows - mi_row); + int end_mi = AOMMIN(xd->n4_h, cm->mi_rows - mi_row); end_mi = AOMMIN(end_mi, mi_size_high[BLOCK_64X64]); const int n8_h_8 = mi_size_high[BLOCK_8X8]; const int n8_h_16 = mi_size_high[BLOCK_16X16]; int i; int row_offset = 0; - const int shift = 0; if (abs(col_offset) > 1) { row_offset = 1; - if ((mi_row & 0x01) && xd->n8_h < n8_h_8) --row_offset; + if ((mi_row & 0x01) && xd->n4_h < n8_h_8) --row_offset; } - const int use_step_16 = (xd->n8_h >= 16); + const int use_step_16 = (xd->n4_h >= 16); (void)mi_col; for (i = 0; i < end_mi;) { const MB_MODE_INFO *const candidate = xd->mi[(row_offset + i) * xd->mi_stride + col_offset]; const int candidate_bsize = candidate->sb_type; - const int n8_h = mi_size_high[candidate_bsize]; - int len = AOMMIN(xd->n8_h, n8_h); + const int n4_h = mi_size_high[candidate_bsize]; + int len = AOMMIN(xd->n4_h, n4_h); if (use_step_16) len = AOMMAX(n8_h_16, len); else if (abs(col_offset) > 1) len = AOMMAX(len, n8_h_8); int weight = 2; - if (xd->n8_h >= n8_h_8 && xd->n8_h <= n8_h) { + if (xd->n4_h >= n8_h_8 && xd->n4_h <= n4_h) { int inc = AOMMIN(-max_col_offset + col_offset + 1, mi_size_wide[candidate_bsize]); // Obtain range used in weight calculation. - weight = AOMMAX(weight, (inc << shift)); + weight = AOMMAX(weight, inc); // Update processed cols. *processed_cols = inc - col_offset - 1; } @@ -248,7 +249,7 @@ static void scan_blk_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd, mi_pos.row = row_offset; mi_pos.col = col_offset; - if (is_inside(tile, mi_col, mi_row, cm->mi_rows, &mi_pos)) { + if (is_inside(tile, mi_col, mi_row, &mi_pos)) { const MB_MODE_INFO *const candidate = xd->mi[mi_pos.row * xd->mi_stride + mi_pos.col]; const int len = mi_size_wide[BLOCK_8X8]; @@ -290,19 +291,19 @@ static int has_top_right(const AV1_COMMON *cm, const MACROBLOCKD *xd, // The left hand of two vertical rectangles always has a top right (as the // block above will have been decoded) - if (xd->n8_w < xd->n8_h) + if (xd->n4_w < xd->n4_h) if (!xd->is_sec_rect) has_tr = 1; // The bottom of two horizontal rectangles never has a top right (as the block // to the right won't have been decoded) - if (xd->n8_w > xd->n8_h) + if (xd->n4_w > xd->n4_h) if (xd->is_sec_rect) has_tr = 0; // The bottom left square of a Vertical A (in the old format) does // not have a top right as it is decoded before the right hand // rectangle of the partition if (xd->mi[0]->partition == PARTITION_VERT_A) { - if (xd->n8_w == xd->n8_h) + if (xd->n4_w == xd->n4_h) if (mask_row & bs) has_tr = 0; } @@ -335,7 +336,7 @@ static int add_tpl_ref_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd, mi_pos.row = (mi_row & 0x01) ? blk_row : blk_row + 1; mi_pos.col = (mi_col & 0x01) ? blk_col : blk_col + 1; - if (!is_inside(&xd->tile, mi_col, mi_row, cm->mi_rows, &mi_pos)) return 0; + if (!is_inside(&xd->tile, mi_col, mi_row, &mi_pos)) return 0; const TPL_MV_REF *prev_frame_mvs = cm->tpl_mvs + ((mi_row + mi_pos.row) >> 1) * (cm->mi_stride >> 1) + @@ -430,20 +431,75 @@ static int add_tpl_ref_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd, return 0; } +static void process_compound_ref_mv_candidate( + const MB_MODE_INFO *const candidate, const AV1_COMMON *const cm, + const MV_REFERENCE_FRAME *const rf, int_mv ref_id[2][2], + int ref_id_count[2], int_mv ref_diff[2][2], int ref_diff_count[2]) { + for (int rf_idx = 0; rf_idx < 2; ++rf_idx) { + MV_REFERENCE_FRAME can_rf = candidate->ref_frame[rf_idx]; + + for (int cmp_idx = 0; cmp_idx < 2; ++cmp_idx) { + if (can_rf == rf[cmp_idx] && ref_id_count[cmp_idx] < 2) { + ref_id[cmp_idx][ref_id_count[cmp_idx]] = candidate->mv[rf_idx]; + ++ref_id_count[cmp_idx]; + } else if (can_rf > INTRA_FRAME && ref_diff_count[cmp_idx] < 2) { + int_mv this_mv = candidate->mv[rf_idx]; + if (cm->ref_frame_sign_bias[can_rf] != + cm->ref_frame_sign_bias[rf[cmp_idx]]) { + this_mv.as_mv.row = -this_mv.as_mv.row; + this_mv.as_mv.col = -this_mv.as_mv.col; + } + ref_diff[cmp_idx][ref_diff_count[cmp_idx]] = this_mv; + ++ref_diff_count[cmp_idx]; + } + } + } +} + +static void process_single_ref_mv_candidate( + const MB_MODE_INFO *const candidate, const AV1_COMMON *const cm, + MV_REFERENCE_FRAME ref_frame, uint8_t refmv_count[MODE_CTX_REF_FRAMES], + CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE]) { + for (int rf_idx = 0; rf_idx < 2; ++rf_idx) { + if (candidate->ref_frame[rf_idx] > INTRA_FRAME) { + int_mv this_mv = candidate->mv[rf_idx]; + if (cm->ref_frame_sign_bias[candidate->ref_frame[rf_idx]] != + cm->ref_frame_sign_bias[ref_frame]) { + this_mv.as_mv.row = -this_mv.as_mv.row; + this_mv.as_mv.col = -this_mv.as_mv.col; + } + int stack_idx; + for (stack_idx = 0; stack_idx < refmv_count[ref_frame]; ++stack_idx) { + const int_mv stack_mv = ref_mv_stack[ref_frame][stack_idx].this_mv; + if (this_mv.as_int == stack_mv.as_int) break; + } + + if (stack_idx == refmv_count[ref_frame]) { + ref_mv_stack[ref_frame][stack_idx].this_mv = this_mv; + + // TODO(jingning): Set an arbitrary small number here. The weight + // doesn't matter as long as it is properly initialized. + ref_mv_stack[ref_frame][stack_idx].weight = 2; + ++refmv_count[ref_frame]; + } + } + } +} + static void setup_ref_mv_list( const AV1_COMMON *cm, const MACROBLOCKD *xd, MV_REFERENCE_FRAME ref_frame, uint8_t refmv_count[MODE_CTX_REF_FRAMES], CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE], int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES], int_mv *gm_mv_candidates, int mi_row, int mi_col, int16_t *mode_context) { - const int bs = AOMMAX(xd->n8_w, xd->n8_h); + const int bs = AOMMAX(xd->n4_w, xd->n4_h); const int has_tr = has_top_right(cm, xd, mi_row, mi_col, bs); MV_REFERENCE_FRAME rf[2]; const TileInfo *const tile = &xd->tile; int max_row_offset = 0, max_col_offset = 0; - const int row_adj = (xd->n8_h < mi_size_high[BLOCK_8X8]) && (mi_row & 0x01); - const int col_adj = (xd->n8_w < mi_size_wide[BLOCK_8X8]) && (mi_col & 0x01); + const int row_adj = (xd->n4_h < mi_size_high[BLOCK_8X8]) && (mi_row & 0x01); + const int col_adj = (xd->n4_w < mi_size_wide[BLOCK_8X8]) && (mi_col & 0x01); int processed_rows = 0; int processed_cols = 0; @@ -455,17 +511,16 @@ static void setup_ref_mv_list( if (xd->up_available) { max_row_offset = -(MVREF_ROW_COLS << 1) + row_adj; - if (xd->n8_h < mi_size_high[BLOCK_8X8]) + if (xd->n4_h < mi_size_high[BLOCK_8X8]) max_row_offset = -(2 << 1) + row_adj; - max_row_offset = - find_valid_row_offset(tile, mi_row, cm->mi_rows, max_row_offset); + max_row_offset = find_valid_row_offset(tile, mi_row, max_row_offset); } if (xd->left_available) { max_col_offset = -(MVREF_ROW_COLS << 1) + col_adj; - if (xd->n8_w < mi_size_wide[BLOCK_8X8]) + if (xd->n4_w < mi_size_wide[BLOCK_8X8]) max_col_offset = -(2 << 1) + col_adj; max_col_offset = find_valid_col_offset(tile, mi_col, max_col_offset); @@ -487,12 +542,12 @@ static void setup_ref_mv_list( gm_mv_candidates, max_col_offset, &processed_cols); // Check top-right boundary if (has_tr) - scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, xd->n8_w, + scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, xd->n4_w, ref_mv_stack[ref_frame], &row_match_count, &newmv_count, gm_mv_candidates, &refmv_count[ref_frame]); - uint8_t nearest_match = (row_match_count > 0) + (col_match_count > 0); - uint8_t nearest_refmv_count = refmv_count[ref_frame]; + const uint8_t nearest_match = (row_match_count > 0) + (col_match_count > 0); + const uint8_t nearest_refmv_count = refmv_count[ref_frame]; // TODO(yunqing): for comp_search, do it for all 3 cases. for (int idx = 0; idx < nearest_refmv_count; ++idx) @@ -500,27 +555,27 @@ static void setup_ref_mv_list( if (cm->allow_ref_frame_mvs) { int is_available = 0; - const int voffset = AOMMAX(mi_size_high[BLOCK_8X8], xd->n8_h); - const int hoffset = AOMMAX(mi_size_wide[BLOCK_8X8], xd->n8_w); - const int blk_row_end = AOMMIN(xd->n8_h, mi_size_high[BLOCK_64X64]); - const int blk_col_end = AOMMIN(xd->n8_w, mi_size_wide[BLOCK_64X64]); + const int voffset = AOMMAX(mi_size_high[BLOCK_8X8], xd->n4_h); + const int hoffset = AOMMAX(mi_size_wide[BLOCK_8X8], xd->n4_w); + const int blk_row_end = AOMMIN(xd->n4_h, mi_size_high[BLOCK_64X64]); + const int blk_col_end = AOMMIN(xd->n4_w, mi_size_wide[BLOCK_64X64]); const int tpl_sample_pos[3][2] = { { voffset, -2 }, { voffset, hoffset }, { voffset - 2, hoffset }, }; - const int allow_extension = (xd->n8_h >= mi_size_high[BLOCK_8X8]) && - (xd->n8_h < mi_size_high[BLOCK_64X64]) && - (xd->n8_w >= mi_size_wide[BLOCK_8X8]) && - (xd->n8_w < mi_size_wide[BLOCK_64X64]); - - int step_h = (xd->n8_h >= mi_size_high[BLOCK_64X64]) - ? mi_size_high[BLOCK_16X16] - : mi_size_high[BLOCK_8X8]; - int step_w = (xd->n8_w >= mi_size_wide[BLOCK_64X64]) - ? mi_size_wide[BLOCK_16X16] - : mi_size_wide[BLOCK_8X8]; + const int allow_extension = (xd->n4_h >= mi_size_high[BLOCK_8X8]) && + (xd->n4_h < mi_size_high[BLOCK_64X64]) && + (xd->n4_w >= mi_size_wide[BLOCK_8X8]) && + (xd->n4_w < mi_size_wide[BLOCK_64X64]); + + const int step_h = (xd->n4_h >= mi_size_high[BLOCK_64X64]) + ? mi_size_high[BLOCK_16X16] + : mi_size_high[BLOCK_8X8]; + const int step_w = (xd->n4_w >= mi_size_wide[BLOCK_64X64]) + ? mi_size_wide[BLOCK_16X16] + : mi_size_wide[BLOCK_8X8]; for (int blk_row = 0; blk_row < blk_row_end; blk_row += step_h) { for (int blk_col = 0; blk_col < blk_col_end; blk_col += step_w) { @@ -569,7 +624,7 @@ static void setup_ref_mv_list( max_col_offset, &processed_cols); } - uint8_t ref_match_count = (row_match_count > 0) + (col_match_count > 0); + const uint8_t ref_match_count = (row_match_count > 0) + (col_match_count > 0); switch (nearest_match) { case 0: @@ -636,62 +691,24 @@ static void setup_ref_mv_list( int_mv ref_id[2][2], ref_diff[2][2]; int ref_id_count[2] = { 0 }, ref_diff_count[2] = { 0 }; - int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n8_w); + int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n4_w); mi_width = AOMMIN(mi_width, cm->mi_cols - mi_col); - int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n8_h); + int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n4_h); mi_height = AOMMIN(mi_height, cm->mi_rows - mi_row); int mi_size = AOMMIN(mi_width, mi_height); for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size;) { const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx]; - const int candidate_bsize = candidate->sb_type; - - for (int rf_idx = 0; rf_idx < 2; ++rf_idx) { - MV_REFERENCE_FRAME can_rf = candidate->ref_frame[rf_idx]; - - for (int cmp_idx = 0; cmp_idx < 2; ++cmp_idx) { - if (can_rf == rf[cmp_idx] && ref_id_count[cmp_idx] < 2) { - ref_id[cmp_idx][ref_id_count[cmp_idx]] = candidate->mv[rf_idx]; - ++ref_id_count[cmp_idx]; - } else if (can_rf > INTRA_FRAME && ref_diff_count[cmp_idx] < 2) { - int_mv this_mv = candidate->mv[rf_idx]; - if (cm->ref_frame_sign_bias[can_rf] != - cm->ref_frame_sign_bias[rf[cmp_idx]]) { - this_mv.as_mv.row = -this_mv.as_mv.row; - this_mv.as_mv.col = -this_mv.as_mv.col; - } - ref_diff[cmp_idx][ref_diff_count[cmp_idx]] = this_mv; - ++ref_diff_count[cmp_idx]; - } - } - } - idx += mi_size_wide[candidate_bsize]; + process_compound_ref_mv_candidate( + candidate, cm, rf, ref_id, ref_id_count, ref_diff, ref_diff_count); + idx += mi_size_wide[candidate->sb_type]; } for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size;) { const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1]; - const int candidate_bsize = candidate->sb_type; - - for (int rf_idx = 0; rf_idx < 2; ++rf_idx) { - MV_REFERENCE_FRAME can_rf = candidate->ref_frame[rf_idx]; - - for (int cmp_idx = 0; cmp_idx < 2; ++cmp_idx) { - if (can_rf == rf[cmp_idx] && ref_id_count[cmp_idx] < 2) { - ref_id[cmp_idx][ref_id_count[cmp_idx]] = candidate->mv[rf_idx]; - ++ref_id_count[cmp_idx]; - } else if (can_rf > INTRA_FRAME && ref_diff_count[cmp_idx] < 2) { - int_mv this_mv = candidate->mv[rf_idx]; - if (cm->ref_frame_sign_bias[can_rf] != - cm->ref_frame_sign_bias[rf[cmp_idx]]) { - this_mv.as_mv.row = -this_mv.as_mv.row; - this_mv.as_mv.col = -this_mv.as_mv.col; - } - ref_diff[cmp_idx][ref_diff_count[cmp_idx]] = this_mv; - ++ref_diff_count[cmp_idx]; - } - } - } - idx += mi_size_high[candidate_bsize]; + process_compound_ref_mv_candidate( + candidate, cm, rf, ref_id, ref_id_count, ref_diff, ref_diff_count); + idx += mi_size_high[candidate->sb_type]; } // Build up the compound mv predictor @@ -743,87 +760,37 @@ static void setup_ref_mv_list( for (int idx = 0; idx < refmv_count[ref_frame]; ++idx) { clamp_mv_ref(&ref_mv_stack[ref_frame][idx].this_mv.as_mv, - xd->n8_w << MI_SIZE_LOG2, xd->n8_h << MI_SIZE_LOG2, xd); + xd->n4_w << MI_SIZE_LOG2, xd->n4_h << MI_SIZE_LOG2, xd); clamp_mv_ref(&ref_mv_stack[ref_frame][idx].comp_mv.as_mv, - xd->n8_w << MI_SIZE_LOG2, xd->n8_h << MI_SIZE_LOG2, xd); + xd->n4_w << MI_SIZE_LOG2, xd->n4_h << MI_SIZE_LOG2, xd); } } else { // Handle single reference frame extension - int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n8_w); + int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n4_w); mi_width = AOMMIN(mi_width, cm->mi_cols - mi_col); - int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n8_h); + int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n4_h); mi_height = AOMMIN(mi_height, cm->mi_rows - mi_row); int mi_size = AOMMIN(mi_width, mi_height); for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size && refmv_count[ref_frame] < MAX_MV_REF_CANDIDATES;) { const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx]; - const int candidate_bsize = candidate->sb_type; - - // TODO(jingning): Refactor the following code. - for (int rf_idx = 0; rf_idx < 2; ++rf_idx) { - if (candidate->ref_frame[rf_idx] > INTRA_FRAME) { - int_mv this_mv = candidate->mv[rf_idx]; - if (cm->ref_frame_sign_bias[candidate->ref_frame[rf_idx]] != - cm->ref_frame_sign_bias[ref_frame]) { - this_mv.as_mv.row = -this_mv.as_mv.row; - this_mv.as_mv.col = -this_mv.as_mv.col; - } - int stack_idx; - for (stack_idx = 0; stack_idx < refmv_count[ref_frame]; ++stack_idx) { - int_mv stack_mv = ref_mv_stack[ref_frame][stack_idx].this_mv; - if (this_mv.as_int == stack_mv.as_int) break; - } - - if (stack_idx == refmv_count[ref_frame]) { - ref_mv_stack[ref_frame][stack_idx].this_mv = this_mv; - - // TODO(jingning): Set an arbitrary small number here. The weight - // doesn't matter as long as it is properly initialized. - ref_mv_stack[ref_frame][stack_idx].weight = 2; - ++refmv_count[ref_frame]; - } - } - } - idx += mi_size_wide[candidate_bsize]; + process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count, + ref_mv_stack); + idx += mi_size_wide[candidate->sb_type]; } for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size && refmv_count[ref_frame] < MAX_MV_REF_CANDIDATES;) { const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1]; - const int candidate_bsize = candidate->sb_type; - - // TODO(jingning): Refactor the following code. - for (int rf_idx = 0; rf_idx < 2; ++rf_idx) { - if (candidate->ref_frame[rf_idx] > INTRA_FRAME) { - int_mv this_mv = candidate->mv[rf_idx]; - if (cm->ref_frame_sign_bias[candidate->ref_frame[rf_idx]] != - cm->ref_frame_sign_bias[ref_frame]) { - this_mv.as_mv.row = -this_mv.as_mv.row; - this_mv.as_mv.col = -this_mv.as_mv.col; - } - int stack_idx; - for (stack_idx = 0; stack_idx < refmv_count[ref_frame]; ++stack_idx) { - int_mv stack_mv = ref_mv_stack[ref_frame][stack_idx].this_mv; - if (this_mv.as_int == stack_mv.as_int) break; - } - - if (stack_idx == refmv_count[ref_frame]) { - ref_mv_stack[ref_frame][stack_idx].this_mv = this_mv; - - // TODO(jingning): Set an arbitrary small number here. The weight - // doesn't matter as long as it is properly initialized. - ref_mv_stack[ref_frame][stack_idx].weight = 2; - ++refmv_count[ref_frame]; - } - } - } - idx += mi_size_high[candidate_bsize]; + process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count, + ref_mv_stack); + idx += mi_size_high[candidate->sb_type]; } for (int idx = 0; idx < refmv_count[ref_frame]; ++idx) { clamp_mv_ref(&ref_mv_stack[ref_frame][idx].this_mv.as_mv, - xd->n8_w << MI_SIZE_LOG2, xd->n8_h << MI_SIZE_LOG2, xd); + xd->n4_w << MI_SIZE_LOG2, xd->n4_h << MI_SIZE_LOG2, xd); } if (mv_ref_list != NULL) { @@ -936,8 +903,10 @@ static int get_block_position(AV1_COMMON *cm, int *mi_r, int *mi_c, int blk_row, const int col_offset = (mv.col >= 0) ? (mv.col >> (4 + MI_SIZE_LOG2)) : -((-mv.col) >> (4 + MI_SIZE_LOG2)); - int row = (sign_bias == 1) ? blk_row - row_offset : blk_row + row_offset; - int col = (sign_bias == 1) ? blk_col - col_offset : blk_col + col_offset; + const int row = + (sign_bias == 1) ? blk_row - row_offset : blk_row + row_offset; + const int col = + (sign_bias == 1) ? blk_col - col_offset : blk_col + col_offset; if (row < 0 || row >= (cm->mi_rows >> 1) || col < 0 || col >= (cm->mi_cols >> 1)) @@ -955,37 +924,44 @@ static int get_block_position(AV1_COMMON *cm, int *mi_r, int *mi_c, int blk_row, return 1; } -static int motion_field_projection(AV1_COMMON *cm, MV_REFERENCE_FRAME ref_frame, - int dir) { +// Note: motion_filed_projection finds motion vectors of current frame's +// reference frame, and projects them to current frame. To make it clear, +// let's call current frame's reference frame as start frame. +// Call Start frame's reference frames as reference frames. +// Call ref_offset as frame distances between start frame and its reference +// frames. +static int motion_field_projection(AV1_COMMON *cm, + MV_REFERENCE_FRAME start_frame, int dir) { TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs; int ref_offset[REF_FRAMES] = { 0 }; (void)dir; - int ref_frame_idx = cm->frame_refs[FWD_RF_OFFSET(ref_frame)].idx; - if (ref_frame_idx < 0) return 0; + const int start_frame_idx = cm->frame_refs[FWD_RF_OFFSET(start_frame)].idx; + if (start_frame_idx < 0) return 0; - if (cm->buffer_pool->frame_bufs[ref_frame_idx].intra_only) return 0; + if (cm->buffer_pool->frame_bufs[start_frame_idx].intra_only) return 0; - if (cm->buffer_pool->frame_bufs[ref_frame_idx].mi_rows != cm->mi_rows || - cm->buffer_pool->frame_bufs[ref_frame_idx].mi_cols != cm->mi_cols) + if (cm->buffer_pool->frame_bufs[start_frame_idx].mi_rows != cm->mi_rows || + cm->buffer_pool->frame_bufs[start_frame_idx].mi_cols != cm->mi_cols) return 0; - int ref_frame_index = - cm->buffer_pool->frame_bufs[ref_frame_idx].cur_frame_offset; - unsigned int *ref_rf_idx = - &cm->buffer_pool->frame_bufs[ref_frame_idx].ref_frame_offset[0]; - int cur_frame_index = cm->cur_frame->cur_frame_offset; - int ref_to_cur = get_relative_dist(cm, ref_frame_index, cur_frame_index); + const int start_frame_offset = + cm->buffer_pool->frame_bufs[start_frame_idx].cur_frame_offset; + const unsigned int *const ref_frame_offsets = + &cm->buffer_pool->frame_bufs[start_frame_idx].ref_frame_offset[0]; + const int cur_frame_offset = cm->cur_frame->cur_frame_offset; + int start_to_current_frame_offset = + get_relative_dist(cm, start_frame_offset, cur_frame_offset); for (MV_REFERENCE_FRAME rf = LAST_FRAME; rf <= INTER_REFS_PER_FRAME; ++rf) { - ref_offset[rf] = - get_relative_dist(cm, ref_frame_index, ref_rf_idx[rf - LAST_FRAME]); + ref_offset[rf] = get_relative_dist(cm, start_frame_offset, + ref_frame_offsets[rf - LAST_FRAME]); } - if (dir == 2) ref_to_cur = -ref_to_cur; + if (dir == 2) start_to_current_frame_offset = -start_to_current_frame_offset; - MV_REF *mv_ref_base = cm->buffer_pool->frame_bufs[ref_frame_idx].mvs; + MV_REF *mv_ref_base = cm->buffer_pool->frame_bufs[start_frame_idx].mvs; const int mvs_rows = (cm->mi_rows + 1) >> 1; const int mvs_cols = (cm->mi_cols + 1) >> 1; @@ -999,19 +975,20 @@ static int motion_field_projection(AV1_COMMON *cm, MV_REFERENCE_FRAME ref_frame, int mi_r, mi_c; const int ref_frame_offset = ref_offset[mv_ref->ref_frame]; - int pos_valid = abs(ref_frame_offset) <= MAX_FRAME_DISTANCE && - ref_frame_offset > 0 && - abs(ref_to_cur) <= MAX_FRAME_DISTANCE; + int pos_valid = + abs(ref_frame_offset) <= MAX_FRAME_DISTANCE && + ref_frame_offset > 0 && + abs(start_to_current_frame_offset) <= MAX_FRAME_DISTANCE; if (pos_valid) { - get_mv_projection(&this_mv.as_mv, fwd_mv, ref_to_cur, - ref_frame_offset); + get_mv_projection(&this_mv.as_mv, fwd_mv, + start_to_current_frame_offset, ref_frame_offset); pos_valid = get_block_position(cm, &mi_r, &mi_c, blk_row, blk_col, this_mv.as_mv, dir >> 1); } if (pos_valid) { - int mi_offset = mi_r * (cm->mi_stride >> 1) + mi_c; + const int mi_offset = mi_r * (cm->mi_stride >> 1) + mi_c; tpl_mvs_base[mi_offset].mfmv0.as_mv.row = fwd_mv.row; tpl_mvs_base[mi_offset].mfmv0.as_mv.col = fwd_mv.col; @@ -1167,14 +1144,14 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col, if (up_available) { int mi_row_offset = -1; MB_MODE_INFO *mbmi = xd->mi[mi_row_offset * xd->mi_stride]; - uint8_t n8_w = mi_size_wide[mbmi->sb_type]; + uint8_t n4_w = mi_size_wide[mbmi->sb_type]; - if (xd->n8_w <= n8_w) { + if (xd->n4_w <= n4_w) { // Handle "current block width <= above block width" case. - int col_offset = -mi_col % n8_w; + int col_offset = -mi_col % n4_w; if (col_offset < 0) do_tl = 0; - if (col_offset + n8_w > xd->n8_w) do_tr = 0; + if (col_offset + n4_w > xd->n4_w) do_tr = 0; if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) { record_samples(mbmi, pts, pts_inref, 0, -1, col_offset, 1); @@ -1185,11 +1162,11 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col, } } else { // Handle "current block width > above block width" case. - for (i = 0; i < AOMMIN(xd->n8_w, cm->mi_cols - mi_col); i += mi_step) { + for (i = 0; i < AOMMIN(xd->n4_w, cm->mi_cols - mi_col); i += mi_step) { int mi_col_offset = i; mbmi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]; - n8_w = mi_size_wide[mbmi->sb_type]; - mi_step = AOMMIN(xd->n8_w, n8_w); + n4_w = mi_size_wide[mbmi->sb_type]; + mi_step = AOMMIN(xd->n4_w, n4_w); if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) { @@ -1209,11 +1186,11 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col, int mi_col_offset = -1; MB_MODE_INFO *mbmi = xd->mi[mi_col_offset]; - uint8_t n8_h = mi_size_high[mbmi->sb_type]; + uint8_t n4_h = mi_size_high[mbmi->sb_type]; - if (xd->n8_h <= n8_h) { + if (xd->n4_h <= n4_h) { // Handle "current block height <= above block height" case. - int row_offset = -mi_row % n8_h; + int row_offset = -mi_row % n4_h; if (row_offset < 0) do_tl = 0; @@ -1226,11 +1203,11 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col, } } else { // Handle "current block height > above block height" case. - for (i = 0; i < AOMMIN(xd->n8_h, cm->mi_rows - mi_row); i += mi_step) { + for (i = 0; i < AOMMIN(xd->n4_h, cm->mi_rows - mi_row); i += mi_step) { int mi_row_offset = i; mbmi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]; - n8_h = mi_size_high[mbmi->sb_type]; - mi_step = AOMMIN(xd->n8_h, n8_h); + n4_h = mi_size_high[mbmi->sb_type]; + mi_step = AOMMIN(xd->n4_h, n4_h); if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) { @@ -1264,18 +1241,18 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col, // Top-right block if (do_tr && - has_top_right(cm, xd, mi_row, mi_col, AOMMAX(xd->n8_w, xd->n8_h))) { - POSITION trb_pos = { -1, xd->n8_w }; + has_top_right(cm, xd, mi_row, mi_col, AOMMAX(xd->n4_w, xd->n4_h))) { + POSITION trb_pos = { -1, xd->n4_w }; - if (is_inside(tile, mi_col, mi_row, cm->mi_rows, &trb_pos)) { + if (is_inside(tile, mi_col, mi_row, &trb_pos)) { int mi_row_offset = -1; - int mi_col_offset = xd->n8_w; + int mi_col_offset = xd->n4_w; MB_MODE_INFO *mbmi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]; if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) { - record_samples(mbmi, pts, pts_inref, 0, -1, xd->n8_w, 1); + record_samples(mbmi, pts, pts_inref, 0, -1, xd->n4_w, 1); np++; if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX; } @@ -1372,7 +1349,7 @@ static int compare_ref_frame_info(const void *arg_a, const void *arg_b) { static void set_ref_frame_info(AV1_COMMON *const cm, int frame_idx, REF_FRAME_INFO *ref_info) { - assert(frame_idx >= 0 && frame_idx <= INTER_REFS_PER_FRAME); + assert(frame_idx >= 0 && frame_idx < INTER_REFS_PER_FRAME); const int buf_idx = ref_info->buf_idx; diff --git a/third_party/aom/av1/common/mvref_common.h b/third_party/aom/av1/common/mvref_common.h index f68c159e1..83f7a1ac0 100644 --- a/third_party/aom/av1/common/mvref_common.h +++ b/third_party/aom/av1/common/mvref_common.h @@ -8,8 +8,8 @@ * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_MVREF_COMMON_H_ -#define AV1_COMMON_MVREF_COMMON_H_ +#ifndef AOM_AV1_COMMON_MVREF_COMMON_H_ +#define AOM_AV1_COMMON_MVREF_COMMON_H_ #include "av1/common/onyxc_int.h" #include "av1/common/blockd.h" @@ -85,29 +85,17 @@ static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref, // Checks that the given mi_row, mi_col and search point // are inside the borders of the tile. static INLINE int is_inside(const TileInfo *const tile, int mi_col, int mi_row, - int mi_rows, const POSITION *mi_pos) { - const int dependent_horz_tile_flag = 0; - if (dependent_horz_tile_flag && !tile->tg_horz_boundary) { - return !(mi_row + mi_pos->row < 0 || - mi_col + mi_pos->col < tile->mi_col_start || - mi_row + mi_pos->row >= mi_rows || - mi_col + mi_pos->col >= tile->mi_col_end); - } else { - return !(mi_row + mi_pos->row < tile->mi_row_start || - mi_col + mi_pos->col < tile->mi_col_start || - mi_row + mi_pos->row >= tile->mi_row_end || - mi_col + mi_pos->col >= tile->mi_col_end); - } + const POSITION *mi_pos) { + return !(mi_row + mi_pos->row < tile->mi_row_start || + mi_col + mi_pos->col < tile->mi_col_start || + mi_row + mi_pos->row >= tile->mi_row_end || + mi_col + mi_pos->col >= tile->mi_col_end); } static INLINE int find_valid_row_offset(const TileInfo *const tile, int mi_row, - int mi_rows, int row_offset) { - const int dependent_horz_tile_flag = 0; - if (dependent_horz_tile_flag && !tile->tg_horz_boundary) - return clamp(row_offset, -mi_row, mi_rows - mi_row - 1); - else - return clamp(row_offset, tile->mi_row_start - mi_row, - tile->mi_row_end - mi_row - 1); + int row_offset) { + return clamp(row_offset, tile->mi_row_start - mi_row, + tile->mi_row_end - mi_row - 1); } static INLINE int find_valid_col_offset(const TileInfo *const tile, int mi_col, @@ -263,8 +251,9 @@ static INLINE void av1_collect_neighbors_ref_counts(MACROBLOCKD *const xd) { } } -void av1_copy_frame_mvs(const AV1_COMMON *const cm, MB_MODE_INFO *mi, - int mi_row, int mi_col, int x_mis, int y_mis); +void av1_copy_frame_mvs(const AV1_COMMON *const cm, + const MB_MODE_INFO *const mi, int mi_row, int mi_col, + int x_mis, int y_mis); void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd, MB_MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, @@ -286,7 +275,6 @@ int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col, #define INTRABC_DELAY_PIXELS 256 // Delay of 256 pixels #define INTRABC_DELAY_SB64 (INTRABC_DELAY_PIXELS / 64) -#define USE_WAVE_FRONT 1 // Use only top left area of frame for reference. static INLINE void av1_find_ref_dv(int_mv *ref_dv, const TileInfo *const tile, int mib_size, int mi_row, int mi_col) { @@ -356,13 +344,12 @@ static INLINE int av1_is_dv_valid(const MV dv, const AV1_COMMON *cm, const int src_sb64 = src_sb_row * total_sb64_per_row + src_sb64_col; if (src_sb64 >= active_sb64 - INTRABC_DELAY_SB64) return 0; -#if USE_WAVE_FRONT + // Wavefront constraint: use only top left area of frame for reference. const int gradient = 1 + INTRABC_DELAY_SB64 + (sb_size > 64); const int wf_offset = gradient * (active_sb_row - src_sb_row); if (src_sb_row > active_sb_row || src_sb64_col >= active_sb64_col - INTRABC_DELAY_SB64 + wf_offset) return 0; -#endif return 1; } @@ -371,4 +358,4 @@ static INLINE int av1_is_dv_valid(const MV dv, const AV1_COMMON *cm, } // extern "C" #endif -#endif // AV1_COMMON_MVREF_COMMON_H_ +#endif // AOM_AV1_COMMON_MVREF_COMMON_H_ diff --git a/third_party/aom/av1/common/obmc.h b/third_party/aom/av1/common/obmc.h index 3918c82c6..1c90cd93f 100644 --- a/third_party/aom/av1/common/obmc.h +++ b/third_party/aom/av1/common/obmc.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_OBMC_H_ -#define AV1_COMMON_OBMC_H_ +#ifndef AOM_AV1_COMMON_OBMC_H_ +#define AOM_AV1_COMMON_OBMC_H_ typedef void (*overlappable_nb_visitor_t)(MACROBLOCKD *xd, int rel_mi_pos, uint8_t nb_mi_size, @@ -30,7 +30,7 @@ static INLINE void foreach_overlappable_nb_above(const AV1_COMMON *cm, // prev_row_mi points into the mi array, starting at the beginning of the // previous row. MB_MODE_INFO **prev_row_mi = xd->mi - mi_col - 1 * xd->mi_stride; - const int end_col = AOMMIN(mi_col + xd->n8_w, cm->mi_cols); + const int end_col = AOMMIN(mi_col + xd->n4_w, cm->mi_cols); uint8_t mi_step; for (int above_mi_col = mi_col; above_mi_col < end_col && nb_count < nb_max; above_mi_col += mi_step) { @@ -49,7 +49,7 @@ static INLINE void foreach_overlappable_nb_above(const AV1_COMMON *cm, } if (is_neighbor_overlappable(*above_mi)) { ++nb_count; - fun(xd, above_mi_col - mi_col, AOMMIN(xd->n8_w, mi_step), *above_mi, + fun(xd, above_mi_col - mi_col, AOMMIN(xd->n4_w, mi_step), *above_mi, fun_ctxt, num_planes); } } @@ -68,7 +68,7 @@ static INLINE void foreach_overlappable_nb_left(const AV1_COMMON *cm, // prev_col_mi points into the mi array, starting at the top of the // previous column MB_MODE_INFO **prev_col_mi = xd->mi - 1 - mi_row * xd->mi_stride; - const int end_row = AOMMIN(mi_row + xd->n8_h, cm->mi_rows); + const int end_row = AOMMIN(mi_row + xd->n4_h, cm->mi_rows); uint8_t mi_step; for (int left_mi_row = mi_row; left_mi_row < end_row && nb_count < nb_max; left_mi_row += mi_step) { @@ -82,10 +82,10 @@ static INLINE void foreach_overlappable_nb_left(const AV1_COMMON *cm, } if (is_neighbor_overlappable(*left_mi)) { ++nb_count; - fun(xd, left_mi_row - mi_row, AOMMIN(xd->n8_h, mi_step), *left_mi, + fun(xd, left_mi_row - mi_row, AOMMIN(xd->n4_h, mi_step), *left_mi, fun_ctxt, num_planes); } } } -#endif // AV1_COMMON_OBMC_H_ +#endif // AOM_AV1_COMMON_OBMC_H_ diff --git a/third_party/aom/av1/common/obu_util.c b/third_party/aom/av1/common/obu_util.c new file mode 100644 index 000000000..823b700b1 --- /dev/null +++ b/third_party/aom/av1/common/obu_util.c @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include "av1/common/obu_util.h" + +#include "aom_dsp/bitreader_buffer.h" + +// Returns 1 when OBU type is valid, and 0 otherwise. +static int valid_obu_type(int obu_type) { + int valid_type = 0; + switch (obu_type) { + case OBU_SEQUENCE_HEADER: + case OBU_TEMPORAL_DELIMITER: + case OBU_FRAME_HEADER: + case OBU_TILE_GROUP: + case OBU_METADATA: + case OBU_FRAME: + case OBU_REDUNDANT_FRAME_HEADER: + case OBU_TILE_LIST: + case OBU_PADDING: valid_type = 1; break; + default: break; + } + return valid_type; +} + +static aom_codec_err_t read_obu_size(const uint8_t *data, + size_t bytes_available, + size_t *const obu_size, + size_t *const length_field_size) { + uint64_t u_obu_size = 0; + if (aom_uleb_decode(data, bytes_available, &u_obu_size, length_field_size) != + 0) { + return AOM_CODEC_CORRUPT_FRAME; + } + + if (u_obu_size > UINT32_MAX) return AOM_CODEC_CORRUPT_FRAME; + *obu_size = (size_t)u_obu_size; + return AOM_CODEC_OK; +} + +// Parses OBU header and stores values in 'header'. +static aom_codec_err_t read_obu_header(struct aom_read_bit_buffer *rb, + int is_annexb, ObuHeader *header) { + if (!rb || !header) return AOM_CODEC_INVALID_PARAM; + + const ptrdiff_t bit_buffer_byte_length = rb->bit_buffer_end - rb->bit_buffer; + if (bit_buffer_byte_length < 1) return AOM_CODEC_CORRUPT_FRAME; + + header->size = 1; + + if (aom_rb_read_bit(rb) != 0) { + // Forbidden bit. Must not be set. + return AOM_CODEC_CORRUPT_FRAME; + } + + header->type = (OBU_TYPE)aom_rb_read_literal(rb, 4); + + if (!valid_obu_type(header->type)) return AOM_CODEC_CORRUPT_FRAME; + + header->has_extension = aom_rb_read_bit(rb); + header->has_size_field = aom_rb_read_bit(rb); + + if (!header->has_size_field && !is_annexb) { + // section 5 obu streams must have obu_size field set. + return AOM_CODEC_UNSUP_BITSTREAM; + } + + if (aom_rb_read_bit(rb) != 0) { + // obu_reserved_1bit must be set to 0. + return AOM_CODEC_CORRUPT_FRAME; + } + + if (header->has_extension) { + if (bit_buffer_byte_length == 1) return AOM_CODEC_CORRUPT_FRAME; + + header->size += 1; + header->temporal_layer_id = aom_rb_read_literal(rb, 3); + header->spatial_layer_id = aom_rb_read_literal(rb, 2); + if (aom_rb_read_literal(rb, 3) != 0) { + // extension_header_reserved_3bits must be set to 0. + return AOM_CODEC_CORRUPT_FRAME; + } + } + + return AOM_CODEC_OK; +} + +aom_codec_err_t aom_read_obu_header(uint8_t *buffer, size_t buffer_length, + size_t *consumed, ObuHeader *header, + int is_annexb) { + if (buffer_length < 1 || !consumed || !header) return AOM_CODEC_INVALID_PARAM; + + // TODO(tomfinegan): Set the error handler here and throughout this file, and + // confirm parsing work done via aom_read_bit_buffer is successful. + struct aom_read_bit_buffer rb = { buffer, buffer + buffer_length, 0, NULL, + NULL }; + aom_codec_err_t parse_result = read_obu_header(&rb, is_annexb, header); + if (parse_result == AOM_CODEC_OK) *consumed = header->size; + return parse_result; +} + +aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data, + size_t bytes_available, + int is_annexb, + ObuHeader *obu_header, + size_t *const payload_size, + size_t *const bytes_read) { + size_t length_field_size = 0, obu_size = 0; + aom_codec_err_t status; + + if (is_annexb) { + // Size field comes before the OBU header, and includes the OBU header + status = + read_obu_size(data, bytes_available, &obu_size, &length_field_size); + + if (status != AOM_CODEC_OK) return status; + } + + struct aom_read_bit_buffer rb = { data + length_field_size, + data + bytes_available, 0, NULL, NULL }; + + status = read_obu_header(&rb, is_annexb, obu_header); + if (status != AOM_CODEC_OK) return status; + + if (is_annexb) { + // Derive the payload size from the data we've already read + if (obu_size < obu_header->size) return AOM_CODEC_CORRUPT_FRAME; + + *payload_size = obu_size - obu_header->size; + } else { + // Size field comes after the OBU header, and is just the payload size + status = read_obu_size(data + obu_header->size, + bytes_available - obu_header->size, payload_size, + &length_field_size); + if (status != AOM_CODEC_OK) return status; + } + + *bytes_read = length_field_size + obu_header->size; + return AOM_CODEC_OK; +} diff --git a/third_party/aom/av1/common/obu_util.h b/third_party/aom/av1/common/obu_util.h new file mode 100644 index 000000000..7c56904c8 --- /dev/null +++ b/third_party/aom/av1/common/obu_util.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_COMMON_OBU_UTIL_H_ +#define AOM_AV1_COMMON_OBU_UTIL_H_ + +#include "aom/aom_codec.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + size_t size; // Size (1 or 2 bytes) of the OBU header (including the + // optional OBU extension header) in the bitstream. + OBU_TYPE type; + int has_size_field; + int has_extension; + // The following fields come from the OBU extension header and therefore are + // only used if has_extension is true. + int temporal_layer_id; + int spatial_layer_id; +} ObuHeader; + +aom_codec_err_t aom_read_obu_header(uint8_t *buffer, size_t buffer_length, + size_t *consumed, ObuHeader *header, + int is_annexb); + +aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data, + size_t bytes_available, + int is_annexb, + ObuHeader *obu_header, + size_t *const payload_size, + size_t *const bytes_read); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_OBU_UTIL_H_ diff --git a/third_party/aom/av1/common/odintrin.h b/third_party/aom/av1/common/odintrin.h index e87c5a0bf..e1db0f44d 100644 --- a/third_party/aom/av1/common/odintrin.h +++ b/third_party/aom/av1/common/odintrin.h @@ -11,8 +11,8 @@ /* clang-format off */ -#ifndef AV1_COMMON_ODINTRIN_H_ -#define AV1_COMMON_ODINTRIN_H_ +#ifndef AOM_AV1_COMMON_ODINTRIN_H_ +#define AOM_AV1_COMMON_ODINTRIN_H_ #include #include @@ -46,9 +46,9 @@ extern uint32_t OD_DIVU_SMALL_CONSTS[OD_DIVU_DMAX][2]; #define OD_MAXI AOMMAX #define OD_CLAMPI(min, val, max) (OD_MAXI(min, OD_MINI(val, max))) -#define OD_CLZ0 (1) -#define OD_CLZ(x) (-get_msb(x)) -#define OD_ILOG_NZ(x) (OD_CLZ0 - OD_CLZ(x)) +/*Integer logarithm (base 2) of a nonzero unsigned 32-bit integer. + OD_ILOG_NZ(x) = (int)floor(log2(x)) + 1.*/ +#define OD_ILOG_NZ(x) (1 + get_msb(x)) /*Enable special features for gcc and compatible compilers.*/ #if defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__GNUC_PATCHLEVEL__) @@ -93,4 +93,4 @@ extern uint32_t OD_DIVU_SMALL_CONSTS[OD_DIVU_DMAX][2]; } // extern "C" #endif -#endif // AV1_COMMON_ODINTRIN_H_ +#endif // AOM_AV1_COMMON_ODINTRIN_H_ diff --git a/third_party/aom/av1/common/onyxc_int.h b/third_party/aom/av1/common/onyxc_int.h index 6b1bf2d74..ff011c89e 100644 --- a/third_party/aom/av1/common/onyxc_int.h +++ b/third_party/aom/av1/common/onyxc_int.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_ONYXC_INT_H_ -#define AV1_COMMON_ONYXC_INT_H_ +#ifndef AOM_AV1_COMMON_ONYXC_INT_H_ +#define AOM_AV1_COMMON_ONYXC_INT_H_ #include "config/aom_config.h" #include "config/av1_rtcd.h" @@ -480,6 +480,7 @@ typedef struct AV1Common { int byte_alignment; int skip_loop_filter; + int skip_film_grain; // Private data associated with the frame buffer callbacks. void *cb_priv; @@ -823,18 +824,18 @@ static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile, xd->chroma_left_mbmi = chroma_left_mi; } - xd->n8_h = bh; - xd->n8_w = bw; + xd->n4_h = bh; + xd->n4_w = bw; xd->is_sec_rect = 0; - if (xd->n8_w < xd->n8_h) { + if (xd->n4_w < xd->n4_h) { // Only mark is_sec_rect as 1 for the last block. // For PARTITION_VERT_4, it would be (0, 0, 0, 1); // For other partitions, it would be (0, 1). - if (!((mi_col + xd->n8_w) & (xd->n8_h - 1))) xd->is_sec_rect = 1; + if (!((mi_col + xd->n4_w) & (xd->n4_h - 1))) xd->is_sec_rect = 1; } - if (xd->n8_w > xd->n8_h) - if (mi_row & (xd->n8_w - 1)) xd->is_sec_rect = 1; + if (xd->n4_w > xd->n4_h) + if (mi_row & (xd->n4_w - 1)) xd->is_sec_rect = 1; } static INLINE aom_cdf_prob *get_y_mode_cdf(FRAME_CONTEXT *tile_ctx, @@ -1115,18 +1116,18 @@ static INLINE void set_txfm_ctx(TXFM_CONTEXT *txfm_ctx, uint8_t txs, int len) { for (i = 0; i < len; ++i) txfm_ctx[i] = txs; } -static INLINE void set_txfm_ctxs(TX_SIZE tx_size, int n8_w, int n8_h, int skip, +static INLINE void set_txfm_ctxs(TX_SIZE tx_size, int n4_w, int n4_h, int skip, const MACROBLOCKD *xd) { uint8_t bw = tx_size_wide[tx_size]; uint8_t bh = tx_size_high[tx_size]; if (skip) { - bw = n8_w * MI_SIZE; - bh = n8_h * MI_SIZE; + bw = n4_w * MI_SIZE; + bh = n4_h * MI_SIZE; } - set_txfm_ctx(xd->above_txfm_context, bw, n8_w); - set_txfm_ctx(xd->left_txfm_context, bh, n8_h); + set_txfm_ctx(xd->above_txfm_context, bw, n4_w); + set_txfm_ctx(xd->left_txfm_context, bh, n4_h); } static INLINE void txfm_partition_update(TXFM_CONTEXT *above_ctx, @@ -1338,4 +1339,4 @@ static INLINE uint8_t major_minor_to_seq_level_idx(BitstreamLevel bl) { } // extern "C" #endif -#endif // AV1_COMMON_ONYXC_INT_H_ +#endif // AOM_AV1_COMMON_ONYXC_INT_H_ diff --git a/third_party/aom/av1/common/ppc/cfl_ppc.c b/third_party/aom/av1/common/ppc/cfl_ppc.c index 58933a7b3..026a07809 100644 --- a/third_party/aom/av1/common/ppc/cfl_ppc.c +++ b/third_party/aom/av1/common/ppc/cfl_ppc.c @@ -24,19 +24,21 @@ #define CFL_LINE_2 128 #define CFL_LINE_3 192 -typedef vector int8_t int8x16_t; -typedef vector uint8_t uint8x16_t; -typedef vector int16_t int16x8_t; -typedef vector uint16_t uint16x8_t; -typedef vector int32_t int32x4_t; -typedef vector uint32_t uint32x4_t; -typedef vector uint64_t uint64x2_t; +typedef vector signed char int8x16_t; // NOLINT(runtime/int) +typedef vector unsigned char uint8x16_t; // NOLINT(runtime/int) +typedef vector signed short int16x8_t; // NOLINT(runtime/int) +typedef vector unsigned short uint16x8_t; // NOLINT(runtime/int) +typedef vector signed int int32x4_t; // NOLINT(runtime/int) +typedef vector unsigned int uint32x4_t; // NOLINT(runtime/int) +typedef vector unsigned long long uint64x2_t; // NOLINT(runtime/int) -static INLINE void subtract_average_vsx(int16_t *pred_buf, int width, - int height, int round_offset, +static INLINE void subtract_average_vsx(const uint16_t *src_ptr, int16_t *dst, + int width, int height, int round_offset, int num_pel_log2) { - const int16_t *end = pred_buf + height * CFL_BUF_LINE; - const int16_t *sum_buf = pred_buf; + // int16_t *dst = dst_ptr; + const int16_t *dst_end = dst + height * CFL_BUF_LINE; + const int16_t *sum_buf = (const int16_t *)src_ptr; + const int16_t *end = sum_buf + height * CFL_BUF_LINE; const uint32x4_t div_shift = vec_splats((uint32_t)num_pel_log2); const uint8x16_t mask_64 = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 }; @@ -71,43 +73,40 @@ static INLINE void subtract_average_vsx(int16_t *pred_buf, int width, const int32x4_t avg = vec_sr(sum_32x4, div_shift); const int16x8_t vec_avg = vec_pack(avg, avg); do { - vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0, pred_buf), vec_avg), OFF_0, pred_buf); - vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_1, pred_buf), vec_avg), - OFF_0 + CFL_BUF_LINE_BYTES, pred_buf); - vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_2, pred_buf), vec_avg), - OFF_0 + CFL_LINE_2, pred_buf); - vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_3, pred_buf), vec_avg), - OFF_0 + CFL_LINE_3, pred_buf); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0, dst), vec_avg), OFF_0, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_1, dst), vec_avg), + OFF_0 + CFL_BUF_LINE_BYTES, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_2, dst), vec_avg), + OFF_0 + CFL_LINE_2, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_3, dst), vec_avg), + OFF_0 + CFL_LINE_3, dst); if (width >= 16) { - vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1, pred_buf), vec_avg), OFF_1, - pred_buf); - vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_1, pred_buf), vec_avg), - OFF_1 + CFL_LINE_1, pred_buf); - vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_2, pred_buf), vec_avg), - OFF_1 + CFL_LINE_2, pred_buf); - vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_3, pred_buf), vec_avg), - OFF_1 + CFL_LINE_3, pred_buf); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1, dst), vec_avg), OFF_1, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_1, dst), vec_avg), + OFF_1 + CFL_LINE_1, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_2, dst), vec_avg), + OFF_1 + CFL_LINE_2, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_3, dst), vec_avg), + OFF_1 + CFL_LINE_3, dst); } if (width == 32) { - vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2, pred_buf), vec_avg), OFF_2, - pred_buf); - vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_1, pred_buf), vec_avg), - OFF_2 + CFL_LINE_1, pred_buf); - vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_2, pred_buf), vec_avg), - OFF_2 + CFL_LINE_2, pred_buf); - vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_3, pred_buf), vec_avg), - OFF_2 + CFL_LINE_3, pred_buf); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2, dst), vec_avg), OFF_2, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_1, dst), vec_avg), + OFF_2 + CFL_LINE_1, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_2, dst), vec_avg), + OFF_2 + CFL_LINE_2, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_3, dst), vec_avg), + OFF_2 + CFL_LINE_3, dst); - vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3, pred_buf), vec_avg), OFF_3, - pred_buf); - vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_1, pred_buf), vec_avg), - OFF_3 + CFL_LINE_1, pred_buf); - vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_2, pred_buf), vec_avg), - OFF_3 + CFL_LINE_2, pred_buf); - vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_3, pred_buf), vec_avg), - OFF_3 + CFL_LINE_3, pred_buf); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3, dst), vec_avg), OFF_3, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_1, dst), vec_avg), + OFF_3 + CFL_LINE_1, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_2, dst), vec_avg), + OFF_3 + CFL_LINE_2, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_3, dst), vec_avg), + OFF_3 + CFL_LINE_3, dst); } - } while ((pred_buf += CFL_BUF_LINE * 4) < end); + } while ((dst += CFL_BUF_LINE * 4) < dst_end); } // Declare wrappers for VSX sizes diff --git a/third_party/aom/av1/common/pred_common.c b/third_party/aom/av1/common/pred_common.c index d77739d85..5952441d1 100644 --- a/third_party/aom/av1/common/pred_common.c +++ b/third_party/aom/av1/common/pred_common.c @@ -31,8 +31,8 @@ int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd, int dir) { const MB_MODE_INFO *const mbmi = xd->mi[0]; const int ctx_offset = (mbmi->ref_frame[1] > INTRA_FRAME) * INTER_FILTER_COMP_OFFSET; - MV_REFERENCE_FRAME ref_frame = - (dir < 2) ? mbmi->ref_frame[0] : mbmi->ref_frame[1]; + assert(dir == 0 || dir == 1); + const MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame[0]; // Note: // The mode info data structure has a one element border above and to the // left of the entries corresponding to real macroblocks. diff --git a/third_party/aom/av1/common/pred_common.h b/third_party/aom/av1/common/pred_common.h index 6a835c467..6dba2322d 100644 --- a/third_party/aom/av1/common/pred_common.h +++ b/third_party/aom/av1/common/pred_common.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_PRED_COMMON_H_ -#define AV1_COMMON_PRED_COMMON_H_ +#ifndef AOM_AV1_COMMON_PRED_COMMON_H_ +#define AOM_AV1_COMMON_PRED_COMMON_H_ #include "av1/common/blockd.h" #include "av1/common/mvref_common.h" @@ -357,4 +357,4 @@ static INLINE int get_tx_size_context(const MACROBLOCKD *xd) { } // extern "C" #endif -#endif // AV1_COMMON_PRED_COMMON_H_ +#endif // AOM_AV1_COMMON_PRED_COMMON_H_ diff --git a/third_party/aom/av1/common/quant_common.h b/third_party/aom/av1/common/quant_common.h index ca199e94c..d1f52a660 100644 --- a/third_party/aom/av1/common/quant_common.h +++ b/third_party/aom/av1/common/quant_common.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_QUANT_COMMON_H_ -#define AV1_COMMON_QUANT_COMMON_H_ +#ifndef AOM_AV1_COMMON_QUANT_COMMON_H_ +#define AOM_AV1_COMMON_QUANT_COMMON_H_ #include "aom/aom_codec.h" #include "av1/common/seg_common.h" @@ -60,4 +60,4 @@ const qm_val_t *av1_qmatrix(struct AV1Common *cm, int qindex, int comp, } // extern "C" #endif -#endif // AV1_COMMON_QUANT_COMMON_H_ +#endif // AOM_AV1_COMMON_QUANT_COMMON_H_ diff --git a/third_party/aom/av1/common/reconinter.c b/third_party/aom/av1/common/reconinter.c index b9f0b57f3..3203efce4 100644 --- a/third_party/aom/av1/common/reconinter.c +++ b/third_party/aom/av1/common/reconinter.c @@ -44,10 +44,9 @@ int av1_allow_warp(const MB_MODE_INFO *const mbmi, if (build_for_obmc) return 0; - if (warp_types->local_warp_allowed && !mbmi->wm_params[0].invalid) { + if (warp_types->local_warp_allowed && !mbmi->wm_params.invalid) { if (final_warp_params != NULL) - memcpy(final_warp_params, &mbmi->wm_params[0], - sizeof(*final_warp_params)); + memcpy(final_warp_params, &mbmi->wm_params, sizeof(*final_warp_params)); return 1; } else if (warp_types->global_warp_allowed && !gm_params->invalid) { if (final_warp_params != NULL) @@ -78,6 +77,9 @@ void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, av1_allow_warp(mi, warp_types, &xd->global_motion[mi->ref_frame[ref]], build_for_obmc, subpel_params->xs, subpel_params->ys, &final_warp_params)); + const int is_intrabc = mi->use_intrabc; + assert(IMPLIES(is_intrabc, !do_warp)); + if (do_warp && xd->cur_frame_force_integer_mv == 0) { const struct macroblockd_plane *const pd = &xd->plane[plane]; const struct buf_2d *const pre_buf = &pd->pre[ref]; @@ -88,10 +90,11 @@ void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, pd->subsampling_x, pd->subsampling_y, conv_params); } else if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { highbd_inter_predictor(src, src_stride, dst, dst_stride, subpel_params, sf, - w, h, conv_params, interp_filters, xd->bd); + w, h, conv_params, interp_filters, is_intrabc, + xd->bd); } else { inter_predictor(src, src_stride, dst, dst_stride, subpel_params, sf, w, h, - conv_params, interp_filters); + conv_params, interp_filters, is_intrabc); } } @@ -574,37 +577,6 @@ static void build_masked_compound_no_round( h, subw, subh, conv_params); } -static void build_masked_compound( - uint8_t *dst, int dst_stride, const uint8_t *src0, int src0_stride, - const uint8_t *src1, int src1_stride, - const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h, - int w) { - // Derive subsampling from h and w passed in. May be refactored to - // pass in subsampling factors directly. - const int subh = (2 << mi_size_high_log2[sb_type]) == h; - const int subw = (2 << mi_size_wide_log2[sb_type]) == w; - const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type); - aom_blend_a64_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride, - mask, block_size_wide[sb_type], w, h, subw, subh); -} - -static void build_masked_compound_highbd( - uint8_t *dst_8, int dst_stride, const uint8_t *src0_8, int src0_stride, - const uint8_t *src1_8, int src1_stride, - const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h, - int w, int bd) { - // Derive subsampling from h and w passed in. May be refactored to - // pass in subsampling factors directly. - const int subh = (2 << mi_size_high_log2[sb_type]) == h; - const int subw = (2 << mi_size_wide_log2[sb_type]) == w; - const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type); - // const uint8_t *mask = - // av1_get_contiguous_soft_mask(wedge_index, wedge_sign, sb_type); - aom_highbd_blend_a64_mask(dst_8, dst_stride, src0_8, src0_stride, src1_8, - src1_stride, mask, block_size_wide[sb_type], w, h, - subw, subh, bd); -} - void av1_make_masked_inter_predictor( const uint8_t *pre, int pre_stride, uint8_t *dst, int dst_stride, const SubpelParams *subpel_params, const struct scale_factors *sf, int w, @@ -653,63 +625,6 @@ void av1_make_masked_inter_predictor( mi->sb_type, h, w, conv_params, xd); } -// TODO(sarahparker) av1_highbd_build_inter_predictor and -// av1_build_inter_predictor should be combined with -// av1_make_inter_predictor -void av1_highbd_build_inter_predictor( - const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, - const MV *src_mv, const struct scale_factors *sf, int w, int h, int ref, - InterpFilters interp_filters, const WarpTypesAllowed *warp_types, int p_col, - int p_row, int plane, enum mv_precision precision, int x, int y, - const MACROBLOCKD *xd, int can_use_previous) { - const int is_q4 = precision == MV_PRECISION_Q4; - const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2, - is_q4 ? src_mv->col : src_mv->col * 2 }; - MV32 mv = av1_scale_mv(&mv_q4, x, y, sf); - mv.col += SCALE_EXTRA_OFF; - mv.row += SCALE_EXTRA_OFF; - const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4, - mv.col & SCALE_SUBPEL_MASK, - mv.row & SCALE_SUBPEL_MASK }; - ConvolveParams conv_params = get_conv_params(ref, 0, plane, xd->bd); - - src += (mv.row >> SCALE_SUBPEL_BITS) * src_stride + - (mv.col >> SCALE_SUBPEL_BITS); - - av1_make_inter_predictor(src, src_stride, dst, dst_stride, &subpel_params, sf, - w, h, &conv_params, interp_filters, warp_types, - p_col, p_row, plane, ref, xd->mi[0], 0, xd, - can_use_previous); -} - -void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, - int dst_stride, const MV *src_mv, - const struct scale_factors *sf, int w, int h, - ConvolveParams *conv_params, - InterpFilters interp_filters, - const WarpTypesAllowed *warp_types, int p_col, - int p_row, int plane, int ref, - enum mv_precision precision, int x, int y, - const MACROBLOCKD *xd, int can_use_previous) { - const int is_q4 = precision == MV_PRECISION_Q4; - const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2, - is_q4 ? src_mv->col : src_mv->col * 2 }; - MV32 mv = av1_scale_mv(&mv_q4, x, y, sf); - mv.col += SCALE_EXTRA_OFF; - mv.row += SCALE_EXTRA_OFF; - - const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4, - mv.col & SCALE_SUBPEL_MASK, - mv.row & SCALE_SUBPEL_MASK }; - src += (mv.row >> SCALE_SUBPEL_BITS) * src_stride + - (mv.col >> SCALE_SUBPEL_BITS); - - av1_make_inter_predictor(src, src_stride, dst, dst_stride, &subpel_params, sf, - w, h, conv_params, interp_filters, warp_types, p_col, - p_row, plane, ref, xd->mi[0], 0, xd, - can_use_previous); -} - void av1_jnt_comp_weight_assign(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi, int order_idx, int *fwd_offset, int *bck_offset, int *use_jnt_comp_avg, int is_compound) { @@ -759,279 +674,6 @@ void av1_jnt_comp_weight_assign(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi, *bck_offset = quant_dist_lookup_table[order_idx][i][1 - order]; } -static INLINE void calc_subpel_params( - MACROBLOCKD *xd, const struct scale_factors *const sf, const MV mv, - int plane, const int pre_x, const int pre_y, int x, int y, - struct buf_2d *const pre_buf, uint8_t **pre, SubpelParams *subpel_params, - int bw, int bh) { - struct macroblockd_plane *const pd = &xd->plane[plane]; - const int is_scaled = av1_is_scaled(sf); - if (is_scaled) { - int ssx = pd->subsampling_x; - int ssy = pd->subsampling_y; - int orig_pos_y = (pre_y + y) << SUBPEL_BITS; - orig_pos_y += mv.row * (1 << (1 - ssy)); - int orig_pos_x = (pre_x + x) << SUBPEL_BITS; - orig_pos_x += mv.col * (1 << (1 - ssx)); - int pos_y = sf->scale_value_y(orig_pos_y, sf); - int pos_x = sf->scale_value_x(orig_pos_x, sf); - pos_x += SCALE_EXTRA_OFF; - pos_y += SCALE_EXTRA_OFF; - - const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy); - const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx); - const int bottom = (pre_buf->height + AOM_INTERP_EXTEND) - << SCALE_SUBPEL_BITS; - const int right = (pre_buf->width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS; - pos_y = clamp(pos_y, top, bottom); - pos_x = clamp(pos_x, left, right); - - *pre = pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride + - (pos_x >> SCALE_SUBPEL_BITS); - subpel_params->subpel_x = pos_x & SCALE_SUBPEL_MASK; - subpel_params->subpel_y = pos_y & SCALE_SUBPEL_MASK; - subpel_params->xs = sf->x_step_q4; - subpel_params->ys = sf->y_step_q4; - } else { - const MV mv_q4 = clamp_mv_to_umv_border_sb( - xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y); - subpel_params->xs = subpel_params->ys = SCALE_SUBPEL_SHIFTS; - subpel_params->subpel_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS; - subpel_params->subpel_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS; - *pre = pre_buf->buf + (y + (mv_q4.row >> SUBPEL_BITS)) * pre_buf->stride + - (x + (mv_q4.col >> SUBPEL_BITS)); - } -} - -static INLINE void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd, - int plane, const MB_MODE_INFO *mi, - int build_for_obmc, int bw, int bh, - int mi_x, int mi_y) { - struct macroblockd_plane *const pd = &xd->plane[plane]; - int is_compound = has_second_ref(mi); - int ref; - const int is_intrabc = is_intrabc_block(mi); - assert(IMPLIES(is_intrabc, !is_compound)); - int is_global[2] = { 0, 0 }; - for (ref = 0; ref < 1 + is_compound; ++ref) { - const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]]; - is_global[ref] = is_global_mv_block(mi, wm->wmtype); - } - - const BLOCK_SIZE bsize = mi->sb_type; - const int ss_x = pd->subsampling_x; - const int ss_y = pd->subsampling_y; - int sub8x8_inter = (block_size_wide[bsize] < 8 && ss_x) || - (block_size_high[bsize] < 8 && ss_y); - - if (is_intrabc) sub8x8_inter = 0; - - // For sub8x8 chroma blocks, we may be covering more than one luma block's - // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for - // the top-left corner of the prediction source - the correct top-left corner - // is at (pre_x, pre_y). - const int row_start = - (block_size_high[bsize] == 4) && ss_y && !build_for_obmc ? -1 : 0; - const int col_start = - (block_size_wide[bsize] == 4) && ss_x && !build_for_obmc ? -1 : 0; - const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x; - const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y; - - sub8x8_inter = sub8x8_inter && !build_for_obmc; - if (sub8x8_inter) { - for (int row = row_start; row <= 0 && sub8x8_inter; ++row) { - for (int col = col_start; col <= 0; ++col) { - const MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col]; - if (!is_inter_block(this_mbmi)) sub8x8_inter = 0; - if (is_intrabc_block(this_mbmi)) sub8x8_inter = 0; - } - } - } - - if (sub8x8_inter) { - // block size - const int b4_w = block_size_wide[bsize] >> ss_x; - const int b4_h = block_size_high[bsize] >> ss_y; - const BLOCK_SIZE plane_bsize = scale_chroma_bsize(bsize, ss_x, ss_y); - const int b8_w = block_size_wide[plane_bsize] >> ss_x; - const int b8_h = block_size_high[plane_bsize] >> ss_y; - assert(!is_compound); - - const struct buf_2d orig_pred_buf[2] = { pd->pre[0], pd->pre[1] }; - - int row = row_start; - for (int y = 0; y < b8_h; y += b4_h) { - int col = col_start; - for (int x = 0; x < b8_w; x += b4_w) { - MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col]; - is_compound = has_second_ref(this_mbmi); - DECLARE_ALIGNED(32, CONV_BUF_TYPE, tmp_dst[8 * 8]); - int tmp_dst_stride = 8; - assert(bw < 8 || bh < 8); - ConvolveParams conv_params = get_conv_params_no_round( - 0, 0, plane, tmp_dst, tmp_dst_stride, is_compound, xd->bd); - conv_params.use_jnt_comp_avg = 0; - struct buf_2d *const dst_buf = &pd->dst; - uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x; - - ref = 0; - const RefBuffer *ref_buf = - &cm->frame_refs[this_mbmi->ref_frame[ref] - LAST_FRAME]; - - pd->pre[ref].buf0 = - (plane == 1) ? ref_buf->buf->u_buffer : ref_buf->buf->v_buffer; - pd->pre[ref].buf = - pd->pre[ref].buf0 + scaled_buffer_offset(pre_x, pre_y, - ref_buf->buf->uv_stride, - &ref_buf->sf); - pd->pre[ref].width = ref_buf->buf->uv_crop_width; - pd->pre[ref].height = ref_buf->buf->uv_crop_height; - pd->pre[ref].stride = ref_buf->buf->uv_stride; - - const struct scale_factors *const sf = - is_intrabc ? &cm->sf_identity : &ref_buf->sf; - struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref]; - - const MV mv = this_mbmi->mv[ref].as_mv; - - uint8_t *pre; - SubpelParams subpel_params; - WarpTypesAllowed warp_types; - warp_types.global_warp_allowed = is_global[ref]; - warp_types.local_warp_allowed = this_mbmi->motion_mode == WARPED_CAUSAL; - - calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, x, y, pre_buf, &pre, - &subpel_params, bw, bh); - - conv_params.ref = ref; - conv_params.do_average = ref; - if (is_masked_compound_type(mi->interinter_comp.type)) { - // masked compound type has its own average mechanism - conv_params.do_average = 0; - } - - av1_make_inter_predictor( - pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf, - b4_w, b4_h, &conv_params, this_mbmi->interp_filters, &warp_types, - (mi_x >> pd->subsampling_x) + x, (mi_y >> pd->subsampling_y) + y, - plane, ref, mi, build_for_obmc, xd, cm->allow_warped_motion); - - ++col; - } - ++row; - } - - for (ref = 0; ref < 2; ++ref) pd->pre[ref] = orig_pred_buf[ref]; - return; - } - - { - DECLARE_ALIGNED(32, uint16_t, tmp_dst[MAX_SB_SIZE * MAX_SB_SIZE]); - ConvolveParams conv_params = get_conv_params_no_round( - 0, 0, plane, tmp_dst, MAX_SB_SIZE, is_compound, xd->bd); - av1_jnt_comp_weight_assign(cm, mi, 0, &conv_params.fwd_offset, - &conv_params.bck_offset, - &conv_params.use_jnt_comp_avg, is_compound); - - struct buf_2d *const dst_buf = &pd->dst; - uint8_t *const dst = dst_buf->buf; - for (ref = 0; ref < 1 + is_compound; ++ref) { - const struct scale_factors *const sf = - is_intrabc ? &cm->sf_identity : &xd->block_refs[ref]->sf; - struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref]; - const MV mv = mi->mv[ref].as_mv; - - uint8_t *pre; - SubpelParams subpel_params; - calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, 0, 0, pre_buf, &pre, - &subpel_params, bw, bh); - - WarpTypesAllowed warp_types; - warp_types.global_warp_allowed = is_global[ref]; - warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL; - conv_params.ref = ref; - - if (ref && is_masked_compound_type(mi->interinter_comp.type)) { - // masked compound type has its own average mechanism - conv_params.do_average = 0; - av1_make_masked_inter_predictor( - pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf, bw, - bh, &conv_params, mi->interp_filters, plane, &warp_types, - mi_x >> pd->subsampling_x, mi_y >> pd->subsampling_y, ref, xd, - cm->allow_warped_motion); - } else { - conv_params.do_average = ref; - av1_make_inter_predictor( - pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf, bw, - bh, &conv_params, mi->interp_filters, &warp_types, - mi_x >> pd->subsampling_x, mi_y >> pd->subsampling_y, plane, ref, - mi, build_for_obmc, xd, cm->allow_warped_motion); - } - } - } -} - -static void build_inter_predictors_for_planes(const AV1_COMMON *cm, - MACROBLOCKD *xd, BLOCK_SIZE bsize, - int mi_row, int mi_col, - int plane_from, int plane_to) { - int plane; - const int mi_x = mi_col * MI_SIZE; - const int mi_y = mi_row * MI_SIZE; - for (plane = plane_from; plane <= plane_to; ++plane) { - const struct macroblockd_plane *pd = &xd->plane[plane]; - const int bw = pd->width; - const int bh = pd->height; - - if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x, - pd->subsampling_y)) - continue; - - build_inter_predictors(cm, xd, plane, xd->mi[0], 0, bw, bh, mi_x, mi_y); - } -} - -void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col, BUFFER_SET *ctx, - BLOCK_SIZE bsize) { - build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, 0, 0); - - if (is_interintra_pred(xd->mi[0])) { - BUFFER_SET default_ctx = { { xd->plane[0].dst.buf, NULL, NULL }, - { xd->plane[0].dst.stride, 0, 0 } }; - if (!ctx) ctx = &default_ctx; - av1_build_interintra_predictors_sbp(cm, xd, xd->plane[0].dst.buf, - xd->plane[0].dst.stride, ctx, 0, bsize); - } -} - -void av1_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col, BUFFER_SET *ctx, - BLOCK_SIZE bsize) { - build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, 1, - MAX_MB_PLANE - 1); - - if (is_interintra_pred(xd->mi[0])) { - BUFFER_SET default_ctx = { - { NULL, xd->plane[1].dst.buf, xd->plane[2].dst.buf }, - { 0, xd->plane[1].dst.stride, xd->plane[2].dst.stride } - }; - if (!ctx) ctx = &default_ctx; - av1_build_interintra_predictors_sbuv( - cm, xd, xd->plane[1].dst.buf, xd->plane[2].dst.buf, - xd->plane[1].dst.stride, xd->plane[2].dst.stride, ctx, bsize); - } -} - -void av1_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col, BUFFER_SET *ctx, - BLOCK_SIZE bsize) { - const int num_planes = av1_num_planes(cm); - av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize); - if (num_planes > 1) - av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, ctx, bsize); -} - void av1_setup_dst_planes(struct macroblockd_plane *planes, BLOCK_SIZE bsize, const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col, const int plane_start, const int plane_end) { @@ -1292,63 +934,7 @@ void av1_setup_build_prediction_by_above_pred( xd->mb_to_left_edge = 8 * MI_SIZE * (-above_mi_col); xd->mb_to_right_edge = ctxt->mb_to_far_edge + - (xd->n8_w - rel_mi_col - above_mi_width) * MI_SIZE * 8; -} - -static INLINE void build_prediction_by_above_pred( - MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width, - MB_MODE_INFO *above_mbmi, void *fun_ctxt, const int num_planes) { - struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt; - const int above_mi_col = ctxt->mi_col + rel_mi_col; - int mi_x, mi_y; - MB_MODE_INFO backup_mbmi = *above_mbmi; - - av1_setup_build_prediction_by_above_pred(xd, rel_mi_col, above_mi_width, - above_mbmi, ctxt, num_planes); - mi_x = above_mi_col << MI_SIZE_LOG2; - mi_y = ctxt->mi_row << MI_SIZE_LOG2; - - const BLOCK_SIZE bsize = xd->mi[0]->sb_type; - - for (int j = 0; j < num_planes; ++j) { - const struct macroblockd_plane *pd = &xd->plane[j]; - int bw = (above_mi_width * MI_SIZE) >> pd->subsampling_x; - int bh = clamp(block_size_high[bsize] >> (pd->subsampling_y + 1), 4, - block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1)); - - if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue; - build_inter_predictors(ctxt->cm, xd, j, above_mbmi, 1, bw, bh, mi_x, mi_y); - } - *above_mbmi = backup_mbmi; -} - -void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col, - uint8_t *tmp_buf[MAX_MB_PLANE], - int tmp_width[MAX_MB_PLANE], - int tmp_height[MAX_MB_PLANE], - int tmp_stride[MAX_MB_PLANE]) { - if (!xd->up_available) return; - - // Adjust mb_to_bottom_edge to have the correct value for the OBMC - // prediction block. This is half the height of the original block, - // except for 128-wide blocks, where we only use a height of 32. - int this_height = xd->n8_h * MI_SIZE; - int pred_height = AOMMIN(this_height / 2, 32); - xd->mb_to_bottom_edge += (this_height - pred_height) * 8; - - struct build_prediction_ctxt ctxt = { cm, mi_row, - mi_col, tmp_buf, - tmp_width, tmp_height, - tmp_stride, xd->mb_to_right_edge }; - BLOCK_SIZE bsize = xd->mi[0]->sb_type; - foreach_overlappable_nb_above(cm, xd, mi_col, - max_neighbor_obmc[mi_size_wide_log2[bsize]], - build_prediction_by_above_pred, &ctxt); - - xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8); - xd->mb_to_right_edge = ctxt.mb_to_far_edge; - xd->mb_to_bottom_edge -= (this_height - pred_height) * 8; + (xd->n4_w - rel_mi_col - above_mi_width) * MI_SIZE * 8; } void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row, @@ -1386,101 +972,7 @@ void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row, xd->mb_to_top_edge = 8 * MI_SIZE * (-left_mi_row); xd->mb_to_bottom_edge = ctxt->mb_to_far_edge + - (xd->n8_h - rel_mi_row - left_mi_height) * MI_SIZE * 8; -} - -static INLINE void build_prediction_by_left_pred( - MACROBLOCKD *xd, int rel_mi_row, uint8_t left_mi_height, - MB_MODE_INFO *left_mbmi, void *fun_ctxt, const int num_planes) { - struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt; - const int left_mi_row = ctxt->mi_row + rel_mi_row; - int mi_x, mi_y; - MB_MODE_INFO backup_mbmi = *left_mbmi; - - av1_setup_build_prediction_by_left_pred(xd, rel_mi_row, left_mi_height, - left_mbmi, ctxt, num_planes); - mi_x = ctxt->mi_col << MI_SIZE_LOG2; - mi_y = left_mi_row << MI_SIZE_LOG2; - const BLOCK_SIZE bsize = xd->mi[0]->sb_type; - - for (int j = 0; j < num_planes; ++j) { - const struct macroblockd_plane *pd = &xd->plane[j]; - int bw = clamp(block_size_wide[bsize] >> (pd->subsampling_x + 1), 4, - block_size_wide[BLOCK_64X64] >> (pd->subsampling_x + 1)); - int bh = (left_mi_height << MI_SIZE_LOG2) >> pd->subsampling_y; - - if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue; - build_inter_predictors(ctxt->cm, xd, j, left_mbmi, 1, bw, bh, mi_x, mi_y); - } - *left_mbmi = backup_mbmi; -} - -void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col, - uint8_t *tmp_buf[MAX_MB_PLANE], - int tmp_width[MAX_MB_PLANE], - int tmp_height[MAX_MB_PLANE], - int tmp_stride[MAX_MB_PLANE]) { - if (!xd->left_available) return; - - // Adjust mb_to_right_edge to have the correct value for the OBMC - // prediction block. This is half the width of the original block, - // except for 128-wide blocks, where we only use a width of 32. - int this_width = xd->n8_w * MI_SIZE; - int pred_width = AOMMIN(this_width / 2, 32); - xd->mb_to_right_edge += (this_width - pred_width) * 8; - - struct build_prediction_ctxt ctxt = { cm, mi_row, - mi_col, tmp_buf, - tmp_width, tmp_height, - tmp_stride, xd->mb_to_bottom_edge }; - BLOCK_SIZE bsize = xd->mi[0]->sb_type; - foreach_overlappable_nb_left(cm, xd, mi_row, - max_neighbor_obmc[mi_size_high_log2[bsize]], - build_prediction_by_left_pred, &ctxt); - - xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8); - xd->mb_to_right_edge -= (this_width - pred_width) * 8; - xd->mb_to_bottom_edge = ctxt.mb_to_far_edge; -} - -void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col) { - const int num_planes = av1_num_planes(cm); - DECLARE_ALIGNED(16, uint8_t, tmp_buf1[2 * MAX_MB_PLANE * MAX_SB_SQUARE]); - DECLARE_ALIGNED(16, uint8_t, tmp_buf2[2 * MAX_MB_PLANE * MAX_SB_SQUARE]); - uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE]; - int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; - int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; - int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; - int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; - int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; - int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; - - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - int len = sizeof(uint16_t); - dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1); - dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * len); - dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * 2 * len); - dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2); - dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * len); - dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * 2 * len); - } else { - dst_buf1[0] = tmp_buf1; - dst_buf1[1] = tmp_buf1 + MAX_SB_SQUARE; - dst_buf1[2] = tmp_buf1 + MAX_SB_SQUARE * 2; - dst_buf2[0] = tmp_buf2; - dst_buf2[1] = tmp_buf2 + MAX_SB_SQUARE; - dst_buf2[2] = tmp_buf2 + MAX_SB_SQUARE * 2; - } - av1_build_prediction_by_above_preds(cm, xd, mi_row, mi_col, dst_buf1, - dst_width1, dst_height1, dst_stride1); - av1_build_prediction_by_left_preds(cm, xd, mi_row, mi_col, dst_buf2, - dst_width2, dst_height2, dst_stride2); - av1_setup_dst_planes(xd->plane, xd->mi[0]->sb_type, get_frame_new_buffer(cm), - mi_row, mi_col, 0, num_planes); - av1_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, dst_buf1, dst_stride1, - dst_buf2, dst_stride2); + (xd->n4_h - rel_mi_row - left_mi_height) * MI_SIZE * 8; } /* clang-format off */ @@ -1668,127 +1160,3 @@ void av1_build_interintra_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd, av1_build_interintra_predictors_sbp(cm, xd, upred, ustride, ctx, 1, bsize); av1_build_interintra_predictors_sbp(cm, xd, vpred, vstride, ctx, 2, bsize); } - -void av1_build_interintra_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd, - uint8_t *ypred, uint8_t *upred, - uint8_t *vpred, int ystride, int ustride, - int vstride, BUFFER_SET *ctx, - BLOCK_SIZE bsize) { - av1_build_interintra_predictors_sbp(cm, xd, ypred, ystride, ctx, 0, bsize); - av1_build_interintra_predictors_sbuv(cm, xd, upred, vpred, ustride, vstride, - ctx, bsize); -} - -// Builds the inter-predictor for the single ref case -// for use in the encoder to search the wedges efficiently. -static void build_inter_predictors_single_buf(MACROBLOCKD *xd, int plane, - int bw, int bh, int x, int y, - int w, int h, int mi_x, int mi_y, - int ref, uint8_t *const ext_dst, - int ext_dst_stride, - int can_use_previous) { - struct macroblockd_plane *const pd = &xd->plane[plane]; - const MB_MODE_INFO *mi = xd->mi[0]; - - const struct scale_factors *const sf = &xd->block_refs[ref]->sf; - struct buf_2d *const pre_buf = &pd->pre[ref]; - uint8_t *const dst = get_buf_by_bd(xd, ext_dst) + ext_dst_stride * y + x; - const MV mv = mi->mv[ref].as_mv; - - ConvolveParams conv_params = get_conv_params(ref, 0, plane, xd->bd); - WarpTypesAllowed warp_types; - const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]]; - warp_types.global_warp_allowed = is_global_mv_block(mi, wm->wmtype); - warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL; - const int pre_x = (mi_x) >> pd->subsampling_x; - const int pre_y = (mi_y) >> pd->subsampling_y; - uint8_t *pre; - SubpelParams subpel_params; - calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, x, y, pre_buf, &pre, - &subpel_params, bw, bh); - - av1_make_inter_predictor(pre, pre_buf->stride, dst, ext_dst_stride, - &subpel_params, sf, w, h, &conv_params, - mi->interp_filters, &warp_types, pre_x + x, - pre_y + y, plane, ref, mi, 0, xd, can_use_previous); -} - -void av1_build_inter_predictors_for_planes_single_buf( - MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int mi_row, - int mi_col, int ref, uint8_t *ext_dst[3], int ext_dst_stride[3], - int can_use_previous) { - int plane; - const int mi_x = mi_col * MI_SIZE; - const int mi_y = mi_row * MI_SIZE; - for (plane = plane_from; plane <= plane_to; ++plane) { - const BLOCK_SIZE plane_bsize = get_plane_block_size( - bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y); - const int bw = block_size_wide[plane_bsize]; - const int bh = block_size_high[plane_bsize]; - build_inter_predictors_single_buf(xd, plane, bw, bh, 0, 0, bw, bh, mi_x, - mi_y, ref, ext_dst[plane], - ext_dst_stride[plane], can_use_previous); - } -} - -static void build_wedge_inter_predictor_from_buf( - MACROBLOCKD *xd, int plane, int x, int y, int w, int h, uint8_t *ext_dst0, - int ext_dst_stride0, uint8_t *ext_dst1, int ext_dst_stride1) { - MB_MODE_INFO *const mbmi = xd->mi[0]; - const int is_compound = has_second_ref(mbmi); - MACROBLOCKD_PLANE *const pd = &xd->plane[plane]; - struct buf_2d *const dst_buf = &pd->dst; - uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x; - mbmi->interinter_comp.seg_mask = xd->seg_mask; - const INTERINTER_COMPOUND_DATA *comp_data = &mbmi->interinter_comp; - - if (is_compound && is_masked_compound_type(comp_data->type)) { - if (!plane && comp_data->type == COMPOUND_DIFFWTD) { - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) - av1_build_compound_diffwtd_mask_highbd( - comp_data->seg_mask, comp_data->mask_type, - CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0, - CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, h, w, xd->bd); - else - av1_build_compound_diffwtd_mask( - comp_data->seg_mask, comp_data->mask_type, ext_dst0, - ext_dst_stride0, ext_dst1, ext_dst_stride1, h, w); - } - - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) - build_masked_compound_highbd( - dst, dst_buf->stride, CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0, - CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, comp_data, - mbmi->sb_type, h, w, xd->bd); - else - build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0, - ext_dst1, ext_dst_stride1, comp_data, mbmi->sb_type, - h, w); - } else { - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) - aom_highbd_convolve_copy(CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0, - dst, dst_buf->stride, NULL, 0, NULL, 0, w, h, - xd->bd); - else - aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, NULL, - 0, NULL, 0, w, h); - } -} - -void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize, - int plane_from, int plane_to, - uint8_t *ext_dst0[3], - int ext_dst_stride0[3], - uint8_t *ext_dst1[3], - int ext_dst_stride1[3]) { - int plane; - for (plane = plane_from; plane <= plane_to; ++plane) { - const BLOCK_SIZE plane_bsize = get_plane_block_size( - bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y); - const int bw = block_size_wide[plane_bsize]; - const int bh = block_size_high[plane_bsize]; - build_wedge_inter_predictor_from_buf( - xd, plane, 0, 0, bw, bh, ext_dst0[plane], ext_dst_stride0[plane], - ext_dst1[plane], ext_dst_stride1[plane]); - } -} diff --git a/third_party/aom/av1/common/reconinter.h b/third_party/aom/av1/common/reconinter.h index 6a3def270..db86c777e 100644 --- a/third_party/aom/av1/common/reconinter.h +++ b/third_party/aom/av1/common/reconinter.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_RECONINTER_H_ -#define AV1_COMMON_RECONINTER_H_ +#ifndef AOM_AV1_COMMON_RECONINTER_H_ +#define AOM_AV1_COMMON_RECONINTER_H_ #include "av1/common/filter.h" #include "av1/common/onyxc_int.h" @@ -113,40 +113,48 @@ static INLINE void inter_predictor(const uint8_t *src, int src_stride, const SubpelParams *subpel_params, const struct scale_factors *sf, int w, int h, ConvolveParams *conv_params, - InterpFilters interp_filters) { + InterpFilters interp_filters, + int is_intrabc) { assert(conv_params->do_average == 0 || conv_params->do_average == 1); assert(sf); - if (has_scale(subpel_params->xs, subpel_params->ys)) { + const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys); + assert(IMPLIES(is_intrabc, !is_scaled)); + if (is_scaled) { av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h, interp_filters, subpel_params->subpel_x, subpel_params->xs, subpel_params->subpel_y, - subpel_params->ys, 1, conv_params, sf); + subpel_params->ys, 1, conv_params, sf, is_intrabc); } else { SubpelParams sp = *subpel_params; revert_scale_extra_bits(&sp); av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h, interp_filters, sp.subpel_x, sp.xs, sp.subpel_y, - sp.ys, 0, conv_params, sf); + sp.ys, 0, conv_params, sf, is_intrabc); } } -static INLINE void highbd_inter_predictor( - const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, - const SubpelParams *subpel_params, const struct scale_factors *sf, int w, - int h, ConvolveParams *conv_params, InterpFilters interp_filters, int bd) { +static INLINE void highbd_inter_predictor(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const SubpelParams *subpel_params, + const struct scale_factors *sf, int w, + int h, ConvolveParams *conv_params, + InterpFilters interp_filters, + int is_intrabc, int bd) { assert(conv_params->do_average == 0 || conv_params->do_average == 1); assert(sf); - if (has_scale(subpel_params->xs, subpel_params->ys)) { - av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h, - interp_filters, subpel_params->subpel_x, - subpel_params->xs, subpel_params->subpel_y, - subpel_params->ys, 1, conv_params, sf, bd); + const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys); + assert(IMPLIES(is_intrabc, !is_scaled)); + if (is_scaled) { + av1_highbd_convolve_2d_facade( + src, src_stride, dst, dst_stride, w, h, interp_filters, + subpel_params->subpel_x, subpel_params->xs, subpel_params->subpel_y, + subpel_params->ys, 1, conv_params, sf, is_intrabc, bd); } else { SubpelParams sp = *subpel_params; revert_scale_extra_bits(&sp); - av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h, - interp_filters, sp.subpel_x, sp.xs, - sp.subpel_y, sp.ys, 0, conv_params, sf, bd); + av1_highbd_convolve_2d_facade( + src, src_stride, dst, dst_stride, w, h, interp_filters, sp.subpel_x, + sp.xs, sp.subpel_y, sp.ys, 0, conv_params, sf, is_intrabc, bd); } } @@ -237,35 +245,6 @@ static INLINE MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, return clamped_mv; } -void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col, BUFFER_SET *ctx, - BLOCK_SIZE bsize); - -void av1_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col, BUFFER_SET *ctx, - BLOCK_SIZE bsize); - -void av1_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col, BUFFER_SET *ctx, - BLOCK_SIZE bsize); - -void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, - int dst_stride, const MV *src_mv, - const struct scale_factors *sf, int w, int h, - ConvolveParams *conv_params, - InterpFilters interp_filters, - const WarpTypesAllowed *warp_types, int p_col, - int p_row, int plane, int ref, - enum mv_precision precision, int x, int y, - const MACROBLOCKD *xd, int can_use_previous); - -void av1_highbd_build_inter_predictor( - const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, - const MV *mv_q3, const struct scale_factors *sf, int w, int h, int do_avg, - InterpFilters interp_filters, const WarpTypesAllowed *warp_types, int p_col, - int p_row, int plane, enum mv_precision precision, int x, int y, - const MACROBLOCKD *xd, int can_use_previous); - static INLINE int scaled_buffer_offset(int x_offset, int y_offset, int stride, const struct scale_factors *sf) { const int x = @@ -303,32 +282,6 @@ void av1_setup_pre_planes(MACROBLOCKD *xd, int idx, const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col, const struct scale_factors *sf, const int num_planes); -// Detect if the block have sub-pixel level motion vectors -// per component. -#define CHECK_SUBPEL 0 -static INLINE int has_subpel_mv_component(const MB_MODE_INFO *const mbmi, - const MACROBLOCKD *const xd, - int dir) { -#if CHECK_SUBPEL - const BLOCK_SIZE bsize = mbmi->sb_type; - int plane; - int ref = (dir >> 1); - - if (dir & 0x01) { - if (mbmi->mv[ref].as_mv.col & SUBPEL_MASK) return 1; - } else { - if (mbmi->mv[ref].as_mv.row & SUBPEL_MASK) return 1; - } - - return 0; -#else - (void)mbmi; - (void)xd; - (void)dir; - return 1; -#endif -} - static INLINE void set_default_interp_filters( MB_MODE_INFO *const mbmi, InterpFilter frame_interp_filter) { mbmi->interp_filters = @@ -343,21 +296,6 @@ static INLINE int av1_is_interp_needed(const MACROBLOCKD *const xd) { return 1; } -static INLINE int av1_is_interp_search_needed(const MACROBLOCKD *const xd) { - MB_MODE_INFO *const mi = xd->mi[0]; - const int is_compound = has_second_ref(mi); - int ref; - for (ref = 0; ref < 1 + is_compound; ++ref) { - int row_col; - for (row_col = 0; row_col < 2; ++row_col) { - const int dir = (ref << 1) + row_col; - if (has_subpel_mv_component(mi, xd, dir)) { - return 1; - } - } - } - return 0; -} void av1_setup_build_prediction_by_above_pred( MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width, MB_MODE_INFO *above_mbmi, struct build_prediction_ctxt *ctxt, @@ -367,18 +305,6 @@ void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row, MB_MODE_INFO *left_mbmi, struct build_prediction_ctxt *ctxt, const int num_planes); -void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col, - uint8_t *tmp_buf[MAX_MB_PLANE], - int tmp_width[MAX_MB_PLANE], - int tmp_height[MAX_MB_PLANE], - int tmp_stride[MAX_MB_PLANE]); -void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col, - uint8_t *tmp_buf[MAX_MB_PLANE], - int tmp_width[MAX_MB_PLANE], - int tmp_height[MAX_MB_PLANE], - int tmp_stride[MAX_MB_PLANE]); void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col, uint8_t *above[MAX_MB_PLANE], @@ -389,8 +315,6 @@ void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd, const uint8_t *av1_get_obmc_mask(int length); void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col); -void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col); #define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1) #define MASK_MASTER_STRIDE (MASK_MASTER_SIZE) @@ -406,12 +330,6 @@ static INLINE const uint8_t *av1_get_contiguous_soft_mask(int wedge_index, const uint8_t *av1_get_compound_type_mask( const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type); -void av1_build_interintra_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd, - uint8_t *ypred, uint8_t *upred, - uint8_t *vpred, int ystride, int ustride, - int vstride, BUFFER_SET *ctx, - BLOCK_SIZE bsize); - // build interintra_predictors for one plane void av1_build_interintra_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd, uint8_t *pred, int stride, @@ -431,18 +349,6 @@ void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane, const uint8_t *inter_pred, int inter_stride, const uint8_t *intra_pred, int intra_stride); -// Encoder only -void av1_build_inter_predictors_for_planes_single_buf( - MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int mi_row, - int mi_col, int ref, uint8_t *ext_dst[3], int ext_dst_stride[3], - int can_use_previous); -void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize, - int plane_from, int plane_to, - uint8_t *ext_dst0[3], - int ext_dst_stride0[3], - uint8_t *ext_dst1[3], - int ext_dst_stride1[3]); - void av1_jnt_comp_weight_assign(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi, int order_idx, int *fwd_offset, int *bck_offset, int *use_jnt_comp_avg, int is_compound); @@ -456,4 +362,4 @@ int av1_allow_warp(const MB_MODE_INFO *const mbmi, } // extern "C" #endif -#endif // AV1_COMMON_RECONINTER_H_ +#endif // AOM_AV1_COMMON_RECONINTER_H_ diff --git a/third_party/aom/av1/common/reconintra.h b/third_party/aom/av1/common/reconintra.h index 57638f24e..07853aba0 100644 --- a/third_party/aom/av1/common/reconintra.h +++ b/third_party/aom/av1/common/reconintra.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_RECONINTRA_H_ -#define AV1_COMMON_RECONINTRA_H_ +#ifndef AOM_AV1_COMMON_RECONINTRA_H_ +#define AOM_AV1_COMMON_RECONINTRA_H_ #include @@ -116,4 +116,4 @@ static INLINE int av1_use_intra_edge_upsample(int bs0, int bs1, int delta, #ifdef __cplusplus } // extern "C" #endif -#endif // AV1_COMMON_RECONINTRA_H_ +#endif // AOM_AV1_COMMON_RECONINTRA_H_ diff --git a/third_party/aom/av1/common/resize.c b/third_party/aom/av1/common/resize.c index 93d62292a..d61a20aa2 100644 --- a/third_party/aom/av1/common/resize.c +++ b/third_party/aom/av1/common/resize.c @@ -170,42 +170,6 @@ static const InterpKernel filteredinterp_filters875[(1 << RS_SUBPEL_BITS)] = { { -1, 3, -9, 17, 112, 10, -7, 3 }, { -1, 3, -8, 15, 112, 12, -7, 2 }, }; -// Filters for interpolation (full-band) - no filtering for integer pixels -static const InterpKernel filteredinterp_filters1000[(1 << RS_SUBPEL_BITS)] = { - { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, -1, 128, 2, -1, 0, 0 }, - { 0, 1, -3, 127, 4, -2, 1, 0 }, { 0, 1, -4, 127, 6, -3, 1, 0 }, - { 0, 2, -6, 126, 8, -3, 1, 0 }, { 0, 2, -7, 125, 11, -4, 1, 0 }, - { -1, 2, -8, 125, 13, -5, 2, 0 }, { -1, 3, -9, 124, 15, -6, 2, 0 }, - { -1, 3, -10, 123, 18, -6, 2, -1 }, { -1, 3, -11, 122, 20, -7, 3, -1 }, - { -1, 4, -12, 121, 22, -8, 3, -1 }, { -1, 4, -13, 120, 25, -9, 3, -1 }, - { -1, 4, -14, 118, 28, -9, 3, -1 }, { -1, 4, -15, 117, 30, -10, 4, -1 }, - { -1, 5, -16, 116, 32, -11, 4, -1 }, { -1, 5, -16, 114, 35, -12, 4, -1 }, - { -1, 5, -17, 112, 38, -12, 4, -1 }, { -1, 5, -18, 111, 40, -13, 5, -1 }, - { -1, 5, -18, 109, 43, -14, 5, -1 }, { -1, 6, -19, 107, 45, -14, 5, -1 }, - { -1, 6, -19, 105, 48, -15, 5, -1 }, { -1, 6, -19, 103, 51, -16, 5, -1 }, - { -1, 6, -20, 101, 53, -16, 6, -1 }, { -1, 6, -20, 99, 56, -17, 6, -1 }, - { -1, 6, -20, 97, 58, -17, 6, -1 }, { -1, 6, -20, 95, 61, -18, 6, -1 }, - { -2, 7, -20, 93, 64, -18, 6, -2 }, { -2, 7, -20, 91, 66, -19, 6, -1 }, - { -2, 7, -20, 88, 69, -19, 6, -1 }, { -2, 7, -20, 86, 71, -19, 6, -1 }, - { -2, 7, -20, 84, 74, -20, 7, -2 }, { -2, 7, -20, 81, 76, -20, 7, -1 }, - { -2, 7, -20, 79, 79, -20, 7, -2 }, { -1, 7, -20, 76, 81, -20, 7, -2 }, - { -2, 7, -20, 74, 84, -20, 7, -2 }, { -1, 6, -19, 71, 86, -20, 7, -2 }, - { -1, 6, -19, 69, 88, -20, 7, -2 }, { -1, 6, -19, 66, 91, -20, 7, -2 }, - { -2, 6, -18, 64, 93, -20, 7, -2 }, { -1, 6, -18, 61, 95, -20, 6, -1 }, - { -1, 6, -17, 58, 97, -20, 6, -1 }, { -1, 6, -17, 56, 99, -20, 6, -1 }, - { -1, 6, -16, 53, 101, -20, 6, -1 }, { -1, 5, -16, 51, 103, -19, 6, -1 }, - { -1, 5, -15, 48, 105, -19, 6, -1 }, { -1, 5, -14, 45, 107, -19, 6, -1 }, - { -1, 5, -14, 43, 109, -18, 5, -1 }, { -1, 5, -13, 40, 111, -18, 5, -1 }, - { -1, 4, -12, 38, 112, -17, 5, -1 }, { -1, 4, -12, 35, 114, -16, 5, -1 }, - { -1, 4, -11, 32, 116, -16, 5, -1 }, { -1, 4, -10, 30, 117, -15, 4, -1 }, - { -1, 3, -9, 28, 118, -14, 4, -1 }, { -1, 3, -9, 25, 120, -13, 4, -1 }, - { -1, 3, -8, 22, 121, -12, 4, -1 }, { -1, 3, -7, 20, 122, -11, 3, -1 }, - { -1, 2, -6, 18, 123, -10, 3, -1 }, { 0, 2, -6, 15, 124, -9, 3, -1 }, - { 0, 2, -5, 13, 125, -8, 2, -1 }, { 0, 1, -4, 11, 125, -7, 2, 0 }, - { 0, 1, -3, 8, 126, -6, 2, 0 }, { 0, 1, -3, 6, 127, -4, 1, 0 }, - { 0, 1, -2, 4, 127, -3, 1, 0 }, { 0, 0, -1, 2, 128, -1, 0, 0 }, -}; - const int16_t av1_resize_filter_normative[( 1 << RS_SUBPEL_BITS)][UPSCALE_NORMATIVE_TAPS] = { #if UPSCALE_NORMATIVE_TAPS == 8 @@ -246,6 +210,9 @@ const int16_t av1_resize_filter_normative[( #endif // UPSCALE_NORMATIVE_TAPS == 8 }; +// Filters for interpolation (full-band) - no filtering for integer pixels +#define filteredinterp_filters1000 av1_resize_filter_normative + // Filters for factor of 2 downsampling. static const int16_t av1_down2_symeven_half_filter[] = { 56, 12, -3, -1 }; static const int16_t av1_down2_symodd_half_filter[] = { 64, 35, 0, -3 }; diff --git a/third_party/aom/av1/common/resize.h b/third_party/aom/av1/common/resize.h index feec3a90e..9a59a8d63 100644 --- a/third_party/aom/av1/common/resize.h +++ b/third_party/aom/av1/common/resize.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_ENCODER_RESIZE_H_ -#define AV1_ENCODER_RESIZE_H_ +#ifndef AOM_AV1_COMMON_RESIZE_H_ +#define AOM_AV1_COMMON_RESIZE_H_ #include #include "aom/aom_integer.h" @@ -109,4 +109,4 @@ int32_t av1_get_upscale_convolve_step(int in_length, int out_length); } // extern "C" #endif -#endif // AV1_ENCODER_RESIZE_H_ +#endif // AOM_AV1_COMMON_RESIZE_H_ diff --git a/third_party/aom/av1/common/restoration.c b/third_party/aom/av1/common/restoration.c index 632967957..d276a915b 100644 --- a/third_party/aom/av1/common/restoration.c +++ b/third_party/aom/av1/common/restoration.c @@ -661,9 +661,10 @@ const int32_t one_by_x[MAX_NELEM] = { 293, 273, 256, 241, 228, 216, 205, 195, 186, 178, 171, 164, }; -static void selfguided_restoration_fast_internal( - int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst, - int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) { +static void calculate_intermediate_result(int32_t *dgd, int width, int height, + int dgd_stride, int bit_depth, + int sgr_params_idx, int radius_idx, + int pass, int32_t *A, int32_t *B) { const sgr_params_type *const params = &sgr_params[sgr_params_idx]; const int r = params->r[radius_idx]; const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; @@ -673,10 +674,7 @@ static void selfguided_restoration_fast_internal( // We also align the stride to a multiple of 16 bytes, for consistency // with the SIMD version of this function. int buf_stride = ((width_ext + 3) & ~3) + 16; - int32_t A_[RESTORATION_PROC_UNIT_PELS]; - int32_t B_[RESTORATION_PROC_UNIT_PELS]; - int32_t *A = A_; - int32_t *B = B_; + const int step = pass == 0 ? 1 : 2; int i, j; assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r"); @@ -691,7 +689,7 @@ static void selfguided_restoration_fast_internal( B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie, // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[]. - for (i = -1; i < height + 1; i += 2) { + for (i = -1; i < height + 1; i += step) { for (j = -1; j < width + 1; ++j) { const int k = i * buf_stride + j; const int n = (2 * r + 1) * (2 * r + 1); @@ -754,7 +752,31 @@ static void selfguided_restoration_fast_internal( SGRPROJ_RECIP_BITS); } } +} + +static void selfguided_restoration_fast_internal( + int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst, + int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) { + const sgr_params_type *const params = &sgr_params[sgr_params_idx]; + const int r = params->r[radius_idx]; + const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; + // Adjusting the stride of A and B here appears to avoid bad cache effects, + // leading to a significant speed improvement. + // We also align the stride to a multiple of 16 bytes, for consistency + // with the SIMD version of this function. + int buf_stride = ((width_ext + 3) & ~3) + 16; + int32_t A_[RESTORATION_PROC_UNIT_PELS]; + int32_t B_[RESTORATION_PROC_UNIT_PELS]; + int32_t *A = A_; + int32_t *B = B_; + int i, j; + calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth, + sgr_params_idx, radius_idx, 1, A, B); + A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + // Use the A[] and B[] arrays to calculate the filtered image + (void)r; assert(r == 2); for (i = 0; i < height; ++i) { if (!(i & 1)) { // even row @@ -796,10 +818,7 @@ static void selfguided_restoration_internal(int32_t *dgd, int width, int height, int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) { - const sgr_params_type *const params = &sgr_params[sgr_params_idx]; - const int r = params->r[radius_idx]; const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; - const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; // Adjusting the stride of A and B here appears to avoid bad cache effects, // leading to a significant speed improvement. // We also align the stride to a multiple of 16 bytes, for consistency @@ -810,82 +829,11 @@ static void selfguided_restoration_internal(int32_t *dgd, int width, int height, int32_t *A = A_; int32_t *B = B_; int i, j; - - assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r"); - assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 && - "Need SGRPROJ_BORDER_* >= r+1"); - - boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ, - width_ext, height_ext, dgd_stride, r, 0, B, buf_stride); - boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ, - width_ext, height_ext, dgd_stride, r, 1, A, buf_stride); + calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth, + sgr_params_idx, radius_idx, 0, A, B); A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; - // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie, - // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[]. - for (i = -1; i < height + 1; ++i) { - for (j = -1; j < width + 1; ++j) { - const int k = i * buf_stride + j; - const int n = (2 * r + 1) * (2 * r + 1); - - // a < 2^16 * n < 2^22 regardless of bit depth - uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8)); - // b < 2^8 * n < 2^14 regardless of bit depth - uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8); - - // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28, - // and p itself satisfies p < 2^14 * n^2 < 2^26. - // This bound on p is due to: - // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances - // - // Note: Sometimes, in high bit depth, we can end up with a*n < b*b. - // This is an artefact of rounding, and can only happen if all pixels - // are (almost) identical, so in this case we saturate to p=0. - uint32_t p = (a * n < b * b) ? 0 : a * n - b * b; - - const uint32_t s = params->s[radius_idx]; - - // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32 - // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12 - // (this holds even after accounting for the rounding in s) - const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS); - - // Note: We have to be quite careful about the value of A[k]. - // This is used as a blend factor between individual pixel values and the - // local mean. So it logically has a range of [0, 256], including both - // endpoints. - // - // This is a pain for hardware, as we'd like something which can be stored - // in exactly 8 bits. - // Further, in the calculation of B[k] below, if z == 0 and r == 2, - // then A[k] "should be" 0. But then we can end up setting B[k] to a value - // slightly above 2^(8 + bit depth), due to rounding in the value of - // one_by_x[25-1]. - // - // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0. - // This fixes the above issues (256 - A[k] fits in a uint8, and we can't - // overflow), without significantly affecting the final result: z == 0 - // implies that the image is essentially "flat", so the local mean and - // individual pixel values are very similar. - // - // Note that saturating on the other side, ie. requring A[k] <= 255, - // would be a bad idea, as that corresponds to the case where the image - // is very variable, when we want to preserve the local pixel value as - // much as possible. - A[k] = x_by_xplus1[AOMMIN(z, 255)]; // in range [1, 256] - // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n, - // one_by_x[n - 1] = round(2^12 / n) - // => the product here is < 2^(20 + bit_depth) <= 2^32, - // and B[k] is set to a value < 2^(8 + bit depth) - // This holds even with the rounding in one_by_x and in the overall - // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8. - B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) * - (uint32_t)B[k] * - (uint32_t)one_by_x[n - 1], - SGRPROJ_RECIP_BITS); - } - } // Use the A[] and B[] arrays to calculate the filtered image for (i = 0; i < height; ++i) { for (j = 0; j < width; ++j) { @@ -911,10 +859,10 @@ static void selfguided_restoration_internal(int32_t *dgd, int width, int height, } } -void av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height, - int dgd_stride, int32_t *flt0, int32_t *flt1, - int flt_stride, int sgr_params_idx, - int bit_depth, int highbd) { +int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height, + int dgd_stride, int32_t *flt0, int32_t *flt1, + int flt_stride, int sgr_params_idx, + int bit_depth, int highbd) { int32_t dgd32_[RESTORATION_PROC_UNIT_PELS]; const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ; int32_t *dgd32 = @@ -948,6 +896,7 @@ void av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height, if (params->r[1] > 0) selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt1, flt_stride, bit_depth, sgr_params_idx, 1); + return 0; } void apply_selfguided_restoration_c(const uint8_t *dat8, int width, int height, @@ -959,8 +908,10 @@ void apply_selfguided_restoration_c(const uint8_t *dat8, int width, int height, int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX; assert(width * height <= RESTORATION_UNITPELS_MAX); - av1_selfguided_restoration_c(dat8, width, height, stride, flt0, flt1, width, - eps, bit_depth, highbd); + const int ret = av1_selfguided_restoration_c( + dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd); + (void)ret; + assert(!ret); const sgr_params_type *const params = &sgr_params[eps]; int xq[2]; decode_xq(xqd, xq, params); diff --git a/third_party/aom/av1/common/restoration.h b/third_party/aom/av1/common/restoration.h index aec37d834..d834f9270 100644 --- a/third_party/aom/av1/common/restoration.h +++ b/third_party/aom/av1/common/restoration.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_RESTORATION_H_ -#define AV1_COMMON_RESTORATION_H_ +#ifndef AOM_AV1_COMMON_RESTORATION_H_ +#define AOM_AV1_COMMON_RESTORATION_H_ #include "aom_ports/mem.h" #include "config/aom_config.h" @@ -120,6 +120,7 @@ extern "C" { // If WIENER_WIN_CHROMA == WIENER_WIN - 2, that implies 5x5 filters are used for // chroma. To use 7x7 for chroma set WIENER_WIN_CHROMA to WIENER_WIN. #define WIENER_WIN_CHROMA (WIENER_WIN - 2) +#define WIENER_WIN2_CHROMA ((WIENER_WIN_CHROMA) * (WIENER_WIN_CHROMA)) #define WIENER_FILT_PREC_BITS 7 #define WIENER_FILT_STEP (1 << WIENER_FILT_PREC_BITS) @@ -373,4 +374,4 @@ void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c, } // extern "C" #endif -#endif // AV1_COMMON_RESTORATION_H_ +#endif // AOM_AV1_COMMON_RESTORATION_H_ diff --git a/third_party/aom/av1/common/scale.h b/third_party/aom/av1/common/scale.h index 5f02fdb81..748e958c3 100644 --- a/third_party/aom/av1/common/scale.h +++ b/third_party/aom/av1/common/scale.h @@ -9,12 +9,11 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_SCALE_H_ -#define AV1_COMMON_SCALE_H_ +#ifndef AOM_AV1_COMMON_SCALE_H_ +#define AOM_AV1_COMMON_SCALE_H_ #include "av1/common/convolve.h" #include "av1/common/mv.h" -#include "aom_dsp/aom_convolve.h" #ifdef __cplusplus extern "C" { @@ -65,4 +64,4 @@ static INLINE int valid_ref_frame_size(int ref_width, int ref_height, } // extern "C" #endif -#endif // AV1_COMMON_SCALE_H_ +#endif // AOM_AV1_COMMON_SCALE_H_ diff --git a/third_party/aom/av1/common/scan.h b/third_party/aom/av1/common/scan.h index d206586b5..233dc0efa 100644 --- a/third_party/aom/av1/common/scan.h +++ b/third_party/aom/av1/common/scan.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_SCAN_H_ -#define AV1_COMMON_SCAN_H_ +#ifndef AOM_AV1_COMMON_SCAN_H_ +#define AOM_AV1_COMMON_SCAN_H_ #include "aom/aom_integer.h" #include "aom_ports/mem.h" @@ -52,4 +52,4 @@ static INLINE const SCAN_ORDER *get_scan(TX_SIZE tx_size, TX_TYPE tx_type) { } // extern "C" #endif -#endif // AV1_COMMON_SCAN_H_ +#endif // AOM_AV1_COMMON_SCAN_H_ diff --git a/third_party/aom/av1/common/seg_common.h b/third_party/aom/av1/common/seg_common.h index c851d65fd..8c35bba86 100644 --- a/third_party/aom/av1/common/seg_common.h +++ b/third_party/aom/av1/common/seg_common.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_SEG_COMMON_H_ -#define AV1_COMMON_SEG_COMMON_H_ +#ifndef AOM_AV1_COMMON_SEG_COMMON_H_ +#define AOM_AV1_COMMON_SEG_COMMON_H_ #include "aom_dsp/prob.h" @@ -101,4 +101,4 @@ static INLINE int get_segdata(const struct segmentation *seg, int segment_id, } // extern "C" #endif -#endif // AV1_COMMON_SEG_COMMON_H_ +#endif // AOM_AV1_COMMON_SEG_COMMON_H_ diff --git a/third_party/aom/av1/common/thread_common.c b/third_party/aom/av1/common/thread_common.c index f9b734b8c..8df4c9a09 100644 --- a/third_party/aom/av1/common/thread_common.c +++ b/third_party/aom/av1/common/thread_common.c @@ -304,8 +304,9 @@ static INLINE void thread_loop_filter_rows( } // Row-based multi-threaded loopfilter hook -static int loop_filter_row_worker(AV1LfSync *const lf_sync, - LFWorkerData *const lf_data) { +static int loop_filter_row_worker(void *arg1, void *arg2) { + AV1LfSync *const lf_sync = (AV1LfSync *)arg1; + LFWorkerData *const lf_data = (LFWorkerData *)arg2; thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes, lf_data->xd, lf_sync); return 1; @@ -342,7 +343,7 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, AVxWorker *const worker = &workers[i]; LFWorkerData *const lf_data = &lf_sync->lfdata[i]; - worker->hook = (AVxWorkerHook)loop_filter_row_worker; + worker->hook = loop_filter_row_worker; worker->data1 = lf_sync; worker->data2 = lf_data; @@ -649,8 +650,9 @@ AV1LrMTInfo *get_lr_job_info(AV1LrSync *lr_sync) { } // Implement row loop restoration for each thread. -static int loop_restoration_row_worker(AV1LrSync *const lr_sync, - LRWorkerData *lrworkerdata) { +static int loop_restoration_row_worker(void *arg1, void *arg2) { + AV1LrSync *const lr_sync = (AV1LrSync *)arg1; + LRWorkerData *lrworkerdata = (LRWorkerData *)arg2; AV1LrStruct *lr_ctxt = (AV1LrStruct *)lrworkerdata->lr_ctxt; FilterFrameCtxt *ctxt = lr_ctxt->ctxt; int lr_unit_row; @@ -714,10 +716,12 @@ static void foreach_rest_unit_in_planes_mt(AV1LrStruct *lr_ctxt, int num_rows_lr = 0; for (int plane = 0; plane < num_planes; plane++) { + if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue; + const AV1PixelRect tile_rect = ctxt[plane].tile_rect; const int max_tile_h = tile_rect.bottom - tile_rect.top; - const int unit_size = cm->seq_params.sb_size == BLOCK_128X128 ? 128 : 64; + const int unit_size = cm->rst_info[plane].restoration_unit_size; num_rows_lr = AOMMAX(num_rows_lr, av1_lr_count_units_in_tile(unit_size, max_tile_h)); @@ -746,7 +750,7 @@ static void foreach_rest_unit_in_planes_mt(AV1LrStruct *lr_ctxt, for (i = 0; i < num_workers; ++i) { AVxWorker *const worker = &workers[i]; lr_sync->lrworkerdata[i].lr_ctxt = (void *)lr_ctxt; - worker->hook = (AVxWorkerHook)loop_restoration_row_worker; + worker->hook = loop_restoration_row_worker; worker->data1 = lr_sync; worker->data2 = &lr_sync->lrworkerdata[i]; diff --git a/third_party/aom/av1/common/thread_common.h b/third_party/aom/av1/common/thread_common.h index 4b0d5d2b8..23d61d72a 100644 --- a/third_party/aom/av1/common/thread_common.h +++ b/third_party/aom/av1/common/thread_common.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_LOOPFILTER_THREAD_H_ -#define AV1_COMMON_LOOPFILTER_THREAD_H_ +#ifndef AOM_AV1_COMMON_THREAD_COMMON_H_ +#define AOM_AV1_COMMON_THREAD_COMMON_H_ #include "config/aom_config.h" @@ -116,4 +116,4 @@ void av1_loop_restoration_dealloc(AV1LrSync *lr_sync, int num_workers); } // extern "C" #endif -#endif // AV1_COMMON_LOOPFILTER_THREAD_H_ +#endif // AOM_AV1_COMMON_THREAD_COMMON_H_ diff --git a/third_party/aom/av1/common/tile_common.c b/third_party/aom/av1/common/tile_common.c index 026c904b6..1b413487f 100644 --- a/third_party/aom/av1/common/tile_common.c +++ b/third_party/aom/av1/common/tile_common.c @@ -127,6 +127,22 @@ void av1_tile_set_col(TileInfo *tile, const AV1_COMMON *cm, int col) { assert(tile->mi_col_end > tile->mi_col_start); } +int av1_get_sb_rows_in_tile(AV1_COMMON *cm, TileInfo tile) { + int mi_rows_aligned_to_sb = ALIGN_POWER_OF_TWO( + tile.mi_row_end - tile.mi_row_start, cm->seq_params.mib_size_log2); + int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params.mib_size_log2; + + return sb_rows; +} + +int av1_get_sb_cols_in_tile(AV1_COMMON *cm, TileInfo tile) { + int mi_cols_aligned_to_sb = ALIGN_POWER_OF_TWO( + tile.mi_col_end - tile.mi_col_start, cm->seq_params.mib_size_log2); + int sb_cols = mi_cols_aligned_to_sb >> cm->seq_params.mib_size_log2; + + return sb_cols; +} + int get_tile_size(int mi_frame_size, int log2_tile_num, int *ntiles) { // Round the frame up to a whole number of max superblocks mi_frame_size = ALIGN_POWER_OF_TWO(mi_frame_size, MAX_MIB_SIZE_LOG2); diff --git a/third_party/aom/av1/common/tile_common.h b/third_party/aom/av1/common/tile_common.h index be037fb17..c03553dc6 100644 --- a/third_party/aom/av1/common/tile_common.h +++ b/third_party/aom/av1/common/tile_common.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_TILE_COMMON_H_ -#define AV1_COMMON_TILE_COMMON_H_ +#ifndef AOM_AV1_COMMON_TILE_COMMON_H_ +#define AOM_AV1_COMMON_TILE_COMMON_H_ #ifdef __cplusplus extern "C" { @@ -44,6 +44,9 @@ void av1_get_tile_n_bits(int mi_cols, int *min_log2_tile_cols, // tiles horizontally or vertically in the frame. int get_tile_size(int mi_frame_size, int log2_tile_num, int *ntiles); +int av1_get_sb_rows_in_tile(struct AV1Common *cm, TileInfo tile); +int av1_get_sb_cols_in_tile(struct AV1Common *cm, TileInfo tile); + typedef struct { int left, top, right, bottom; } AV1PixelRect; @@ -66,4 +69,4 @@ void av1_calculate_tile_rows(struct AV1Common *const cm); } // extern "C" #endif -#endif // AV1_COMMON_TILE_COMMON_H_ +#endif // AOM_AV1_COMMON_TILE_COMMON_H_ diff --git a/third_party/aom/av1/common/timing.h b/third_party/aom/av1/common/timing.h index 1749baa57..06939ae43 100644 --- a/third_party/aom/av1/common/timing.h +++ b/third_party/aom/av1/common/timing.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_TIMING_H_ -#define AOM_TIMING_H_ +#ifndef AOM_AV1_COMMON_TIMING_H_ +#define AOM_AV1_COMMON_TIMING_H_ #include "aom/aom_integer.h" #include "av1/common/enums.h" @@ -56,4 +56,4 @@ void set_resource_availability_parameters( int64_t max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx, int seq_tier); -#endif // AOM_TIMING_H_ +#endif // AOM_AV1_COMMON_TIMING_H_ diff --git a/third_party/aom/av1/common/token_cdfs.h b/third_party/aom/av1/common/token_cdfs.h index 9a6b454ac..53e956450 100644 --- a/third_party/aom/av1/common/token_cdfs.h +++ b/third_party/aom/av1/common/token_cdfs.h @@ -9,6 +9,9 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#ifndef AOM_AV1_COMMON_TOKEN_CDFS_H_ +#define AOM_AV1_COMMON_TOKEN_CDFS_H_ + #include "config/aom_config.h" #include "av1/common/entropy.h" @@ -3548,3 +3551,5 @@ static const aom_cdf_prob av1_default_coeff_base_eob_multi_cdfs { AOM_CDF3(10923, 21845) }, { AOM_CDF3(10923, 21845) }, { AOM_CDF3(10923, 21845) } } } } }; + +#endif // AOM_AV1_COMMON_TOKEN_CDFS_H_ diff --git a/third_party/aom/av1/common/txb_common.h b/third_party/aom/av1/common/txb_common.h index f0ab79d0f..1dda51f8b 100644 --- a/third_party/aom/av1/common/txb_common.h +++ b/third_party/aom/av1/common/txb_common.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_TXB_COMMON_H_ -#define AV1_COMMON_TXB_COMMON_H_ +#ifndef AOM_AV1_COMMON_TXB_COMMON_H_ +#define AOM_AV1_COMMON_TXB_COMMON_H_ extern const int16_t k_eob_group_start[12]; extern const int16_t k_eob_offset_bits[12]; @@ -34,24 +34,6 @@ static const int base_level_count_to_index[13] = { 0, 0, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, }; -// Note: TX_PAD_2D is dependent to this offset table. -static const int base_ref_offset[BASE_CONTEXT_POSITION_NUM][2] = { - /* clang-format off*/ - { -2, 0 }, { -1, -1 }, { -1, 0 }, { -1, 1 }, { 0, -2 }, { 0, -1 }, { 0, 1 }, - { 0, 2 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 2, 0 } - /* clang-format on*/ -}; - -#define CONTEXT_MAG_POSITION_NUM 3 -static const int mag_ref_offset_with_txclass[3][CONTEXT_MAG_POSITION_NUM][2] = { - { { 0, 1 }, { 1, 0 }, { 1, 1 } }, - { { 0, 1 }, { 1, 0 }, { 0, 2 } }, - { { 0, 1 }, { 1, 0 }, { 2, 0 } } -}; -static const int mag_ref_offset[CONTEXT_MAG_POSITION_NUM][2] = { - { 0, 1 }, { 1, 0 }, { 1, 1 } -}; - static const TX_CLASS tx_type_to_class[TX_TYPES] = { TX_CLASS_2D, // DCT_DCT TX_CLASS_2D, // ADST_DCT @@ -71,61 +53,6 @@ static const TX_CLASS tx_type_to_class[TX_TYPES] = { TX_CLASS_HORIZ, // H_FLIPADST }; -static const int8_t eob_to_pos_small[33] = { - 0, 1, 2, // 0-2 - 3, 3, // 3-4 - 4, 4, 4, 4, // 5-8 - 5, 5, 5, 5, 5, 5, 5, 5, // 9-16 - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6 // 17-32 -}; - -static const int8_t eob_to_pos_large[17] = { - 6, // place holder - 7, // 33-64 - 8, 8, // 65-128 - 9, 9, 9, 9, // 129-256 - 10, 10, 10, 10, 10, 10, 10, 10, // 257-512 - 11 // 513- -}; - -static INLINE int get_eob_pos_token(const int eob, int *const extra) { - int t; - - if (eob < 33) { - t = eob_to_pos_small[eob]; - } else { - const int e = AOMMIN((eob - 1) >> 5, 16); - t = eob_to_pos_large[e]; - } - - *extra = eob - k_eob_group_start[t]; - - return t; -} - -static INLINE int av1_get_eob_pos_ctx(const TX_TYPE tx_type, - const int eob_token) { - static const int8_t tx_type_to_offset[TX_TYPES] = { - -1, // DCT_DCT - -1, // ADST_DCT - -1, // DCT_ADST - -1, // ADST_ADST - -1, // FLIPADST_DCT - -1, // DCT_FLIPADST - -1, // FLIPADST_FLIPADST - -1, // ADST_FLIPADST - -1, // FLIPADST_ADST - -1, // IDTX - 10, // V_DCT - 10, // H_DCT - 10, // V_ADST - 10, // H_ADST - 10, // V_FLIPADST - 10, // H_FLIPADST - }; - return eob_token + tx_type_to_offset[tx_type]; -} - static INLINE int get_txb_bwl(TX_SIZE tx_size) { tx_size = av1_get_adjusted_tx_size(tx_size); return tx_size_wide_log2[tx_size]; @@ -141,36 +68,6 @@ static INLINE int get_txb_high(TX_SIZE tx_size) { return tx_size_high[tx_size]; } -static INLINE void get_base_count_mag(int *mag, int *count, - const tran_low_t *tcoeffs, int bwl, - int height, int row, int col) { - mag[0] = 0; - mag[1] = 0; - for (int i = 0; i < NUM_BASE_LEVELS; ++i) count[i] = 0; - for (int idx = 0; idx < BASE_CONTEXT_POSITION_NUM; ++idx) { - const int ref_row = row + base_ref_offset[idx][0]; - const int ref_col = col + base_ref_offset[idx][1]; - if (ref_row < 0 || ref_col < 0 || ref_row >= height || - ref_col >= (1 << bwl)) - continue; - const int pos = (ref_row << bwl) + ref_col; - tran_low_t abs_coeff = abs(tcoeffs[pos]); - // count - for (int i = 0; i < NUM_BASE_LEVELS; ++i) { - count[i] += abs_coeff > i; - } - // mag - if (base_ref_offset[idx][0] >= 0 && base_ref_offset[idx][1] >= 0) { - if (abs_coeff > mag[0]) { - mag[0] = abs_coeff; - mag[1] = 1; - } else if (abs_coeff == mag[0]) { - ++mag[1]; - } - } - } -} - static INLINE uint8_t *set_levels(uint8_t *const levels_buf, const int width) { return levels_buf + TX_PAD_TOP * (width + TX_PAD_HOR); } @@ -179,30 +76,6 @@ static INLINE int get_padded_idx(const int idx, const int bwl) { return idx + ((idx >> bwl) << TX_PAD_HOR_LOG2); } -static INLINE int get_level_count(const uint8_t *const levels, const int stride, - const int row, const int col, const int level, - const int (*nb_offset)[2], const int nb_num) { - int count = 0; - - for (int idx = 0; idx < nb_num; ++idx) { - const int ref_row = row + nb_offset[idx][0]; - const int ref_col = col + nb_offset[idx][1]; - const int pos = ref_row * stride + ref_col; - count += levels[pos] > level; - } - return count; -} - -static INLINE void get_level_mag(const uint8_t *const levels, const int stride, - const int row, const int col, int *const mag) { - for (int idx = 0; idx < CONTEXT_MAG_POSITION_NUM; ++idx) { - const int ref_row = row + mag_ref_offset[idx][0]; - const int ref_col = col + mag_ref_offset[idx][1]; - const int pos = ref_row * stride + ref_col; - mag[idx] = levels[pos]; - } -} - static INLINE int get_base_ctx_from_count_mag(int row, int col, int count, int sig_mag) { const int ctx = base_level_count_to_index[count]; @@ -267,84 +140,6 @@ static INLINE int get_base_ctx_from_count_mag(int row, int col, int count, return ctx_idx; } -static INLINE int get_base_ctx(const uint8_t *const levels, - const int c, // raster order - const int bwl, const int level_minus_1, - const int count) { - const int row = c >> bwl; - const int col = c - (row << bwl); - const int stride = (1 << bwl) + TX_PAD_HOR; - int mag_count = 0; - int nb_mag[3] = { 0 }; - - get_level_mag(levels, stride, row, col, nb_mag); - - for (int idx = 0; idx < 3; ++idx) - mag_count += nb_mag[idx] > (level_minus_1 + 1); - const int ctx_idx = - get_base_ctx_from_count_mag(row, col, count, AOMMIN(2, mag_count)); - return ctx_idx; -} - -#define BR_CONTEXT_POSITION_NUM 8 // Base range coefficient context -// Note: TX_PAD_2D is dependent to this offset table. -static const int br_ref_offset[BR_CONTEXT_POSITION_NUM][2] = { - /* clang-format off*/ - { -1, -1 }, { -1, 0 }, { -1, 1 }, { 0, -1 }, - { 0, 1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, - /* clang-format on*/ -}; - -static const int br_level_map[9] = { - 0, 0, 1, 1, 2, 2, 3, 3, 3, -}; - -// Note: If BR_MAG_OFFSET changes, the calculation of offset in -// get_br_ctx_from_count_mag() must be updated. -#define BR_MAG_OFFSET 1 -// TODO(angiebird): optimize this function by using a table to map from -// count/mag to ctx - -static INLINE int get_br_count_mag(int *mag, const tran_low_t *tcoeffs, int bwl, - int height, int row, int col, int level) { - mag[0] = 0; - mag[1] = 0; - int count = 0; - for (int idx = 0; idx < BR_CONTEXT_POSITION_NUM; ++idx) { - const int ref_row = row + br_ref_offset[idx][0]; - const int ref_col = col + br_ref_offset[idx][1]; - if (ref_row < 0 || ref_col < 0 || ref_row >= height || - ref_col >= (1 << bwl)) - continue; - const int pos = (ref_row << bwl) + ref_col; - tran_low_t abs_coeff = abs(tcoeffs[pos]); - count += abs_coeff > level; - if (br_ref_offset[idx][0] >= 0 && br_ref_offset[idx][1] >= 0) { - if (abs_coeff > mag[0]) { - mag[0] = abs_coeff; - mag[1] = 1; - } else if (abs_coeff == mag[0]) { - ++mag[1]; - } - } - } - return count; -} - -static INLINE int get_br_ctx_from_count_mag(const int row, const int col, - const int count, const int mag) { - // DC: 0 - 1 - // Top row: 2 - 4 - // Left column: 5 - 7 - // others: 8 - 11 - static const int offset_pos[2][2] = { { 8, 5 }, { 2, 0 } }; - const int mag_clamp = AOMMIN(mag, 6); - const int offset = mag_clamp >> 1; - const int ctx = - br_level_map[count] + offset * BR_TMP_OFFSET + offset_pos[!row][!col]; - return ctx; -} - static INLINE int get_br_ctx_2d(const uint8_t *const levels, const int c, // raster order const int bwl) { @@ -396,38 +191,6 @@ static AOM_FORCE_INLINE int get_br_ctx(const uint8_t *const levels, return mag + 14; } -#define SIG_REF_OFFSET_NUM 5 - -// Note: TX_PAD_2D is dependent to these offset tables. -static const int sig_ref_offset[SIG_REF_OFFSET_NUM][2] = { - { 0, 1 }, { 1, 0 }, { 1, 1 }, { 0, 2 }, { 2, 0 } - // , { 1, 2 }, { 2, 1 }, -}; - -static const int sig_ref_offset_vert[SIG_REF_OFFSET_NUM][2] = { - { 1, 0 }, { 2, 0 }, { 0, 1 }, { 3, 0 }, { 4, 0 } - // , { 1, 1 }, { 2, 1 }, -}; - -static const int sig_ref_offset_horiz[SIG_REF_OFFSET_NUM][2] = { - { 0, 1 }, { 0, 2 }, { 1, 0 }, { 0, 3 }, { 0, 4 } - // , { 1, 1 }, { 1, 2 }, -}; - -#define SIG_REF_DIFF_OFFSET_NUM 3 - -static const int sig_ref_diff_offset[SIG_REF_DIFF_OFFSET_NUM][2] = { - { 1, 1 }, { 0, 2 }, { 2, 0 } -}; - -static const int sig_ref_diff_offset_vert[SIG_REF_DIFF_OFFSET_NUM][2] = { - { 2, 0 }, { 3, 0 }, { 4, 0 } -}; - -static const int sig_ref_diff_offset_horiz[SIG_REF_DIFF_OFFSET_NUM][2] = { - { 0, 2 }, { 0, 3 }, { 0, 4 } -}; - static const uint8_t clip_max3[256] = { 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, @@ -658,4 +421,4 @@ static INLINE void get_txb_ctx(const BLOCK_SIZE plane_bsize, void av1_init_lv_map(AV1_COMMON *cm); -#endif // AV1_COMMON_TXB_COMMON_H_ +#endif // AOM_AV1_COMMON_TXB_COMMON_H_ diff --git a/third_party/aom/av1/common/warped_motion.c b/third_party/aom/av1/common/warped_motion.c index 412d83ed8..4144c4389 100644 --- a/third_party/aom/av1/common/warped_motion.c +++ b/third_party/aom/av1/common/warped_motion.c @@ -562,7 +562,7 @@ static int64_t highbd_warp_error( const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK); uint16_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK]; - ConvolveParams conv_params = get_conv_params(0, 0, 0, bd); + ConvolveParams conv_params = get_conv_params(0, 0, bd); conv_params.use_jnt_comp_avg = 0; for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) { for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) { @@ -845,7 +845,7 @@ static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref, int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK); int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK); uint8_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK]; - ConvolveParams conv_params = get_conv_params(0, 0, 0, 8); + ConvolveParams conv_params = get_conv_params(0, 0, 8); conv_params.use_jnt_comp_avg = 0; for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) { diff --git a/third_party/aom/av1/common/warped_motion.h b/third_party/aom/av1/common/warped_motion.h index ce4032ee5..a1a4f067d 100644 --- a/third_party/aom/av1/common/warped_motion.h +++ b/third_party/aom/av1/common/warped_motion.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_WARPED_MOTION_H_ -#define AV1_COMMON_WARPED_MOTION_H_ +#ifndef AOM_AV1_COMMON_WARPED_MOTION_H_ +#define AOM_AV1_COMMON_WARPED_MOTION_H_ #include #include @@ -92,4 +92,4 @@ int find_projection(int np, int *pts1, int *pts2, BLOCK_SIZE bsize, int mvy, int mi_col); int get_shear_params(WarpedMotionParams *wm); -#endif // AV1_COMMON_WARPED_MOTION_H_ +#endif // AOM_AV1_COMMON_WARPED_MOTION_H_ diff --git a/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c b/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c index 0c5286f9d..d9fb53785 100644 --- a/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c +++ b/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c @@ -14,7 +14,6 @@ #include "config/aom_dsp_rtcd.h" -#include "aom_dsp/aom_convolve.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" #include "av1/common/convolve.h" diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c index ae331b40d..5db2ccf6c 100644 --- a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c +++ b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c @@ -18,6 +18,12 @@ #include "av1/common/x86/av1_inv_txfm_avx2.h" #include "av1/common/x86/av1_inv_txfm_ssse3.h" +// TODO(venkatsanampudi@ittiam.com): move this to header file + +// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5 +static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096, + 4 * 5793 }; + static INLINE void idct16_stage5_avx2(__m256i *x1, const int32_t *cospi, const __m256i _r, int8_t cos_bit) { const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h index 7b5b29cf8..f74cbaeaa 100644 --- a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h +++ b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h @@ -8,8 +8,8 @@ * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_ -#define AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_ +#ifndef AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_ +#define AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_ #include @@ -68,4 +68,4 @@ void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output, } #endif -#endif // AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_ +#endif // AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_ diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c index dd7cee24c..995bc3da4 100644 --- a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c +++ b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c @@ -16,6 +16,12 @@ #include "av1/common/x86/av1_inv_txfm_ssse3.h" #include "av1/common/x86/av1_txfm_sse2.h" +// TODO(venkatsanampudi@ittiam.com): move this to header file + +// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5 +static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096, + 4 * 5793 }; + // TODO(binpengsmail@gmail.com): replace some for loop with do {} while static void idct4_new_sse2(const __m128i *input, __m128i *output, diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h index dc9be25d2..66bd339d1 100644 --- a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h +++ b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h @@ -8,8 +8,8 @@ * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_ -#define AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_ +#ifndef AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_ +#define AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_ #include // SSE2 #include // SSSE3 @@ -94,10 +94,6 @@ static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = { IIDENTITY_1D, IADST_1D, IIDENTITY_1D, IFLIPADST_1D, }; -// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5 -static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096, - 4 * 5793 }; - DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = { 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, }; @@ -233,4 +229,4 @@ void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output, } // extern "C" #endif -#endif // AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_ +#endif // AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_ diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse2.h b/third_party/aom/av1/common/x86/av1_txfm_sse2.h index 721cfe059..77aeb6eb1 100644 --- a/third_party/aom/av1/common/x86/av1_txfm_sse2.h +++ b/third_party/aom/av1/common/x86/av1_txfm_sse2.h @@ -8,8 +8,8 @@ * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_X86_AV1_TXFM_SSE2_H_ -#define AV1_COMMON_X86_AV1_TXFM_SSE2_H_ +#ifndef AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_ +#define AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_ #include // SSE2 @@ -314,4 +314,4 @@ typedef struct { #ifdef __cplusplus } #endif // __cplusplus -#endif // AV1_COMMON_X86_AV1_TXFM_SSE2_H_ +#endif // AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_ diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse4.h b/third_party/aom/av1/common/x86/av1_txfm_sse4.h index 367e02096..6cad821b1 100644 --- a/third_party/aom/av1/common/x86/av1_txfm_sse4.h +++ b/third_party/aom/av1/common/x86/av1_txfm_sse4.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_TXFM_SSE4_H_ -#define AV1_TXFM_SSE4_H_ +#ifndef AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_ +#define AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_ #include @@ -45,8 +45,9 @@ static INLINE void av1_round_shift_array_32_sse4_1(__m128i *input, static INLINE void av1_round_shift_rect_array_32_sse4_1(__m128i *input, __m128i *output, const int size, - const int bit) { - const __m128i sqrt2 = _mm_set1_epi32(NewSqrt2); + const int bit, + const int val) { + const __m128i sqrt2 = _mm_set1_epi32(val); if (bit > 0) { int i; for (i = 0; i < size; i++) { @@ -68,4 +69,4 @@ static INLINE void av1_round_shift_rect_array_32_sse4_1(__m128i *input, } #endif -#endif // AV1_TXFM_SSE4_H_ +#endif // AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_ diff --git a/third_party/aom/av1/common/x86/cfl_simd.h b/third_party/aom/av1/common/x86/cfl_simd.h index 7479ac3e1..3b342cd4e 100644 --- a/third_party/aom/av1/common/x86/cfl_simd.h +++ b/third_party/aom/av1/common/x86/cfl_simd.h @@ -9,6 +9,9 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#ifndef AOM_AV1_COMMON_X86_CFL_SIMD_H_ +#define AOM_AV1_COMMON_X86_CFL_SIMD_H_ + #include "av1/common/blockd.h" // SSSE3 version is optimal for with == 4, we reuse them in AVX2 @@ -236,3 +239,5 @@ void predict_hbd_16x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, int bd); void predict_hbd_16x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, int bd); + +#endif // AOM_AV1_COMMON_X86_CFL_SIMD_H_ diff --git a/third_party/aom/av1/common/x86/convolve_2d_avx2.c b/third_party/aom/av1/common/x86/convolve_2d_avx2.c index 1099144fe..0acafd044 100644 --- a/third_party/aom/av1/common/x86/convolve_2d_avx2.c +++ b/third_party/aom/av1/common/x86/convolve_2d_avx2.c @@ -11,10 +11,8 @@ #include -#include "config/aom_dsp_rtcd.h" #include "config/av1_rtcd.h" -#include "aom_dsp/aom_convolve.h" #include "aom_dsp/x86/convolve_avx2.h" #include "aom_dsp/x86/convolve_common_intrin.h" #include "aom_dsp/aom_dsp_common.h" diff --git a/third_party/aom/av1/common/x86/convolve_2d_sse2.c b/third_party/aom/av1/common/x86/convolve_2d_sse2.c index 637f83cf7..b1a62a4f6 100644 --- a/third_party/aom/av1/common/x86/convolve_2d_sse2.c +++ b/third_party/aom/av1/common/x86/convolve_2d_sse2.c @@ -11,9 +11,8 @@ #include -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" -#include "aom_dsp/aom_convolve.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/x86/convolve_sse2.h" diff --git a/third_party/aom/av1/common/x86/convolve_sse2.c b/third_party/aom/av1/common/x86/convolve_sse2.c index f66dee37d..5016642de 100644 --- a/third_party/aom/av1/common/x86/convolve_sse2.c +++ b/third_party/aom/av1/common/x86/convolve_sse2.c @@ -11,9 +11,8 @@ #include -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" -#include "aom_dsp/aom_convolve.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/x86/convolve_common_intrin.h" @@ -76,8 +75,8 @@ static INLINE __m128i convolve_hi_y(const __m128i *const s, return convolve(ss, coeffs); } -void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, - const uint8_t *dst, int dst_stride, int w, int h, +void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, @@ -237,8 +236,8 @@ void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, } } -void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, - const uint8_t *dst, int dst_stride, int w, int h, +void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c index 8444ffa93..ae68f0bbb 100644 --- a/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c +++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c @@ -14,7 +14,6 @@ #include "config/aom_dsp_rtcd.h" -#include "aom_dsp/aom_convolve.h" #include "aom_dsp/x86/convolve_avx2.h" #include "aom_dsp/x86/synonyms.h" #include "aom_dsp/aom_dsp_common.h" diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c index eb340523a..3f8dafb4b 100644 --- a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c +++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c @@ -15,7 +15,6 @@ #include "config/aom_dsp_rtcd.h" -#include "aom_dsp/aom_convolve.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/x86/convolve_sse2.h" diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c index 33183fdee..1d029db39 100644 --- a/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c +++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c @@ -14,7 +14,6 @@ #include "config/aom_dsp_rtcd.h" -#include "aom_dsp/aom_convolve.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/x86/convolve_sse2.h" diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c index debb05a6d..ade2af03e 100644 --- a/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c +++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c @@ -15,6 +15,9 @@ #include "config/av1_rtcd.h" #include "av1/common/av1_inv_txfm1d_cfg.h" +#include "av1/common/idct.h" +#include "av1/common/x86/av1_inv_txfm_ssse3.h" +#include "av1/common/x86/highbd_txfm_utility_sse4.h" // Note: // Total 32x4 registers to represent 32x32 block coefficients. @@ -27,131 +30,125 @@ // ... ... // v124, v125, v126, v127 -static void transpose_32x32_8x8(const __m256i *in, __m256i *out) { +static INLINE __m256i highbd_clamp_epi16_avx2(__m256i u, int bd) { + const __m256i zero = _mm256_setzero_si256(); + const __m256i one = _mm256_set1_epi16(1); + const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one); + __m256i clamped, mask; + + mask = _mm256_cmpgt_epi16(u, max); + clamped = _mm256_andnot_si256(mask, u); + mask = _mm256_and_si256(mask, max); + clamped = _mm256_or_si256(mask, clamped); + mask = _mm256_cmpgt_epi16(clamped, zero); + clamped = _mm256_and_si256(clamped, mask); + + return clamped; +} + +static INLINE __m256i highbd_get_recon_16x8_avx2(const __m256i pred, + __m256i res0, __m256i res1, + const int bd) { + __m256i x0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(pred)); + __m256i x1 = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(pred, 1)); + + x0 = _mm256_add_epi32(res0, x0); + x1 = _mm256_add_epi32(res1, x1); + x0 = _mm256_packus_epi32(x0, x1); + x0 = _mm256_permute4x64_epi64(x0, 0xd8); + x0 = highbd_clamp_epi16_avx2(x0, bd); + return x0; +} + +static INLINE void highbd_write_buffer_16xn_avx2(__m256i *in, uint16_t *output, + int stride, int flipud, + int height, const int bd) { + int j = flipud ? (height - 1) : 0; + const int step = flipud ? -1 : 1; + for (int i = 0; i < height; ++i, j += step) { + __m256i v = _mm256_loadu_si256((__m256i const *)(output + i * stride)); + __m256i u = highbd_get_recon_16x8_avx2(v, in[j], in[j + height], bd); + + _mm256_storeu_si256((__m256i *)(output + i * stride), u); + } +} + +static INLINE __m256i av1_round_shift_32_avx2(__m256i vec, int bit) { + __m256i tmp, round; + round = _mm256_set1_epi32(1 << (bit - 1)); + tmp = _mm256_add_epi32(vec, round); + return _mm256_srai_epi32(tmp, bit); +} + +static INLINE void av1_round_shift_array_32_avx2(__m256i *input, + __m256i *output, + const int size, + const int bit) { + if (bit > 0) { + int i; + for (i = 0; i < size; i++) { + output[i] = av1_round_shift_32_avx2(input[i], bit); + } + } else { + int i; + for (i = 0; i < size; i++) { + output[i] = _mm256_slli_epi32(input[i], -bit); + } + } +} + +static void transpose_8x8_avx2(const __m256i *in, __m256i *out) { __m256i u0, u1, u2, u3, u4, u5, u6, u7; __m256i x0, x1; - u0 = _mm256_unpacklo_epi32(in[0], in[4]); - u1 = _mm256_unpackhi_epi32(in[0], in[4]); + u0 = _mm256_unpacklo_epi32(in[0], in[1]); + u1 = _mm256_unpackhi_epi32(in[0], in[1]); - u2 = _mm256_unpacklo_epi32(in[8], in[12]); - u3 = _mm256_unpackhi_epi32(in[8], in[12]); + u2 = _mm256_unpacklo_epi32(in[2], in[3]); + u3 = _mm256_unpackhi_epi32(in[2], in[3]); - u4 = _mm256_unpacklo_epi32(in[16], in[20]); - u5 = _mm256_unpackhi_epi32(in[16], in[20]); + u4 = _mm256_unpacklo_epi32(in[4], in[5]); + u5 = _mm256_unpackhi_epi32(in[4], in[5]); - u6 = _mm256_unpacklo_epi32(in[24], in[28]); - u7 = _mm256_unpackhi_epi32(in[24], in[28]); + u6 = _mm256_unpacklo_epi32(in[6], in[7]); + u7 = _mm256_unpackhi_epi32(in[6], in[7]); x0 = _mm256_unpacklo_epi64(u0, u2); x1 = _mm256_unpacklo_epi64(u4, u6); out[0] = _mm256_permute2f128_si256(x0, x1, 0x20); - out[16] = _mm256_permute2f128_si256(x0, x1, 0x31); + out[4] = _mm256_permute2f128_si256(x0, x1, 0x31); x0 = _mm256_unpackhi_epi64(u0, u2); x1 = _mm256_unpackhi_epi64(u4, u6); - out[4] = _mm256_permute2f128_si256(x0, x1, 0x20); - out[20] = _mm256_permute2f128_si256(x0, x1, 0x31); + out[1] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[5] = _mm256_permute2f128_si256(x0, x1, 0x31); x0 = _mm256_unpacklo_epi64(u1, u3); x1 = _mm256_unpacklo_epi64(u5, u7); - out[8] = _mm256_permute2f128_si256(x0, x1, 0x20); - out[24] = _mm256_permute2f128_si256(x0, x1, 0x31); + out[2] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[6] = _mm256_permute2f128_si256(x0, x1, 0x31); x0 = _mm256_unpackhi_epi64(u1, u3); x1 = _mm256_unpackhi_epi64(u5, u7); - out[12] = _mm256_permute2f128_si256(x0, x1, 0x20); - out[28] = _mm256_permute2f128_si256(x0, x1, 0x31); -} - -static void transpose_32x32_16x16(const __m256i *in, __m256i *out) { - transpose_32x32_8x8(&in[0], &out[0]); - transpose_32x32_8x8(&in[1], &out[32]); - transpose_32x32_8x8(&in[32], &out[1]); - transpose_32x32_8x8(&in[33], &out[33]); -} - -static void transpose_32x32(const __m256i *in, __m256i *out) { - transpose_32x32_16x16(&in[0], &out[0]); - transpose_32x32_16x16(&in[2], &out[64]); - transpose_32x32_16x16(&in[64], &out[2]); - transpose_32x32_16x16(&in[66], &out[66]); + out[3] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[7] = _mm256_permute2f128_si256(x0, x1, 0x31); } -static void load_buffer_32x32(const int32_t *coeff, __m256i *in) { +static void load_buffer_32x32(const int32_t *coeff, __m256i *in, + int input_stiride, int size) { int i; - for (i = 0; i < 128; ++i) { - in[i] = _mm256_loadu_si256((const __m256i *)coeff); - coeff += 8; + for (i = 0; i < size; ++i) { + in[i] = _mm256_loadu_si256((const __m256i *)(coeff + i * input_stiride)); } } -static __m256i highbd_clamp_epi32(__m256i x, int bd) { - const __m256i zero = _mm256_setzero_si256(); - const __m256i one = _mm256_set1_epi16(1); - const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one); - __m256i clamped, mask; - - mask = _mm256_cmpgt_epi16(x, max); - clamped = _mm256_andnot_si256(mask, x); - mask = _mm256_and_si256(mask, max); - clamped = _mm256_or_si256(mask, clamped); - mask = _mm256_cmpgt_epi16(clamped, zero); - clamped = _mm256_and_si256(clamped, mask); - - return clamped; -} - -static void write_buffer_32x32(__m256i *in, uint16_t *output, int stride, - int fliplr, int flipud, int shift, int bd) { - __m256i u0, u1, x0, x1, x2, x3, v0, v1, v2, v3; - const __m256i zero = _mm256_setzero_si256(); - int i = 0; - (void)fliplr; - (void)flipud; - - __m256i round = _mm256_set1_epi32((1 << shift) >> 1); - - while (i < 128) { - u0 = _mm256_loadu_si256((const __m256i *)output); - u1 = _mm256_loadu_si256((const __m256i *)(output + 16)); - - x0 = _mm256_unpacklo_epi16(u0, zero); - x1 = _mm256_unpackhi_epi16(u0, zero); - x2 = _mm256_unpacklo_epi16(u1, zero); - x3 = _mm256_unpackhi_epi16(u1, zero); - - v0 = _mm256_permute2f128_si256(in[i], in[i + 1], 0x20); - v1 = _mm256_permute2f128_si256(in[i], in[i + 1], 0x31); - v2 = _mm256_permute2f128_si256(in[i + 2], in[i + 3], 0x20); - v3 = _mm256_permute2f128_si256(in[i + 2], in[i + 3], 0x31); - - v0 = _mm256_add_epi32(v0, round); - v1 = _mm256_add_epi32(v1, round); - v2 = _mm256_add_epi32(v2, round); - v3 = _mm256_add_epi32(v3, round); - - v0 = _mm256_sra_epi32(v0, _mm_cvtsi32_si128(shift)); - v1 = _mm256_sra_epi32(v1, _mm_cvtsi32_si128(shift)); - v2 = _mm256_sra_epi32(v2, _mm_cvtsi32_si128(shift)); - v3 = _mm256_sra_epi32(v3, _mm_cvtsi32_si128(shift)); - - v0 = _mm256_add_epi32(v0, x0); - v1 = _mm256_add_epi32(v1, x1); - v2 = _mm256_add_epi32(v2, x2); - v3 = _mm256_add_epi32(v3, x3); - - v0 = _mm256_packus_epi32(v0, v1); - v2 = _mm256_packus_epi32(v2, v3); - - v0 = highbd_clamp_epi32(v0, bd); - v2 = highbd_clamp_epi32(v2, bd); - - _mm256_storeu_si256((__m256i *)output, v0); - _mm256_storeu_si256((__m256i *)(output + 16), v2); - output += stride; - i += 4; - } +static INLINE __m256i half_btf_0_avx2(const __m256i *w0, const __m256i *n0, + const __m256i *rounding, int bit) { + __m256i x; + x = _mm256_mullo_epi32(*w0, *n0); + x = _mm256_add_epi32(x, *rounding); + x = _mm256_srai_epi32(x, bit); + return x; } static INLINE __m256i half_btf_avx2(const __m256i *w0, const __m256i *n0, @@ -200,18 +197,549 @@ static void addsub_shift_avx2(const __m256i in0, const __m256i in1, __m256i a0 = _mm256_add_epi32(in0_w_offset, in1); __m256i a1 = _mm256_sub_epi32(in0_w_offset, in1); + a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift)); + a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift)); + a0 = _mm256_max_epi32(a0, *clamp_lo); a0 = _mm256_min_epi32(a0, *clamp_hi); a1 = _mm256_max_epi32(a1, *clamp_lo); a1 = _mm256_min_epi32(a1, *clamp_hi); - a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift)); - a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift)); - *out0 = a0; *out1 = a1; } +static INLINE void idct32_stage4_avx2( + __m256i *bf1, const __m256i *cospim8, const __m256i *cospi56, + const __m256i *cospi8, const __m256i *cospim56, const __m256i *cospim40, + const __m256i *cospi24, const __m256i *cospi40, const __m256i *cospim24, + const __m256i *rounding, int bit) { + __m256i temp1, temp2; + temp1 = half_btf_avx2(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit); + bf1[30] = half_btf_avx2(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit); + bf1[17] = temp1; + + temp2 = half_btf_avx2(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit); + bf1[29] = half_btf_avx2(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit); + bf1[18] = temp2; + + temp1 = half_btf_avx2(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit); + bf1[26] = half_btf_avx2(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit); + bf1[21] = temp1; + + temp2 = half_btf_avx2(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit); + bf1[25] = half_btf_avx2(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit); + bf1[22] = temp2; +} + +static INLINE void idct32_stage5_avx2( + __m256i *bf1, const __m256i *cospim16, const __m256i *cospi48, + const __m256i *cospi16, const __m256i *cospim48, const __m256i *clamp_lo, + const __m256i *clamp_hi, const __m256i *rounding, int bit) { + __m256i temp1, temp2; + temp1 = half_btf_avx2(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit); + bf1[14] = half_btf_avx2(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit); + bf1[9] = temp1; + + temp2 = half_btf_avx2(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit); + bf1[13] = half_btf_avx2(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit); + bf1[10] = temp2; + + addsub_avx2(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi); + addsub_avx2(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi); + addsub_avx2(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi); + addsub_avx2(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi); + addsub_avx2(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi); + addsub_avx2(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi); + addsub_avx2(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi); + addsub_avx2(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi); +} + +static INLINE void idct32_stage6_avx2( + __m256i *bf1, const __m256i *cospim32, const __m256i *cospi32, + const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16, + const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi, + const __m256i *rounding, int bit) { + __m256i temp1, temp2; + temp1 = half_btf_avx2(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit); + bf1[6] = half_btf_avx2(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit); + bf1[5] = temp1; + + addsub_avx2(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi); + addsub_avx2(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi); + addsub_avx2(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi); + addsub_avx2(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi); + + temp1 = half_btf_avx2(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit); + bf1[29] = half_btf_avx2(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit); + bf1[18] = temp1; + temp2 = half_btf_avx2(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit); + bf1[28] = half_btf_avx2(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit); + bf1[19] = temp2; + temp1 = half_btf_avx2(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit); + bf1[27] = half_btf_avx2(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit); + bf1[20] = temp1; + temp2 = half_btf_avx2(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit); + bf1[26] = half_btf_avx2(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit); + bf1[21] = temp2; +} + +static INLINE void idct32_stage7_avx2(__m256i *bf1, const __m256i *cospim32, + const __m256i *cospi32, + const __m256i *clamp_lo, + const __m256i *clamp_hi, + const __m256i *rounding, int bit) { + __m256i temp1, temp2; + addsub_avx2(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi); + addsub_avx2(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi); + addsub_avx2(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi); + addsub_avx2(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi); + + temp1 = half_btf_avx2(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit); + bf1[13] = half_btf_avx2(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit); + bf1[10] = temp1; + temp2 = half_btf_avx2(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit); + bf1[12] = half_btf_avx2(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit); + bf1[11] = temp2; + + addsub_avx2(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi); + addsub_avx2(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi); + addsub_avx2(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi); + addsub_avx2(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi); + addsub_avx2(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi); + addsub_avx2(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi); + addsub_avx2(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi); + addsub_avx2(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi); +} + +static INLINE void idct32_stage8_avx2(__m256i *bf1, const __m256i *cospim32, + const __m256i *cospi32, + const __m256i *clamp_lo, + const __m256i *clamp_hi, + const __m256i *rounding, int bit) { + __m256i temp1, temp2; + addsub_avx2(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi); + addsub_avx2(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi); + addsub_avx2(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi); + addsub_avx2(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi); + addsub_avx2(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi); + addsub_avx2(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi); + addsub_avx2(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi); + addsub_avx2(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi); + + temp1 = half_btf_avx2(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit); + bf1[27] = half_btf_avx2(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit); + bf1[20] = temp1; + temp2 = half_btf_avx2(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit); + bf1[26] = half_btf_avx2(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit); + bf1[21] = temp2; + temp1 = half_btf_avx2(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit); + bf1[25] = half_btf_avx2(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit); + bf1[22] = temp1; + temp2 = half_btf_avx2(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit); + bf1[24] = half_btf_avx2(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit); + bf1[23] = temp2; +} + +static INLINE void idct32_stage9_avx2(__m256i *bf1, __m256i *out, + const int do_cols, const int bd, + const int out_shift, + const int log_range) { + if (do_cols) { + addsub_no_clamp_avx2(bf1[0], bf1[31], out + 0, out + 31); + addsub_no_clamp_avx2(bf1[1], bf1[30], out + 1, out + 30); + addsub_no_clamp_avx2(bf1[2], bf1[29], out + 2, out + 29); + addsub_no_clamp_avx2(bf1[3], bf1[28], out + 3, out + 28); + addsub_no_clamp_avx2(bf1[4], bf1[27], out + 4, out + 27); + addsub_no_clamp_avx2(bf1[5], bf1[26], out + 5, out + 26); + addsub_no_clamp_avx2(bf1[6], bf1[25], out + 6, out + 25); + addsub_no_clamp_avx2(bf1[7], bf1[24], out + 7, out + 24); + addsub_no_clamp_avx2(bf1[8], bf1[23], out + 8, out + 23); + addsub_no_clamp_avx2(bf1[9], bf1[22], out + 9, out + 22); + addsub_no_clamp_avx2(bf1[10], bf1[21], out + 10, out + 21); + addsub_no_clamp_avx2(bf1[11], bf1[20], out + 11, out + 20); + addsub_no_clamp_avx2(bf1[12], bf1[19], out + 12, out + 19); + addsub_no_clamp_avx2(bf1[13], bf1[18], out + 13, out + 18); + addsub_no_clamp_avx2(bf1[14], bf1[17], out + 14, out + 17); + addsub_no_clamp_avx2(bf1[15], bf1[16], out + 15, out + 16); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX( + -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); + const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN( + (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); + + addsub_shift_avx2(bf1[0], bf1[31], out + 0, out + 31, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf1[1], bf1[30], out + 1, out + 30, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf1[2], bf1[29], out + 2, out + 29, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf1[3], bf1[28], out + 3, out + 28, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf1[4], bf1[27], out + 4, out + 27, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf1[5], bf1[26], out + 5, out + 26, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf1[6], bf1[25], out + 6, out + 25, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf1[7], bf1[24], out + 7, out + 24, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf1[8], bf1[23], out + 8, out + 23, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf1[9], bf1[22], out + 9, out + 22, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf1[10], bf1[21], out + 10, out + 21, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf1[11], bf1[20], out + 11, out + 20, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf1[12], bf1[19], out + 12, out + 19, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf1[13], bf1[18], out + 13, out + 18, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf1[14], bf1[17], out + 14, out + 17, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf1[15], bf1[16], out + 15, out + 16, &clamp_lo_out, + &clamp_hi_out, out_shift); + } +} + +static void idct32_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i x; + // stage 0 + // stage 1 + // stage 2 + // stage 3 + // stage 4 + // stage 5 + x = _mm256_mullo_epi32(in[0], cospi32); + x = _mm256_add_epi32(x, rounding); + x = _mm256_srai_epi32(x, bit); + + // stage 6 + // stage 7 + // stage 8 + // stage 9 + if (do_cols) { + x = _mm256_max_epi32(x, clamp_lo); + x = _mm256_min_epi32(x, clamp_hi); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX( + -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); + const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN( + (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); + __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1); + x = _mm256_add_epi32(offset, x); + x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift)); + x = _mm256_max_epi32(x, clamp_lo_out); + x = _mm256_min_epi32(x, clamp_hi_out); + } + + out[0] = x; + out[1] = x; + out[2] = x; + out[3] = x; + out[4] = x; + out[5] = x; + out[6] = x; + out[7] = x; + out[8] = x; + out[9] = x; + out[10] = x; + out[11] = x; + out[12] = x; + out[13] = x; + out[14] = x; + out[15] = x; + out[16] = x; + out[17] = x; + out[18] = x; + out[19] = x; + out[20] = x; + out[21] = x; + out[22] = x; + out[23] = x; + out[24] = x; + out[25] = x; + out[26] = x; + out[27] = x; + out[28] = x; + out[29] = x; + out[30] = x; + out[31] = x; +} + +static void idct32_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); + const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); + const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); + const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); + const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); + const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); + const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); + const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); + const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); + const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); + const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i bf1[32]; + + { + // stage 0 + // stage 1 + bf1[0] = in[0]; + bf1[4] = in[4]; + bf1[8] = in[2]; + bf1[12] = in[6]; + bf1[16] = in[1]; + bf1[20] = in[5]; + bf1[24] = in[3]; + bf1[28] = in[7]; + + // stage 2 + bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit); + bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit); + bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit); + bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit); + bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit); + bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit); + bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit); + bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit); + + // stage 3 + bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit); + bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit); + + bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit); + bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit); + bf1[17] = bf1[16]; + bf1[18] = bf1[19]; + bf1[21] = bf1[20]; + bf1[22] = bf1[23]; + bf1[25] = bf1[24]; + bf1[26] = bf1[27]; + bf1[29] = bf1[28]; + bf1[30] = bf1[31]; + + // stage 4 + bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit); + bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit); + + bf1[9] = bf1[8]; + bf1[10] = bf1[11]; + bf1[13] = bf1[12]; + bf1[14] = bf1[15]; + + idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40, + &cospi24, &cospi40, &cospim24, &rounding, bit); + + // stage 5 + bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit); + bf1[1] = bf1[0]; + bf1[5] = bf1[4]; + bf1[6] = bf1[7]; + + idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo, + &clamp_hi, &rounding, bit); + + // stage 6 + bf1[3] = bf1[0]; + bf1[2] = bf1[1]; + + idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, + &cospim48, &clamp_lo, &clamp_hi, &rounding, bit); + + // stage 7 + idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, + &rounding, bit); + + // stage 8 + idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, + &rounding, bit); + + // stage 9 + idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, log_range); + } +} + +static void idct32_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); + const __m256i cospi30 = _mm256_set1_epi32(cospi[30]); + const __m256i cospi46 = _mm256_set1_epi32(cospi[46]); + const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); + const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); + const __m256i cospi22 = _mm256_set1_epi32(cospi[22]); + const __m256i cospi38 = _mm256_set1_epi32(cospi[38]); + const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); + const __m256i cospi26 = _mm256_set1_epi32(cospi[26]); + const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); + const __m256i cospi18 = _mm256_set1_epi32(cospi[18]); + const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); + const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); + const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]); + const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]); + const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); + const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); + const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); + const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); + const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); + const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i bf1[32]; + + { + // stage 0 + // stage 1 + bf1[0] = in[0]; + bf1[2] = in[8]; + bf1[4] = in[4]; + bf1[6] = in[12]; + bf1[8] = in[2]; + bf1[10] = in[10]; + bf1[12] = in[6]; + bf1[14] = in[14]; + bf1[16] = in[1]; + bf1[18] = in[9]; + bf1[20] = in[5]; + bf1[22] = in[13]; + bf1[24] = in[3]; + bf1[26] = in[11]; + bf1[28] = in[7]; + bf1[30] = in[15]; + + // stage 2 + bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit); + bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit); + bf1[17] = half_btf_0_avx2(&cospim34, &bf1[30], &rounding, bit); + bf1[30] = half_btf_0_avx2(&cospi30, &bf1[30], &rounding, bit); + bf1[29] = half_btf_0_avx2(&cospi18, &bf1[18], &rounding, bit); + bf1[18] = half_btf_0_avx2(&cospi46, &bf1[18], &rounding, bit); + bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit); + bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit); + bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit); + bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit); + bf1[21] = half_btf_0_avx2(&cospim42, &bf1[26], &rounding, bit); + bf1[26] = half_btf_0_avx2(&cospi22, &bf1[26], &rounding, bit); + bf1[25] = half_btf_0_avx2(&cospi26, &bf1[22], &rounding, bit); + bf1[22] = half_btf_0_avx2(&cospi38, &bf1[22], &rounding, bit); + bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit); + bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit); + + // stage 3 + bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit); + bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit); + bf1[9] = half_btf_0_avx2(&cospim36, &bf1[14], &rounding, bit); + bf1[14] = half_btf_0_avx2(&cospi28, &bf1[14], &rounding, bit); + bf1[13] = half_btf_0_avx2(&cospi20, &bf1[10], &rounding, bit); + bf1[10] = half_btf_0_avx2(&cospi44, &bf1[10], &rounding, bit); + bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit); + bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit); + + addsub_avx2(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi); + + // stage 4 + bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit); + bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit); + bf1[5] = half_btf_0_avx2(&cospim40, &bf1[6], &rounding, bit); + bf1[6] = half_btf_0_avx2(&cospi24, &bf1[6], &rounding, bit); + + addsub_avx2(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi); + + idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40, + &cospi24, &cospi40, &cospim24, &rounding, bit); + + // stage 5 + bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit); + bf1[1] = bf1[0]; + bf1[3] = half_btf_0_avx2(&cospi16, &bf1[2], &rounding, bit); + bf1[2] = half_btf_0_avx2(&cospi48, &bf1[2], &rounding, bit); + + addsub_avx2(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi); + + idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo, + &clamp_hi, &rounding, bit); + + // stage 6 + addsub_avx2(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi); + + idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, + &cospim48, &clamp_lo, &clamp_hi, &rounding, bit); + + // stage 7 + idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, + &rounding, bit); + + // stage 8 + idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, + &rounding, bit); + + // stage 9 + idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, log_range); + } +} + static void idct32_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); @@ -270,43 +798,42 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd, const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); __m256i bf1[32], bf0[32]; - int col; - for (col = 0; col < 4; ++col) { + { // stage 0 // stage 1 - bf1[0] = in[0 * 4 + col]; - bf1[1] = in[16 * 4 + col]; - bf1[2] = in[8 * 4 + col]; - bf1[3] = in[24 * 4 + col]; - bf1[4] = in[4 * 4 + col]; - bf1[5] = in[20 * 4 + col]; - bf1[6] = in[12 * 4 + col]; - bf1[7] = in[28 * 4 + col]; - bf1[8] = in[2 * 4 + col]; - bf1[9] = in[18 * 4 + col]; - bf1[10] = in[10 * 4 + col]; - bf1[11] = in[26 * 4 + col]; - bf1[12] = in[6 * 4 + col]; - bf1[13] = in[22 * 4 + col]; - bf1[14] = in[14 * 4 + col]; - bf1[15] = in[30 * 4 + col]; - bf1[16] = in[1 * 4 + col]; - bf1[17] = in[17 * 4 + col]; - bf1[18] = in[9 * 4 + col]; - bf1[19] = in[25 * 4 + col]; - bf1[20] = in[5 * 4 + col]; - bf1[21] = in[21 * 4 + col]; - bf1[22] = in[13 * 4 + col]; - bf1[23] = in[29 * 4 + col]; - bf1[24] = in[3 * 4 + col]; - bf1[25] = in[19 * 4 + col]; - bf1[26] = in[11 * 4 + col]; - bf1[27] = in[27 * 4 + col]; - bf1[28] = in[7 * 4 + col]; - bf1[29] = in[23 * 4 + col]; - bf1[30] = in[15 * 4 + col]; - bf1[31] = in[31 * 4 + col]; + bf1[0] = in[0]; + bf1[1] = in[16]; + bf1[2] = in[8]; + bf1[3] = in[24]; + bf1[4] = in[4]; + bf1[5] = in[20]; + bf1[6] = in[12]; + bf1[7] = in[28]; + bf1[8] = in[2]; + bf1[9] = in[18]; + bf1[10] = in[10]; + bf1[11] = in[26]; + bf1[12] = in[6]; + bf1[13] = in[22]; + bf1[14] = in[14]; + bf1[15] = in[30]; + bf1[16] = in[1]; + bf1[17] = in[17]; + bf1[18] = in[9]; + bf1[19] = in[25]; + bf1[20] = in[5]; + bf1[21] = in[21]; + bf1[22] = in[13]; + bf1[23] = in[29]; + bf1[24] = in[3]; + bf1[25] = in[19]; + bf1[26] = in[11]; + bf1[27] = in[27]; + bf1[28] = in[7]; + bf1[29] = in[23]; + bf1[30] = in[15]; + bf1[31] = in[31]; // stage 2 bf0[0] = bf1[0]; @@ -568,91 +1095,255 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd, // stage 9 if (do_cols) { - addsub_no_clamp_avx2(bf0[0], bf0[31], out + 0 * 4 + col, - out + 31 * 4 + col); - addsub_no_clamp_avx2(bf0[1], bf0[30], out + 1 * 4 + col, - out + 30 * 4 + col); - addsub_no_clamp_avx2(bf0[2], bf0[29], out + 2 * 4 + col, - out + 29 * 4 + col); - addsub_no_clamp_avx2(bf0[3], bf0[28], out + 3 * 4 + col, - out + 28 * 4 + col); - addsub_no_clamp_avx2(bf0[4], bf0[27], out + 4 * 4 + col, - out + 27 * 4 + col); - addsub_no_clamp_avx2(bf0[5], bf0[26], out + 5 * 4 + col, - out + 26 * 4 + col); - addsub_no_clamp_avx2(bf0[6], bf0[25], out + 6 * 4 + col, - out + 25 * 4 + col); - addsub_no_clamp_avx2(bf0[7], bf0[24], out + 7 * 4 + col, - out + 24 * 4 + col); - addsub_no_clamp_avx2(bf0[8], bf0[23], out + 8 * 4 + col, - out + 23 * 4 + col); - addsub_no_clamp_avx2(bf0[9], bf0[22], out + 9 * 4 + col, - out + 22 * 4 + col); - addsub_no_clamp_avx2(bf0[10], bf0[21], out + 10 * 4 + col, - out + 21 * 4 + col); - addsub_no_clamp_avx2(bf0[11], bf0[20], out + 11 * 4 + col, - out + 20 * 4 + col); - addsub_no_clamp_avx2(bf0[12], bf0[19], out + 12 * 4 + col, - out + 19 * 4 + col); - addsub_no_clamp_avx2(bf0[13], bf0[18], out + 13 * 4 + col, - out + 18 * 4 + col); - addsub_no_clamp_avx2(bf0[14], bf0[17], out + 14 * 4 + col, - out + 17 * 4 + col); - addsub_no_clamp_avx2(bf0[15], bf0[16], out + 15 * 4 + col, - out + 16 * 4 + col); + addsub_no_clamp_avx2(bf0[0], bf0[31], out + 0, out + 31); + addsub_no_clamp_avx2(bf0[1], bf0[30], out + 1, out + 30); + addsub_no_clamp_avx2(bf0[2], bf0[29], out + 2, out + 29); + addsub_no_clamp_avx2(bf0[3], bf0[28], out + 3, out + 28); + addsub_no_clamp_avx2(bf0[4], bf0[27], out + 4, out + 27); + addsub_no_clamp_avx2(bf0[5], bf0[26], out + 5, out + 26); + addsub_no_clamp_avx2(bf0[6], bf0[25], out + 6, out + 25); + addsub_no_clamp_avx2(bf0[7], bf0[24], out + 7, out + 24); + addsub_no_clamp_avx2(bf0[8], bf0[23], out + 8, out + 23); + addsub_no_clamp_avx2(bf0[9], bf0[22], out + 9, out + 22); + addsub_no_clamp_avx2(bf0[10], bf0[21], out + 10, out + 21); + addsub_no_clamp_avx2(bf0[11], bf0[20], out + 11, out + 20); + addsub_no_clamp_avx2(bf0[12], bf0[19], out + 12, out + 19); + addsub_no_clamp_avx2(bf0[13], bf0[18], out + 13, out + 18); + addsub_no_clamp_avx2(bf0[14], bf0[17], out + 14, out + 17); + addsub_no_clamp_avx2(bf0[15], bf0[16], out + 15, out + 16); } else { - addsub_shift_avx2(bf0[0], bf0[31], out + 0 * 4 + col, out + 31 * 4 + col, - &clamp_lo, &clamp_hi, out_shift); - addsub_shift_avx2(bf0[1], bf0[30], out + 1 * 4 + col, out + 30 * 4 + col, - &clamp_lo, &clamp_hi, out_shift); - addsub_shift_avx2(bf0[2], bf0[29], out + 2 * 4 + col, out + 29 * 4 + col, - &clamp_lo, &clamp_hi, out_shift); - addsub_shift_avx2(bf0[3], bf0[28], out + 3 * 4 + col, out + 28 * 4 + col, - &clamp_lo, &clamp_hi, out_shift); - addsub_shift_avx2(bf0[4], bf0[27], out + 4 * 4 + col, out + 27 * 4 + col, - &clamp_lo, &clamp_hi, out_shift); - addsub_shift_avx2(bf0[5], bf0[26], out + 5 * 4 + col, out + 26 * 4 + col, - &clamp_lo, &clamp_hi, out_shift); - addsub_shift_avx2(bf0[6], bf0[25], out + 6 * 4 + col, out + 25 * 4 + col, - &clamp_lo, &clamp_hi, out_shift); - addsub_shift_avx2(bf0[7], bf0[24], out + 7 * 4 + col, out + 24 * 4 + col, - &clamp_lo, &clamp_hi, out_shift); - addsub_shift_avx2(bf0[8], bf0[23], out + 8 * 4 + col, out + 23 * 4 + col, - &clamp_lo, &clamp_hi, out_shift); - addsub_shift_avx2(bf0[9], bf0[22], out + 9 * 4 + col, out + 22 * 4 + col, - &clamp_lo, &clamp_hi, out_shift); - addsub_shift_avx2(bf0[10], bf0[21], out + 10 * 4 + col, - out + 21 * 4 + col, &clamp_lo, &clamp_hi, out_shift); - addsub_shift_avx2(bf0[11], bf0[20], out + 11 * 4 + col, - out + 20 * 4 + col, &clamp_lo, &clamp_hi, out_shift); - addsub_shift_avx2(bf0[12], bf0[19], out + 12 * 4 + col, - out + 19 * 4 + col, &clamp_lo, &clamp_hi, out_shift); - addsub_shift_avx2(bf0[13], bf0[18], out + 13 * 4 + col, - out + 18 * 4 + col, &clamp_lo, &clamp_hi, out_shift); - addsub_shift_avx2(bf0[14], bf0[17], out + 14 * 4 + col, - out + 17 * 4 + col, &clamp_lo, &clamp_hi, out_shift); - addsub_shift_avx2(bf0[15], bf0[16], out + 15 * 4 + col, - out + 16 * 4 + col, &clamp_lo, &clamp_hi, out_shift); + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX( + -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); + const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN( + (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); + + addsub_shift_avx2(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_avx2(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo_out, + &clamp_hi_out, out_shift); } } } -void av1_inv_txfm2d_add_32x32_avx2(const int32_t *coeff, uint16_t *output, - int stride, TX_TYPE tx_type, int bd) { - __m256i in[128], out[128]; - const int8_t *shift = inv_txfm_shift_ls[TX_32X32]; - const int txw_idx = get_txw_idx(TX_32X32); - const int txh_idx = get_txh_idx(TX_32X32); +typedef void (*transform_1d_avx2)(__m256i *in, __m256i *out, int bit, + int do_cols, int bd, int out_shift); + +static const transform_1d_avx2 + highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = { + { + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + }, + { { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } }, + { + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + }, + { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } }, + + { { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } } + }; + +static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input, + uint16_t *output, int stride, + TX_TYPE tx_type, + TX_SIZE tx_size, int eob, + const int bd) { + __m256i buf1[64 * 2]; + int eobx, eoby; + get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); + const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div8 = txfm_size_col >> 3; + const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3; + const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; + const int input_stride = AOMMIN(32, txfm_size_col); + + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + const transform_1d_avx2 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + const transform_1d_avx2 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + // 1st stage: column transform + for (int i = 0; i < buf_size_nonzero_h_div8; i++) { + __m256i buf0[32]; + const int32_t *input_row = input + i * input_stride * 8; + for (int j = 0; j < buf_size_nonzero_w_div8; ++j) { + __m256i *buf0_cur = buf0 + j * 8; + load_buffer_32x32(input_row + j * 8, buf0_cur, input_stride, 8); + + transpose_8x8_avx2(&buf0_cur[0], &buf0_cur[0]); + } + + row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]); + + __m256i *_buf1 = buf1 + i * 8; + for (int j = 0; j < buf_size_w_div8; ++j) { + transpose_8x8_avx2(&buf0[j * 8], &_buf1[j * txfm_size_row]); + } + } + // 2nd stage: column transform + for (int i = 0; i < buf_size_w_div8; i++) { + col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, + inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + + av1_round_shift_array_32_avx2(buf1 + i * txfm_size_row, + buf1 + i * txfm_size_row, txfm_size_row, + -shift[1]); + } + + // write to buffer + { + for (int i = 0; i < (txfm_size_col >> 4); i++) { + highbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row * 2, + output + 16 * i, stride, ud_flip, + txfm_size_row, bd); + } + } +} + +void av1_highbd_inv_txfm2d_add_universe_avx2(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob, const int bd) { switch (tx_type) { case DCT_DCT: - load_buffer_32x32(coeff, in); - transpose_32x32(in, out); - idct32_avx2(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]); - transpose_32x32(in, out); - idct32_avx2(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); - write_buffer_32x32(in, output, stride, 0, 0, -shift[1], bd); + highbd_inv_txfm2d_add_no_identity_avx2(input, CONVERT_TO_SHORTPTR(output), + stride, tx_type, tx_size, eob, bd); break; + default: assert(0); break; + } +} + +void av1_highbd_inv_txfm_add_32x32_avx2(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + const int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const int32_t *src = cast_to_int32(input); + switch (tx_type) { + case DCT_DCT: + av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type, + txfm_param->tx_size, + txfm_param->eob, bd); + break; + // Assembly version doesn't support IDTX, so use C version for it. + case IDTX: + av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, + tx_type, bd); + break; + default: assert(0); } } + +void av1_highbd_inv_txfm_add_avx2(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); + const TX_SIZE tx_size = txfm_param->tx_size; + switch (tx_size) { + case TX_32X32: + av1_highbd_inv_txfm_add_32x32_avx2(input, dest, stride, txfm_param); + break; + case TX_16X16: + av1_highbd_inv_txfm_add_16x16_sse4_1(input, dest, stride, txfm_param); + break; + case TX_8X8: + av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param); + break; + case TX_4X8: + av1_highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param); + break; + case TX_8X4: + av1_highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param); + break; + case TX_8X16: + av1_highbd_inv_txfm_add_8x16_sse4_1(input, dest, stride, txfm_param); + break; + case TX_16X8: + av1_highbd_inv_txfm_add_16x8_sse4_1(input, dest, stride, txfm_param); + break; + case TX_16X32: + av1_highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param); + break; + case TX_32X16: + av1_highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param); + break; + case TX_32X64: + av1_highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param); + break; + case TX_64X32: + av1_highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param); + break; + case TX_4X4: + av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param); + break; + case TX_16X4: + av1_highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param); + break; + case TX_4X16: + av1_highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param); + break; + case TX_8X32: + av1_highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param); + break; + case TX_32X8: + av1_highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param); + break; + case TX_64X64: + case TX_16X64: + case TX_64X16: + av1_highbd_inv_txfm2d_add_universe_sse4_1( + input, dest, stride, txfm_param->tx_type, txfm_param->tx_size, + txfm_param->eob, txfm_param->bd); + break; + default: assert(0 && "Invalid transform size"); break; + } +} diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c index 801a4133b..e29e0baf5 100644 --- a/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c +++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c @@ -15,8 +15,60 @@ #include "config/av1_rtcd.h" #include "av1/common/av1_inv_txfm1d_cfg.h" +#include "av1/common/idct.h" +#include "av1/common/x86/av1_inv_txfm_ssse3.h" +#include "av1/common/x86/av1_txfm_sse4.h" #include "av1/common/x86/highbd_txfm_utility_sse4.h" +static INLINE __m128i highbd_clamp_epi16(__m128i u, int bd) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); + __m128i clamped, mask; + + mask = _mm_cmpgt_epi16(u, max); + clamped = _mm_andnot_si128(mask, u); + mask = _mm_and_si128(mask, max); + clamped = _mm_or_si128(mask, clamped); + mask = _mm_cmpgt_epi16(clamped, zero); + clamped = _mm_and_si128(clamped, mask); + + return clamped; +} + +static INLINE __m128i highbd_get_recon_8x8_sse4_1(const __m128i pred, + __m128i res0, __m128i res1, + const int bd) { + __m128i x0 = _mm_cvtepi16_epi32(pred); + __m128i x1 = _mm_cvtepi16_epi32(_mm_srli_si128(pred, 8)); + + x0 = _mm_add_epi32(res0, x0); + x1 = _mm_add_epi32(res1, x1); + x0 = _mm_packus_epi32(x0, x1); + x0 = highbd_clamp_epi16(x0, bd); + return x0; +} + +static INLINE void highbd_write_buffer_8xn_sse4_1(__m128i *in, uint16_t *output, + int stride, int flipud, + int height, const int bd) { + int j = flipud ? (height - 1) : 0; + const int step = flipud ? -1 : 1; + for (int i = 0; i < height; ++i, j += step) { + __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride)); + __m128i u = highbd_get_recon_8x8_sse4_1(v, in[j], in[j + height], bd); + + _mm_storeu_si128((__m128i *)(output + i * stride), u); + } +} + +static INLINE void load_buffer_32bit_input(const int32_t *in, int stride, + __m128i *out, int out_size) { + for (int i = 0; i < out_size; ++i) { + out[i] = _mm_loadu_si128((const __m128i *)(in + i * stride)); + } +} + static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) { in[0] = _mm_load_si128((const __m128i *)(coeff + 0)); in[1] = _mm_load_si128((const __m128i *)(coeff + 4)); @@ -57,18 +109,231 @@ static void addsub_shift_sse4_1(const __m128i in0, const __m128i in1, __m128i a0 = _mm_add_epi32(in0_w_offset, in1); __m128i a1 = _mm_sub_epi32(in0_w_offset, in1); + a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift)); + a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift)); + a0 = _mm_max_epi32(a0, *clamp_lo); a0 = _mm_min_epi32(a0, *clamp_hi); a1 = _mm_max_epi32(a1, *clamp_lo); a1 = _mm_min_epi32(a1, *clamp_hi); - a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift)); - a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift)); - *out0 = a0; *out1 = a1; } +static INLINE void idct32_stage4_sse4_1( + __m128i *bf1, const __m128i *cospim8, const __m128i *cospi56, + const __m128i *cospi8, const __m128i *cospim56, const __m128i *cospim40, + const __m128i *cospi24, const __m128i *cospi40, const __m128i *cospim24, + const __m128i *rounding, int bit) { + __m128i temp1, temp2; + temp1 = half_btf_sse4_1(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit); + bf1[30] = half_btf_sse4_1(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit); + bf1[17] = temp1; + + temp2 = half_btf_sse4_1(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit); + bf1[29] = + half_btf_sse4_1(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit); + bf1[18] = temp2; + + temp1 = half_btf_sse4_1(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit); + bf1[26] = + half_btf_sse4_1(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit); + bf1[21] = temp1; + + temp2 = + half_btf_sse4_1(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit); + bf1[25] = + half_btf_sse4_1(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit); + bf1[22] = temp2; +} + +static INLINE void idct32_stage5_sse4_1( + __m128i *bf1, const __m128i *cospim16, const __m128i *cospi48, + const __m128i *cospi16, const __m128i *cospim48, const __m128i *clamp_lo, + const __m128i *clamp_hi, const __m128i *rounding, int bit) { + __m128i temp1, temp2; + temp1 = half_btf_sse4_1(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit); + bf1[14] = half_btf_sse4_1(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit); + bf1[9] = temp1; + + temp2 = + half_btf_sse4_1(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit); + bf1[13] = + half_btf_sse4_1(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit); + bf1[10] = temp2; + + addsub_sse4_1(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi); +} + +static INLINE void idct32_stage6_sse4_1( + __m128i *bf1, const __m128i *cospim32, const __m128i *cospi32, + const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16, + const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi, + const __m128i *rounding, int bit) { + __m128i temp1, temp2; + temp1 = half_btf_sse4_1(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit); + bf1[6] = half_btf_sse4_1(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit); + bf1[5] = temp1; + + addsub_sse4_1(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi); + + temp1 = half_btf_sse4_1(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit); + bf1[29] = + half_btf_sse4_1(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit); + bf1[18] = temp1; + temp2 = half_btf_sse4_1(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit); + bf1[28] = + half_btf_sse4_1(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit); + bf1[19] = temp2; + temp1 = + half_btf_sse4_1(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit); + bf1[27] = + half_btf_sse4_1(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit); + bf1[20] = temp1; + temp2 = + half_btf_sse4_1(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit); + bf1[26] = + half_btf_sse4_1(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit); + bf1[21] = temp2; +} + +static INLINE void idct32_stage7_sse4_1(__m128i *bf1, const __m128i *cospim32, + const __m128i *cospi32, + const __m128i *clamp_lo, + const __m128i *clamp_hi, + const __m128i *rounding, int bit) { + __m128i temp1, temp2; + addsub_sse4_1(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi); + + temp1 = half_btf_sse4_1(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit); + bf1[13] = + half_btf_sse4_1(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit); + bf1[10] = temp1; + temp2 = half_btf_sse4_1(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit); + bf1[12] = + half_btf_sse4_1(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit); + bf1[11] = temp2; + + addsub_sse4_1(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi); +} + +static INLINE void idct32_stage8_sse4_1(__m128i *bf1, const __m128i *cospim32, + const __m128i *cospi32, + const __m128i *clamp_lo, + const __m128i *clamp_hi, + const __m128i *rounding, int bit) { + __m128i temp1, temp2; + addsub_sse4_1(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi); + + temp1 = half_btf_sse4_1(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit); + bf1[27] = + half_btf_sse4_1(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit); + bf1[20] = temp1; + temp2 = half_btf_sse4_1(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit); + bf1[26] = + half_btf_sse4_1(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit); + bf1[21] = temp2; + temp1 = half_btf_sse4_1(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit); + bf1[25] = + half_btf_sse4_1(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit); + bf1[22] = temp1; + temp2 = half_btf_sse4_1(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit); + bf1[24] = + half_btf_sse4_1(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit); + bf1[23] = temp2; +} + +static INLINE void idct32_stage9_sse4_1(__m128i *bf1, __m128i *out, + const int do_cols, const int bd, + const int out_shift, + const int log_range) { + if (do_cols) { + addsub_no_clamp_sse4_1(bf1[0], bf1[31], out + 0, out + 31); + addsub_no_clamp_sse4_1(bf1[1], bf1[30], out + 1, out + 30); + addsub_no_clamp_sse4_1(bf1[2], bf1[29], out + 2, out + 29); + addsub_no_clamp_sse4_1(bf1[3], bf1[28], out + 3, out + 28); + addsub_no_clamp_sse4_1(bf1[4], bf1[27], out + 4, out + 27); + addsub_no_clamp_sse4_1(bf1[5], bf1[26], out + 5, out + 26); + addsub_no_clamp_sse4_1(bf1[6], bf1[25], out + 6, out + 25); + addsub_no_clamp_sse4_1(bf1[7], bf1[24], out + 7, out + 24); + addsub_no_clamp_sse4_1(bf1[8], bf1[23], out + 8, out + 23); + addsub_no_clamp_sse4_1(bf1[9], bf1[22], out + 9, out + 22); + addsub_no_clamp_sse4_1(bf1[10], bf1[21], out + 10, out + 21); + addsub_no_clamp_sse4_1(bf1[11], bf1[20], out + 11, out + 20); + addsub_no_clamp_sse4_1(bf1[12], bf1[19], out + 12, out + 19); + addsub_no_clamp_sse4_1(bf1[13], bf1[18], out + 13, out + 18); + addsub_no_clamp_sse4_1(bf1[14], bf1[17], out + 14, out + 17); + addsub_no_clamp_sse4_1(bf1[15], bf1[16], out + 15, out + 16); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( + -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); + const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( + (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); + + addsub_shift_sse4_1(bf1[0], bf1[31], out + 0, out + 31, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf1[1], bf1[30], out + 1, out + 30, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf1[2], bf1[29], out + 2, out + 29, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf1[3], bf1[28], out + 3, out + 28, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf1[4], bf1[27], out + 4, out + 27, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf1[5], bf1[26], out + 5, out + 26, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf1[6], bf1[25], out + 6, out + 25, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf1[7], bf1[24], out + 7, out + 24, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf1[8], bf1[23], out + 8, out + 23, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf1[9], bf1[22], out + 9, out + 22, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf1[10], bf1[21], out + 10, out + 21, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf1[11], bf1[20], out + 11, out + 20, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf1[12], bf1[19], out + 12, out + 19, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf1[13], bf1[18], out + 13, out + 18, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf1[14], bf1[17], out + 14, out + 17, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf1[15], bf1[16], out + 15, out + 16, &clamp_lo_out, + &clamp_hi_out, out_shift); + } +} + static void neg_shift_sse4_1(const __m128i in0, const __m128i in1, __m128i *out0, __m128i *out1, const __m128i *clamp_lo, const __m128i *clamp_hi, @@ -77,14 +342,14 @@ static void neg_shift_sse4_1(const __m128i in0, const __m128i in1, __m128i a0 = _mm_add_epi32(offset, in0); __m128i a1 = _mm_sub_epi32(offset, in1); + a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift)); + a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift)); + a0 = _mm_max_epi32(a0, *clamp_lo); a0 = _mm_min_epi32(a0, *clamp_hi); a1 = _mm_max_epi32(a1, *clamp_lo); a1 = _mm_min_epi32(a1, *clamp_hi); - a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift)); - a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift)); - *out0 = a0; *out1 = a1; } @@ -96,9 +361,6 @@ static void idct4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) { const __m128i cospi16 = _mm_set1_epi32(cospi[16]); const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); - const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); - const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); - const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); __m128i u0, u1, u2, u3; __m128i v0, v1, v2, v3, x, y; @@ -135,11 +397,19 @@ static void idct4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) { v3 = _mm_add_epi32(v3, rnding); v3 = _mm_srai_epi32(v3, bit); - addsub_sse4_1(v0, v3, in + 0, in + 3, &clamp_lo, &clamp_hi); - addsub_sse4_1(v1, v2, in + 1, in + 2, &clamp_lo, &clamp_hi); + if (do_cols) { + addsub_no_clamp_sse4_1(v0, v3, in + 0, in + 3); + addsub_no_clamp_sse4_1(v1, v2, in + 1, in + 2); + } else { + const int log_range = AOMMAX(16, bd + 6); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + addsub_sse4_1(v0, v3, in + 0, in + 3, &clamp_lo, &clamp_hi); + addsub_sse4_1(v1, v2, in + 1, in + 2, &clamp_lo, &clamp_hi); + } } -static void iadst4x4_sse4_1(__m128i *in, int bit) { +static void iadst4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) { const int32_t *sinpi = sinpi_arr(bit); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]); @@ -197,6 +467,21 @@ static void iadst4x4_sse4_1(__m128i *in, int bit) { u3 = _mm_add_epi32(u3, rnding); u3 = _mm_srai_epi32(u3, bit); + if (!do_cols) { + const int log_range = AOMMAX(16, bd + 6); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + + u0 = _mm_max_epi32(u0, clamp_lo); + u0 = _mm_min_epi32(u0, clamp_hi); + u1 = _mm_max_epi32(u1, clamp_lo); + u1 = _mm_min_epi32(u1, clamp_hi); + u2 = _mm_max_epi32(u2, clamp_lo); + u2 = _mm_min_epi32(u2, clamp_hi); + u3 = _mm_max_epi32(u3, clamp_lo); + u3 = _mm_min_epi32(u3, clamp_hi); + } + in[0] = u0; in[1] = u1; in[2] = u2; @@ -217,22 +502,6 @@ static INLINE void round_shift_4x4(__m128i *in, int shift) { in[3] = _mm_srai_epi32(in[3], shift); } -static INLINE __m128i highbd_clamp_epi16(__m128i u, int bd) { - const __m128i zero = _mm_setzero_si128(); - const __m128i one = _mm_set1_epi16(1); - const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); - __m128i clamped, mask; - - mask = _mm_cmpgt_epi16(u, max); - clamped = _mm_andnot_si128(mask, u); - mask = _mm_and_si128(mask, max); - clamped = _mm_or_si128(mask, clamped); - mask = _mm_cmpgt_epi16(clamped, zero); - clamped = _mm_and_si128(clamped, mask); - - return clamped; -} - static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride, int fliplr, int flipud, int shift, int bd) { const __m128i zero = _mm_setzero_si128(); @@ -304,49 +573,49 @@ void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output, case ADST_DCT: load_buffer_4x4(coeff, in); idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd); - iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]); + iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd); write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); break; case DCT_ADST: load_buffer_4x4(coeff, in); - iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]); + iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd); idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd); write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); break; case ADST_ADST: load_buffer_4x4(coeff, in); - iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]); - iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]); + iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd); + iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd); write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); break; case FLIPADST_DCT: load_buffer_4x4(coeff, in); idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd); - iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]); + iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd); write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd); break; case DCT_FLIPADST: load_buffer_4x4(coeff, in); - iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]); + iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd); idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd); write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd); break; case FLIPADST_FLIPADST: load_buffer_4x4(coeff, in); - iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]); - iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]); + iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd); + iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd); write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd); break; case ADST_FLIPADST: load_buffer_4x4(coeff, in); - iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]); - iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]); + iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd); + iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd); write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd); break; case FLIPADST_ADST: load_buffer_4x4(coeff, in); - iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]); - iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]); + iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd); + iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd); write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd); break; default: assert(0); @@ -482,14 +751,19 @@ static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, addsub_no_clamp_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col); addsub_no_clamp_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col); } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( + -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); + const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( + (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); addsub_shift_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col, - &clamp_lo, &clamp_hi, out_shift); + &clamp_lo_out, &clamp_hi_out, out_shift); addsub_shift_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col, - &clamp_lo, &clamp_hi, out_shift); + &clamp_lo_out, &clamp_hi_out, out_shift); addsub_shift_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col, - &clamp_lo, &clamp_hi, out_shift); + &clamp_lo_out, &clamp_hi_out, out_shift); addsub_shift_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col, - &clamp_lo, &clamp_hi, out_shift); + &clamp_lo_out, &clamp_hi_out, out_shift); } } } @@ -651,14 +925,18 @@ static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, out[12] = u[5]; out[14] = _mm_sub_epi32(kZero, u[1]); } else { - neg_shift_sse4_1(u[0], u[4], out + 0, out + 2, &clamp_lo, &clamp_hi, - out_shift); - neg_shift_sse4_1(u[6], u[2], out + 4, out + 6, &clamp_lo, &clamp_hi, - out_shift); - neg_shift_sse4_1(u[3], u[7], out + 8, out + 10, &clamp_lo, &clamp_hi, + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_sse4_1(u[0], u[4], out + 0, out + 2, &clamp_lo_out, &clamp_hi_out, out_shift); - neg_shift_sse4_1(u[5], u[1], out + 12, out + 14, &clamp_lo, &clamp_hi, + neg_shift_sse4_1(u[6], u[2], out + 4, out + 6, &clamp_lo_out, &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[3], u[7], out + 8, out + 10, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[5], u[1], out + 12, out + 14, &clamp_lo_out, + &clamp_hi_out, out_shift); } // Odd 8 points: 1, 3, ..., 15 @@ -796,14 +1074,18 @@ static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, out[13] = u[5]; out[15] = _mm_sub_epi32(kZero, u[1]); } else { - neg_shift_sse4_1(u[0], u[4], out + 1, out + 3, &clamp_lo, &clamp_hi, - out_shift); - neg_shift_sse4_1(u[6], u[2], out + 5, out + 7, &clamp_lo, &clamp_hi, - out_shift); - neg_shift_sse4_1(u[3], u[7], out + 9, out + 11, &clamp_lo, &clamp_hi, + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_sse4_1(u[0], u[4], out + 1, out + 3, &clamp_lo_out, &clamp_hi_out, out_shift); - neg_shift_sse4_1(u[5], u[1], out + 13, out + 15, &clamp_lo, &clamp_hi, + neg_shift_sse4_1(u[6], u[2], out + 5, out + 7, &clamp_lo_out, &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[3], u[7], out + 9, out + 11, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[5], u[1], out + 13, out + 15, &clamp_lo_out, + &clamp_hi_out, out_shift); } } @@ -976,81 +1258,51 @@ void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output, } } -// 16x16 -static void load_buffer_16x16(const int32_t *coeff, __m128i *in) { - int i; - for (i = 0; i < 64; ++i) { - in[i] = _mm_load_si128((const __m128i *)(coeff + (i << 2))); - } -} - -static void assign_8x8_input_from_16x16(const __m128i *in, __m128i *in8x8, - int col) { - int i; - for (i = 0; i < 16; i += 2) { - in8x8[i] = in[col]; - in8x8[i + 1] = in[col + 1]; - col += 4; - } -} - -static void swap_addr(uint16_t **output1, uint16_t **output2) { - uint16_t *tmp; - tmp = *output1; - *output1 = *output2; - *output2 = tmp; -} - -static void write_buffer_16x16(__m128i *in, uint16_t *output, int stride, - int fliplr, int flipud, int shift, int bd) { - __m128i in8x8[16]; - uint16_t *leftUp = &output[0]; - uint16_t *rightUp = &output[8]; - uint16_t *leftDown = &output[8 * stride]; - uint16_t *rightDown = &output[8 * stride + 8]; +static void idct8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + __m128i x; - if (fliplr) { - swap_addr(&leftUp, &rightUp); - swap_addr(&leftDown, &rightDown); - } + // stage 0 + // stage 1 + // stage 2 + // stage 3 + x = _mm_mullo_epi32(in[0], cospi32); + x = _mm_add_epi32(x, rnding); + x = _mm_srai_epi32(x, bit); - if (flipud) { - swap_addr(&leftUp, &leftDown); - swap_addr(&rightUp, &rightDown); + // stage 4 + // stage 5 + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( + -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); + const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( + (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); + + __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1); + x = _mm_add_epi32(x, offset); + x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift)); + x = _mm_max_epi32(x, clamp_lo_out); + x = _mm_min_epi32(x, clamp_hi_out); } - // Left-up quarter - assign_8x8_input_from_16x16(in, in8x8, 0); - write_buffer_8x8(in8x8, leftUp, stride, fliplr, flipud, shift, bd); - - // Right-up quarter - assign_8x8_input_from_16x16(in, in8x8, 2); - write_buffer_8x8(in8x8, rightUp, stride, fliplr, flipud, shift, bd); - - // Left-down quarter - assign_8x8_input_from_16x16(in, in8x8, 32); - write_buffer_8x8(in8x8, leftDown, stride, fliplr, flipud, shift, bd); - - // Right-down quarter - assign_8x8_input_from_16x16(in, in8x8, 34); - write_buffer_8x8(in8x8, rightDown, stride, fliplr, flipud, shift, bd); + out[0] = x; + out[1] = x; + out[2] = x; + out[3] = x; + out[4] = x; + out[5] = x; + out[6] = x; + out[7] = x; } -static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, - int bd, int out_shift) { +static void idct8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); - const __m128i cospi60 = _mm_set1_epi32(cospi[60]); - const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); - const __m128i cospi28 = _mm_set1_epi32(cospi[28]); - const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); - const __m128i cospi44 = _mm_set1_epi32(cospi[44]); - const __m128i cospi20 = _mm_set1_epi32(cospi[20]); - const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); - const __m128i cospi12 = _mm_set1_epi32(cospi[12]); - const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); - const __m128i cospi52 = _mm_set1_epi32(cospi[52]); - const __m128i cospi36 = _mm_set1_epi32(cospi[36]); - const __m128i cospi4 = _mm_set1_epi32(cospi[4]); const __m128i cospi56 = _mm_set1_epi32(cospi[56]); const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); const __m128i cospi24 = _mm_set1_epi32(cospi[24]); @@ -1059,473 +1311,687 @@ static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, const __m128i cospi8 = _mm_set1_epi32(cospi[8]); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i cospi48 = _mm_set1_epi32(cospi[48]); - const __m128i cospi16 = _mm_set1_epi32(cospi[16]); const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); - const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); - __m128i u[16], v[16], x, y; - int col; - - for (col = 0; col < 4; ++col) { - // stage 0 - // stage 1 - u[0] = in[0 * 4 + col]; - u[1] = in[8 * 4 + col]; - u[2] = in[4 * 4 + col]; - u[3] = in[12 * 4 + col]; - u[4] = in[2 * 4 + col]; - u[5] = in[10 * 4 + col]; - u[6] = in[6 * 4 + col]; - u[7] = in[14 * 4 + col]; - u[8] = in[1 * 4 + col]; - u[9] = in[9 * 4 + col]; - u[10] = in[5 * 4 + col]; - u[11] = in[13 * 4 + col]; - u[12] = in[3 * 4 + col]; - u[13] = in[11 * 4 + col]; - u[14] = in[7 * 4 + col]; - u[15] = in[15 * 4 + col]; + __m128i u0, u1, u2, u3, u4, u5, u6, u7; + __m128i v0, v1, v2, v3, v4, v5, v6, v7; + __m128i x, y; - // stage 2 - v[0] = u[0]; - v[1] = u[1]; - v[2] = u[2]; - v[3] = u[3]; - v[4] = u[4]; - v[5] = u[5]; - v[6] = u[6]; - v[7] = u[7]; + // stage 0 + // stage 1 + // stage 2 + u0 = in[0]; + u1 = in[4]; + u2 = in[2]; + u3 = in[6]; + + x = _mm_mullo_epi32(in[1], cospi56); + y = _mm_mullo_epi32(in[7], cospim8); + u4 = _mm_add_epi32(x, y); + u4 = _mm_add_epi32(u4, rnding); + u4 = _mm_srai_epi32(u4, bit); + + x = _mm_mullo_epi32(in[1], cospi8); + y = _mm_mullo_epi32(in[7], cospi56); + u7 = _mm_add_epi32(x, y); + u7 = _mm_add_epi32(u7, rnding); + u7 = _mm_srai_epi32(u7, bit); + + x = _mm_mullo_epi32(in[5], cospi24); + y = _mm_mullo_epi32(in[3], cospim40); + u5 = _mm_add_epi32(x, y); + u5 = _mm_add_epi32(u5, rnding); + u5 = _mm_srai_epi32(u5, bit); + + x = _mm_mullo_epi32(in[5], cospi40); + y = _mm_mullo_epi32(in[3], cospi24); + u6 = _mm_add_epi32(x, y); + u6 = _mm_add_epi32(u6, rnding); + u6 = _mm_srai_epi32(u6, bit); - v[8] = half_btf_sse4_1(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit); - v[9] = half_btf_sse4_1(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit); - v[10] = half_btf_sse4_1(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit); - v[11] = half_btf_sse4_1(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit); - v[12] = half_btf_sse4_1(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit); - v[13] = half_btf_sse4_1(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit); - v[14] = half_btf_sse4_1(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit); - v[15] = half_btf_sse4_1(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit); + // stage 3 + x = _mm_mullo_epi32(u0, cospi32); + y = _mm_mullo_epi32(u1, cospi32); + v0 = _mm_add_epi32(x, y); + v0 = _mm_add_epi32(v0, rnding); + v0 = _mm_srai_epi32(v0, bit); - // stage 3 - u[0] = v[0]; - u[1] = v[1]; - u[2] = v[2]; - u[3] = v[3]; - u[4] = half_btf_sse4_1(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit); - u[5] = half_btf_sse4_1(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit); - u[6] = half_btf_sse4_1(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit); - u[7] = half_btf_sse4_1(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit); - addsub_sse4_1(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi); + v1 = _mm_sub_epi32(x, y); + v1 = _mm_add_epi32(v1, rnding); + v1 = _mm_srai_epi32(v1, bit); - // stage 4 - x = _mm_mullo_epi32(u[0], cospi32); - y = _mm_mullo_epi32(u[1], cospi32); - v[0] = _mm_add_epi32(x, y); - v[0] = _mm_add_epi32(v[0], rnding); - v[0] = _mm_srai_epi32(v[0], bit); + x = _mm_mullo_epi32(u2, cospi48); + y = _mm_mullo_epi32(u3, cospim16); + v2 = _mm_add_epi32(x, y); + v2 = _mm_add_epi32(v2, rnding); + v2 = _mm_srai_epi32(v2, bit); - v[1] = _mm_sub_epi32(x, y); - v[1] = _mm_add_epi32(v[1], rnding); - v[1] = _mm_srai_epi32(v[1], bit); + x = _mm_mullo_epi32(u2, cospi16); + y = _mm_mullo_epi32(u3, cospi48); + v3 = _mm_add_epi32(x, y); + v3 = _mm_add_epi32(v3, rnding); + v3 = _mm_srai_epi32(v3, bit); - v[2] = half_btf_sse4_1(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit); - v[3] = half_btf_sse4_1(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit); - addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi); - v[8] = u[8]; - v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); - v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); - v[11] = u[11]; - v[12] = u[12]; - v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); - v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); - v[15] = u[15]; + addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi); + addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi); - // stage 5 - addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi); - u[4] = v[4]; + // stage 4 + addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi); + addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi); + u4 = v4; + u7 = v7; - x = _mm_mullo_epi32(v[5], cospi32); - y = _mm_mullo_epi32(v[6], cospi32); - u[5] = _mm_sub_epi32(y, x); - u[5] = _mm_add_epi32(u[5], rnding); - u[5] = _mm_srai_epi32(u[5], bit); + x = _mm_mullo_epi32(v5, cospi32); + y = _mm_mullo_epi32(v6, cospi32); + u6 = _mm_add_epi32(y, x); + u6 = _mm_add_epi32(u6, rnding); + u6 = _mm_srai_epi32(u6, bit); - u[6] = _mm_add_epi32(y, x); - u[6] = _mm_add_epi32(u[6], rnding); - u[6] = _mm_srai_epi32(u[6], bit); + u5 = _mm_sub_epi32(y, x); + u5 = _mm_add_epi32(u5, rnding); + u5 = _mm_srai_epi32(u5, bit); - u[7] = v[7]; - addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + // stage 5 + if (do_cols) { + addsub_no_clamp_sse4_1(u0, u7, out + 0, out + 7); + addsub_no_clamp_sse4_1(u1, u6, out + 1, out + 6); + addsub_no_clamp_sse4_1(u2, u5, out + 2, out + 5); + addsub_no_clamp_sse4_1(u3, u4, out + 3, out + 4); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( + -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); + const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( + (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); + addsub_shift_sse4_1(u0, u7, out + 0, out + 7, &clamp_lo_out, &clamp_hi_out, + out_shift); + addsub_shift_sse4_1(u1, u6, out + 1, out + 6, &clamp_lo_out, &clamp_hi_out, + out_shift); + addsub_shift_sse4_1(u2, u5, out + 2, out + 5, &clamp_lo_out, &clamp_hi_out, + out_shift); + addsub_shift_sse4_1(u3, u4, out + 3, out + 4, &clamp_lo_out, &clamp_hi_out, + out_shift); + } +} - // stage 6 - addsub_sse4_1(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi); - v[8] = u[8]; - v[9] = u[9]; +static void iadst8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const __m128i kZero = _mm_setzero_si128(); + __m128i u[8], x; - x = _mm_mullo_epi32(u[10], cospi32); - y = _mm_mullo_epi32(u[13], cospi32); - v[10] = _mm_sub_epi32(y, x); - v[10] = _mm_add_epi32(v[10], rnding); - v[10] = _mm_srai_epi32(v[10], bit); + // stage 0 + // stage 1 + // stage 2 - v[13] = _mm_add_epi32(x, y); - v[13] = _mm_add_epi32(v[13], rnding); - v[13] = _mm_srai_epi32(v[13], bit); + x = _mm_mullo_epi32(in[0], cospi60); + u[0] = _mm_add_epi32(x, rnding); + u[0] = _mm_srai_epi32(u[0], bit); - x = _mm_mullo_epi32(u[11], cospi32); - y = _mm_mullo_epi32(u[12], cospi32); - v[11] = _mm_sub_epi32(y, x); - v[11] = _mm_add_epi32(v[11], rnding); - v[11] = _mm_srai_epi32(v[11], bit); + x = _mm_mullo_epi32(in[0], cospi4); + u[1] = _mm_sub_epi32(kZero, x); + u[1] = _mm_add_epi32(u[1], rnding); + u[1] = _mm_srai_epi32(u[1], bit); - v[12] = _mm_add_epi32(x, y); - v[12] = _mm_add_epi32(v[12], rnding); - v[12] = _mm_srai_epi32(v[12], bit); + // stage 3 + // stage 4 + __m128i temp1, temp2; + temp1 = _mm_mullo_epi32(u[0], cospi16); + x = _mm_mullo_epi32(u[1], cospi48); + temp1 = _mm_add_epi32(temp1, x); + temp1 = _mm_add_epi32(temp1, rnding); + temp1 = _mm_srai_epi32(temp1, bit); + u[4] = temp1; + + temp2 = _mm_mullo_epi32(u[0], cospi48); + x = _mm_mullo_epi32(u[1], cospi16); + u[5] = _mm_sub_epi32(temp2, x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); - v[14] = u[14]; - v[15] = u[15]; + // stage 5 + // stage 6 + temp1 = _mm_mullo_epi32(u[0], cospi32); + x = _mm_mullo_epi32(u[1], cospi32); + u[2] = _mm_add_epi32(temp1, x); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); - // stage 7 - if (do_cols) { - addsub_no_clamp_sse4_1(v[0], v[15], out + 0 * 4 + col, - out + 15 * 4 + col); - addsub_no_clamp_sse4_1(v[1], v[14], out + 1 * 4 + col, - out + 14 * 4 + col); - addsub_no_clamp_sse4_1(v[2], v[13], out + 2 * 4 + col, - out + 13 * 4 + col); - addsub_no_clamp_sse4_1(v[3], v[12], out + 3 * 4 + col, - out + 12 * 4 + col); - addsub_no_clamp_sse4_1(v[4], v[11], out + 4 * 4 + col, - out + 11 * 4 + col); - addsub_no_clamp_sse4_1(v[5], v[10], out + 5 * 4 + col, - out + 10 * 4 + col); - addsub_no_clamp_sse4_1(v[6], v[9], out + 6 * 4 + col, out + 9 * 4 + col); - addsub_no_clamp_sse4_1(v[7], v[8], out + 7 * 4 + col, out + 8 * 4 + col); - } else { - addsub_shift_sse4_1(v[0], v[15], out + 0 * 4 + col, out + 15 * 4 + col, - &clamp_lo, &clamp_hi, out_shift); - addsub_shift_sse4_1(v[1], v[14], out + 1 * 4 + col, out + 14 * 4 + col, - &clamp_lo, &clamp_hi, out_shift); - addsub_shift_sse4_1(v[2], v[13], out + 2 * 4 + col, out + 13 * 4 + col, - &clamp_lo, &clamp_hi, out_shift); - addsub_shift_sse4_1(v[3], v[12], out + 3 * 4 + col, out + 12 * 4 + col, - &clamp_lo, &clamp_hi, out_shift); - addsub_shift_sse4_1(v[4], v[11], out + 4 * 4 + col, out + 11 * 4 + col, - &clamp_lo, &clamp_hi, out_shift); - addsub_shift_sse4_1(v[5], v[10], out + 5 * 4 + col, out + 10 * 4 + col, - &clamp_lo, &clamp_hi, out_shift); - addsub_shift_sse4_1(v[6], v[9], out + 6 * 4 + col, out + 9 * 4 + col, - &clamp_lo, &clamp_hi, out_shift); - addsub_shift_sse4_1(v[7], v[8], out + 7 * 4 + col, out + 8 * 4 + col, - &clamp_lo, &clamp_hi, out_shift); - } + u[3] = _mm_sub_epi32(temp1, x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + temp1 = _mm_mullo_epi32(u[4], cospi32); + x = _mm_mullo_epi32(u[5], cospi32); + u[6] = _mm_add_epi32(temp1, x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_sub_epi32(temp1, x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 7 + if (do_cols) { + out[0] = u[0]; + out[1] = _mm_sub_epi32(kZero, u[4]); + out[2] = u[6]; + out[3] = _mm_sub_epi32(kZero, u[2]); + out[4] = u[3]; + out[5] = _mm_sub_epi32(kZero, u[7]); + out[6] = u[5]; + out[7] = _mm_sub_epi32(kZero, u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, + out_shift); } } -static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, - int bd, int out_shift) { +static void iadst8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); - const __m128i cospi2 = _mm_set1_epi32(cospi[2]); - const __m128i cospi62 = _mm_set1_epi32(cospi[62]); - const __m128i cospi10 = _mm_set1_epi32(cospi[10]); - const __m128i cospi54 = _mm_set1_epi32(cospi[54]); - const __m128i cospi18 = _mm_set1_epi32(cospi[18]); - const __m128i cospi46 = _mm_set1_epi32(cospi[46]); - const __m128i cospi26 = _mm_set1_epi32(cospi[26]); - const __m128i cospi38 = _mm_set1_epi32(cospi[38]); - const __m128i cospi34 = _mm_set1_epi32(cospi[34]); - const __m128i cospi30 = _mm_set1_epi32(cospi[30]); - const __m128i cospi42 = _mm_set1_epi32(cospi[42]); - const __m128i cospi22 = _mm_set1_epi32(cospi[22]); - const __m128i cospi50 = _mm_set1_epi32(cospi[50]); - const __m128i cospi14 = _mm_set1_epi32(cospi[14]); - const __m128i cospi58 = _mm_set1_epi32(cospi[58]); - const __m128i cospi6 = _mm_set1_epi32(cospi[6]); - const __m128i cospi8 = _mm_set1_epi32(cospi[8]); - const __m128i cospi56 = _mm_set1_epi32(cospi[56]); - const __m128i cospi40 = _mm_set1_epi32(cospi[40]); - const __m128i cospi24 = _mm_set1_epi32(cospi[24]); - const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); - const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); - const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi36 = _mm_set1_epi32(cospi[36]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi52 = _mm_set1_epi32(cospi[52]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const __m128i kZero = _mm_setzero_si128(); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); - __m128i u[16], v[16], x, y; - const int col_num = 4; - int col; + __m128i u[8], v[8], x; - // Calculate the column 0, 1, 2, 3 - for (col = 0; col < col_num; ++col) { - // stage 0 - // stage 1 - // stage 2 - v[0] = _mm_mullo_epi32(in[15 * col_num + col], cospi2); - x = _mm_mullo_epi32(in[0 * col_num + col], cospi62); - v[0] = _mm_add_epi32(v[0], x); - v[0] = _mm_add_epi32(v[0], rnding); - v[0] = _mm_srai_epi32(v[0], bit); + // stage 0 + // stage 1 + // stage 2 - v[1] = _mm_mullo_epi32(in[15 * col_num + col], cospi62); - x = _mm_mullo_epi32(in[0 * col_num + col], cospi2); - v[1] = _mm_sub_epi32(v[1], x); - v[1] = _mm_add_epi32(v[1], rnding); - v[1] = _mm_srai_epi32(v[1], bit); + u[0] = _mm_mullo_epi32(in[7], cospi4); + x = _mm_mullo_epi32(in[0], cospi60); + u[0] = _mm_add_epi32(u[0], x); + u[0] = _mm_add_epi32(u[0], rnding); + u[0] = _mm_srai_epi32(u[0], bit); - v[2] = _mm_mullo_epi32(in[13 * col_num + col], cospi10); - x = _mm_mullo_epi32(in[2 * col_num + col], cospi54); - v[2] = _mm_add_epi32(v[2], x); - v[2] = _mm_add_epi32(v[2], rnding); - v[2] = _mm_srai_epi32(v[2], bit); + u[1] = _mm_mullo_epi32(in[7], cospi60); + x = _mm_mullo_epi32(in[0], cospi4); + u[1] = _mm_sub_epi32(u[1], x); + u[1] = _mm_add_epi32(u[1], rnding); + u[1] = _mm_srai_epi32(u[1], bit); - v[3] = _mm_mullo_epi32(in[13 * col_num + col], cospi54); - x = _mm_mullo_epi32(in[2 * col_num + col], cospi10); - v[3] = _mm_sub_epi32(v[3], x); - v[3] = _mm_add_epi32(v[3], rnding); - v[3] = _mm_srai_epi32(v[3], bit); + // (2) + u[2] = _mm_mullo_epi32(in[5], cospi20); + x = _mm_mullo_epi32(in[2], cospi44); + u[2] = _mm_add_epi32(u[2], x); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); - v[4] = _mm_mullo_epi32(in[11 * col_num + col], cospi18); - x = _mm_mullo_epi32(in[4 * col_num + col], cospi46); - v[4] = _mm_add_epi32(v[4], x); - v[4] = _mm_add_epi32(v[4], rnding); - v[4] = _mm_srai_epi32(v[4], bit); + u[3] = _mm_mullo_epi32(in[5], cospi44); + x = _mm_mullo_epi32(in[2], cospi20); + u[3] = _mm_sub_epi32(u[3], x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); - v[5] = _mm_mullo_epi32(in[11 * col_num + col], cospi46); - x = _mm_mullo_epi32(in[4 * col_num + col], cospi18); - v[5] = _mm_sub_epi32(v[5], x); - v[5] = _mm_add_epi32(v[5], rnding); - v[5] = _mm_srai_epi32(v[5], bit); + // (3) + u[4] = _mm_mullo_epi32(in[3], cospi36); + x = _mm_mullo_epi32(in[4], cospi28); + u[4] = _mm_add_epi32(u[4], x); + u[4] = _mm_add_epi32(u[4], rnding); + u[4] = _mm_srai_epi32(u[4], bit); - v[6] = _mm_mullo_epi32(in[9 * col_num + col], cospi26); - x = _mm_mullo_epi32(in[6 * col_num + col], cospi38); - v[6] = _mm_add_epi32(v[6], x); - v[6] = _mm_add_epi32(v[6], rnding); - v[6] = _mm_srai_epi32(v[6], bit); + u[5] = _mm_mullo_epi32(in[3], cospi28); + x = _mm_mullo_epi32(in[4], cospi36); + u[5] = _mm_sub_epi32(u[5], x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); - v[7] = _mm_mullo_epi32(in[9 * col_num + col], cospi38); - x = _mm_mullo_epi32(in[6 * col_num + col], cospi26); - v[7] = _mm_sub_epi32(v[7], x); - v[7] = _mm_add_epi32(v[7], rnding); - v[7] = _mm_srai_epi32(v[7], bit); + // (4) + u[6] = _mm_mullo_epi32(in[1], cospi52); + x = _mm_mullo_epi32(in[6], cospi12); + u[6] = _mm_add_epi32(u[6], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); - v[8] = _mm_mullo_epi32(in[7 * col_num + col], cospi34); - x = _mm_mullo_epi32(in[8 * col_num + col], cospi30); - v[8] = _mm_add_epi32(v[8], x); - v[8] = _mm_add_epi32(v[8], rnding); - v[8] = _mm_srai_epi32(v[8], bit); + u[7] = _mm_mullo_epi32(in[1], cospi12); + x = _mm_mullo_epi32(in[6], cospi52); + u[7] = _mm_sub_epi32(u[7], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); - v[9] = _mm_mullo_epi32(in[7 * col_num + col], cospi30); - x = _mm_mullo_epi32(in[8 * col_num + col], cospi34); - v[9] = _mm_sub_epi32(v[9], x); - v[9] = _mm_add_epi32(v[9], rnding); - v[9] = _mm_srai_epi32(v[9], bit); + // stage 3 + addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi); - v[10] = _mm_mullo_epi32(in[5 * col_num + col], cospi42); - x = _mm_mullo_epi32(in[10 * col_num + col], cospi22); - v[10] = _mm_add_epi32(v[10], x); - v[10] = _mm_add_epi32(v[10], rnding); - v[10] = _mm_srai_epi32(v[10], bit); + // stage 4 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; - v[11] = _mm_mullo_epi32(in[5 * col_num + col], cospi22); - x = _mm_mullo_epi32(in[10 * col_num + col], cospi42); - v[11] = _mm_sub_epi32(v[11], x); - v[11] = _mm_add_epi32(v[11], rnding); - v[11] = _mm_srai_epi32(v[11], bit); + u[4] = _mm_mullo_epi32(v[4], cospi16); + x = _mm_mullo_epi32(v[5], cospi48); + u[4] = _mm_add_epi32(u[4], x); + u[4] = _mm_add_epi32(u[4], rnding); + u[4] = _mm_srai_epi32(u[4], bit); - v[12] = _mm_mullo_epi32(in[3 * col_num + col], cospi50); - x = _mm_mullo_epi32(in[12 * col_num + col], cospi14); - v[12] = _mm_add_epi32(v[12], x); - v[12] = _mm_add_epi32(v[12], rnding); - v[12] = _mm_srai_epi32(v[12], bit); + u[5] = _mm_mullo_epi32(v[4], cospi48); + x = _mm_mullo_epi32(v[5], cospi16); + u[5] = _mm_sub_epi32(u[5], x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); - v[13] = _mm_mullo_epi32(in[3 * col_num + col], cospi14); - x = _mm_mullo_epi32(in[12 * col_num + col], cospi50); - v[13] = _mm_sub_epi32(v[13], x); - v[13] = _mm_add_epi32(v[13], rnding); - v[13] = _mm_srai_epi32(v[13], bit); + u[6] = _mm_mullo_epi32(v[6], cospim48); + x = _mm_mullo_epi32(v[7], cospi16); + u[6] = _mm_add_epi32(u[6], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); - v[14] = _mm_mullo_epi32(in[1 * col_num + col], cospi58); - x = _mm_mullo_epi32(in[14 * col_num + col], cospi6); - v[14] = _mm_add_epi32(v[14], x); - v[14] = _mm_add_epi32(v[14], rnding); - v[14] = _mm_srai_epi32(v[14], bit); + u[7] = _mm_mullo_epi32(v[6], cospi16); + x = _mm_mullo_epi32(v[7], cospim48); + u[7] = _mm_sub_epi32(u[7], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); - v[15] = _mm_mullo_epi32(in[1 * col_num + col], cospi6); - x = _mm_mullo_epi32(in[14 * col_num + col], cospi58); - v[15] = _mm_sub_epi32(v[15], x); - v[15] = _mm_add_epi32(v[15], rnding); - v[15] = _mm_srai_epi32(v[15], bit); + // stage 5 + addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi); - // stage 3 - addsub_sse4_1(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi); + // stage 6 + u[0] = v[0]; + u[1] = v[1]; + u[4] = v[4]; + u[5] = v[5]; - // stage 4 - v[0] = u[0]; - v[1] = u[1]; - v[2] = u[2]; - v[3] = u[3]; - v[4] = u[4]; - v[5] = u[5]; - v[6] = u[6]; - v[7] = u[7]; + v[0] = _mm_mullo_epi32(v[2], cospi32); + x = _mm_mullo_epi32(v[3], cospi32); + u[2] = _mm_add_epi32(v[0], x); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); - v[8] = _mm_mullo_epi32(u[8], cospi8); - x = _mm_mullo_epi32(u[9], cospi56); - v[8] = _mm_add_epi32(v[8], x); - v[8] = _mm_add_epi32(v[8], rnding); - v[8] = _mm_srai_epi32(v[8], bit); + u[3] = _mm_sub_epi32(v[0], x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); - v[9] = _mm_mullo_epi32(u[8], cospi56); - x = _mm_mullo_epi32(u[9], cospi8); - v[9] = _mm_sub_epi32(v[9], x); - v[9] = _mm_add_epi32(v[9], rnding); - v[9] = _mm_srai_epi32(v[9], bit); + v[0] = _mm_mullo_epi32(v[6], cospi32); + x = _mm_mullo_epi32(v[7], cospi32); + u[6] = _mm_add_epi32(v[0], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); - v[10] = _mm_mullo_epi32(u[10], cospi40); - x = _mm_mullo_epi32(u[11], cospi24); - v[10] = _mm_add_epi32(v[10], x); - v[10] = _mm_add_epi32(v[10], rnding); - v[10] = _mm_srai_epi32(v[10], bit); + u[7] = _mm_sub_epi32(v[0], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); - v[11] = _mm_mullo_epi32(u[10], cospi24); - x = _mm_mullo_epi32(u[11], cospi40); - v[11] = _mm_sub_epi32(v[11], x); - v[11] = _mm_add_epi32(v[11], rnding); - v[11] = _mm_srai_epi32(v[11], bit); + // stage 7 + if (do_cols) { + out[0] = u[0]; + out[1] = _mm_sub_epi32(kZero, u[4]); + out[2] = u[6]; + out[3] = _mm_sub_epi32(kZero, u[2]); + out[4] = u[3]; + out[5] = _mm_sub_epi32(kZero, u[7]); + out[6] = u[5]; + out[7] = _mm_sub_epi32(kZero, u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); - v[12] = _mm_mullo_epi32(u[12], cospim56); - x = _mm_mullo_epi32(u[13], cospi8); - v[12] = _mm_add_epi32(v[12], x); - v[12] = _mm_add_epi32(v[12], rnding); - v[12] = _mm_srai_epi32(v[12], bit); + neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, + out_shift); + } +} - v[13] = _mm_mullo_epi32(u[12], cospi8); - x = _mm_mullo_epi32(u[13], cospim56); - v[13] = _mm_sub_epi32(v[13], x); - v[13] = _mm_add_epi32(v[13], rnding); - v[13] = _mm_srai_epi32(v[13], bit); +static void idct16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); - v[14] = _mm_mullo_epi32(u[14], cospim24); - x = _mm_mullo_epi32(u[15], cospi40); - v[14] = _mm_add_epi32(v[14], x); - v[14] = _mm_add_epi32(v[14], rnding); - v[14] = _mm_srai_epi32(v[14], bit); + { + // stage 0 + // stage 1 + // stage 2 + // stage 3 + // stage 4 + in[0] = _mm_mullo_epi32(in[0], cospi32); + in[0] = _mm_add_epi32(in[0], rnding); + in[0] = _mm_srai_epi32(in[0], bit); - v[15] = _mm_mullo_epi32(u[14], cospi40); - x = _mm_mullo_epi32(u[15], cospim24); - v[15] = _mm_sub_epi32(v[15], x); - v[15] = _mm_add_epi32(v[15], rnding); - v[15] = _mm_srai_epi32(v[15], bit); + // stage 5 + // stage 6 + // stage 7 + if (do_cols) { + in[0] = _mm_max_epi32(in[0], clamp_lo); + in[0] = _mm_min_epi32(in[0], clamp_hi); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( + -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); + const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( + (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); + __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1); + in[0] = _mm_add_epi32(in[0], offset); + in[0] = _mm_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift)); + in[0] = _mm_max_epi32(in[0], clamp_lo_out); + in[0] = _mm_min_epi32(in[0], clamp_hi_out); + } + + out[0] = in[0]; + out[1] = in[0]; + out[2] = in[0]; + out[3] = in[0]; + out[4] = in[0]; + out[5] = in[0]; + out[6] = in[0]; + out[7] = in[0]; + out[8] = in[0]; + out[9] = in[0]; + out[10] = in[0]; + out[11] = in[0]; + out[12] = in[0]; + out[13] = in[0]; + out[14] = in[0]; + out[15] = in[0]; + } +} + +static void idct16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i u[16], x, y; + + { + // stage 0 + // stage 1 + u[0] = in[0]; + u[2] = in[4]; + u[4] = in[2]; + u[6] = in[6]; + u[8] = in[1]; + u[10] = in[5]; + u[12] = in[3]; + u[14] = in[7]; + + // stage 2 + u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit); + u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit); + + u[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit); + u[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit); + + u[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit); + u[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit); + + u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit); + u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit); + + // stage 3 + u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit); + u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit); + u[5] = half_btf_0_sse4_1(&cospim40, &u[6], &rnding, bit); + u[6] = half_btf_0_sse4_1(&cospi24, &u[6], &rnding, bit); + + addsub_sse4_1(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi); + + // stage 4 + x = _mm_mullo_epi32(u[0], cospi32); + u[0] = _mm_add_epi32(x, rnding); + u[0] = _mm_srai_epi32(u[0], bit); + u[1] = u[0]; + + u[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit); + u[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit); + + addsub_sse4_1(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi); + + x = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + u[9] = x; + y = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); + u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); + u[10] = y; // stage 5 - addsub_sse4_1(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi); + + x = _mm_mullo_epi32(u[5], cospi32); + y = _mm_mullo_epi32(u[6], cospi32); + u[5] = _mm_sub_epi32(y, x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + u[6] = _mm_add_epi32(y, x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi); // stage 6 - v[0] = u[0]; - v[1] = u[1]; - v[2] = u[2]; - v[3] = u[3]; + addsub_sse4_1(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi); - v[4] = _mm_mullo_epi32(u[4], cospi16); - x = _mm_mullo_epi32(u[5], cospi48); - v[4] = _mm_add_epi32(v[4], x); - v[4] = _mm_add_epi32(v[4], rnding); - v[4] = _mm_srai_epi32(v[4], bit); + x = _mm_mullo_epi32(u[10], cospi32); + y = _mm_mullo_epi32(u[13], cospi32); + u[10] = _mm_sub_epi32(y, x); + u[10] = _mm_add_epi32(u[10], rnding); + u[10] = _mm_srai_epi32(u[10], bit); - v[5] = _mm_mullo_epi32(u[4], cospi48); - x = _mm_mullo_epi32(u[5], cospi16); - v[5] = _mm_sub_epi32(v[5], x); - v[5] = _mm_add_epi32(v[5], rnding); - v[5] = _mm_srai_epi32(v[5], bit); + u[13] = _mm_add_epi32(x, y); + u[13] = _mm_add_epi32(u[13], rnding); + u[13] = _mm_srai_epi32(u[13], bit); - v[6] = _mm_mullo_epi32(u[6], cospim48); - x = _mm_mullo_epi32(u[7], cospi16); - v[6] = _mm_add_epi32(v[6], x); - v[6] = _mm_add_epi32(v[6], rnding); - v[6] = _mm_srai_epi32(v[6], bit); + x = _mm_mullo_epi32(u[11], cospi32); + y = _mm_mullo_epi32(u[12], cospi32); + u[11] = _mm_sub_epi32(y, x); + u[11] = _mm_add_epi32(u[11], rnding); + u[11] = _mm_srai_epi32(u[11], bit); - v[7] = _mm_mullo_epi32(u[6], cospi16); - x = _mm_mullo_epi32(u[7], cospim48); - v[7] = _mm_sub_epi32(v[7], x); - v[7] = _mm_add_epi32(v[7], rnding); - v[7] = _mm_srai_epi32(v[7], bit); + u[12] = _mm_add_epi32(x, y); + u[12] = _mm_add_epi32(u[12], rnding); + u[12] = _mm_srai_epi32(u[12], bit); + // stage 7 + if (do_cols) { + addsub_no_clamp_sse4_1(u[0], u[15], out + 0, out + 15); + addsub_no_clamp_sse4_1(u[1], u[14], out + 1, out + 14); + addsub_no_clamp_sse4_1(u[2], u[13], out + 2, out + 13); + addsub_no_clamp_sse4_1(u[3], u[12], out + 3, out + 12); + addsub_no_clamp_sse4_1(u[4], u[11], out + 4, out + 11); + addsub_no_clamp_sse4_1(u[5], u[10], out + 5, out + 10); + addsub_no_clamp_sse4_1(u[6], u[9], out + 6, out + 9); + addsub_no_clamp_sse4_1(u[7], u[8], out + 7, out + 8); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( + -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); + const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( + (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); + + addsub_shift_sse4_1(u[0], u[15], out + 0, out + 15, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(u[1], u[14], out + 1, out + 14, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(u[2], u[13], out + 2, out + 13, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(u[3], u[12], out + 3, out + 12, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(u[4], u[11], out + 4, out + 11, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(u[5], u[10], out + 5, out + 10, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(u[6], u[9], out + 6, out + 9, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(u[7], u[8], out + 7, out + 8, &clamp_lo_out, + &clamp_hi_out, out_shift); + } + } +} - v[8] = u[8]; - v[9] = u[9]; - v[10] = u[10]; - v[11] = u[11]; +static void iadst16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const __m128i zero = _mm_setzero_si128(); + __m128i v[16], x, y, temp1, temp2; - v[12] = _mm_mullo_epi32(u[12], cospi16); - x = _mm_mullo_epi32(u[13], cospi48); - v[12] = _mm_add_epi32(v[12], x); - v[12] = _mm_add_epi32(v[12], rnding); - v[12] = _mm_srai_epi32(v[12], bit); + // Calculate the column 0, 1, 2, 3 + { + // stage 0 + // stage 1 + // stage 2 + x = _mm_mullo_epi32(in[0], cospi62); + v[0] = _mm_add_epi32(x, rnding); + v[0] = _mm_srai_epi32(v[0], bit); - v[13] = _mm_mullo_epi32(u[12], cospi48); - x = _mm_mullo_epi32(u[13], cospi16); - v[13] = _mm_sub_epi32(v[13], x); - v[13] = _mm_add_epi32(v[13], rnding); - v[13] = _mm_srai_epi32(v[13], bit); + x = _mm_mullo_epi32(in[0], cospi2); + v[1] = _mm_sub_epi32(zero, x); + v[1] = _mm_add_epi32(v[1], rnding); + v[1] = _mm_srai_epi32(v[1], bit); - v[14] = _mm_mullo_epi32(u[14], cospim48); - x = _mm_mullo_epi32(u[15], cospi16); - v[14] = _mm_add_epi32(v[14], x); - v[14] = _mm_add_epi32(v[14], rnding); - v[14] = _mm_srai_epi32(v[14], bit); + // stage 3 + v[8] = v[0]; + v[9] = v[1]; - v[15] = _mm_mullo_epi32(u[14], cospi16); - x = _mm_mullo_epi32(u[15], cospim48); - v[15] = _mm_sub_epi32(v[15], x); - v[15] = _mm_add_epi32(v[15], rnding); - v[15] = _mm_srai_epi32(v[15], bit); + // stage 4 + temp1 = _mm_mullo_epi32(v[8], cospi8); + x = _mm_mullo_epi32(v[9], cospi56); + temp1 = _mm_add_epi32(temp1, x); + temp1 = _mm_add_epi32(temp1, rnding); + temp1 = _mm_srai_epi32(temp1, bit); + + temp2 = _mm_mullo_epi32(v[8], cospi56); + x = _mm_mullo_epi32(v[9], cospi8); + temp2 = _mm_sub_epi32(temp2, x); + temp2 = _mm_add_epi32(temp2, rnding); + temp2 = _mm_srai_epi32(temp2, bit); + v[8] = temp1; + v[9] = temp2; + + // stage 5 + v[4] = v[0]; + v[5] = v[1]; + v[12] = v[8]; + v[13] = v[9]; + + // stage 6 + temp1 = _mm_mullo_epi32(v[4], cospi16); + x = _mm_mullo_epi32(v[5], cospi48); + temp1 = _mm_add_epi32(temp1, x); + temp1 = _mm_add_epi32(temp1, rnding); + temp1 = _mm_srai_epi32(temp1, bit); + + temp2 = _mm_mullo_epi32(v[4], cospi48); + x = _mm_mullo_epi32(v[5], cospi16); + temp2 = _mm_sub_epi32(temp2, x); + temp2 = _mm_add_epi32(temp2, rnding); + temp2 = _mm_srai_epi32(temp2, bit); + v[4] = temp1; + v[5] = temp2; + + temp1 = _mm_mullo_epi32(v[12], cospi16); + x = _mm_mullo_epi32(v[13], cospi48); + temp1 = _mm_add_epi32(temp1, x); + temp1 = _mm_add_epi32(temp1, rnding); + temp1 = _mm_srai_epi32(temp1, bit); + + temp2 = _mm_mullo_epi32(v[12], cospi48); + x = _mm_mullo_epi32(v[13], cospi16); + temp2 = _mm_sub_epi32(temp2, x); + temp2 = _mm_add_epi32(temp2, rnding); + temp2 = _mm_srai_epi32(temp2, bit); + v[12] = temp1; + v[13] = temp2; // stage 7 - addsub_sse4_1(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi); + v[2] = v[0]; + v[3] = v[1]; + v[6] = v[4]; + v[7] = v[5]; + v[10] = v[8]; + v[11] = v[9]; + v[14] = v[12]; + v[15] = v[13]; // stage 8 - v[0] = u[0]; - v[1] = u[1]; - - y = _mm_mullo_epi32(u[2], cospi32); - x = _mm_mullo_epi32(u[3], cospi32); + y = _mm_mullo_epi32(v[2], cospi32); + x = _mm_mullo_epi32(v[3], cospi32); v[2] = _mm_add_epi32(y, x); v[2] = _mm_add_epi32(v[2], rnding); v[2] = _mm_srai_epi32(v[2], bit); @@ -1534,11 +2000,8 @@ static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, v[3] = _mm_add_epi32(v[3], rnding); v[3] = _mm_srai_epi32(v[3], bit); - v[4] = u[4]; - v[5] = u[5]; - - y = _mm_mullo_epi32(u[6], cospi32); - x = _mm_mullo_epi32(u[7], cospi32); + y = _mm_mullo_epi32(v[6], cospi32); + x = _mm_mullo_epi32(v[7], cospi32); v[6] = _mm_add_epi32(y, x); v[6] = _mm_add_epi32(v[6], rnding); v[6] = _mm_srai_epi32(v[6], bit); @@ -1547,11 +2010,8 @@ static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, v[7] = _mm_add_epi32(v[7], rnding); v[7] = _mm_srai_epi32(v[7], bit); - v[8] = u[8]; - v[9] = u[9]; - - y = _mm_mullo_epi32(u[10], cospi32); - x = _mm_mullo_epi32(u[11], cospi32); + y = _mm_mullo_epi32(v[10], cospi32); + x = _mm_mullo_epi32(v[11], cospi32); v[10] = _mm_add_epi32(y, x); v[10] = _mm_add_epi32(v[10], rnding); v[10] = _mm_srai_epi32(v[10], bit); @@ -1560,11 +2020,8 @@ static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, v[11] = _mm_add_epi32(v[11], rnding); v[11] = _mm_srai_epi32(v[11], bit); - v[12] = u[12]; - v[13] = u[13]; - - y = _mm_mullo_epi32(u[14], cospi32); - x = _mm_mullo_epi32(u[15], cospi32); + y = _mm_mullo_epi32(v[14], cospi32); + x = _mm_mullo_epi32(v[15], cospi32); v[14] = _mm_add_epi32(y, x); v[14] = _mm_add_epi32(v[14], rnding); v[14] = _mm_srai_epi32(v[14], bit); @@ -1575,439 +2032,1904 @@ static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, // stage 9 if (do_cols) { - out[0 * col_num + col] = v[0]; - out[1 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[8]); - out[2 * col_num + col] = v[12]; - out[3 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[4]); - out[4 * col_num + col] = v[6]; - out[5 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[14]); - out[6 * col_num + col] = v[10]; - out[7 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[2]); - out[8 * col_num + col] = v[3]; - out[9 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[11]); - out[10 * col_num + col] = v[15]; - out[11 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[7]); - out[12 * col_num + col] = v[5]; - out[13 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[13]); - out[14 * col_num + col] = v[9]; - out[15 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[1]); + out[0] = v[0]; + out[1] = _mm_sub_epi32(_mm_setzero_si128(), v[8]); + out[2] = v[12]; + out[3] = _mm_sub_epi32(_mm_setzero_si128(), v[4]); + out[4] = v[6]; + out[5] = _mm_sub_epi32(_mm_setzero_si128(), v[14]); + out[6] = v[10]; + out[7] = _mm_sub_epi32(_mm_setzero_si128(), v[2]); + out[8] = v[3]; + out[9] = _mm_sub_epi32(_mm_setzero_si128(), v[11]); + out[10] = v[15]; + out[11] = _mm_sub_epi32(_mm_setzero_si128(), v[7]); + out[12] = v[5]; + out[13] = _mm_sub_epi32(_mm_setzero_si128(), v[13]); + out[14] = v[9]; + out[15] = _mm_sub_epi32(_mm_setzero_si128(), v[1]); } else { - neg_shift_sse4_1(v[0], v[8], out + 0 * col_num + col, - out + 1 * col_num + col, &clamp_lo, &clamp_hi, - out_shift); - neg_shift_sse4_1(v[12], v[4], out + 2 * col_num + col, - out + 3 * col_num + col, &clamp_lo, &clamp_hi, - out_shift); - neg_shift_sse4_1(v[6], v[14], out + 4 * col_num + col, - out + 5 * col_num + col, &clamp_lo, &clamp_hi, - out_shift); - neg_shift_sse4_1(v[10], v[2], out + 6 * col_num + col, - out + 7 * col_num + col, &clamp_lo, &clamp_hi, - out_shift); - neg_shift_sse4_1(v[3], v[11], out + 8 * col_num + col, - out + 9 * col_num + col, &clamp_lo, &clamp_hi, - out_shift); - neg_shift_sse4_1(v[15], v[7], out + 10 * col_num + col, - out + 11 * col_num + col, &clamp_lo, &clamp_hi, - out_shift); - neg_shift_sse4_1(v[5], v[13], out + 12 * col_num + col, - out + 13 * col_num + col, &clamp_lo, &clamp_hi, - out_shift); - neg_shift_sse4_1(v[9], v[1], out + 14 * col_num + col, - out + 15 * col_num + col, &clamp_lo, &clamp_hi, - out_shift); + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = + _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out, + &clamp_hi_out, out_shift); } } } -void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *coeff, uint16_t *output, - int stride, TX_TYPE tx_type, int bd) { - __m128i in[64], out[64]; - const int8_t *shift = inv_txfm_shift_ls[TX_16X16]; - const int txw_idx = get_txw_idx(TX_16X16); - const int txh_idx = get_txh_idx(TX_16X16); +static void iadst16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospi10 = _mm_set1_epi32(cospi[10]); + const __m128i cospi54 = _mm_set1_epi32(cospi[54]); + const __m128i cospi18 = _mm_set1_epi32(cospi[18]); + const __m128i cospi46 = _mm_set1_epi32(cospi[46]); + const __m128i cospi26 = _mm_set1_epi32(cospi[26]); + const __m128i cospi38 = _mm_set1_epi32(cospi[38]); + const __m128i cospi34 = _mm_set1_epi32(cospi[34]); + const __m128i cospi30 = _mm_set1_epi32(cospi[30]); + const __m128i cospi42 = _mm_set1_epi32(cospi[42]); + const __m128i cospi22 = _mm_set1_epi32(cospi[22]); + const __m128i cospi50 = _mm_set1_epi32(cospi[50]); + const __m128i cospi14 = _mm_set1_epi32(cospi[14]); + const __m128i cospi58 = _mm_set1_epi32(cospi[58]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i u[16], x, y; - switch (tx_type) { - case DCT_DCT: - load_buffer_16x16(coeff, in); - transpose_16x16(in, out); - idct16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, - -shift[0]); - transpose_16x16(in, out); - idct16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); - write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd); - break; - case DCT_ADST: - load_buffer_16x16(coeff, in); - transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, - -shift[0]); - transpose_16x16(in, out); - idct16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); - write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd); - break; - case ADST_DCT: - load_buffer_16x16(coeff, in); - transpose_16x16(in, out); - idct16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, - -shift[0]); - transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); - write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd); - break; - case ADST_ADST: - load_buffer_16x16(coeff, in); - transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, - -shift[0]); - transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); - write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd); - break; - case FLIPADST_DCT: - load_buffer_16x16(coeff, in); - transpose_16x16(in, out); - idct16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, - -shift[0]); - transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); - write_buffer_16x16(in, output, stride, 0, 1, -shift[1], bd); - break; - case DCT_FLIPADST: - load_buffer_16x16(coeff, in); - transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, - -shift[0]); - transpose_16x16(in, out); - idct16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); - write_buffer_16x16(in, output, stride, 1, 0, -shift[1], bd); - break; - case ADST_FLIPADST: - load_buffer_16x16(coeff, in); - transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, - -shift[0]); - transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); - write_buffer_16x16(in, output, stride, 1, 0, -shift[1], bd); - break; - case FLIPADST_FLIPADST: - load_buffer_16x16(coeff, in); - transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, - -shift[0]); - transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); - write_buffer_16x16(in, output, stride, 1, 1, -shift[1], bd); - break; - case FLIPADST_ADST: - load_buffer_16x16(coeff, in); - transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, - -shift[0]); - transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); - write_buffer_16x16(in, output, stride, 0, 1, -shift[1], bd); - break; - default: assert(0); - } -} + // Calculate the column 0, 1, 2, 3 + { + // stage 0 + // stage 1 + // stage 2 + __m128i zero = _mm_setzero_si128(); + x = _mm_mullo_epi32(in[0], cospi62); + u[0] = _mm_add_epi32(x, rnding); + u[0] = _mm_srai_epi32(u[0], bit); + + x = _mm_mullo_epi32(in[0], cospi2); + u[1] = _mm_sub_epi32(zero, x); + u[1] = _mm_add_epi32(u[1], rnding); + u[1] = _mm_srai_epi32(u[1], bit); + + x = _mm_mullo_epi32(in[2], cospi54); + u[2] = _mm_add_epi32(x, rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + x = _mm_mullo_epi32(in[2], cospi10); + u[3] = _mm_sub_epi32(zero, x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + x = _mm_mullo_epi32(in[4], cospi46); + u[4] = _mm_add_epi32(x, rnding); + u[4] = _mm_srai_epi32(u[4], bit); + + x = _mm_mullo_epi32(in[4], cospi18); + u[5] = _mm_sub_epi32(zero, x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); -static void load_buffer_64x64_lower_32x32(const int32_t *coeff, __m128i *in) { - int i, j; + x = _mm_mullo_epi32(in[6], cospi38); + u[6] = _mm_add_epi32(x, rnding); + u[6] = _mm_srai_epi32(u[6], bit); - __m128i zero = _mm_setzero_si128(); + x = _mm_mullo_epi32(in[6], cospi26); + u[7] = _mm_sub_epi32(zero, x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); - for (i = 0; i < 32; ++i) { - for (j = 0; j < 8; ++j) { - in[16 * i + j] = - _mm_loadu_si128((const __m128i *)(coeff + 32 * i + 4 * j)); - in[16 * i + j + 8] = zero; - } - } + u[8] = _mm_mullo_epi32(in[7], cospi34); + u[8] = _mm_add_epi32(u[8], rnding); + u[8] = _mm_srai_epi32(u[8], bit); - for (i = 0; i < 512; ++i) in[512 + i] = zero; -} + u[9] = _mm_mullo_epi32(in[7], cospi30); + u[9] = _mm_add_epi32(u[9], rnding); + u[9] = _mm_srai_epi32(u[9], bit); -static void transpose_64x64(__m128i *in, __m128i *out, int do_cols) { - int i, j; - for (i = 0; i < (do_cols ? 16 : 8); ++i) { - for (j = 0; j < 8; ++j) { - TRANSPOSE_4X4(in[(4 * i + 0) * 16 + j], in[(4 * i + 1) * 16 + j], - in[(4 * i + 2) * 16 + j], in[(4 * i + 3) * 16 + j], - out[(4 * j + 0) * 16 + i], out[(4 * j + 1) * 16 + i], - out[(4 * j + 2) * 16 + i], out[(4 * j + 3) * 16 + i]); - } - } -} + u[10] = _mm_mullo_epi32(in[5], cospi42); + u[10] = _mm_add_epi32(u[10], rnding); + u[10] = _mm_srai_epi32(u[10], bit); -static void assign_16x16_input_from_32x32(const __m128i *in, __m128i *in16x16, - int col) { - int i; - for (i = 0; i < 16 * 16 / 4; i += 4) { - in16x16[i] = in[col]; - in16x16[i + 1] = in[col + 1]; - in16x16[i + 2] = in[col + 2]; - in16x16[i + 3] = in[col + 3]; - col += 8; - } -} + u[11] = _mm_mullo_epi32(in[5], cospi22); + u[11] = _mm_add_epi32(u[11], rnding); + u[11] = _mm_srai_epi32(u[11], bit); -static void write_buffer_32x32(__m128i *in, uint16_t *output, int stride, - int fliplr, int flipud, int shift, int bd) { - __m128i in16x16[16 * 16 / 4]; - uint16_t *leftUp = &output[0]; - uint16_t *rightUp = &output[16]; - uint16_t *leftDown = &output[16 * stride]; - uint16_t *rightDown = &output[16 * stride + 16]; + u[12] = _mm_mullo_epi32(in[3], cospi50); + u[12] = _mm_add_epi32(u[12], rnding); + u[12] = _mm_srai_epi32(u[12], bit); - if (fliplr) { - swap_addr(&leftUp, &rightUp); - swap_addr(&leftDown, &rightDown); - } + u[13] = _mm_mullo_epi32(in[3], cospi14); + u[13] = _mm_add_epi32(u[13], rnding); + u[13] = _mm_srai_epi32(u[13], bit); - if (flipud) { - swap_addr(&leftUp, &leftDown); - swap_addr(&rightUp, &rightDown); - } + u[14] = _mm_mullo_epi32(in[1], cospi58); + u[14] = _mm_add_epi32(u[14], rnding); + u[14] = _mm_srai_epi32(u[14], bit); - // Left-up quarter - assign_16x16_input_from_32x32(in, in16x16, 0); - write_buffer_16x16(in16x16, leftUp, stride, fliplr, flipud, shift, bd); + u[15] = _mm_mullo_epi32(in[1], cospi6); + u[15] = _mm_add_epi32(u[15], rnding); + u[15] = _mm_srai_epi32(u[15], bit); - // Right-up quarter - assign_16x16_input_from_32x32(in, in16x16, 32 / 2 / 4); - write_buffer_16x16(in16x16, rightUp, stride, fliplr, flipud, shift, bd); + // stage 3 + addsub_sse4_1(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi); - // Left-down quarter - assign_16x16_input_from_32x32(in, in16x16, 32 * 32 / 2 / 4); - write_buffer_16x16(in16x16, leftDown, stride, fliplr, flipud, shift, bd); + // stage 4 + y = _mm_mullo_epi32(u[8], cospi56); + x = _mm_mullo_epi32(u[9], cospi56); + u[8] = _mm_mullo_epi32(u[8], cospi8); + u[8] = _mm_add_epi32(u[8], x); + u[8] = _mm_add_epi32(u[8], rnding); + u[8] = _mm_srai_epi32(u[8], bit); - // Right-down quarter - assign_16x16_input_from_32x32(in, in16x16, 32 * 32 / 2 / 4 + 32 / 2 / 4); - write_buffer_16x16(in16x16, rightDown, stride, fliplr, flipud, shift, bd); -} + x = _mm_mullo_epi32(u[9], cospi8); + u[9] = _mm_sub_epi32(y, x); + u[9] = _mm_add_epi32(u[9], rnding); + u[9] = _mm_srai_epi32(u[9], bit); -static void assign_32x32_input_from_64x64(const __m128i *in, __m128i *in32x32, - int col) { - int i; - for (i = 0; i < 32 * 32 / 4; i += 8) { - in32x32[i] = in[col]; - in32x32[i + 1] = in[col + 1]; - in32x32[i + 2] = in[col + 2]; - in32x32[i + 3] = in[col + 3]; - in32x32[i + 4] = in[col + 4]; - in32x32[i + 5] = in[col + 5]; - in32x32[i + 6] = in[col + 6]; - in32x32[i + 7] = in[col + 7]; - col += 16; - } -} + x = _mm_mullo_epi32(u[11], cospi24); + y = _mm_mullo_epi32(u[10], cospi24); + u[10] = _mm_mullo_epi32(u[10], cospi40); + u[10] = _mm_add_epi32(u[10], x); + u[10] = _mm_add_epi32(u[10], rnding); + u[10] = _mm_srai_epi32(u[10], bit); -static void write_buffer_64x64(__m128i *in, uint16_t *output, int stride, - int fliplr, int flipud, int shift, int bd) { - __m128i in32x32[32 * 32 / 4]; - uint16_t *leftUp = &output[0]; - uint16_t *rightUp = &output[32]; - uint16_t *leftDown = &output[32 * stride]; - uint16_t *rightDown = &output[32 * stride + 32]; + x = _mm_mullo_epi32(u[11], cospi40); + u[11] = _mm_sub_epi32(y, x); + u[11] = _mm_add_epi32(u[11], rnding); + u[11] = _mm_srai_epi32(u[11], bit); - if (fliplr) { - swap_addr(&leftUp, &rightUp); - swap_addr(&leftDown, &rightDown); - } + x = _mm_mullo_epi32(u[13], cospi8); + y = _mm_mullo_epi32(u[12], cospi8); + u[12] = _mm_mullo_epi32(u[12], cospim56); + u[12] = _mm_add_epi32(u[12], x); + u[12] = _mm_add_epi32(u[12], rnding); + u[12] = _mm_srai_epi32(u[12], bit); - if (flipud) { - swap_addr(&leftUp, &leftDown); - swap_addr(&rightUp, &rightDown); - } + x = _mm_mullo_epi32(u[13], cospim56); + u[13] = _mm_sub_epi32(y, x); + u[13] = _mm_add_epi32(u[13], rnding); + u[13] = _mm_srai_epi32(u[13], bit); + + x = _mm_mullo_epi32(u[15], cospi40); + y = _mm_mullo_epi32(u[14], cospi40); + u[14] = _mm_mullo_epi32(u[14], cospim24); + u[14] = _mm_add_epi32(u[14], x); + u[14] = _mm_add_epi32(u[14], rnding); + u[14] = _mm_srai_epi32(u[14], bit); + + x = _mm_mullo_epi32(u[15], cospim24); + u[15] = _mm_sub_epi32(y, x); + u[15] = _mm_add_epi32(u[15], rnding); + u[15] = _mm_srai_epi32(u[15], bit); + + // stage 5 + addsub_sse4_1(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi); + + // stage 6 + x = _mm_mullo_epi32(u[5], cospi48); + y = _mm_mullo_epi32(u[4], cospi48); + u[4] = _mm_mullo_epi32(u[4], cospi16); + u[4] = _mm_add_epi32(u[4], x); + u[4] = _mm_add_epi32(u[4], rnding); + u[4] = _mm_srai_epi32(u[4], bit); + + x = _mm_mullo_epi32(u[5], cospi16); + u[5] = _mm_sub_epi32(y, x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + x = _mm_mullo_epi32(u[7], cospi16); + y = _mm_mullo_epi32(u[6], cospi16); + u[6] = _mm_mullo_epi32(u[6], cospim48); + u[6] = _mm_add_epi32(u[6], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + x = _mm_mullo_epi32(u[7], cospim48); + u[7] = _mm_sub_epi32(y, x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + x = _mm_mullo_epi32(u[13], cospi48); + y = _mm_mullo_epi32(u[12], cospi48); + u[12] = _mm_mullo_epi32(u[12], cospi16); + u[12] = _mm_add_epi32(u[12], x); + u[12] = _mm_add_epi32(u[12], rnding); + u[12] = _mm_srai_epi32(u[12], bit); + + x = _mm_mullo_epi32(u[13], cospi16); + u[13] = _mm_sub_epi32(y, x); + u[13] = _mm_add_epi32(u[13], rnding); + u[13] = _mm_srai_epi32(u[13], bit); + + x = _mm_mullo_epi32(u[15], cospi16); + y = _mm_mullo_epi32(u[14], cospi16); + u[14] = _mm_mullo_epi32(u[14], cospim48); + u[14] = _mm_add_epi32(u[14], x); + u[14] = _mm_add_epi32(u[14], rnding); + u[14] = _mm_srai_epi32(u[14], bit); + + x = _mm_mullo_epi32(u[15], cospim48); + u[15] = _mm_sub_epi32(y, x); + u[15] = _mm_add_epi32(u[15], rnding); + u[15] = _mm_srai_epi32(u[15], bit); + + // stage 7 + addsub_sse4_1(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi); + + // stage 8 + y = _mm_mullo_epi32(u[2], cospi32); + x = _mm_mullo_epi32(u[3], cospi32); + u[2] = _mm_add_epi32(y, x); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + u[3] = _mm_sub_epi32(y, x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + y = _mm_mullo_epi32(u[6], cospi32); + x = _mm_mullo_epi32(u[7], cospi32); + u[6] = _mm_add_epi32(y, x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); - // Left-up quarter - assign_32x32_input_from_64x64(in, in32x32, 0); - write_buffer_32x32(in32x32, leftUp, stride, fliplr, flipud, shift, bd); + u[7] = _mm_sub_epi32(y, x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); - // Right-up quarter - assign_32x32_input_from_64x64(in, in32x32, 64 / 2 / 4); - write_buffer_32x32(in32x32, rightUp, stride, fliplr, flipud, shift, bd); + y = _mm_mullo_epi32(u[10], cospi32); + x = _mm_mullo_epi32(u[11], cospi32); + u[10] = _mm_add_epi32(y, x); + u[10] = _mm_add_epi32(u[10], rnding); + u[10] = _mm_srai_epi32(u[10], bit); + + u[11] = _mm_sub_epi32(y, x); + u[11] = _mm_add_epi32(u[11], rnding); + u[11] = _mm_srai_epi32(u[11], bit); + + y = _mm_mullo_epi32(u[14], cospi32); + x = _mm_mullo_epi32(u[15], cospi32); + u[14] = _mm_add_epi32(y, x); + u[14] = _mm_add_epi32(u[14], rnding); + u[14] = _mm_srai_epi32(u[14], bit); - // Left-down quarter - assign_32x32_input_from_64x64(in, in32x32, 64 * 64 / 2 / 4); - write_buffer_32x32(in32x32, leftDown, stride, fliplr, flipud, shift, bd); + u[15] = _mm_sub_epi32(y, x); + u[15] = _mm_add_epi32(u[15], rnding); + u[15] = _mm_srai_epi32(u[15], bit); - // Right-down quarter - assign_32x32_input_from_64x64(in, in32x32, 64 * 64 / 2 / 4 + 64 / 2 / 4); - write_buffer_32x32(in32x32, rightDown, stride, fliplr, flipud, shift, bd); + // stage 9 + if (do_cols) { + out[0] = u[0]; + out[1] = _mm_sub_epi32(_mm_setzero_si128(), u[8]); + out[2] = u[12]; + out[3] = _mm_sub_epi32(_mm_setzero_si128(), u[4]); + out[4] = u[6]; + out[5] = _mm_sub_epi32(_mm_setzero_si128(), u[14]); + out[6] = u[10]; + out[7] = _mm_sub_epi32(_mm_setzero_si128(), u[2]); + out[8] = u[3]; + out[9] = _mm_sub_epi32(_mm_setzero_si128(), u[11]); + out[10] = u[15]; + out[11] = _mm_sub_epi32(_mm_setzero_si128(), u[7]); + out[12] = u[5]; + out[13] = _mm_sub_epi32(_mm_setzero_si128(), u[13]); + out[14] = u[9]; + out[15] = _mm_sub_epi32(_mm_setzero_si128(), u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = + _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_sse4_1(u[0], u[8], out + 0, out + 1, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[12], u[4], out + 2, out + 3, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[6], u[14], out + 4, out + 5, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[10], u[2], out + 6, out + 7, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[3], u[11], out + 8, out + 9, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[15], u[7], out + 10, out + 11, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[5], u[13], out + 12, out + 13, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[9], u[1], out + 14, out + 15, &clamp_lo_out, + &clamp_hi_out, out_shift); + } + } } -static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, +static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, int bd, int out_shift) { - int i, j; const int32_t *cospi = cospi_arr(bit); - const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); - const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); - const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); - const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); - int col; - - const __m128i cospi1 = _mm_set1_epi32(cospi[1]); - const __m128i cospi2 = _mm_set1_epi32(cospi[2]); - const __m128i cospi3 = _mm_set1_epi32(cospi[3]); - const __m128i cospi4 = _mm_set1_epi32(cospi[4]); - const __m128i cospi5 = _mm_set1_epi32(cospi[5]); - const __m128i cospi6 = _mm_set1_epi32(cospi[6]); - const __m128i cospi7 = _mm_set1_epi32(cospi[7]); - const __m128i cospi8 = _mm_set1_epi32(cospi[8]); - const __m128i cospi9 = _mm_set1_epi32(cospi[9]); - const __m128i cospi10 = _mm_set1_epi32(cospi[10]); - const __m128i cospi11 = _mm_set1_epi32(cospi[11]); - const __m128i cospi12 = _mm_set1_epi32(cospi[12]); - const __m128i cospi13 = _mm_set1_epi32(cospi[13]); - const __m128i cospi14 = _mm_set1_epi32(cospi[14]); - const __m128i cospi15 = _mm_set1_epi32(cospi[15]); - const __m128i cospi16 = _mm_set1_epi32(cospi[16]); - const __m128i cospi17 = _mm_set1_epi32(cospi[17]); - const __m128i cospi18 = _mm_set1_epi32(cospi[18]); - const __m128i cospi19 = _mm_set1_epi32(cospi[19]); - const __m128i cospi20 = _mm_set1_epi32(cospi[20]); - const __m128i cospi21 = _mm_set1_epi32(cospi[21]); - const __m128i cospi22 = _mm_set1_epi32(cospi[22]); - const __m128i cospi23 = _mm_set1_epi32(cospi[23]); - const __m128i cospi24 = _mm_set1_epi32(cospi[24]); - const __m128i cospi25 = _mm_set1_epi32(cospi[25]); - const __m128i cospi26 = _mm_set1_epi32(cospi[26]); - const __m128i cospi27 = _mm_set1_epi32(cospi[27]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); const __m128i cospi28 = _mm_set1_epi32(cospi[28]); - const __m128i cospi29 = _mm_set1_epi32(cospi[29]); - const __m128i cospi30 = _mm_set1_epi32(cospi[30]); - const __m128i cospi31 = _mm_set1_epi32(cospi[31]); - const __m128i cospi32 = _mm_set1_epi32(cospi[32]); - const __m128i cospi35 = _mm_set1_epi32(cospi[35]); - const __m128i cospi36 = _mm_set1_epi32(cospi[36]); - const __m128i cospi38 = _mm_set1_epi32(cospi[38]); - const __m128i cospi39 = _mm_set1_epi32(cospi[39]); - const __m128i cospi40 = _mm_set1_epi32(cospi[40]); - const __m128i cospi43 = _mm_set1_epi32(cospi[43]); + const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); const __m128i cospi44 = _mm_set1_epi32(cospi[44]); - const __m128i cospi46 = _mm_set1_epi32(cospi[46]); - const __m128i cospi47 = _mm_set1_epi32(cospi[47]); - const __m128i cospi48 = _mm_set1_epi32(cospi[48]); - const __m128i cospi51 = _mm_set1_epi32(cospi[51]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); const __m128i cospi52 = _mm_set1_epi32(cospi[52]); - const __m128i cospi54 = _mm_set1_epi32(cospi[54]); - const __m128i cospi55 = _mm_set1_epi32(cospi[55]); + const __m128i cospi36 = _mm_set1_epi32(cospi[36]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); const __m128i cospi56 = _mm_set1_epi32(cospi[56]); - const __m128i cospi59 = _mm_set1_epi32(cospi[59]); - const __m128i cospi60 = _mm_set1_epi32(cospi[60]); - const __m128i cospi62 = _mm_set1_epi32(cospi[62]); - const __m128i cospi63 = _mm_set1_epi32(cospi[63]); - - const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); - const __m128i cospim12 = _mm_set1_epi32(-cospi[12]); - const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); - const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); - const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); - const __m128i cospim28 = _mm_set1_epi32(-cospi[28]); - const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); - const __m128i cospim33 = _mm_set1_epi32(-cospi[33]); - const __m128i cospim34 = _mm_set1_epi32(-cospi[34]); - const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); - const __m128i cospim37 = _mm_set1_epi32(-cospi[37]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); - const __m128i cospim41 = _mm_set1_epi32(-cospi[41]); - const __m128i cospim42 = _mm_set1_epi32(-cospi[42]); - const __m128i cospim44 = _mm_set1_epi32(-cospi[44]); - const __m128i cospim45 = _mm_set1_epi32(-cospi[45]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); - const __m128i cospim49 = _mm_set1_epi32(-cospi[49]); - const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); - const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); - const __m128i cospim53 = _mm_set1_epi32(-cospi[53]); - const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); - const __m128i cospim57 = _mm_set1_epi32(-cospi[57]); - const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); - const __m128i cospim60 = _mm_set1_epi32(-cospi[60]); - const __m128i cospim61 = _mm_set1_epi32(-cospi[61]); - - for (col = 0; col < (do_cols ? 64 / 4 : 32 / 4); ++col) { - __m128i u[64], v[64]; + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i u[16], v[16], x, y; + { + // stage 0 // stage 1 - u[32] = in[1 * 16 + col]; - u[34] = in[17 * 16 + col]; - u[36] = in[9 * 16 + col]; - u[38] = in[25 * 16 + col]; - u[40] = in[5 * 16 + col]; - u[42] = in[21 * 16 + col]; - u[44] = in[13 * 16 + col]; - u[46] = in[29 * 16 + col]; - u[48] = in[3 * 16 + col]; - u[50] = in[19 * 16 + col]; - u[52] = in[11 * 16 + col]; - u[54] = in[27 * 16 + col]; - u[56] = in[7 * 16 + col]; - u[58] = in[23 * 16 + col]; - u[60] = in[15 * 16 + col]; - u[62] = in[31 * 16 + col]; - - v[16] = in[2 * 16 + col]; - v[18] = in[18 * 16 + col]; - v[20] = in[10 * 16 + col]; - v[22] = in[26 * 16 + col]; - v[24] = in[6 * 16 + col]; - v[26] = in[22 * 16 + col]; - v[28] = in[14 * 16 + col]; - v[30] = in[30 * 16 + col]; - - u[8] = in[4 * 16 + col]; - u[10] = in[20 * 16 + col]; - u[12] = in[12 * 16 + col]; - u[14] = in[28 * 16 + col]; - - v[4] = in[8 * 16 + col]; - v[6] = in[24 * 16 + col]; - - u[0] = in[0 * 16 + col]; - u[2] = in[16 * 16 + col]; + u[0] = in[0]; + u[1] = in[8]; + u[2] = in[4]; + u[3] = in[12]; + u[4] = in[2]; + u[5] = in[10]; + u[6] = in[6]; + u[7] = in[14]; + u[8] = in[1]; + u[9] = in[9]; + u[10] = in[5]; + u[11] = in[13]; + u[12] = in[3]; + u[13] = in[11]; + u[14] = in[7]; + u[15] = in[15]; // stage 2 - v[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit); - v[33] = half_btf_0_sse4_1(&cospim33, &u[62], &rnding, bit); - v[34] = half_btf_0_sse4_1(&cospi47, &u[34], &rnding, bit); - v[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit); - v[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit); - v[37] = half_btf_0_sse4_1(&cospim41, &u[58], &rnding, bit); - v[38] = half_btf_0_sse4_1(&cospi39, &u[38], &rnding, bit); - v[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit); - v[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit); - v[41] = half_btf_0_sse4_1(&cospim37, &u[54], &rnding, bit); - v[42] = half_btf_0_sse4_1(&cospi43, &u[42], &rnding, bit); - v[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit); - v[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit); - v[45] = half_btf_0_sse4_1(&cospim45, &u[50], &rnding, bit); - v[46] = half_btf_0_sse4_1(&cospi35, &u[46], &rnding, bit); - v[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit); - v[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit); - v[49] = half_btf_0_sse4_1(&cospi29, &u[46], &rnding, bit); - v[50] = half_btf_0_sse4_1(&cospi19, &u[50], &rnding, bit); - v[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit); - v[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit); - v[53] = half_btf_0_sse4_1(&cospi21, &u[42], &rnding, bit); - v[54] = half_btf_0_sse4_1(&cospi27, &u[54], &rnding, bit); - v[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit); - v[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit); - v[57] = half_btf_0_sse4_1(&cospi25, &u[38], &rnding, bit); - v[58] = half_btf_0_sse4_1(&cospi23, &u[58], &rnding, bit); - v[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit); - v[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit); - v[61] = half_btf_0_sse4_1(&cospi17, &u[34], &rnding, bit); - v[62] = half_btf_0_sse4_1(&cospi31, &u[62], &rnding, bit); - v[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit); + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; - // stage 3 - u[16] = half_btf_0_sse4_1(&cospi62, &v[16], &rnding, bit); - u[17] = half_btf_0_sse4_1(&cospim34, &v[30], &rnding, bit); - u[18] = half_btf_0_sse4_1(&cospi46, &v[18], &rnding, bit); + v[8] = half_btf_sse4_1(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit); + v[9] = half_btf_sse4_1(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit); + v[10] = half_btf_sse4_1(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit); + v[11] = half_btf_sse4_1(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit); + v[12] = half_btf_sse4_1(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit); + v[13] = half_btf_sse4_1(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit); + v[14] = half_btf_sse4_1(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit); + v[15] = half_btf_sse4_1(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit); + + // stage 3 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + u[4] = half_btf_sse4_1(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit); + u[5] = half_btf_sse4_1(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit); + u[6] = half_btf_sse4_1(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit); + u[7] = half_btf_sse4_1(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit); + addsub_sse4_1(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi); + + // stage 4 + x = _mm_mullo_epi32(u[0], cospi32); + y = _mm_mullo_epi32(u[1], cospi32); + v[0] = _mm_add_epi32(x, y); + v[0] = _mm_add_epi32(v[0], rnding); + v[0] = _mm_srai_epi32(v[0], bit); + + v[1] = _mm_sub_epi32(x, y); + v[1] = _mm_add_epi32(v[1], rnding); + v[1] = _mm_srai_epi32(v[1], bit); + + v[2] = half_btf_sse4_1(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit); + v[3] = half_btf_sse4_1(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit); + addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi); + v[8] = u[8]; + v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); + v[11] = u[11]; + v[12] = u[12]; + v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); + v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + v[15] = u[15]; + + // stage 5 + addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi); + u[4] = v[4]; + + x = _mm_mullo_epi32(v[5], cospi32); + y = _mm_mullo_epi32(v[6], cospi32); + u[5] = _mm_sub_epi32(y, x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + u[6] = _mm_add_epi32(y, x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = v[7]; + addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + // stage 6 + addsub_sse4_1(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi); + v[8] = u[8]; + v[9] = u[9]; + + x = _mm_mullo_epi32(u[10], cospi32); + y = _mm_mullo_epi32(u[13], cospi32); + v[10] = _mm_sub_epi32(y, x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[13] = _mm_add_epi32(x, y); + v[13] = _mm_add_epi32(v[13], rnding); + v[13] = _mm_srai_epi32(v[13], bit); + + x = _mm_mullo_epi32(u[11], cospi32); + y = _mm_mullo_epi32(u[12], cospi32); + v[11] = _mm_sub_epi32(y, x); + v[11] = _mm_add_epi32(v[11], rnding); + v[11] = _mm_srai_epi32(v[11], bit); + + v[12] = _mm_add_epi32(x, y); + v[12] = _mm_add_epi32(v[12], rnding); + v[12] = _mm_srai_epi32(v[12], bit); + + v[14] = u[14]; + v[15] = u[15]; + + // stage 7 + if (do_cols) { + addsub_no_clamp_sse4_1(v[0], v[15], out + 0, out + 15); + addsub_no_clamp_sse4_1(v[1], v[14], out + 1, out + 14); + addsub_no_clamp_sse4_1(v[2], v[13], out + 2, out + 13); + addsub_no_clamp_sse4_1(v[3], v[12], out + 3, out + 12); + addsub_no_clamp_sse4_1(v[4], v[11], out + 4, out + 11); + addsub_no_clamp_sse4_1(v[5], v[10], out + 5, out + 10); + addsub_no_clamp_sse4_1(v[6], v[9], out + 6, out + 9); + addsub_no_clamp_sse4_1(v[7], v[8], out + 7, out + 8); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( + -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); + const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( + (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); + + addsub_shift_sse4_1(v[0], v[15], out + 0, out + 15, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(v[1], v[14], out + 1, out + 14, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(v[2], v[13], out + 2, out + 13, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(v[3], v[12], out + 3, out + 12, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(v[4], v[11], out + 4, out + 11, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(v[5], v[10], out + 5, out + 10, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(v[6], v[9], out + 6, out + 9, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(v[7], v[8], out + 7, out + 8, &clamp_lo_out, + &clamp_hi_out, out_shift); + } + } +} + +static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospi10 = _mm_set1_epi32(cospi[10]); + const __m128i cospi54 = _mm_set1_epi32(cospi[54]); + const __m128i cospi18 = _mm_set1_epi32(cospi[18]); + const __m128i cospi46 = _mm_set1_epi32(cospi[46]); + const __m128i cospi26 = _mm_set1_epi32(cospi[26]); + const __m128i cospi38 = _mm_set1_epi32(cospi[38]); + const __m128i cospi34 = _mm_set1_epi32(cospi[34]); + const __m128i cospi30 = _mm_set1_epi32(cospi[30]); + const __m128i cospi42 = _mm_set1_epi32(cospi[42]); + const __m128i cospi22 = _mm_set1_epi32(cospi[22]); + const __m128i cospi50 = _mm_set1_epi32(cospi[50]); + const __m128i cospi14 = _mm_set1_epi32(cospi[14]); + const __m128i cospi58 = _mm_set1_epi32(cospi[58]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i u[16], v[16], x, y; + + // Calculate the column 0, 1, 2, 3 + { + // stage 0 + // stage 1 + // stage 2 + v[0] = _mm_mullo_epi32(in[15], cospi2); + x = _mm_mullo_epi32(in[0], cospi62); + v[0] = _mm_add_epi32(v[0], x); + v[0] = _mm_add_epi32(v[0], rnding); + v[0] = _mm_srai_epi32(v[0], bit); + + v[1] = _mm_mullo_epi32(in[15], cospi62); + x = _mm_mullo_epi32(in[0], cospi2); + v[1] = _mm_sub_epi32(v[1], x); + v[1] = _mm_add_epi32(v[1], rnding); + v[1] = _mm_srai_epi32(v[1], bit); + + v[2] = _mm_mullo_epi32(in[13], cospi10); + x = _mm_mullo_epi32(in[2], cospi54); + v[2] = _mm_add_epi32(v[2], x); + v[2] = _mm_add_epi32(v[2], rnding); + v[2] = _mm_srai_epi32(v[2], bit); + + v[3] = _mm_mullo_epi32(in[13], cospi54); + x = _mm_mullo_epi32(in[2], cospi10); + v[3] = _mm_sub_epi32(v[3], x); + v[3] = _mm_add_epi32(v[3], rnding); + v[3] = _mm_srai_epi32(v[3], bit); + + v[4] = _mm_mullo_epi32(in[11], cospi18); + x = _mm_mullo_epi32(in[4], cospi46); + v[4] = _mm_add_epi32(v[4], x); + v[4] = _mm_add_epi32(v[4], rnding); + v[4] = _mm_srai_epi32(v[4], bit); + + v[5] = _mm_mullo_epi32(in[11], cospi46); + x = _mm_mullo_epi32(in[4], cospi18); + v[5] = _mm_sub_epi32(v[5], x); + v[5] = _mm_add_epi32(v[5], rnding); + v[5] = _mm_srai_epi32(v[5], bit); + + v[6] = _mm_mullo_epi32(in[9], cospi26); + x = _mm_mullo_epi32(in[6], cospi38); + v[6] = _mm_add_epi32(v[6], x); + v[6] = _mm_add_epi32(v[6], rnding); + v[6] = _mm_srai_epi32(v[6], bit); + + v[7] = _mm_mullo_epi32(in[9], cospi38); + x = _mm_mullo_epi32(in[6], cospi26); + v[7] = _mm_sub_epi32(v[7], x); + v[7] = _mm_add_epi32(v[7], rnding); + v[7] = _mm_srai_epi32(v[7], bit); + + v[8] = _mm_mullo_epi32(in[7], cospi34); + x = _mm_mullo_epi32(in[8], cospi30); + v[8] = _mm_add_epi32(v[8], x); + v[8] = _mm_add_epi32(v[8], rnding); + v[8] = _mm_srai_epi32(v[8], bit); + + v[9] = _mm_mullo_epi32(in[7], cospi30); + x = _mm_mullo_epi32(in[8], cospi34); + v[9] = _mm_sub_epi32(v[9], x); + v[9] = _mm_add_epi32(v[9], rnding); + v[9] = _mm_srai_epi32(v[9], bit); + + v[10] = _mm_mullo_epi32(in[5], cospi42); + x = _mm_mullo_epi32(in[10], cospi22); + v[10] = _mm_add_epi32(v[10], x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[11] = _mm_mullo_epi32(in[5], cospi22); + x = _mm_mullo_epi32(in[10], cospi42); + v[11] = _mm_sub_epi32(v[11], x); + v[11] = _mm_add_epi32(v[11], rnding); + v[11] = _mm_srai_epi32(v[11], bit); + + v[12] = _mm_mullo_epi32(in[3], cospi50); + x = _mm_mullo_epi32(in[12], cospi14); + v[12] = _mm_add_epi32(v[12], x); + v[12] = _mm_add_epi32(v[12], rnding); + v[12] = _mm_srai_epi32(v[12], bit); + + v[13] = _mm_mullo_epi32(in[3], cospi14); + x = _mm_mullo_epi32(in[12], cospi50); + v[13] = _mm_sub_epi32(v[13], x); + v[13] = _mm_add_epi32(v[13], rnding); + v[13] = _mm_srai_epi32(v[13], bit); + + v[14] = _mm_mullo_epi32(in[1], cospi58); + x = _mm_mullo_epi32(in[14], cospi6); + v[14] = _mm_add_epi32(v[14], x); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[15] = _mm_mullo_epi32(in[1], cospi6); + x = _mm_mullo_epi32(in[14], cospi58); + v[15] = _mm_sub_epi32(v[15], x); + v[15] = _mm_add_epi32(v[15], rnding); + v[15] = _mm_srai_epi32(v[15], bit); + + // stage 3 + addsub_sse4_1(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi); + + // stage 4 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + + v[8] = _mm_mullo_epi32(u[8], cospi8); + x = _mm_mullo_epi32(u[9], cospi56); + v[8] = _mm_add_epi32(v[8], x); + v[8] = _mm_add_epi32(v[8], rnding); + v[8] = _mm_srai_epi32(v[8], bit); + + v[9] = _mm_mullo_epi32(u[8], cospi56); + x = _mm_mullo_epi32(u[9], cospi8); + v[9] = _mm_sub_epi32(v[9], x); + v[9] = _mm_add_epi32(v[9], rnding); + v[9] = _mm_srai_epi32(v[9], bit); + + v[10] = _mm_mullo_epi32(u[10], cospi40); + x = _mm_mullo_epi32(u[11], cospi24); + v[10] = _mm_add_epi32(v[10], x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[11] = _mm_mullo_epi32(u[10], cospi24); + x = _mm_mullo_epi32(u[11], cospi40); + v[11] = _mm_sub_epi32(v[11], x); + v[11] = _mm_add_epi32(v[11], rnding); + v[11] = _mm_srai_epi32(v[11], bit); + + v[12] = _mm_mullo_epi32(u[12], cospim56); + x = _mm_mullo_epi32(u[13], cospi8); + v[12] = _mm_add_epi32(v[12], x); + v[12] = _mm_add_epi32(v[12], rnding); + v[12] = _mm_srai_epi32(v[12], bit); + + v[13] = _mm_mullo_epi32(u[12], cospi8); + x = _mm_mullo_epi32(u[13], cospim56); + v[13] = _mm_sub_epi32(v[13], x); + v[13] = _mm_add_epi32(v[13], rnding); + v[13] = _mm_srai_epi32(v[13], bit); + + v[14] = _mm_mullo_epi32(u[14], cospim24); + x = _mm_mullo_epi32(u[15], cospi40); + v[14] = _mm_add_epi32(v[14], x); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[15] = _mm_mullo_epi32(u[14], cospi40); + x = _mm_mullo_epi32(u[15], cospim24); + v[15] = _mm_sub_epi32(v[15], x); + v[15] = _mm_add_epi32(v[15], rnding); + v[15] = _mm_srai_epi32(v[15], bit); + + // stage 5 + addsub_sse4_1(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi); + + // stage 6 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + + v[4] = _mm_mullo_epi32(u[4], cospi16); + x = _mm_mullo_epi32(u[5], cospi48); + v[4] = _mm_add_epi32(v[4], x); + v[4] = _mm_add_epi32(v[4], rnding); + v[4] = _mm_srai_epi32(v[4], bit); + + v[5] = _mm_mullo_epi32(u[4], cospi48); + x = _mm_mullo_epi32(u[5], cospi16); + v[5] = _mm_sub_epi32(v[5], x); + v[5] = _mm_add_epi32(v[5], rnding); + v[5] = _mm_srai_epi32(v[5], bit); + + v[6] = _mm_mullo_epi32(u[6], cospim48); + x = _mm_mullo_epi32(u[7], cospi16); + v[6] = _mm_add_epi32(v[6], x); + v[6] = _mm_add_epi32(v[6], rnding); + v[6] = _mm_srai_epi32(v[6], bit); + + v[7] = _mm_mullo_epi32(u[6], cospi16); + x = _mm_mullo_epi32(u[7], cospim48); + v[7] = _mm_sub_epi32(v[7], x); + v[7] = _mm_add_epi32(v[7], rnding); + v[7] = _mm_srai_epi32(v[7], bit); + + v[8] = u[8]; + v[9] = u[9]; + v[10] = u[10]; + v[11] = u[11]; + + v[12] = _mm_mullo_epi32(u[12], cospi16); + x = _mm_mullo_epi32(u[13], cospi48); + v[12] = _mm_add_epi32(v[12], x); + v[12] = _mm_add_epi32(v[12], rnding); + v[12] = _mm_srai_epi32(v[12], bit); + + v[13] = _mm_mullo_epi32(u[12], cospi48); + x = _mm_mullo_epi32(u[13], cospi16); + v[13] = _mm_sub_epi32(v[13], x); + v[13] = _mm_add_epi32(v[13], rnding); + v[13] = _mm_srai_epi32(v[13], bit); + + v[14] = _mm_mullo_epi32(u[14], cospim48); + x = _mm_mullo_epi32(u[15], cospi16); + v[14] = _mm_add_epi32(v[14], x); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[15] = _mm_mullo_epi32(u[14], cospi16); + x = _mm_mullo_epi32(u[15], cospim48); + v[15] = _mm_sub_epi32(v[15], x); + v[15] = _mm_add_epi32(v[15], rnding); + v[15] = _mm_srai_epi32(v[15], bit); + + // stage 7 + addsub_sse4_1(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi); + + // stage 8 + v[0] = u[0]; + v[1] = u[1]; + + y = _mm_mullo_epi32(u[2], cospi32); + x = _mm_mullo_epi32(u[3], cospi32); + v[2] = _mm_add_epi32(y, x); + v[2] = _mm_add_epi32(v[2], rnding); + v[2] = _mm_srai_epi32(v[2], bit); + + v[3] = _mm_sub_epi32(y, x); + v[3] = _mm_add_epi32(v[3], rnding); + v[3] = _mm_srai_epi32(v[3], bit); + + v[4] = u[4]; + v[5] = u[5]; + + y = _mm_mullo_epi32(u[6], cospi32); + x = _mm_mullo_epi32(u[7], cospi32); + v[6] = _mm_add_epi32(y, x); + v[6] = _mm_add_epi32(v[6], rnding); + v[6] = _mm_srai_epi32(v[6], bit); + + v[7] = _mm_sub_epi32(y, x); + v[7] = _mm_add_epi32(v[7], rnding); + v[7] = _mm_srai_epi32(v[7], bit); + + v[8] = u[8]; + v[9] = u[9]; + + y = _mm_mullo_epi32(u[10], cospi32); + x = _mm_mullo_epi32(u[11], cospi32); + v[10] = _mm_add_epi32(y, x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[11] = _mm_sub_epi32(y, x); + v[11] = _mm_add_epi32(v[11], rnding); + v[11] = _mm_srai_epi32(v[11], bit); + + v[12] = u[12]; + v[13] = u[13]; + + y = _mm_mullo_epi32(u[14], cospi32); + x = _mm_mullo_epi32(u[15], cospi32); + v[14] = _mm_add_epi32(y, x); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[15] = _mm_sub_epi32(y, x); + v[15] = _mm_add_epi32(v[15], rnding); + v[15] = _mm_srai_epi32(v[15], bit); + + // stage 9 + if (do_cols) { + out[0] = v[0]; + out[1] = _mm_sub_epi32(_mm_setzero_si128(), v[8]); + out[2] = v[12]; + out[3] = _mm_sub_epi32(_mm_setzero_si128(), v[4]); + out[4] = v[6]; + out[5] = _mm_sub_epi32(_mm_setzero_si128(), v[14]); + out[6] = v[10]; + out[7] = _mm_sub_epi32(_mm_setzero_si128(), v[2]); + out[8] = v[3]; + out[9] = _mm_sub_epi32(_mm_setzero_si128(), v[11]); + out[10] = v[15]; + out[11] = _mm_sub_epi32(_mm_setzero_si128(), v[7]); + out[12] = v[5]; + out[13] = _mm_sub_epi32(_mm_setzero_si128(), v[13]); + out[14] = v[9]; + out[15] = _mm_sub_epi32(_mm_setzero_si128(), v[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = + _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out, + &clamp_hi_out, out_shift); + } + } +} + +static INLINE void idct64_stage8_sse4_1( + __m128i *u, const __m128i *cospim32, const __m128i *cospi32, + const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16, + const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi, + const __m128i *rnding, int bit) { + int i; + __m128i temp1, temp2, temp3, temp4; + temp1 = half_btf_sse4_1(cospim32, &u[10], cospi32, &u[13], rnding, bit); + u[13] = half_btf_sse4_1(cospi32, &u[10], cospi32, &u[13], rnding, bit); + u[10] = temp1; + temp2 = half_btf_sse4_1(cospim32, &u[11], cospi32, &u[12], rnding, bit); + u[12] = half_btf_sse4_1(cospi32, &u[11], cospi32, &u[12], rnding, bit); + u[11] = temp2; + + for (i = 16; i < 20; ++i) { + addsub_sse4_1(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi); + addsub_sse4_1(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo, + clamp_hi); + } + + temp1 = half_btf_sse4_1(cospim16, &u[36], cospi48, &u[59], rnding, bit); + temp2 = half_btf_sse4_1(cospim16, &u[37], cospi48, &u[58], rnding, bit); + temp3 = half_btf_sse4_1(cospim16, &u[38], cospi48, &u[57], rnding, bit); + temp4 = half_btf_sse4_1(cospim16, &u[39], cospi48, &u[56], rnding, bit); + u[56] = half_btf_sse4_1(cospi48, &u[39], cospi16, &u[56], rnding, bit); + u[57] = half_btf_sse4_1(cospi48, &u[38], cospi16, &u[57], rnding, bit); + u[58] = half_btf_sse4_1(cospi48, &u[37], cospi16, &u[58], rnding, bit); + u[59] = half_btf_sse4_1(cospi48, &u[36], cospi16, &u[59], rnding, bit); + u[36] = temp1; + u[37] = temp2; + u[38] = temp3; + u[39] = temp4; + + temp1 = half_btf_sse4_1(cospim48, &u[40], cospim16, &u[55], rnding, bit); + temp2 = half_btf_sse4_1(cospim48, &u[41], cospim16, &u[54], rnding, bit); + temp3 = half_btf_sse4_1(cospim48, &u[42], cospim16, &u[53], rnding, bit); + temp4 = half_btf_sse4_1(cospim48, &u[43], cospim16, &u[52], rnding, bit); + u[52] = half_btf_sse4_1(cospim16, &u[43], cospi48, &u[52], rnding, bit); + u[53] = half_btf_sse4_1(cospim16, &u[42], cospi48, &u[53], rnding, bit); + u[54] = half_btf_sse4_1(cospim16, &u[41], cospi48, &u[54], rnding, bit); + u[55] = half_btf_sse4_1(cospim16, &u[40], cospi48, &u[55], rnding, bit); + u[40] = temp1; + u[41] = temp2; + u[42] = temp3; + u[43] = temp4; +} + +static INLINE void idct64_stage9_sse4_1(__m128i *u, const __m128i *cospim32, + const __m128i *cospi32, + const __m128i *clamp_lo, + const __m128i *clamp_hi, + const __m128i *rnding, int bit) { + int i; + __m128i temp1, temp2, temp3, temp4; + for (i = 0; i < 8; ++i) { + addsub_sse4_1(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi); + } + + temp1 = half_btf_sse4_1(cospim32, &u[20], cospi32, &u[27], rnding, bit); + temp2 = half_btf_sse4_1(cospim32, &u[21], cospi32, &u[26], rnding, bit); + temp3 = half_btf_sse4_1(cospim32, &u[22], cospi32, &u[25], rnding, bit); + temp4 = half_btf_sse4_1(cospim32, &u[23], cospi32, &u[24], rnding, bit); + u[24] = half_btf_sse4_1(cospi32, &u[23], cospi32, &u[24], rnding, bit); + u[25] = half_btf_sse4_1(cospi32, &u[22], cospi32, &u[25], rnding, bit); + u[26] = half_btf_sse4_1(cospi32, &u[21], cospi32, &u[26], rnding, bit); + u[27] = half_btf_sse4_1(cospi32, &u[20], cospi32, &u[27], rnding, bit); + u[20] = temp1; + u[21] = temp2; + u[22] = temp3; + u[23] = temp4; + for (i = 32; i < 40; i++) { + addsub_sse4_1(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi); + } + + for (i = 48; i < 56; i++) { + addsub_sse4_1(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi); + } +} + +static INLINE void idct64_stage10_sse4_1(__m128i *u, const __m128i *cospim32, + const __m128i *cospi32, + const __m128i *clamp_lo, + const __m128i *clamp_hi, + const __m128i *rnding, int bit) { + __m128i temp1, temp2, temp3, temp4; + for (int i = 0; i < 16; i++) { + addsub_sse4_1(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi); + } + + temp1 = half_btf_sse4_1(cospim32, &u[40], cospi32, &u[55], rnding, bit); + temp2 = half_btf_sse4_1(cospim32, &u[41], cospi32, &u[54], rnding, bit); + temp3 = half_btf_sse4_1(cospim32, &u[42], cospi32, &u[53], rnding, bit); + temp4 = half_btf_sse4_1(cospim32, &u[43], cospi32, &u[52], rnding, bit); + u[52] = half_btf_sse4_1(cospi32, &u[43], cospi32, &u[52], rnding, bit); + u[53] = half_btf_sse4_1(cospi32, &u[42], cospi32, &u[53], rnding, bit); + u[54] = half_btf_sse4_1(cospi32, &u[41], cospi32, &u[54], rnding, bit); + u[55] = half_btf_sse4_1(cospi32, &u[40], cospi32, &u[55], rnding, bit); + u[40] = temp1; + u[41] = temp2; + u[42] = temp3; + u[43] = temp4; + + temp1 = half_btf_sse4_1(cospim32, &u[44], cospi32, &u[51], rnding, bit); + temp2 = half_btf_sse4_1(cospim32, &u[45], cospi32, &u[50], rnding, bit); + temp3 = half_btf_sse4_1(cospim32, &u[46], cospi32, &u[49], rnding, bit); + temp4 = half_btf_sse4_1(cospim32, &u[47], cospi32, &u[48], rnding, bit); + u[48] = half_btf_sse4_1(cospi32, &u[47], cospi32, &u[48], rnding, bit); + u[49] = half_btf_sse4_1(cospi32, &u[46], cospi32, &u[49], rnding, bit); + u[50] = half_btf_sse4_1(cospi32, &u[45], cospi32, &u[50], rnding, bit); + u[51] = half_btf_sse4_1(cospi32, &u[44], cospi32, &u[51], rnding, bit); + u[44] = temp1; + u[45] = temp2; + u[46] = temp3; + u[47] = temp4; +} + +static INLINE void idct64_stage11_sse4_1(__m128i *u, __m128i *out, int do_cols, + int bd, int out_shift, + const int log_range) { + if (do_cols) { + for (int i = 0; i < 32; i++) { + addsub_no_clamp_sse4_1(u[i], u[63 - i], &out[(i)], &out[(63 - i)]); + } + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( + -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); + const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( + (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); + + for (int i = 0; i < 32; i++) { + addsub_shift_sse4_1(u[i], u[63 - i], &out[(i)], &out[(63 - i)], + &clamp_lo_out, &clamp_hi_out, out_shift); + } + } +} + +static void idct64x64_low1_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + + { + __m128i x; + + // stage 1 + // stage 2 + // stage 3 + // stage 4 + // stage 5 + // stage 6 + x = half_btf_0_sse4_1(&cospi32, &in[0], &rnding, bit); + + // stage 8 + // stage 9 + // stage 10 + // stage 11 + if (do_cols) { + x = _mm_max_epi32(x, clamp_lo); + x = _mm_min_epi32(x, clamp_hi); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( + -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); + const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( + (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); + + __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1); + x = _mm_add_epi32(x, offset); + x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift)); + + x = _mm_max_epi32(x, clamp_lo_out); + x = _mm_min_epi32(x, clamp_hi_out); + } + + out[0] = x; + out[63] = x; + out[1] = x; + out[62] = x; + out[2] = x; + out[61] = x; + out[3] = x; + out[60] = x; + out[4] = x; + out[59] = x; + out[5] = x; + out[58] = x; + out[6] = x; + out[57] = x; + out[7] = x; + out[56] = x; + out[8] = x; + out[55] = x; + out[9] = x; + out[54] = x; + out[10] = x; + out[53] = x; + out[11] = x; + out[52] = x; + out[12] = x; + out[51] = x; + out[13] = x; + out[50] = x; + out[14] = x; + out[49] = x; + out[15] = x; + out[48] = x; + out[16] = x; + out[47] = x; + out[17] = x; + out[46] = x; + out[18] = x; + out[45] = x; + out[19] = x; + out[44] = x; + out[20] = x; + out[43] = x; + out[21] = x; + out[42] = x; + out[22] = x; + out[41] = x; + out[23] = x; + out[40] = x; + out[24] = x; + out[39] = x; + out[25] = x; + out[38] = x; + out[26] = x; + out[37] = x; + out[27] = x; + out[36] = x; + out[28] = x; + out[35] = x; + out[29] = x; + out[34] = x; + out[30] = x; + out[33] = x; + out[31] = x; + out[32] = x; + } +} + +static void idct64x64_low8_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + int i, j; + const int32_t *cospi = cospi_arr(bit); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + + const __m128i cospi1 = _mm_set1_epi32(cospi[1]); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospi3 = _mm_set1_epi32(cospi[3]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospim12 = _mm_set1_epi32(-cospi[12]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospim28 = _mm_set1_epi32(-cospi[28]); + const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); + const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospi63 = _mm_set1_epi32(cospi[63]); + const __m128i cospim57 = _mm_set1_epi32(-cospi[57]); + const __m128i cospi7 = _mm_set1_epi32(cospi[7]); + const __m128i cospi5 = _mm_set1_epi32(cospi[5]); + const __m128i cospi59 = _mm_set1_epi32(cospi[59]); + const __m128i cospim61 = _mm_set1_epi32(-cospi[61]); + const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + + { + __m128i u[64]; + + // stage 1 + u[0] = in[0]; + u[8] = in[4]; + u[16] = in[2]; + u[24] = in[6]; + u[32] = in[1]; + u[40] = in[5]; + u[48] = in[3]; + u[56] = in[7]; + + // stage 2 + u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit); + u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit); + u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit); + u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit); + u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit); + u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit); + u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit); + u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit); + + // stage 3 + u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit); + u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit); + u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit); + u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit); + u[33] = u[32]; + u[38] = u[39]; + u[41] = u[40]; + u[46] = u[47]; + u[49] = u[48]; + u[54] = u[55]; + u[57] = u[56]; + u[62] = u[63]; + + // stage 4 + __m128i temp1, temp2; + u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit); + u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit); + u[17] = u[16]; + u[22] = u[23]; + u[25] = u[24]; + u[30] = u[31]; + + temp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); + u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); + u[33] = temp1; + + temp2 = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); + u[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); + u[57] = temp2; + + temp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); + u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); + u[41] = temp1; + + temp2 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); + u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); + u[46] = temp2; + + // stage 5 + u[9] = u[8]; + u[14] = u[15]; + + temp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit); + u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit); + u[17] = temp1; + + temp2 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit); + u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit); + u[22] = temp2; + + u[35] = u[32]; + u[34] = u[33]; + u[36] = u[39]; + u[37] = u[38]; + u[43] = u[40]; + u[42] = u[41]; + u[44] = u[47]; + u[45] = u[46]; + u[51] = u[48]; + u[50] = u[49]; + u[52] = u[55]; + u[53] = u[54]; + u[59] = u[56]; + u[58] = u[57]; + u[60] = u[63]; + u[61] = u[62]; + + // stage 6 + temp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); + u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); + u[0] = temp1; + + temp2 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + u[9] = temp2; + u[19] = u[16]; + u[18] = u[17]; + u[20] = u[23]; + u[21] = u[22]; + u[27] = u[24]; + u[26] = u[25]; + u[28] = u[31]; + u[29] = u[30]; + + temp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); + u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); + u[34] = temp1; + temp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); + u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); + u[35] = temp2; + temp1 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); + u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); + u[36] = temp1; + temp2 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); + u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); + u[37] = temp2; + temp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); + u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); + u[42] = temp1; + temp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); + u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); + u[43] = temp2; + temp1 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); + u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); + u[44] = temp1; + temp2 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); + u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); + u[45] = temp2; + + // stage 7 + u[3] = u[0]; + u[2] = u[1]; + u[11] = u[8]; + u[10] = u[9]; + u[12] = u[15]; + u[13] = u[14]; + + temp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit); + u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit); + u[18] = temp1; + temp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit); + u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit); + u[19] = temp2; + temp1 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit); + u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit); + u[20] = temp1; + temp2 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit); + u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit); + u[21] = temp2; + for (i = 32; i < 64; i += 16) { + for (j = i; j < i + 4; j++) { + addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, + &clamp_hi); + } + } + + // stage 8 + u[7] = u[0]; + u[6] = u[1]; + u[5] = u[2]; + u[4] = u[3]; + u[9] = u[9]; + + idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, + &cospim48, &clamp_lo, &clamp_hi, &rnding, bit); + + // stage 9 + idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, + bit); + + // stage 10 + idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, + bit); + + // stage 11 + idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, log_range); + } +} + +static void idct64x64_low16_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + int i, j; + const int32_t *cospi = cospi_arr(bit); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + + const __m128i cospi1 = _mm_set1_epi32(cospi[1]); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospi3 = _mm_set1_epi32(cospi[3]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi5 = _mm_set1_epi32(cospi[5]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospi7 = _mm_set1_epi32(cospi[7]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi9 = _mm_set1_epi32(cospi[9]); + const __m128i cospi10 = _mm_set1_epi32(cospi[10]); + const __m128i cospi11 = _mm_set1_epi32(cospi[11]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi13 = _mm_set1_epi32(cospi[13]); + const __m128i cospi14 = _mm_set1_epi32(cospi[14]); + const __m128i cospi15 = _mm_set1_epi32(cospi[15]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi36 = _mm_set1_epi32(cospi[36]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi51 = _mm_set1_epi32(cospi[51]); + const __m128i cospi52 = _mm_set1_epi32(cospi[52]); + const __m128i cospi54 = _mm_set1_epi32(cospi[54]); + const __m128i cospi55 = _mm_set1_epi32(cospi[55]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi59 = _mm_set1_epi32(cospi[59]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospi63 = _mm_set1_epi32(cospi[63]); + + const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospim12 = _mm_set1_epi32(-cospi[12]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospim28 = _mm_set1_epi32(-cospi[28]); + const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); + const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospim44 = _mm_set1_epi32(-cospi[44]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospim49 = _mm_set1_epi32(-cospi[49]); + const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); + const __m128i cospim53 = _mm_set1_epi32(-cospi[53]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospim57 = _mm_set1_epi32(-cospi[57]); + const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); + const __m128i cospim60 = _mm_set1_epi32(-cospi[60]); + const __m128i cospim61 = _mm_set1_epi32(-cospi[61]); + + { + __m128i u[64]; + __m128i tmp1, tmp2, tmp3, tmp4; + // stage 1 + u[0] = in[0]; + u[32] = in[1]; + u[36] = in[9]; + u[40] = in[5]; + u[44] = in[13]; + u[48] = in[3]; + u[52] = in[11]; + u[56] = in[7]; + u[60] = in[15]; + u[16] = in[2]; + u[20] = in[10]; + u[24] = in[6]; + u[28] = in[14]; + u[4] = in[8]; + u[8] = in[4]; + u[12] = in[12]; + + // stage 2 + u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit); + u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit); + u[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit); + u[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit); + u[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit); + u[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit); + u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit); + u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit); + u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit); + u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit); + u[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit); + u[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit); + u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit); + u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit); + u[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit); + u[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit); + + // stage 3 + u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit); + u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit); + u[19] = half_btf_0_sse4_1(&cospim50, &u[28], &rnding, bit); + u[28] = half_btf_0_sse4_1(&cospi14, &u[28], &rnding, bit); + u[27] = half_btf_0_sse4_1(&cospi10, &u[20], &rnding, bit); + u[20] = half_btf_0_sse4_1(&cospi54, &u[20], &rnding, bit); + u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit); + u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit); + u[33] = u[32]; + u[34] = u[35]; + u[37] = u[36]; + u[38] = u[39]; + u[41] = u[40]; + u[42] = u[43]; + u[45] = u[44]; + u[46] = u[47]; + u[49] = u[48]; + u[50] = u[51]; + u[53] = u[52]; + u[54] = u[55]; + u[57] = u[56]; + u[58] = u[59]; + u[61] = u[60]; + u[62] = u[63]; + + // stage 4 + u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit); + u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit); + u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit); + u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit); + + u[17] = u[16]; + u[18] = u[19]; + u[21] = u[20]; + u[22] = u[23]; + u[25] = u[24]; + u[26] = u[27]; + u[29] = u[28]; + u[30] = u[31]; + + tmp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); + tmp2 = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit); + tmp3 = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit); + tmp4 = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); + u[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); + u[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit); + u[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit); + u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); + u[33] = tmp1; + u[34] = tmp2; + u[37] = tmp3; + u[38] = tmp4; + + tmp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); + tmp2 = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit); + tmp3 = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit); + tmp4 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); + u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); + u[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit); + u[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit); + u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); + u[41] = tmp1; + u[42] = tmp2; + u[45] = tmp3; + u[46] = tmp4; + + // stage 5 + u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit); + u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit); + + u[9] = u[8]; + u[10] = u[11]; + u[13] = u[12]; + u[14] = u[15]; + + tmp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit); + tmp2 = half_btf_sse4_1(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit); + tmp3 = half_btf_sse4_1(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit); + tmp4 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit); + u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit); + u[26] = half_btf_sse4_1(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit); + u[29] = half_btf_sse4_1(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit); + u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit); + u[17] = tmp1; + u[18] = tmp2; + u[21] = tmp3; + u[22] = tmp4; + + for (i = 32; i < 64; i += 8) { + addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, + &clamp_hi); + addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, + &clamp_hi); + + addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, + &clamp_hi); + addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, + &clamp_hi); + } + + // stage 6 + tmp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); + u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); + u[0] = tmp1; + u[5] = u[4]; + u[6] = u[7]; + + tmp1 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + u[9] = tmp1; + tmp2 = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); + u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); + u[10] = tmp2; + + for (i = 16; i < 32; i += 8) { + addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, + &clamp_hi); + addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, + &clamp_hi); + + addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, + &clamp_hi); + addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, + &clamp_hi); + } + + tmp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); + tmp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); + tmp3 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); + tmp4 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); + u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); + u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); + u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); + u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); + u[34] = tmp1; + u[35] = tmp2; + u[36] = tmp3; + u[37] = tmp4; + + tmp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); + tmp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); + tmp3 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); + tmp4 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); + u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); + u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); + u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); + u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); + u[42] = tmp1; + u[43] = tmp2; + u[44] = tmp3; + u[45] = tmp4; + + // stage 7 + u[3] = u[0]; + u[2] = u[1]; + tmp1 = half_btf_sse4_1(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit); + u[6] = half_btf_sse4_1(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit); + u[5] = tmp1; + addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + tmp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit); + tmp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit); + tmp3 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit); + tmp4 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit); + u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit); + u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit); + u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit); + u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit); + u[18] = tmp1; + u[19] = tmp2; + u[20] = tmp3; + u[21] = tmp4; + + for (i = 32; i < 64; i += 16) { + for (j = i; j < i + 4; j++) { + addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, + &clamp_hi); + } + } + + // stage 8 + for (i = 0; i < 4; ++i) { + addsub_sse4_1(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi); + } + + idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, + &cospim48, &clamp_lo, &clamp_hi, &rnding, bit); + + // stage 9 + idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, + bit); + + // stage 10 + idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, + bit); + + // stage 11 + idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, log_range); + } +} + +static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + int i, j; + const int32_t *cospi = cospi_arr(bit); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + + const __m128i cospi1 = _mm_set1_epi32(cospi[1]); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospi3 = _mm_set1_epi32(cospi[3]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi5 = _mm_set1_epi32(cospi[5]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospi7 = _mm_set1_epi32(cospi[7]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi9 = _mm_set1_epi32(cospi[9]); + const __m128i cospi10 = _mm_set1_epi32(cospi[10]); + const __m128i cospi11 = _mm_set1_epi32(cospi[11]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi13 = _mm_set1_epi32(cospi[13]); + const __m128i cospi14 = _mm_set1_epi32(cospi[14]); + const __m128i cospi15 = _mm_set1_epi32(cospi[15]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospi17 = _mm_set1_epi32(cospi[17]); + const __m128i cospi18 = _mm_set1_epi32(cospi[18]); + const __m128i cospi19 = _mm_set1_epi32(cospi[19]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi21 = _mm_set1_epi32(cospi[21]); + const __m128i cospi22 = _mm_set1_epi32(cospi[22]); + const __m128i cospi23 = _mm_set1_epi32(cospi[23]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi25 = _mm_set1_epi32(cospi[25]); + const __m128i cospi26 = _mm_set1_epi32(cospi[26]); + const __m128i cospi27 = _mm_set1_epi32(cospi[27]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi29 = _mm_set1_epi32(cospi[29]); + const __m128i cospi30 = _mm_set1_epi32(cospi[30]); + const __m128i cospi31 = _mm_set1_epi32(cospi[31]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi35 = _mm_set1_epi32(cospi[35]); + const __m128i cospi36 = _mm_set1_epi32(cospi[36]); + const __m128i cospi38 = _mm_set1_epi32(cospi[38]); + const __m128i cospi39 = _mm_set1_epi32(cospi[39]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi43 = _mm_set1_epi32(cospi[43]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi46 = _mm_set1_epi32(cospi[46]); + const __m128i cospi47 = _mm_set1_epi32(cospi[47]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi51 = _mm_set1_epi32(cospi[51]); + const __m128i cospi52 = _mm_set1_epi32(cospi[52]); + const __m128i cospi54 = _mm_set1_epi32(cospi[54]); + const __m128i cospi55 = _mm_set1_epi32(cospi[55]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi59 = _mm_set1_epi32(cospi[59]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospi63 = _mm_set1_epi32(cospi[63]); + + const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospim12 = _mm_set1_epi32(-cospi[12]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospim28 = _mm_set1_epi32(-cospi[28]); + const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); + const __m128i cospim33 = _mm_set1_epi32(-cospi[33]); + const __m128i cospim34 = _mm_set1_epi32(-cospi[34]); + const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); + const __m128i cospim37 = _mm_set1_epi32(-cospi[37]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospim41 = _mm_set1_epi32(-cospi[41]); + const __m128i cospim42 = _mm_set1_epi32(-cospi[42]); + const __m128i cospim44 = _mm_set1_epi32(-cospi[44]); + const __m128i cospim45 = _mm_set1_epi32(-cospi[45]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospim49 = _mm_set1_epi32(-cospi[49]); + const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); + const __m128i cospim53 = _mm_set1_epi32(-cospi[53]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospim57 = _mm_set1_epi32(-cospi[57]); + const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); + const __m128i cospim60 = _mm_set1_epi32(-cospi[60]); + const __m128i cospim61 = _mm_set1_epi32(-cospi[61]); + + { + __m128i u[64], v[64]; + + // stage 1 + u[32] = in[1]; + u[34] = in[17]; + u[36] = in[9]; + u[38] = in[25]; + u[40] = in[5]; + u[42] = in[21]; + u[44] = in[13]; + u[46] = in[29]; + u[48] = in[3]; + u[50] = in[19]; + u[52] = in[11]; + u[54] = in[27]; + u[56] = in[7]; + u[58] = in[23]; + u[60] = in[15]; + u[62] = in[31]; + + v[16] = in[2]; + v[18] = in[18]; + v[20] = in[10]; + v[22] = in[26]; + v[24] = in[6]; + v[26] = in[22]; + v[28] = in[14]; + v[30] = in[30]; + + u[8] = in[4]; + u[10] = in[20]; + u[12] = in[12]; + u[14] = in[28]; + + v[4] = in[8]; + v[6] = in[24]; + + u[0] = in[0]; + u[2] = in[16]; + + // stage 2 + v[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit); + v[33] = half_btf_0_sse4_1(&cospim33, &u[62], &rnding, bit); + v[34] = half_btf_0_sse4_1(&cospi47, &u[34], &rnding, bit); + v[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit); + v[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit); + v[37] = half_btf_0_sse4_1(&cospim41, &u[58], &rnding, bit); + v[38] = half_btf_0_sse4_1(&cospi39, &u[38], &rnding, bit); + v[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit); + v[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit); + v[41] = half_btf_0_sse4_1(&cospim37, &u[54], &rnding, bit); + v[42] = half_btf_0_sse4_1(&cospi43, &u[42], &rnding, bit); + v[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit); + v[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit); + v[45] = half_btf_0_sse4_1(&cospim45, &u[50], &rnding, bit); + v[46] = half_btf_0_sse4_1(&cospi35, &u[46], &rnding, bit); + v[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit); + v[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit); + v[49] = half_btf_0_sse4_1(&cospi29, &u[46], &rnding, bit); + v[50] = half_btf_0_sse4_1(&cospi19, &u[50], &rnding, bit); + v[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit); + v[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit); + v[53] = half_btf_0_sse4_1(&cospi21, &u[42], &rnding, bit); + v[54] = half_btf_0_sse4_1(&cospi27, &u[54], &rnding, bit); + v[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit); + v[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit); + v[57] = half_btf_0_sse4_1(&cospi25, &u[38], &rnding, bit); + v[58] = half_btf_0_sse4_1(&cospi23, &u[58], &rnding, bit); + v[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit); + v[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit); + v[61] = half_btf_0_sse4_1(&cospi17, &u[34], &rnding, bit); + v[62] = half_btf_0_sse4_1(&cospi31, &u[62], &rnding, bit); + v[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit); + + // stage 3 + u[16] = half_btf_0_sse4_1(&cospi62, &v[16], &rnding, bit); + u[17] = half_btf_0_sse4_1(&cospim34, &v[30], &rnding, bit); + u[18] = half_btf_0_sse4_1(&cospi46, &v[18], &rnding, bit); u[19] = half_btf_0_sse4_1(&cospim50, &v[28], &rnding, bit); u[20] = half_btf_0_sse4_1(&cospi54, &v[20], &rnding, bit); u[21] = half_btf_0_sse4_1(&cospim42, &v[26], &rnding, bit); @@ -2039,301 +3961,1388 @@ static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, v[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit); v[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit); - for (i = 16; i < 32; i += 4) { - addsub_sse4_1(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo, - &clamp_hi); - addsub_sse4_1(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo, - &clamp_hi); + for (i = 16; i < 32; i += 4) { + addsub_sse4_1(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo, + &clamp_hi); + addsub_sse4_1(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo, + &clamp_hi); + } + + for (i = 32; i < 64; i += 4) { + v[i + 0] = u[i + 0]; + v[i + 3] = u[i + 3]; + } + + v[33] = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); + v[34] = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit); + v[37] = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit); + v[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); + v[41] = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); + v[42] = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit); + v[45] = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit); + v[46] = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); + v[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); + v[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit); + v[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit); + v[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); + v[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); + v[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit); + v[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit); + v[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); + + // stage 5 + u[4] = half_btf_0_sse4_1(&cospi56, &v[4], &rnding, bit); + u[5] = half_btf_0_sse4_1(&cospim40, &v[6], &rnding, bit); + u[6] = half_btf_0_sse4_1(&cospi24, &v[6], &rnding, bit); + u[7] = half_btf_0_sse4_1(&cospi8, &v[4], &rnding, bit); + + for (i = 8; i < 16; i += 4) { + addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo, + &clamp_hi); + addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo, + &clamp_hi); + } + + for (i = 16; i < 32; i += 4) { + u[i + 0] = v[i + 0]; + u[i + 3] = v[i + 3]; + } + + u[17] = half_btf_sse4_1(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit); + u[18] = half_btf_sse4_1(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit); + u[21] = half_btf_sse4_1(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit); + u[22] = half_btf_sse4_1(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit); + u[25] = half_btf_sse4_1(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit); + u[26] = half_btf_sse4_1(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit); + u[29] = half_btf_sse4_1(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit); + u[30] = half_btf_sse4_1(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit); + + for (i = 32; i < 64; i += 8) { + addsub_sse4_1(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, + &clamp_hi); + addsub_sse4_1(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, + &clamp_hi); + + addsub_sse4_1(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, + &clamp_hi); + addsub_sse4_1(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, + &clamp_hi); + } + + // stage 6 + v[0] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); + v[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); + v[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit); + v[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit); + + addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi); + + for (i = 8; i < 16; i += 4) { + v[i + 0] = u[i + 0]; + v[i + 3] = u[i + 3]; + } + + v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); + v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); + v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + + for (i = 16; i < 32; i += 8) { + addsub_sse4_1(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo, + &clamp_hi); + addsub_sse4_1(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo, + &clamp_hi); + + addsub_sse4_1(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo, + &clamp_hi); + addsub_sse4_1(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo, + &clamp_hi); + } + + for (i = 32; i < 64; i += 8) { + v[i + 0] = u[i + 0]; + v[i + 1] = u[i + 1]; + v[i + 6] = u[i + 6]; + v[i + 7] = u[i + 7]; + } + + v[34] = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); + v[35] = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); + v[36] = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); + v[37] = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); + v[42] = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); + v[43] = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); + v[44] = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); + v[45] = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); + v[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); + v[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); + v[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); + v[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); + v[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); + v[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); + v[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); + v[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); + + // stage 7 + addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi); + + u[4] = v[4]; + u[7] = v[7]; + u[5] = half_btf_sse4_1(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit); + u[6] = half_btf_sse4_1(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit); + + addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + for (i = 16; i < 32; i += 8) { + u[i + 0] = v[i + 0]; + u[i + 1] = v[i + 1]; + u[i + 6] = v[i + 6]; + u[i + 7] = v[i + 7]; + } + + u[18] = half_btf_sse4_1(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit); + u[19] = half_btf_sse4_1(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit); + u[20] = half_btf_sse4_1(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit); + u[21] = half_btf_sse4_1(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit); + u[26] = half_btf_sse4_1(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit); + u[27] = half_btf_sse4_1(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit); + u[28] = half_btf_sse4_1(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit); + u[29] = half_btf_sse4_1(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit); + + for (i = 32; i < 64; i += 16) { + for (j = i; j < i + 4; j++) { + addsub_sse4_1(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, + &clamp_hi); + } + } + + // stage 8 + for (i = 0; i < 4; ++i) { + addsub_sse4_1(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi); + } + + v[8] = u[8]; + v[9] = u[9]; + v[14] = u[14]; + v[15] = u[15]; + + v[10] = half_btf_sse4_1(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit); + v[11] = half_btf_sse4_1(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit); + v[12] = half_btf_sse4_1(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit); + v[13] = half_btf_sse4_1(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit); + + for (i = 16; i < 20; ++i) { + addsub_sse4_1(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo, + &clamp_hi); + } + + for (i = 32; i < 36; ++i) { + v[i] = u[i]; + v[i + 12] = u[i + 12]; + v[i + 16] = u[i + 16]; + v[i + 28] = u[i + 28]; + } + + v[36] = half_btf_sse4_1(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit); + v[37] = half_btf_sse4_1(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit); + v[38] = half_btf_sse4_1(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit); + v[39] = half_btf_sse4_1(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit); + v[40] = half_btf_sse4_1(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit); + v[41] = half_btf_sse4_1(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit); + v[42] = half_btf_sse4_1(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit); + v[43] = half_btf_sse4_1(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit); + v[52] = half_btf_sse4_1(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit); + v[53] = half_btf_sse4_1(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit); + v[54] = half_btf_sse4_1(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit); + v[55] = half_btf_sse4_1(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit); + v[56] = half_btf_sse4_1(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit); + v[57] = half_btf_sse4_1(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit); + v[58] = half_btf_sse4_1(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit); + v[59] = half_btf_sse4_1(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit); + + // stage 9 + for (i = 0; i < 8; ++i) { + addsub_sse4_1(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi); + } + + for (i = 16; i < 20; ++i) { + u[i] = v[i]; + u[i + 12] = v[i + 12]; + } + + u[20] = half_btf_sse4_1(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit); + u[21] = half_btf_sse4_1(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit); + u[22] = half_btf_sse4_1(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit); + u[23] = half_btf_sse4_1(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit); + u[24] = half_btf_sse4_1(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit); + u[25] = half_btf_sse4_1(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit); + u[26] = half_btf_sse4_1(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit); + u[27] = half_btf_sse4_1(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit); + + for (i = 32; i < 40; i++) { + addsub_sse4_1(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi); } - for (i = 32; i < 64; i += 4) { - v[i + 0] = u[i + 0]; - v[i + 3] = u[i + 3]; + for (i = 48; i < 56; i++) { + addsub_sse4_1(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi); } - v[33] = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); - v[34] = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit); - v[37] = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit); - v[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); - v[41] = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); - v[42] = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit); - v[45] = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit); - v[46] = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); - v[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); - v[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit); - v[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit); - v[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); - v[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); - v[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit); - v[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit); - v[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); + // stage 10 + for (i = 0; i < 16; i++) { + addsub_sse4_1(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi); + } - // stage 5 - u[4] = half_btf_0_sse4_1(&cospi56, &v[4], &rnding, bit); - u[5] = half_btf_0_sse4_1(&cospim40, &v[6], &rnding, bit); - u[6] = half_btf_0_sse4_1(&cospi24, &v[6], &rnding, bit); - u[7] = half_btf_0_sse4_1(&cospi8, &v[4], &rnding, bit); + for (i = 32; i < 40; i++) v[i] = u[i]; - for (i = 8; i < 16; i += 4) { - addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo, - &clamp_hi); - addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo, - &clamp_hi); - } + v[40] = half_btf_sse4_1(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit); + v[41] = half_btf_sse4_1(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit); + v[42] = half_btf_sse4_1(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit); + v[43] = half_btf_sse4_1(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit); + v[44] = half_btf_sse4_1(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit); + v[45] = half_btf_sse4_1(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit); + v[46] = half_btf_sse4_1(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit); + v[47] = half_btf_sse4_1(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit); + v[48] = half_btf_sse4_1(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit); + v[49] = half_btf_sse4_1(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit); + v[50] = half_btf_sse4_1(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit); + v[51] = half_btf_sse4_1(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit); + v[52] = half_btf_sse4_1(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit); + v[53] = half_btf_sse4_1(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit); + v[54] = half_btf_sse4_1(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit); + v[55] = half_btf_sse4_1(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit); - for (i = 16; i < 32; i += 4) { - u[i + 0] = v[i + 0]; - u[i + 3] = v[i + 3]; + for (i = 56; i < 64; i++) v[i] = u[i]; + + // stage 11 + if (do_cols) { + for (i = 0; i < 32; i++) { + addsub_no_clamp_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)]); + } + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( + -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); + const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( + (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); + + for (i = 0; i < 32; i++) { + addsub_shift_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)], + &clamp_lo_out, &clamp_hi_out, out_shift); + } } + } +} - u[17] = half_btf_sse4_1(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit); - u[18] = half_btf_sse4_1(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit); - u[21] = half_btf_sse4_1(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit); - u[22] = half_btf_sse4_1(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit); - u[25] = half_btf_sse4_1(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit); - u[26] = half_btf_sse4_1(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit); - u[29] = half_btf_sse4_1(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit); - u[30] = half_btf_sse4_1(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit); +static void idct32x32_low1_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rounding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i bf1; - for (i = 32; i < 64; i += 8) { - addsub_sse4_1(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, - &clamp_hi); - addsub_sse4_1(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, - &clamp_hi); + // stage 0 + // stage 1 + bf1 = in[0]; - addsub_sse4_1(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, - &clamp_hi); - addsub_sse4_1(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, - &clamp_hi); - } + // stage 2 + // stage 3 + // stage 4 + // stage 5 + bf1 = half_btf_0_sse4_1(&cospi32, &bf1, &rounding, bit); - // stage 6 - v[0] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); - v[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); - v[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit); - v[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit); + // stage 6 + // stage 7 + // stage 8 + // stage 9 + if (do_cols) { + bf1 = _mm_max_epi32(bf1, clamp_lo); + bf1 = _mm_min_epi32(bf1, clamp_hi); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( + -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); + const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( + (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); + + __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1); + bf1 = _mm_add_epi32(bf1, offset); + bf1 = _mm_sra_epi32(bf1, _mm_cvtsi32_si128(out_shift)); + bf1 = _mm_max_epi32(bf1, clamp_lo_out); + bf1 = _mm_min_epi32(bf1, clamp_hi_out); + } + out[0] = bf1; + out[1] = bf1; + out[2] = bf1; + out[3] = bf1; + out[4] = bf1; + out[5] = bf1; + out[6] = bf1; + out[7] = bf1; + out[8] = bf1; + out[9] = bf1; + out[10] = bf1; + out[11] = bf1; + out[12] = bf1; + out[13] = bf1; + out[14] = bf1; + out[15] = bf1; + out[16] = bf1; + out[17] = bf1; + out[18] = bf1; + out[19] = bf1; + out[20] = bf1; + out[21] = bf1; + out[22] = bf1; + out[23] = bf1; + out[24] = bf1; + out[25] = bf1; + out[26] = bf1; + out[27] = bf1; + out[28] = bf1; + out[29] = bf1; + out[30] = bf1; + out[31] = bf1; +} - addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi); +static void idct32x32_low8_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospi14 = _mm_set1_epi32(cospi[14]); + const __m128i cospi54 = _mm_set1_epi32(cospi[54]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospi10 = _mm_set1_epi32(cospi[10]); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); + const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i rounding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i bf1[32]; - for (i = 8; i < 16; i += 4) { - v[i + 0] = u[i + 0]; - v[i + 3] = u[i + 3]; - } + // stage 0 + // stage 1 + bf1[0] = in[0]; + bf1[4] = in[4]; + bf1[8] = in[2]; + bf1[12] = in[6]; + bf1[16] = in[1]; + bf1[20] = in[5]; + bf1[24] = in[3]; + bf1[28] = in[7]; - v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); - v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); - v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); - v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + // stage 2 + bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit); + bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit); + bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit); + bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit); + bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit); + bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit); + bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit); + bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit); - for (i = 16; i < 32; i += 8) { - addsub_sse4_1(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo, - &clamp_hi); - addsub_sse4_1(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo, - &clamp_hi); + // stage 3 + bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit); + bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit); + + bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit); + bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit); + bf1[17] = bf1[16]; + bf1[18] = bf1[19]; + bf1[21] = bf1[20]; + bf1[22] = bf1[23]; + bf1[25] = bf1[24]; + bf1[26] = bf1[27]; + bf1[29] = bf1[28]; + bf1[30] = bf1[31]; + + // stage 4 : + bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit); + bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit); + + bf1[9] = bf1[8]; + bf1[10] = bf1[11]; + bf1[13] = bf1[12]; + bf1[14] = bf1[15]; + + idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40, + &cospi24, &cospi40, &cospim24, &rounding, bit); - addsub_sse4_1(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo, - &clamp_hi); - addsub_sse4_1(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo, - &clamp_hi); - } + // stage 5 + bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit); + bf1[1] = bf1[0]; + bf1[5] = bf1[4]; + bf1[6] = bf1[7]; - for (i = 32; i < 64; i += 8) { - v[i + 0] = u[i + 0]; - v[i + 1] = u[i + 1]; - v[i + 6] = u[i + 6]; - v[i + 7] = u[i + 7]; - } + idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo, + &clamp_hi, &rounding, bit); - v[34] = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); - v[35] = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); - v[36] = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); - v[37] = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); - v[42] = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); - v[43] = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); - v[44] = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); - v[45] = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); - v[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); - v[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); - v[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); - v[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); - v[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); - v[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); - v[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); - v[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); + // stage 6 + bf1[3] = bf1[0]; + bf1[2] = bf1[1]; - // stage 7 - addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi); + idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, + &cospim48, &clamp_lo, &clamp_hi, &rounding, bit); + + // stage 7 + idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, + &rounding, bit); + + // stage 8 + idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, + &rounding, bit); + + // stage 9 + idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, log_range); +} + +static void idct32x32_low16_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospi30 = _mm_set1_epi32(cospi[30]); + const __m128i cospi46 = _mm_set1_epi32(cospi[46]); + const __m128i cospi14 = _mm_set1_epi32(cospi[14]); + const __m128i cospi54 = _mm_set1_epi32(cospi[54]); + const __m128i cospi22 = _mm_set1_epi32(cospi[22]); + const __m128i cospi38 = _mm_set1_epi32(cospi[38]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospi26 = _mm_set1_epi32(cospi[26]); + const __m128i cospi10 = _mm_set1_epi32(cospi[10]); + const __m128i cospi18 = _mm_set1_epi32(cospi[18]); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); + const __m128i cospim42 = _mm_set1_epi32(-cospi[42]); + const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); + const __m128i cospim34 = _mm_set1_epi32(-cospi[34]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); + const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i rounding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i bf1[32]; + + // stage 0 + // stage 1 + + bf1[0] = in[0]; + bf1[2] = in[8]; + bf1[4] = in[4]; + bf1[6] = in[12]; + bf1[8] = in[2]; + bf1[10] = in[10]; + bf1[12] = in[6]; + bf1[14] = in[14]; + bf1[16] = in[1]; + bf1[18] = in[9]; + bf1[20] = in[5]; + bf1[22] = in[13]; + bf1[24] = in[3]; + bf1[26] = in[11]; + bf1[28] = in[7]; + bf1[30] = in[15]; + + // stage 2 + bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit); + bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit); + bf1[17] = half_btf_0_sse4_1(&cospim34, &bf1[30], &rounding, bit); + bf1[30] = half_btf_0_sse4_1(&cospi30, &bf1[30], &rounding, bit); + bf1[29] = half_btf_0_sse4_1(&cospi18, &bf1[18], &rounding, bit); + bf1[18] = half_btf_0_sse4_1(&cospi46, &bf1[18], &rounding, bit); + bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit); + bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit); + bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit); + bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit); + bf1[21] = half_btf_0_sse4_1(&cospim42, &bf1[26], &rounding, bit); + bf1[26] = half_btf_0_sse4_1(&cospi22, &bf1[26], &rounding, bit); + bf1[25] = half_btf_0_sse4_1(&cospi26, &bf1[22], &rounding, bit); + bf1[22] = half_btf_0_sse4_1(&cospi38, &bf1[22], &rounding, bit); + bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit); + bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit); + + // stage 3 + bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit); + bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit); + bf1[9] = half_btf_0_sse4_1(&cospim36, &bf1[14], &rounding, bit); + bf1[14] = half_btf_0_sse4_1(&cospi28, &bf1[14], &rounding, bit); + bf1[13] = half_btf_0_sse4_1(&cospi20, &bf1[10], &rounding, bit); + bf1[10] = half_btf_0_sse4_1(&cospi44, &bf1[10], &rounding, bit); + bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit); + bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit); + + addsub_sse4_1(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi); + // stage 4 + bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit); + bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit); + bf1[5] = half_btf_0_sse4_1(&cospim40, &bf1[6], &rounding, bit); + bf1[6] = half_btf_0_sse4_1(&cospi24, &bf1[6], &rounding, bit); + + addsub_sse4_1(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi); + + idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40, + &cospi24, &cospi40, &cospim24, &rounding, bit); + + // stage 5 + bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit); + bf1[1] = bf1[0]; + bf1[3] = half_btf_0_sse4_1(&cospi16, &bf1[2], &rounding, bit); + bf1[2] = half_btf_0_sse4_1(&cospi48, &bf1[2], &rounding, bit); - u[4] = v[4]; - u[7] = v[7]; - u[5] = half_btf_sse4_1(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit); - u[6] = half_btf_sse4_1(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit); + addsub_sse4_1(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi); - addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo, + &clamp_hi, &rounding, bit); - for (i = 16; i < 32; i += 8) { - u[i + 0] = v[i + 0]; - u[i + 1] = v[i + 1]; - u[i + 6] = v[i + 6]; - u[i + 7] = v[i + 7]; - } + // stage 6 + addsub_sse4_1(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi); - u[18] = half_btf_sse4_1(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit); - u[19] = half_btf_sse4_1(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit); - u[20] = half_btf_sse4_1(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit); - u[21] = half_btf_sse4_1(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit); - u[26] = half_btf_sse4_1(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit); - u[27] = half_btf_sse4_1(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit); - u[28] = half_btf_sse4_1(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit); - u[29] = half_btf_sse4_1(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit); + idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, + &cospim48, &clamp_lo, &clamp_hi, &rounding, bit); - for (i = 32; i < 64; i += 16) { - for (j = i; j < i + 4; j++) { - addsub_sse4_1(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, - &clamp_hi); - } - } + // stage 7 + idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, + &rounding, bit); - // stage 8 - for (i = 0; i < 4; ++i) { - addsub_sse4_1(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi); - } + // stage 8 + idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, + &rounding, bit); - v[8] = u[8]; - v[9] = u[9]; - v[14] = u[14]; - v[15] = u[15]; + // stage 9 + idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, log_range); +} - v[10] = half_btf_sse4_1(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit); - v[11] = half_btf_sse4_1(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit); - v[12] = half_btf_sse4_1(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit); - v[13] = half_btf_sse4_1(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit); +static void idct32x32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospi30 = _mm_set1_epi32(cospi[30]); + const __m128i cospi46 = _mm_set1_epi32(cospi[46]); + const __m128i cospi14 = _mm_set1_epi32(cospi[14]); + const __m128i cospi54 = _mm_set1_epi32(cospi[54]); + const __m128i cospi22 = _mm_set1_epi32(cospi[22]); + const __m128i cospi38 = _mm_set1_epi32(cospi[38]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospi58 = _mm_set1_epi32(cospi[58]); + const __m128i cospi26 = _mm_set1_epi32(cospi[26]); + const __m128i cospi42 = _mm_set1_epi32(cospi[42]); + const __m128i cospi10 = _mm_set1_epi32(cospi[10]); + const __m128i cospi50 = _mm_set1_epi32(cospi[50]); + const __m128i cospi18 = _mm_set1_epi32(cospi[18]); + const __m128i cospi34 = _mm_set1_epi32(cospi[34]); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); + const __m128i cospim26 = _mm_set1_epi32(-cospi[26]); + const __m128i cospim42 = _mm_set1_epi32(-cospi[42]); + const __m128i cospim10 = _mm_set1_epi32(-cospi[10]); + const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); + const __m128i cospim18 = _mm_set1_epi32(-cospi[18]); + const __m128i cospim34 = _mm_set1_epi32(-cospi[34]); + const __m128i cospim2 = _mm_set1_epi32(-cospi[2]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi52 = _mm_set1_epi32(cospi[52]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi36 = _mm_set1_epi32(cospi[36]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); + const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); + const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); + const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i rounding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i bf1[32], bf0[32]; - for (i = 16; i < 20; ++i) { - addsub_sse4_1(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo, - &clamp_hi); - } + // stage 0 + // stage 1 + bf1[0] = in[0]; + bf1[1] = in[16]; + bf1[2] = in[8]; + bf1[3] = in[24]; + bf1[4] = in[4]; + bf1[5] = in[20]; + bf1[6] = in[12]; + bf1[7] = in[28]; + bf1[8] = in[2]; + bf1[9] = in[18]; + bf1[10] = in[10]; + bf1[11] = in[26]; + bf1[12] = in[6]; + bf1[13] = in[22]; + bf1[14] = in[14]; + bf1[15] = in[30]; + bf1[16] = in[1]; + bf1[17] = in[17]; + bf1[18] = in[9]; + bf1[19] = in[25]; + bf1[20] = in[5]; + bf1[21] = in[21]; + bf1[22] = in[13]; + bf1[23] = in[29]; + bf1[24] = in[3]; + bf1[25] = in[19]; + bf1[26] = in[11]; + bf1[27] = in[27]; + bf1[28] = in[7]; + bf1[29] = in[23]; + bf1[30] = in[15]; + bf1[31] = in[31]; - for (i = 32; i < 36; ++i) { - v[i] = u[i]; - v[i + 12] = u[i + 12]; - v[i + 16] = u[i + 16]; - v[i + 28] = u[i + 28]; - } + // stage 2 + bf0[0] = bf1[0]; + bf0[1] = bf1[1]; + bf0[2] = bf1[2]; + bf0[3] = bf1[3]; + bf0[4] = bf1[4]; + bf0[5] = bf1[5]; + bf0[6] = bf1[6]; + bf0[7] = bf1[7]; + bf0[8] = bf1[8]; + bf0[9] = bf1[9]; + bf0[10] = bf1[10]; + bf0[11] = bf1[11]; + bf0[12] = bf1[12]; + bf0[13] = bf1[13]; + bf0[14] = bf1[14]; + bf0[15] = bf1[15]; + bf0[16] = + half_btf_sse4_1(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit); + bf0[17] = + half_btf_sse4_1(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit); + bf0[18] = + half_btf_sse4_1(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit); + bf0[19] = + half_btf_sse4_1(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit); + bf0[20] = + half_btf_sse4_1(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit); + bf0[21] = + half_btf_sse4_1(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit); + bf0[22] = + half_btf_sse4_1(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit); + bf0[23] = + half_btf_sse4_1(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit); + bf0[24] = + half_btf_sse4_1(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit); + bf0[25] = + half_btf_sse4_1(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit); + bf0[26] = + half_btf_sse4_1(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit); + bf0[27] = + half_btf_sse4_1(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit); + bf0[28] = + half_btf_sse4_1(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit); + bf0[29] = + half_btf_sse4_1(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit); + bf0[30] = + half_btf_sse4_1(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit); + bf0[31] = + half_btf_sse4_1(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit); - v[36] = half_btf_sse4_1(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit); - v[37] = half_btf_sse4_1(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit); - v[38] = half_btf_sse4_1(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit); - v[39] = half_btf_sse4_1(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit); - v[40] = half_btf_sse4_1(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit); - v[41] = half_btf_sse4_1(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit); - v[42] = half_btf_sse4_1(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit); - v[43] = half_btf_sse4_1(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit); - v[52] = half_btf_sse4_1(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit); - v[53] = half_btf_sse4_1(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit); - v[54] = half_btf_sse4_1(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit); - v[55] = half_btf_sse4_1(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit); - v[56] = half_btf_sse4_1(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit); - v[57] = half_btf_sse4_1(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit); - v[58] = half_btf_sse4_1(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit); - v[59] = half_btf_sse4_1(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit); + // stage 3 + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = + half_btf_sse4_1(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit); + bf1[9] = + half_btf_sse4_1(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit); + bf1[10] = + half_btf_sse4_1(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit); + bf1[11] = + half_btf_sse4_1(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit); + bf1[12] = + half_btf_sse4_1(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit); + bf1[13] = + half_btf_sse4_1(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit); + bf1[14] = + half_btf_sse4_1(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit); + bf1[15] = + half_btf_sse4_1(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit); + + addsub_sse4_1(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi); - // stage 9 - for (i = 0; i < 8; ++i) { - addsub_sse4_1(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi); - } + // stage 4 + bf0[0] = bf1[0]; + bf0[1] = bf1[1]; + bf0[2] = bf1[2]; + bf0[3] = bf1[3]; + bf0[4] = + half_btf_sse4_1(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit); + bf0[5] = + half_btf_sse4_1(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit); + bf0[6] = + half_btf_sse4_1(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit); + bf0[7] = half_btf_sse4_1(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit); + + addsub_sse4_1(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi); + + bf0[16] = bf1[16]; + bf0[17] = + half_btf_sse4_1(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit); + bf0[18] = + half_btf_sse4_1(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit); + bf0[19] = bf1[19]; + bf0[20] = bf1[20]; + bf0[21] = + half_btf_sse4_1(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit); + bf0[22] = + half_btf_sse4_1(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit); + bf0[23] = bf1[23]; + bf0[24] = bf1[24]; + bf0[25] = + half_btf_sse4_1(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit); + bf0[26] = + half_btf_sse4_1(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit); + bf0[27] = bf1[27]; + bf0[28] = bf1[28]; + bf0[29] = + half_btf_sse4_1(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit); + bf0[30] = + half_btf_sse4_1(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit); + bf0[31] = bf1[31]; - for (i = 16; i < 20; ++i) { - u[i] = v[i]; - u[i + 12] = v[i + 12]; - } + // stage 5 + bf1[0] = + half_btf_sse4_1(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit); + bf1[1] = + half_btf_sse4_1(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit); + bf1[2] = + half_btf_sse4_1(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit); + bf1[3] = + half_btf_sse4_1(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit); + addsub_sse4_1(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi); + bf1[8] = bf0[8]; + bf1[9] = + half_btf_sse4_1(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit); + bf1[10] = + half_btf_sse4_1(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit); + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = + half_btf_sse4_1(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit); + bf1[14] = + half_btf_sse4_1(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit); + bf1[15] = bf0[15]; + addsub_sse4_1(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi); - u[20] = half_btf_sse4_1(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit); - u[21] = half_btf_sse4_1(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit); - u[22] = half_btf_sse4_1(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit); - u[23] = half_btf_sse4_1(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit); - u[24] = half_btf_sse4_1(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit); - u[25] = half_btf_sse4_1(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit); - u[26] = half_btf_sse4_1(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit); - u[27] = half_btf_sse4_1(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit); + // stage 6 + addsub_sse4_1(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi); + bf0[4] = bf1[4]; + bf0[5] = + half_btf_sse4_1(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit); + bf0[6] = + half_btf_sse4_1(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit); + bf0[7] = bf1[7]; + addsub_sse4_1(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi); + bf0[16] = bf1[16]; + bf0[17] = bf1[17]; + bf0[18] = + half_btf_sse4_1(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit); + bf0[19] = + half_btf_sse4_1(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit); + bf0[20] = + half_btf_sse4_1(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit); + bf0[21] = + half_btf_sse4_1(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit); + bf0[22] = bf1[22]; + bf0[23] = bf1[23]; + bf0[24] = bf1[24]; + bf0[25] = bf1[25]; + bf0[26] = + half_btf_sse4_1(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit); + bf0[27] = + half_btf_sse4_1(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit); + bf0[28] = + half_btf_sse4_1(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit); + bf0[29] = + half_btf_sse4_1(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit); + bf0[30] = bf1[30]; + bf0[31] = bf1[31]; - for (i = 32; i < 40; i++) { - addsub_sse4_1(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi); - } + // stage 7 + addsub_sse4_1(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi); + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = + half_btf_sse4_1(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit); + bf1[11] = + half_btf_sse4_1(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit); + bf1[12] = + half_btf_sse4_1(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit); + bf1[13] = + half_btf_sse4_1(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit); + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + addsub_sse4_1(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi); + + // stage 8 + addsub_sse4_1(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi); + bf0[16] = bf1[16]; + bf0[17] = bf1[17]; + bf0[18] = bf1[18]; + bf0[19] = bf1[19]; + bf0[20] = + half_btf_sse4_1(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit); + bf0[21] = + half_btf_sse4_1(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit); + bf0[22] = + half_btf_sse4_1(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit); + bf0[23] = + half_btf_sse4_1(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit); + bf0[24] = + half_btf_sse4_1(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit); + bf0[25] = + half_btf_sse4_1(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit); + bf0[26] = + half_btf_sse4_1(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit); + bf0[27] = + half_btf_sse4_1(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit); + bf0[28] = bf1[28]; + bf0[29] = bf1[29]; + bf0[30] = bf1[30]; + bf0[31] = bf1[31]; + + // stage 9 + if (do_cols) { + addsub_no_clamp_sse4_1(bf0[0], bf0[31], out + 0, out + 31); + addsub_no_clamp_sse4_1(bf0[1], bf0[30], out + 1, out + 30); + addsub_no_clamp_sse4_1(bf0[2], bf0[29], out + 2, out + 29); + addsub_no_clamp_sse4_1(bf0[3], bf0[28], out + 3, out + 28); + addsub_no_clamp_sse4_1(bf0[4], bf0[27], out + 4, out + 27); + addsub_no_clamp_sse4_1(bf0[5], bf0[26], out + 5, out + 26); + addsub_no_clamp_sse4_1(bf0[6], bf0[25], out + 6, out + 25); + addsub_no_clamp_sse4_1(bf0[7], bf0[24], out + 7, out + 24); + addsub_no_clamp_sse4_1(bf0[8], bf0[23], out + 8, out + 23); + addsub_no_clamp_sse4_1(bf0[9], bf0[22], out + 9, out + 22); + addsub_no_clamp_sse4_1(bf0[10], bf0[21], out + 10, out + 21); + addsub_no_clamp_sse4_1(bf0[11], bf0[20], out + 11, out + 20); + addsub_no_clamp_sse4_1(bf0[12], bf0[19], out + 12, out + 19); + addsub_no_clamp_sse4_1(bf0[13], bf0[18], out + 13, out + 18); + addsub_no_clamp_sse4_1(bf0[14], bf0[17], out + 14, out + 17); + addsub_no_clamp_sse4_1(bf0[15], bf0[16], out + 15, out + 16); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( + -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); + const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( + (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); + + addsub_shift_sse4_1(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo_out, + &clamp_hi_out, out_shift); + addsub_shift_sse4_1(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo_out, + &clamp_hi_out, out_shift); + } +} - for (i = 48; i < 56; i++) { - addsub_sse4_1(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi); - } +void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const int32_t *src = cast_to_int32(input); + switch (tx_type) { + // Assembly version doesn't support some transform types, so use C version + // for those. + case V_DCT: + case H_DCT: + case V_ADST: + case H_ADST: + case V_FLIPADST: + case H_FLIPADST: + case IDTX: + av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, + bd); + break; + default: + av1_inv_txfm2d_add_8x8_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride, + tx_type, bd); + break; + } +} - // stage 10 - for (i = 0; i < 16; i++) { - addsub_sse4_1(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi); - } +void av1_highbd_inv_txfm_add_16x8_sse4_1(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const int32_t *src = cast_to_int32(input); + switch (tx_type) { + // Assembly version doesn't support some transform types, so use C version + // for those. + case V_DCT: + case H_DCT: + case V_ADST: + case H_ADST: + case V_FLIPADST: + case H_FLIPADST: + case IDTX: + av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); + break; + default: + av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type, + txfm_param->tx_size, + txfm_param->eob, bd); + break; + } +} - for (i = 32; i < 40; i++) v[i] = u[i]; +void av1_highbd_inv_txfm_add_8x16_sse4_1(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const int32_t *src = cast_to_int32(input); + switch (tx_type) { + // Assembly version doesn't support some transform types, so use C version + // for those. + case V_DCT: + case H_DCT: + case V_ADST: + case H_ADST: + case V_FLIPADST: + case H_FLIPADST: + case IDTX: + av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); + break; + default: + av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type, + txfm_param->tx_size, + txfm_param->eob, bd); + break; + } +} - v[40] = half_btf_sse4_1(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit); - v[41] = half_btf_sse4_1(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit); - v[42] = half_btf_sse4_1(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit); - v[43] = half_btf_sse4_1(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit); - v[44] = half_btf_sse4_1(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit); - v[45] = half_btf_sse4_1(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit); - v[46] = half_btf_sse4_1(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit); - v[47] = half_btf_sse4_1(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit); - v[48] = half_btf_sse4_1(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit); - v[49] = half_btf_sse4_1(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit); - v[50] = half_btf_sse4_1(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit); - v[51] = half_btf_sse4_1(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit); - v[52] = half_btf_sse4_1(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit); - v[53] = half_btf_sse4_1(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit); - v[54] = half_btf_sse4_1(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit); - v[55] = half_btf_sse4_1(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit); +void av1_highbd_inv_txfm_add_16x16_sse4_1(const tran_low_t *input, + uint8_t *dest, int stride, + const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const int32_t *src = cast_to_int32(input); + switch (tx_type) { + // Assembly version doesn't support some transform types, so use C version + // for those. + case V_DCT: + case H_DCT: + case V_ADST: + case H_ADST: + case V_FLIPADST: + case H_FLIPADST: + case IDTX: + av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, + tx_type, bd); + break; + default: + av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type, + txfm_param->tx_size, + txfm_param->eob, bd); + break; + } +} - for (i = 56; i < 64; i++) v[i] = u[i]; +void av1_highbd_inv_txfm_add_32x32_sse4_1(const tran_low_t *input, + uint8_t *dest, int stride, + const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const int32_t *src = cast_to_int32(input); + switch (tx_type) { + case DCT_DCT: + av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type, + txfm_param->tx_size, + txfm_param->eob, bd); + break; + // Assembly version doesn't support IDTX, so use C version for it. + case IDTX: + av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, + tx_type, bd); + break; + default: assert(0); + } +} - // stage 11 - if (do_cols) { - for (i = 0; i < 32; i++) { - addsub_no_clamp_sse4_1(v[i], v[63 - i], &out[16 * (i) + col], - &out[16 * (63 - i) + col]); +void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); + int eob = txfm_param->eob; + int bd = txfm_param->bd; + int lossless = txfm_param->lossless; + const int32_t *src = cast_to_int32(input); + const TX_TYPE tx_type = txfm_param->tx_type; + if (lossless) { + assert(tx_type == DCT_DCT); + av1_highbd_iwht4x4_add(input, dest, stride, eob, bd); + return; + } + switch (tx_type) { + // Assembly version doesn't support some transform types, so use C version + // for those. + case V_DCT: + case H_DCT: + case V_ADST: + case H_ADST: + case V_FLIPADST: + case H_FLIPADST: + case IDTX: + av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, + bd); + break; + default: + av1_inv_txfm2d_add_4x4_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride, + tx_type, bd); + break; + } +} + +static const transform_1d_sse4_1 + highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = { + { + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + }, + { { idct8x8_low1_sse4_1, idct8x8_new_sse4_1, NULL, NULL }, + { iadst8x8_low1_sse4_1, iadst8x8_new_sse4_1, NULL, NULL }, + { NULL, NULL, NULL, NULL } }, + { + { idct16x16_low1_sse4_1, idct16x16_low8_sse4_1, idct16x16_sse4_1, + NULL }, + { iadst16x16_low1_sse4_1, iadst16x16_low8_sse4_1, iadst16x16_sse4_1, + NULL }, + { NULL, NULL, NULL, NULL }, + }, + { { idct32x32_low1_sse4_1, idct32x32_low8_sse4_1, idct32x32_low16_sse4_1, + idct32x32_sse4_1 }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } }, + { { idct64x64_low1_sse4_1, idct64x64_low8_sse4_1, idct64x64_low16_sse4_1, + idct64x64_sse4_1 }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } } + }; + +static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input, + uint16_t *output, + int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob, + const int bd) { + __m128i buf1[64 * 16]; + int eobx, eoby; + get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); + const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div8 = txfm_size_col >> 2; + const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3; + const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; + const int input_stride = AOMMIN(32, txfm_size_col); + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + const transform_1d_sse4_1 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + const transform_1d_sse4_1 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // 1st stage: column transform + for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) { + __m128i buf0[64]; + const int32_t *input_row = input + i * input_stride * 4; + for (int j = 0; j < buf_size_nonzero_w_div8 << 1; ++j) { + __m128i *buf0_cur = buf0 + j * 4; + load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4); + + TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3], + buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]); + } + if (rect_type == 1 || rect_type == -1) { + av1_round_shift_rect_array_32_sse4_1( + buf0, buf0, buf_size_nonzero_w_div8 << 3, 0, NewInvSqrt2); + } + row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]); + + __m128i *_buf1 = buf1 + i * 4; + if (lr_flip) { + for (int j = 0; j < buf_size_w_div8; ++j) { + TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1], + buf0[4 * j], + _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0], + _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1], + _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2], + _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]); } } else { - for (i = 0; i < 32; i++) { - addsub_shift_sse4_1(v[i], v[63 - i], &out[16 * (i) + col], - &out[16 * (63 - i) + col], &clamp_lo, &clamp_hi, - out_shift); + for (int j = 0; j < buf_size_w_div8; ++j) { + TRANSPOSE_4X4( + buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3], + _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1], + _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]); } } } -} + // 2nd stage: column transform + for (int i = 0; i < buf_size_w_div8; i++) { + col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, + inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + + av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row, + buf1 + i * txfm_size_row, txfm_size_row, + -shift[1]); + } -void av1_inv_txfm2d_add_64x64_sse4_1(const int32_t *coeff, uint16_t *output, - int stride, TX_TYPE tx_type, int bd) { - __m128i in[64 * 64 / 4], out[64 * 64 / 4]; - const int8_t *shift = inv_txfm_shift_ls[TX_64X64]; - const int txw_idx = tx_size_wide_log2[TX_64X64] - tx_size_wide_log2[0]; - const int txh_idx = tx_size_high_log2[TX_64X64] - tx_size_high_log2[0]; + // write to buffer + { + for (int i = 0; i < (txfm_size_col >> 3); i++) { + highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, + output + 8 * i, stride, ud_flip, + txfm_size_row, bd); + } + } +} +void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob, const int bd) { switch (tx_type) { case DCT_DCT: - load_buffer_64x64_lower_32x32(coeff, in); - transpose_64x64(in, out, 0); - idct64x64_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, - -shift[0]); - transpose_64x64(in, out, 1); - idct64x64_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); - write_buffer_64x64(in, output, stride, 0, 0, -shift[1], bd); + case ADST_DCT: + case DCT_ADST: + case ADST_ADST: + case FLIPADST_DCT: + case DCT_FLIPADST: + case FLIPADST_FLIPADST: + case ADST_FLIPADST: + case FLIPADST_ADST: + highbd_inv_txfm2d_add_no_identity_sse41( + input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob, + bd); break; + default: assert(0); break; + } +} - default: - av1_inv_txfm2d_add_64x64_c(coeff, output, stride, tx_type, bd); +void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); + const TX_SIZE tx_size = txfm_param->tx_size; + switch (tx_size) { + case TX_32X32: + av1_highbd_inv_txfm_add_32x32_sse4_1(input, dest, stride, txfm_param); + break; + case TX_16X16: + av1_highbd_inv_txfm_add_16x16_sse4_1(input, dest, stride, txfm_param); + break; + case TX_8X8: + av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param); + break; + case TX_4X8: + av1_highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param); + break; + case TX_8X4: + av1_highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param); + break; + case TX_8X16: + av1_highbd_inv_txfm_add_8x16_sse4_1(input, dest, stride, txfm_param); + break; + case TX_16X8: + av1_highbd_inv_txfm_add_16x8_sse4_1(input, dest, stride, txfm_param); + break; + case TX_16X32: + av1_highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param); + break; + case TX_32X16: + av1_highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param); + break; + case TX_32X64: + av1_highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param); + break; + case TX_64X32: + av1_highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param); + break; + case TX_4X4: + av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param); + break; + case TX_16X4: + av1_highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param); + break; + case TX_4X16: + av1_highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param); + break; + case TX_8X32: + av1_highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param); + break; + case TX_32X8: + av1_highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param); + break; + case TX_64X64: + case TX_16X64: + case TX_64X16: + av1_highbd_inv_txfm2d_add_universe_sse4_1( + input, dest, stride, txfm_param->tx_type, txfm_param->tx_size, + txfm_param->eob, txfm_param->bd); break; + default: assert(0 && "Invalid transform size"); break; } } diff --git a/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c b/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c index 608bd88a4..e298cf653 100644 --- a/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c +++ b/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c @@ -14,7 +14,6 @@ #include "config/aom_dsp_rtcd.h" -#include "aom_dsp/aom_convolve.h" #include "aom_dsp/x86/convolve_avx2.h" #include "aom_dsp/x86/convolve_common_intrin.h" #include "aom_dsp/x86/convolve_sse4_1.h" diff --git a/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h index b29bd1d79..6f24e5948 100644 --- a/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h +++ b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _HIGHBD_TXFM_UTILITY_SSE4_H -#define _HIGHBD_TXFM_UTILITY_SSE4_H +#ifndef AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_ +#define AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_ #include /* SSE4.1 */ @@ -75,6 +75,17 @@ static INLINE void transpose_16x16(const __m128i *in, __m128i *out) { out[63]); } +static INLINE void transpose_32x32(const __m128i *input, __m128i *output) { + for (int j = 0; j < 8; j++) { + for (int i = 0; i < 8; i++) { + TRANSPOSE_4X4(input[i * 32 + j + 0], input[i * 32 + j + 8], + input[i * 32 + j + 16], input[i * 32 + j + 24], + output[j * 32 + i + 0], output[j * 32 + i + 8], + output[j * 32 + i + 16], output[j * 32 + i + 24]); + } + } +} + // Note: // rounding = 1 << (bit - 1) static INLINE __m128i half_btf_sse4_1(const __m128i *w0, const __m128i *n0, @@ -100,4 +111,15 @@ static INLINE __m128i half_btf_0_sse4_1(const __m128i *w0, const __m128i *n0, return x; } -#endif // _HIGHBD_TXFM_UTILITY_SSE4_H +typedef void (*transform_1d_sse4_1)(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift); + +typedef void (*fwd_transform_1d_sse4_1)(__m128i *in, __m128i *out, int bit, + const int num_cols); + +void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob, const int bd); + +#endif // AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_ diff --git a/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c b/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c index a08beaafd..4bcab0564 100644 --- a/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c +++ b/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c @@ -19,10 +19,21 @@ static const uint8_t warp_highbd_arrange_bytes[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; -static INLINE void horizontal_filter(__m128i src, __m128i src2, __m128i *tmp, - int sx, int alpha, int k, - const int offset_bits_horiz, - const int reduce_bits_horiz) { +static const uint8_t highbd_shuffle_alpha0_mask0[16] = { + 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +}; +static const uint8_t highbd_shuffle_alpha0_mask1[16] = { + 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7 +}; +static const uint8_t highbd_shuffle_alpha0_mask2[16] = { + 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11 +}; +static const uint8_t highbd_shuffle_alpha0_mask3[16] = { + 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15 +}; + +static INLINE void highbd_prepare_horizontal_filter_coeff(int alpha, int sx, + __m128i *coeff) { // Filter even-index pixels const __m128i tmp_0 = _mm_loadu_si128( (__m128i *)(warped_filter + ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS))); @@ -43,27 +54,13 @@ static INLINE void horizontal_filter(__m128i src, __m128i src2, __m128i *tmp, const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6); // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6 - const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10); + coeff[0] = _mm_unpacklo_epi64(tmp_8, tmp_10); // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6 - const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10); + coeff[2] = _mm_unpackhi_epi64(tmp_8, tmp_10); // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6 - const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14); + coeff[4] = _mm_unpacklo_epi64(tmp_12, tmp_14); // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6 - const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14); - - const __m128i round_const = _mm_set1_epi32((1 << offset_bits_horiz) + - ((1 << reduce_bits_horiz) >> 1)); - - // Calculate filtered results - const __m128i res_0 = _mm_madd_epi16(src, coeff_0); - const __m128i res_2 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 4), coeff_2); - const __m128i res_4 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 8), coeff_4); - const __m128i res_6 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 12), coeff_6); - - __m128i res_even = - _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6)); - res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const), - _mm_cvtsi32_si128(reduce_bits_horiz)); + coeff[6] = _mm_unpackhi_epi64(tmp_12, tmp_14); // Filter odd-index pixels const __m128i tmp_1 = _mm_loadu_si128( @@ -80,15 +77,63 @@ static INLINE void horizontal_filter(__m128i src, __m128i src2, __m128i *tmp, const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3); const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7); - const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11); - const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11); - const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15); - const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15); + coeff[1] = _mm_unpacklo_epi64(tmp_9, tmp_11); + coeff[3] = _mm_unpackhi_epi64(tmp_9, tmp_11); + coeff[5] = _mm_unpacklo_epi64(tmp_13, tmp_15); + coeff[7] = _mm_unpackhi_epi64(tmp_13, tmp_15); +} + +static INLINE void highbd_prepare_horizontal_filter_coeff_alpha0( + int sx, __m128i *coeff) { + // Filter coeff + const __m128i tmp_0 = _mm_loadu_si128( + (__m128i *)(warped_filter + (sx >> WARPEDDIFF_PREC_BITS))); + + coeff[0] = _mm_shuffle_epi8( + tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask0)); + coeff[2] = _mm_shuffle_epi8( + tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask1)); + coeff[4] = _mm_shuffle_epi8( + tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask2)); + coeff[6] = _mm_shuffle_epi8( + tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask3)); + + coeff[1] = coeff[0]; + coeff[3] = coeff[2]; + coeff[5] = coeff[4]; + coeff[7] = coeff[6]; +} + +static INLINE void highbd_filter_src_pixels( + const __m128i *src, const __m128i *src2, __m128i *tmp, __m128i *coeff, + const int offset_bits_horiz, const int reduce_bits_horiz, int k) { + const __m128i src_1 = *src; + const __m128i src2_1 = *src2; - const __m128i res_1 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 2), coeff_1); - const __m128i res_3 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 6), coeff_3); - const __m128i res_5 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 10), coeff_5); - const __m128i res_7 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 14), coeff_7); + const __m128i round_const = _mm_set1_epi32((1 << offset_bits_horiz) + + ((1 << reduce_bits_horiz) >> 1)); + + const __m128i res_0 = _mm_madd_epi16(src_1, coeff[0]); + const __m128i res_2 = + _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 4), coeff[2]); + const __m128i res_4 = + _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 8), coeff[4]); + const __m128i res_6 = + _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 12), coeff[6]); + + __m128i res_even = + _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6)); + res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const), + _mm_cvtsi32_si128(reduce_bits_horiz)); + + const __m128i res_1 = + _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 2), coeff[1]); + const __m128i res_3 = + _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 6), coeff[3]); + const __m128i res_5 = + _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 10), coeff[5]); + const __m128i res_7 = + _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 14), coeff[7]); __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7)); @@ -101,6 +146,145 @@ static INLINE void horizontal_filter(__m128i src, __m128i src2, __m128i *tmp, tmp[k + 7] = _mm_packs_epi32(res_even, res_odd); } +static INLINE void highbd_horiz_filter(const __m128i *src, const __m128i *src2, + __m128i *tmp, int sx, int alpha, int k, + const int offset_bits_horiz, + const int reduce_bits_horiz) { + __m128i coeff[8]; + highbd_prepare_horizontal_filter_coeff(alpha, sx, coeff); + highbd_filter_src_pixels(src, src2, tmp, coeff, offset_bits_horiz, + reduce_bits_horiz, k); +} + +static INLINE void highbd_warp_horizontal_filter_alpha0_beta0( + const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + (void)beta; + (void)alpha; + int k; + + __m128i coeff[8]; + highbd_prepare_horizontal_filter_coeff_alpha0(sx4, coeff); + + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + const __m128i src2 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1)); + highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz, + reduce_bits_horiz, k); + } +} + +static INLINE void highbd_warp_horizontal_filter_alpha0( + const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + (void)alpha; + int k; + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + int sx = sx4 + beta * (k + 4); + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + const __m128i src2 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1)); + + __m128i coeff[8]; + highbd_prepare_horizontal_filter_coeff_alpha0(sx, coeff); + highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz, + reduce_bits_horiz, k); + } +} + +static INLINE void highbd_warp_horizontal_filter_beta0( + const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + (void)beta; + int k; + __m128i coeff[8]; + highbd_prepare_horizontal_filter_coeff(alpha, sx4, coeff); + + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + const __m128i src2 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1)); + highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz, + reduce_bits_horiz, k); + } +} + +static INLINE void highbd_warp_horizontal_filter( + const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + int k; + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + int sx = sx4 + beta * (k + 4); + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + const __m128i src2 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1)); + + highbd_horiz_filter(&src, &src2, tmp, sx, alpha, k, offset_bits_horiz, + reduce_bits_horiz); + } +} + +static INLINE void highbd_prepare_warp_horizontal_filter( + const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + if (alpha == 0 && beta == 0) + highbd_warp_horizontal_filter_alpha0_beta0( + ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i, + offset_bits_horiz, reduce_bits_horiz); + + else if (alpha == 0 && beta != 0) + highbd_warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha, + beta, p_height, height, i, + offset_bits_horiz, reduce_bits_horiz); + + else if (alpha != 0 && beta == 0) + highbd_warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, + beta, p_height, height, i, + offset_bits_horiz, reduce_bits_horiz); + else + highbd_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta, + p_height, height, i, offset_bits_horiz, + reduce_bits_horiz); +} + void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, @@ -247,27 +431,13 @@ void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref, const __m128i src_padded = _mm_unpacklo_epi8(src_lo, src_hi); const __m128i src2_padded = _mm_unpackhi_epi8(src_lo, src_hi); - horizontal_filter(src_padded, src2_padded, tmp, sx, alpha, k, - offset_bits_horiz, reduce_bits_horiz); + highbd_horiz_filter(&src_padded, &src2_padded, tmp, sx, alpha, k, + offset_bits_horiz, reduce_bits_horiz); } } else { - for (k = -7; k < AOMMIN(8, p_height - i); ++k) { - int iy = iy4 + k; - if (iy < 0) - iy = 0; - else if (iy > height - 1) - iy = height - 1; - int sx = sx4 + beta * (k + 4); - - // Load source pixels - const __m128i src = - _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); - const __m128i src2 = - _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1)); - - horizontal_filter(src, src2, tmp, sx, alpha, k, offset_bits_horiz, - reduce_bits_horiz); - } + highbd_prepare_warp_horizontal_filter( + ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i, + offset_bits_horiz, reduce_bits_horiz); } // Vertical filter diff --git a/third_party/aom/av1/common/x86/jnt_convolve_avx2.c b/third_party/aom/av1/common/x86/jnt_convolve_avx2.c index d1ea26290..9f2e2b457 100644 --- a/third_party/aom/av1/common/x86/jnt_convolve_avx2.c +++ b/third_party/aom/av1/common/x86/jnt_convolve_avx2.c @@ -13,7 +13,6 @@ #include "config/aom_dsp_rtcd.h" -#include "aom_dsp/aom_convolve.h" #include "aom_dsp/x86/convolve_avx2.h" #include "aom_dsp/x86/convolve_common_intrin.h" #include "aom_dsp/x86/convolve_sse4_1.h" @@ -21,6 +20,21 @@ #include "aom_dsp/aom_filter.h" #include "av1/common/convolve.h" +static INLINE __m256i unpack_weights_avx2(ConvolveParams *conv_params) { + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m256i wt0 = _mm256_set1_epi16(w0); + const __m256i wt1 = _mm256_set1_epi16(w1); + const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1); + return wt; +} + +static INLINE __m256i load_line2_avx2(const void *a, const void *b) { + return _mm256_permute2x128_si256( + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)a)), + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)b)), 0x20); +} + void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w, int h, const InterpFilterParams *filter_params_x, @@ -34,11 +48,7 @@ void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, const int fo_horiz = filter_params_x->taps / 2 - 1; const uint8_t *const src_ptr = src - fo_horiz; const int bits = FILTER_BITS - conv_params->round_1; - const int w0 = conv_params->fwd_offset; - const int w1 = conv_params->bck_offset; - const __m256i wt0 = _mm256_set1_epi16(w0); - const __m256i wt1 = _mm256_set1_epi16(w1); - const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1); + const __m256i wt = unpack_weights_avx2(conv_params); const int do_average = conv_params->do_average; const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; const int offset_0 = @@ -68,13 +78,11 @@ void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, (void)subpel_y_q4; for (i = 0; i < h; i += 2) { + const uint8_t *src_data = src_ptr + i * src_stride; + CONV_BUF_TYPE *dst_data = dst + i * dst_stride; for (j = 0; j < w; j += 8) { - const __m256i data = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))), - _mm256_castsi128_si256(_mm_loadu_si128( - (__m128i *)(&src_ptr[i * src_stride + j + src_stride]))), - 0x20); + const __m256i data = + load_line2_avx2(&src_data[j], &src_data[j + src_stride]); __m256i res = convolve_lowbd_x(data, coeffs, filt); @@ -86,13 +94,8 @@ void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, // Accumulate values into the destination buffer if (do_average) { - const __m256i data_ref_0 = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))), - _mm256_castsi128_si256(_mm_loadu_si128( - (__m128i *)(&dst[i * dst_stride + j + dst_stride]))), - 0x20); - + const __m256i data_ref_0 = + load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]); const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); @@ -141,11 +144,7 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, const __m256i round_const = _mm256_set1_epi32((1 << conv_params->round_1) >> 1); const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); - const int w0 = conv_params->fwd_offset; - const int w1 = conv_params->bck_offset; - const __m256i wt0 = _mm256_set1_epi16(w0); - const __m256i wt1 = _mm256_set1_epi16(w1); - const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1); + const __m256i wt = unpack_weights_avx2(conv_params); const int do_average = conv_params->do_average; const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; const int offset_0 = @@ -172,72 +171,35 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, for (j = 0; j < w; j += 16) { const uint8_t *data = &src_ptr[j]; __m256i src6; - // Load lines a and b. Line a to lower 128, line b to upper 128 - const __m256i src_01a = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 0 * src_stride))), - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 1 * src_stride))), - 0x20); - - const __m256i src_12a = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 1 * src_stride))), - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 2 * src_stride))), - 0x20); - - const __m256i src_23a = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 2 * src_stride))), - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 3 * src_stride))), - 0x20); - - const __m256i src_34a = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 3 * src_stride))), - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 4 * src_stride))), - 0x20); - - const __m256i src_45a = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 4 * src_stride))), - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), - 0x20); - - src6 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 6 * src_stride))); - const __m256i src_56a = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), - src6, 0x20); - - s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); - s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); - s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); - - s[4] = _mm256_unpackhi_epi8(src_01a, src_12a); - s[5] = _mm256_unpackhi_epi8(src_23a, src_34a); - s[6] = _mm256_unpackhi_epi8(src_45a, src_56a); + { + __m256i src_ab[7]; + __m256i src_a[7]; + src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); + for (int kk = 0; kk < 6; ++kk) { + data += src_stride; + src_a[kk + 1] = + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); + src_ab[kk] = _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20); + } + src6 = src_a[6]; + s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]); + s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]); + s[2] = _mm256_unpacklo_epi8(src_ab[4], src_ab[5]); + s[4] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]); + s[5] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]); + s[6] = _mm256_unpackhi_epi8(src_ab[4], src_ab[5]); + } for (i = 0; i < h; i += 2) { - data = &src_ptr[i * src_stride + j]; - const __m256i src_67a = _mm256_permute2x128_si256( - src6, - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), - 0x20); + data = &src_ptr[(i + 7) * src_stride + j]; + const __m256i src7 = + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); + const __m256i src_67a = _mm256_permute2x128_si256(src6, src7, 0x20); src6 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 8 * src_stride))); - const __m256i src_78a = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), - src6, 0x20); + _mm_loadu_si128((__m128i *)(data + src_stride))); + const __m256i src_78a = _mm256_permute2x128_si256(src7, src6, 0x20); s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); s[7] = _mm256_unpackhi_epi8(src_67a, src_78a); @@ -266,13 +228,8 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, if (w - j < 16) { if (do_average) { - const __m256i data_ref_0 = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))), - _mm256_castsi128_si256(_mm_loadu_si128( - (__m128i *)(&dst[i * dst_stride + j + dst_stride]))), - 0x20); - + const __m256i data_ref_0 = load_line2_avx2( + &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]); const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned, &wt, use_jnt_comp_avg); @@ -325,19 +282,12 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, _mm256_add_epi16(res_hi_round, offset_const_2); if (do_average) { - const __m256i data_ref_0_lo = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))), - _mm256_castsi128_si256(_mm_loadu_si128( - (__m128i *)(&dst[i * dst_stride + j + dst_stride]))), - 0x20); - - const __m256i data_ref_0_hi = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j + 8]))), - _mm256_castsi128_si256(_mm_loadu_si128( - (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]))), - 0x20); + const __m256i data_ref_0_lo = load_line2_avx2( + &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]); + + const __m256i data_ref_0_hi = + load_line2_avx2(&dst[i * dst_stride + j + 8], + &dst[i * dst_stride + j + 8 + dst_stride]); const __m256i comp_avg_res_lo = comp_avg(&data_ref_0_lo, &res_lo_unsigned, &wt, use_jnt_comp_avg); @@ -404,11 +354,7 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, const int fo_vert = filter_params_y->taps / 2 - 1; const int fo_horiz = filter_params_x->taps / 2 - 1; const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; - const int w0 = conv_params->fwd_offset; - const int w1 = conv_params->bck_offset; - const __m256i wt0 = _mm256_set1_epi16(w0); - const __m256i wt1 = _mm256_set1_epi16(w1); - const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1); + const __m256i wt = unpack_weights_avx2(conv_params); const int do_average = conv_params->do_average; const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; const int offset_0 = @@ -442,15 +388,14 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, for (j = 0; j < w; j += 8) { /* Horizontal filter */ { + const uint8_t *src_h = src_ptr + j; for (i = 0; i < im_h; i += 2) { - __m256i data = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); + __m256i data = + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h)); if (i + 1 < im_h) data = _mm256_inserti128_si256( - data, - _mm_loadu_si128( - (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]), - 1); + data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1); + src_h += (src_stride << 1); __m256i res = convolve_lowbd_x(data, coeffs_x, filt); res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), @@ -500,13 +445,9 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); if (do_average) { - const __m256i data_ref_0 = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))), - _mm256_castsi128_si256(_mm_loadu_si128( - (__m128i *)(&dst[i * dst_stride + j + dst_stride]))), - 0x20); - + const __m256i data_ref_0 = + load_line2_avx2(&dst[i * dst_stride + j], + &dst[i * dst_stride + j + dst_stride]); const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); @@ -534,12 +475,9 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); if (do_average) { - const __m256i data_ref_0 = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))), - _mm256_castsi128_si256(_mm_loadu_si128( - (__m128i *)(&dst[i * dst_stride + j + dst_stride]))), - 0x20); + const __m256i data_ref_0 = + load_line2_avx2(&dst[i * dst_stride + j], + &dst[i * dst_stride + j + dst_stride]); const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); @@ -598,11 +536,7 @@ void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, const __m128i left_shift = _mm_cvtsi32_si128(bits); const int do_average = conv_params->do_average; const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; - const int w0 = conv_params->fwd_offset; - const int w1 = conv_params->bck_offset; - const __m256i wt0 = _mm256_set1_epi16(w0); - const __m256i wt1 = _mm256_set1_epi16(w1); - const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1); + const __m256i wt = unpack_weights_avx2(conv_params); const __m256i zero = _mm256_setzero_si256(); const int offset_0 = @@ -663,13 +597,8 @@ void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, // Accumulate values into the destination buffer if (do_average) { - const __m256i data_ref_0 = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))), - _mm256_castsi128_si256(_mm_loadu_si128( - (__m128i *)(&dst[i * dst_stride + j + dst_stride]))), - 0x20); - + const __m256i data_ref_0 = load_line2_avx2( + &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]); const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); diff --git a/third_party/aom/av1/common/x86/reconinter_avx2.c b/third_party/aom/av1/common/x86/reconinter_avx2.c index ffbb31849..f645e0454 100644 --- a/third_party/aom/av1/common/x86/reconinter_avx2.c +++ b/third_party/aom/av1/common/x86/reconinter_avx2.c @@ -16,8 +16,504 @@ #include "aom/aom_integer.h" #include "aom_dsp/blend.h" #include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" #include "av1/common/blockd.h" +static INLINE __m256i calc_mask_avx2(const __m256i mask_base, const __m256i s0, + const __m256i s1) { + const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)); + return _mm256_abs_epi16( + _mm256_add_epi16(mask_base, _mm256_srli_epi16(diff, 4))); + // clamp(diff, 0, 64) can be skiped for diff is always in the range ( 38, 54) +} +void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask, + DIFFWTD_MASK_TYPE mask_type, + const uint8_t *src0, int stride0, + const uint8_t *src1, int stride1, + int h, int w) { + const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0; + const __m256i y_mask_base = _mm256_set1_epi16(38 - mb); + int i = 0; + if (4 == w) { + do { + const __m128i s0A = xx_loadl_32(src0); + const __m128i s0B = xx_loadl_32(src0 + stride0); + const __m128i s0C = xx_loadl_32(src0 + stride0 * 2); + const __m128i s0D = xx_loadl_32(src0 + stride0 * 3); + const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B); + const __m128i s0CD = _mm_unpacklo_epi32(s0C, s0D); + const __m128i s0ABCD = _mm_unpacklo_epi64(s0AB, s0CD); + const __m256i s0ABCD_w = _mm256_cvtepu8_epi16(s0ABCD); + + const __m128i s1A = xx_loadl_32(src1); + const __m128i s1B = xx_loadl_32(src1 + stride1); + const __m128i s1C = xx_loadl_32(src1 + stride1 * 2); + const __m128i s1D = xx_loadl_32(src1 + stride1 * 3); + const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B); + const __m128i s1CD = _mm_unpacklo_epi32(s1C, s1D); + const __m128i s1ABCD = _mm_unpacklo_epi64(s1AB, s1CD); + const __m256i s1ABCD_w = _mm256_cvtepu8_epi16(s1ABCD); + const __m256i m16 = calc_mask_avx2(y_mask_base, s0ABCD_w, s1ABCD_w); + const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256()); + const __m128i x_m8 = + _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8)); + xx_storeu_128(mask, x_m8); + src0 += (stride0 << 2); + src1 += (stride1 << 2); + mask += 16; + i += 4; + } while (i < h); + } else if (8 == w) { + do { + const __m128i s0A = xx_loadl_64(src0); + const __m128i s0B = xx_loadl_64(src0 + stride0); + const __m128i s0C = xx_loadl_64(src0 + stride0 * 2); + const __m128i s0D = xx_loadl_64(src0 + stride0 * 3); + const __m256i s0AC_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0A, s0C)); + const __m256i s0BD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0B, s0D)); + const __m128i s1A = xx_loadl_64(src1); + const __m128i s1B = xx_loadl_64(src1 + stride1); + const __m128i s1C = xx_loadl_64(src1 + stride1 * 2); + const __m128i s1D = xx_loadl_64(src1 + stride1 * 3); + const __m256i s1AB_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1A, s1C)); + const __m256i s1CD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1B, s1D)); + const __m256i m16AC = calc_mask_avx2(y_mask_base, s0AC_w, s1AB_w); + const __m256i m16BD = calc_mask_avx2(y_mask_base, s0BD_w, s1CD_w); + const __m256i m8 = _mm256_packus_epi16(m16AC, m16BD); + yy_storeu_256(mask, m8); + src0 += stride0 << 2; + src1 += stride1 << 2; + mask += 32; + i += 4; + } while (i < h); + } else if (16 == w) { + do { + const __m128i s0A = xx_load_128(src0); + const __m128i s0B = xx_load_128(src0 + stride0); + const __m128i s1A = xx_load_128(src1); + const __m128i s1B = xx_load_128(src1 + stride1); + const __m256i s0AL = _mm256_cvtepu8_epi16(s0A); + const __m256i s0BL = _mm256_cvtepu8_epi16(s0B); + const __m256i s1AL = _mm256_cvtepu8_epi16(s1A); + const __m256i s1BL = _mm256_cvtepu8_epi16(s1B); + + const __m256i m16AL = calc_mask_avx2(y_mask_base, s0AL, s1AL); + const __m256i m16BL = calc_mask_avx2(y_mask_base, s0BL, s1BL); + + const __m256i m8 = + _mm256_permute4x64_epi64(_mm256_packus_epi16(m16AL, m16BL), 0xd8); + yy_storeu_256(mask, m8); + src0 += stride0 << 1; + src1 += stride1 << 1; + mask += 32; + i += 2; + } while (i < h); + } else { + do { + int j = 0; + do { + const __m256i s0 = yy_loadu_256(src0 + j); + const __m256i s1 = yy_loadu_256(src1 + j); + const __m256i s0L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s0)); + const __m256i s1L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s1)); + const __m256i s0H = + _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s0, 1)); + const __m256i s1H = + _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s1, 1)); + const __m256i m16L = calc_mask_avx2(y_mask_base, s0L, s1L); + const __m256i m16H = calc_mask_avx2(y_mask_base, s0H, s1H); + const __m256i m8 = + _mm256_permute4x64_epi64(_mm256_packus_epi16(m16L, m16H), 0xd8); + yy_storeu_256(mask + j, m8); + j += 32; + } while (j < w); + src0 += stride0; + src1 += stride1; + mask += w; + i += 1; + } while (i < h); + } +} + +static INLINE __m256i calc_mask_d16_avx2(const __m256i *data_src0, + const __m256i *data_src1, + const __m256i *round_const, + const __m256i *mask_base_16, + const __m256i *clip_diff, int round) { + const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1); + const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0); + const __m256i diff = _mm256_max_epu16(diffa, diffb); + const __m256i diff_round = + _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round); + const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2); + const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16); + const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff); + return diff_clamp; +} + +static INLINE __m256i calc_mask_d16_inv_avx2(const __m256i *data_src0, + const __m256i *data_src1, + const __m256i *round_const, + const __m256i *mask_base_16, + const __m256i *clip_diff, + int round) { + const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1); + const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0); + const __m256i diff = _mm256_max_epu16(diffa, diffb); + const __m256i diff_round = + _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round); + const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2); + const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16); + const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff); + const __m256i diff_const_16 = _mm256_sub_epi16(*clip_diff, diff_clamp); + return diff_const_16; +} + +static INLINE void build_compound_diffwtd_mask_d16_avx2( + uint8_t *mask, const CONV_BUF_TYPE *src0, int src0_stride, + const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) { + const int mask_base = 38; + const __m256i _r = _mm256_set1_epi16((1 << shift) >> 1); + const __m256i y38 = _mm256_set1_epi16(mask_base); + const __m256i y64 = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + int i = 0; + if (w == 4) { + do { + const __m128i s0A = xx_loadl_64(src0); + const __m128i s0B = xx_loadl_64(src0 + src0_stride); + const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2); + const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3); + const __m128i s1A = xx_loadl_64(src1); + const __m128i s1B = xx_loadl_64(src1 + src1_stride); + const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2); + const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3); + const __m256i s0 = yy_set_m128i(_mm_unpacklo_epi64(s0C, s0D), + _mm_unpacklo_epi64(s0A, s0B)); + const __m256i s1 = yy_set_m128i(_mm_unpacklo_epi64(s1C, s1D), + _mm_unpacklo_epi64(s1A, s1B)); + const __m256i m16 = calc_mask_d16_avx2(&s0, &s1, &_r, &y38, &y64, shift); + const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256()); + xx_storeu_128(mask, + _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8))); + src0 += src0_stride << 2; + src1 += src1_stride << 2; + mask += 16; + i += 4; + } while (i < h); + } else if (w == 8) { + do { + const __m256i s0AB = yy_loadu2_128(src0 + src0_stride, src0); + const __m256i s0CD = + yy_loadu2_128(src0 + src0_stride * 3, src0 + src0_stride * 2); + const __m256i s1AB = yy_loadu2_128(src1 + src1_stride, src1); + const __m256i s1CD = + yy_loadu2_128(src1 + src1_stride * 3, src1 + src1_stride * 2); + const __m256i m16AB = + calc_mask_d16_avx2(&s0AB, &s1AB, &_r, &y38, &y64, shift); + const __m256i m16CD = + calc_mask_d16_avx2(&s0CD, &s1CD, &_r, &y38, &y64, shift); + const __m256i m8 = _mm256_packus_epi16(m16AB, m16CD); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); + src0 += src0_stride << 2; + src1 += src1_stride << 2; + mask += 32; + i += 4; + } while (i < h); + } else if (w == 16) { + do { + const __m256i s0A = yy_loadu_256(src0); + const __m256i s0B = yy_loadu_256(src0 + src0_stride); + const __m256i s1A = yy_loadu_256(src1); + const __m256i s1B = yy_loadu_256(src1 + src1_stride); + const __m256i m16A = + calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); + const __m256i m16B = + calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); + const __m256i m8 = _mm256_packus_epi16(m16A, m16B); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); + src0 += src0_stride << 1; + src1 += src1_stride << 1; + mask += 32; + i += 2; + } while (i < h); + } else if (w == 32) { + do { + const __m256i s0A = yy_loadu_256(src0); + const __m256i s0B = yy_loadu_256(src0 + 16); + const __m256i s1A = yy_loadu_256(src1); + const __m256i s1B = yy_loadu_256(src1 + 16); + const __m256i m16A = + calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); + const __m256i m16B = + calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); + const __m256i m8 = _mm256_packus_epi16(m16A, m16B); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); + src0 += src0_stride; + src1 += src1_stride; + mask += 32; + i += 1; + } while (i < h); + } else if (w == 64) { + do { + const __m256i s0A = yy_loadu_256(src0); + const __m256i s0B = yy_loadu_256(src0 + 16); + const __m256i s0C = yy_loadu_256(src0 + 32); + const __m256i s0D = yy_loadu_256(src0 + 48); + const __m256i s1A = yy_loadu_256(src1); + const __m256i s1B = yy_loadu_256(src1 + 16); + const __m256i s1C = yy_loadu_256(src1 + 32); + const __m256i s1D = yy_loadu_256(src1 + 48); + const __m256i m16A = + calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); + const __m256i m16B = + calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); + const __m256i m16C = + calc_mask_d16_avx2(&s0C, &s1C, &_r, &y38, &y64, shift); + const __m256i m16D = + calc_mask_d16_avx2(&s0D, &s1D, &_r, &y38, &y64, shift); + const __m256i m8AB = _mm256_packus_epi16(m16A, m16B); + const __m256i m8CD = _mm256_packus_epi16(m16C, m16D); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8)); + yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8)); + src0 += src0_stride; + src1 += src1_stride; + mask += 64; + i += 1; + } while (i < h); + } else { + do { + const __m256i s0A = yy_loadu_256(src0); + const __m256i s0B = yy_loadu_256(src0 + 16); + const __m256i s0C = yy_loadu_256(src0 + 32); + const __m256i s0D = yy_loadu_256(src0 + 48); + const __m256i s0E = yy_loadu_256(src0 + 64); + const __m256i s0F = yy_loadu_256(src0 + 80); + const __m256i s0G = yy_loadu_256(src0 + 96); + const __m256i s0H = yy_loadu_256(src0 + 112); + const __m256i s1A = yy_loadu_256(src1); + const __m256i s1B = yy_loadu_256(src1 + 16); + const __m256i s1C = yy_loadu_256(src1 + 32); + const __m256i s1D = yy_loadu_256(src1 + 48); + const __m256i s1E = yy_loadu_256(src1 + 64); + const __m256i s1F = yy_loadu_256(src1 + 80); + const __m256i s1G = yy_loadu_256(src1 + 96); + const __m256i s1H = yy_loadu_256(src1 + 112); + const __m256i m16A = + calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); + const __m256i m16B = + calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); + const __m256i m16C = + calc_mask_d16_avx2(&s0C, &s1C, &_r, &y38, &y64, shift); + const __m256i m16D = + calc_mask_d16_avx2(&s0D, &s1D, &_r, &y38, &y64, shift); + const __m256i m16E = + calc_mask_d16_avx2(&s0E, &s1E, &_r, &y38, &y64, shift); + const __m256i m16F = + calc_mask_d16_avx2(&s0F, &s1F, &_r, &y38, &y64, shift); + const __m256i m16G = + calc_mask_d16_avx2(&s0G, &s1G, &_r, &y38, &y64, shift); + const __m256i m16H = + calc_mask_d16_avx2(&s0H, &s1H, &_r, &y38, &y64, shift); + const __m256i m8AB = _mm256_packus_epi16(m16A, m16B); + const __m256i m8CD = _mm256_packus_epi16(m16C, m16D); + const __m256i m8EF = _mm256_packus_epi16(m16E, m16F); + const __m256i m8GH = _mm256_packus_epi16(m16G, m16H); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8)); + yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8)); + yy_storeu_256(mask + 64, _mm256_permute4x64_epi64(m8EF, 0xd8)); + yy_storeu_256(mask + 96, _mm256_permute4x64_epi64(m8GH, 0xd8)); + src0 += src0_stride; + src1 += src1_stride; + mask += 128; + i += 1; + } while (i < h); + } +} + +static INLINE void build_compound_diffwtd_mask_d16_inv_avx2( + uint8_t *mask, const CONV_BUF_TYPE *src0, int src0_stride, + const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) { + const int mask_base = 38; + const __m256i _r = _mm256_set1_epi16((1 << shift) >> 1); + const __m256i y38 = _mm256_set1_epi16(mask_base); + const __m256i y64 = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + int i = 0; + if (w == 4) { + do { + const __m128i s0A = xx_loadl_64(src0); + const __m128i s0B = xx_loadl_64(src0 + src0_stride); + const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2); + const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3); + const __m128i s1A = xx_loadl_64(src1); + const __m128i s1B = xx_loadl_64(src1 + src1_stride); + const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2); + const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3); + const __m256i s0 = yy_set_m128i(_mm_unpacklo_epi64(s0C, s0D), + _mm_unpacklo_epi64(s0A, s0B)); + const __m256i s1 = yy_set_m128i(_mm_unpacklo_epi64(s1C, s1D), + _mm_unpacklo_epi64(s1A, s1B)); + const __m256i m16 = + calc_mask_d16_inv_avx2(&s0, &s1, &_r, &y38, &y64, shift); + const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256()); + xx_storeu_128(mask, + _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8))); + src0 += src0_stride << 2; + src1 += src1_stride << 2; + mask += 16; + i += 4; + } while (i < h); + } else if (w == 8) { + do { + const __m256i s0AB = yy_loadu2_128(src0 + src0_stride, src0); + const __m256i s0CD = + yy_loadu2_128(src0 + src0_stride * 3, src0 + src0_stride * 2); + const __m256i s1AB = yy_loadu2_128(src1 + src1_stride, src1); + const __m256i s1CD = + yy_loadu2_128(src1 + src1_stride * 3, src1 + src1_stride * 2); + const __m256i m16AB = + calc_mask_d16_inv_avx2(&s0AB, &s1AB, &_r, &y38, &y64, shift); + const __m256i m16CD = + calc_mask_d16_inv_avx2(&s0CD, &s1CD, &_r, &y38, &y64, shift); + const __m256i m8 = _mm256_packus_epi16(m16AB, m16CD); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); + src0 += src0_stride << 2; + src1 += src1_stride << 2; + mask += 32; + i += 4; + } while (i < h); + } else if (w == 16) { + do { + const __m256i s0A = yy_loadu_256(src0); + const __m256i s0B = yy_loadu_256(src0 + src0_stride); + const __m256i s1A = yy_loadu_256(src1); + const __m256i s1B = yy_loadu_256(src1 + src1_stride); + const __m256i m16A = + calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); + const __m256i m16B = + calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); + const __m256i m8 = _mm256_packus_epi16(m16A, m16B); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); + src0 += src0_stride << 1; + src1 += src1_stride << 1; + mask += 32; + i += 2; + } while (i < h); + } else if (w == 32) { + do { + const __m256i s0A = yy_loadu_256(src0); + const __m256i s0B = yy_loadu_256(src0 + 16); + const __m256i s1A = yy_loadu_256(src1); + const __m256i s1B = yy_loadu_256(src1 + 16); + const __m256i m16A = + calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); + const __m256i m16B = + calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); + const __m256i m8 = _mm256_packus_epi16(m16A, m16B); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); + src0 += src0_stride; + src1 += src1_stride; + mask += 32; + i += 1; + } while (i < h); + } else if (w == 64) { + do { + const __m256i s0A = yy_loadu_256(src0); + const __m256i s0B = yy_loadu_256(src0 + 16); + const __m256i s0C = yy_loadu_256(src0 + 32); + const __m256i s0D = yy_loadu_256(src0 + 48); + const __m256i s1A = yy_loadu_256(src1); + const __m256i s1B = yy_loadu_256(src1 + 16); + const __m256i s1C = yy_loadu_256(src1 + 32); + const __m256i s1D = yy_loadu_256(src1 + 48); + const __m256i m16A = + calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); + const __m256i m16B = + calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); + const __m256i m16C = + calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift); + const __m256i m16D = + calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift); + const __m256i m8AB = _mm256_packus_epi16(m16A, m16B); + const __m256i m8CD = _mm256_packus_epi16(m16C, m16D); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8)); + yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8)); + src0 += src0_stride; + src1 += src1_stride; + mask += 64; + i += 1; + } while (i < h); + } else { + do { + const __m256i s0A = yy_loadu_256(src0); + const __m256i s0B = yy_loadu_256(src0 + 16); + const __m256i s0C = yy_loadu_256(src0 + 32); + const __m256i s0D = yy_loadu_256(src0 + 48); + const __m256i s0E = yy_loadu_256(src0 + 64); + const __m256i s0F = yy_loadu_256(src0 + 80); + const __m256i s0G = yy_loadu_256(src0 + 96); + const __m256i s0H = yy_loadu_256(src0 + 112); + const __m256i s1A = yy_loadu_256(src1); + const __m256i s1B = yy_loadu_256(src1 + 16); + const __m256i s1C = yy_loadu_256(src1 + 32); + const __m256i s1D = yy_loadu_256(src1 + 48); + const __m256i s1E = yy_loadu_256(src1 + 64); + const __m256i s1F = yy_loadu_256(src1 + 80); + const __m256i s1G = yy_loadu_256(src1 + 96); + const __m256i s1H = yy_loadu_256(src1 + 112); + const __m256i m16A = + calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); + const __m256i m16B = + calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); + const __m256i m16C = + calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift); + const __m256i m16D = + calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift); + const __m256i m16E = + calc_mask_d16_inv_avx2(&s0E, &s1E, &_r, &y38, &y64, shift); + const __m256i m16F = + calc_mask_d16_inv_avx2(&s0F, &s1F, &_r, &y38, &y64, shift); + const __m256i m16G = + calc_mask_d16_inv_avx2(&s0G, &s1G, &_r, &y38, &y64, shift); + const __m256i m16H = + calc_mask_d16_inv_avx2(&s0H, &s1H, &_r, &y38, &y64, shift); + const __m256i m8AB = _mm256_packus_epi16(m16A, m16B); + const __m256i m8CD = _mm256_packus_epi16(m16C, m16D); + const __m256i m8EF = _mm256_packus_epi16(m16E, m16F); + const __m256i m8GH = _mm256_packus_epi16(m16G, m16H); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8)); + yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8)); + yy_storeu_256(mask + 64, _mm256_permute4x64_epi64(m8EF, 0xd8)); + yy_storeu_256(mask + 96, _mm256_permute4x64_epi64(m8GH, 0xd8)); + src0 += src0_stride; + src1 += src1_stride; + mask += 128; + i += 1; + } while (i < h); + } +} + +void av1_build_compound_diffwtd_mask_d16_avx2( + uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, + int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, + ConvolveParams *conv_params, int bd) { + const int shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8); + // When rounding constant is added, there is a possibility of overflow. + // However that much precision is not required. Code should very well work for + // other values of DIFF_FACTOR_LOG2 and AOM_BLEND_A64_MAX_ALPHA as well. But + // there is a possibility of corner case bugs. + assert(DIFF_FACTOR_LOG2 == 4); + assert(AOM_BLEND_A64_MAX_ALPHA == 64); + + if (mask_type == DIFFWTD_38) { + build_compound_diffwtd_mask_d16_avx2(mask, src0, src0_stride, src1, + src1_stride, h, w, shift); + } else { + build_compound_diffwtd_mask_d16_inv_avx2(mask, src0, src0_stride, src1, + src1_stride, h, w, shift); + } +} + void av1_build_compound_diffwtd_mask_highbd_avx2( uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, diff --git a/third_party/aom/av1/common/x86/selfguided_avx2.c b/third_party/aom/av1/common/x86/selfguided_avx2.c index 375def62e..0aaf1f454 100644 --- a/third_party/aom/av1/common/x86/selfguided_avx2.c +++ b/third_party/aom/av1/common/x86/selfguided_avx2.c @@ -546,17 +546,18 @@ static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A, } } -void av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height, - int dgd_stride, int32_t *flt0, - int32_t *flt1, int flt_stride, - int sgr_params_idx, int bit_depth, - int highbd) { +int av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height, + int dgd_stride, int32_t *flt0, + int32_t *flt1, int flt_stride, + int sgr_params_idx, int bit_depth, + int highbd) { // The ALIGN_POWER_OF_TWO macro here ensures that column 1 of Atl, Btl, // Ctl and Dtl is 32-byte aligned. const int buf_elts = ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3); - DECLARE_ALIGNED(32, int32_t, - buf[4 * ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3)]); + int32_t *buf = aom_memalign( + 32, 4 * sizeof(*buf) * ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3)); + if (!buf) return -1; const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; @@ -625,6 +626,8 @@ void av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height, final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width, height, highbd); } + aom_free(buf); + return 0; } void apply_selfguided_restoration_avx2(const uint8_t *dat8, int width, @@ -635,8 +638,10 @@ void apply_selfguided_restoration_avx2(const uint8_t *dat8, int width, int32_t *flt0 = tmpbuf; int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX; assert(width * height <= RESTORATION_UNITPELS_MAX); - av1_selfguided_restoration_avx2(dat8, width, height, stride, flt0, flt1, - width, eps, bit_depth, highbd); + const int ret = av1_selfguided_restoration_avx2( + dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd); + (void)ret; + assert(!ret); const sgr_params_type *const params = &sgr_params[eps]; int xq[2]; decode_xq(xqd, xq, params); diff --git a/third_party/aom/av1/common/x86/selfguided_sse4.c b/third_party/aom/av1/common/x86/selfguided_sse4.c index c64150b9d..ea3f6d942 100644 --- a/third_party/aom/av1/common/x86/selfguided_sse4.c +++ b/third_party/aom/av1/common/x86/selfguided_sse4.c @@ -499,13 +499,15 @@ static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A, } } -void av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width, - int height, int dgd_stride, - int32_t *flt0, int32_t *flt1, - int flt_stride, int sgr_params_idx, - int bit_depth, int highbd) { - DECLARE_ALIGNED(16, int32_t, buf[4 * RESTORATION_PROC_UNIT_PELS]); - memset(buf, 0, sizeof(buf)); +int av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width, + int height, int dgd_stride, int32_t *flt0, + int32_t *flt1, int flt_stride, + int sgr_params_idx, int bit_depth, + int highbd) { + int32_t *buf = (int32_t *)aom_memalign( + 16, 4 * sizeof(*buf) * RESTORATION_PROC_UNIT_PELS); + if (!buf) return -1; + memset(buf, 0, 4 * sizeof(*buf) * RESTORATION_PROC_UNIT_PELS); const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; @@ -574,6 +576,8 @@ void av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width, final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width, height, highbd); } + aom_free(buf); + return 0; } void apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width, @@ -584,8 +588,10 @@ void apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width, int32_t *flt0 = tmpbuf; int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX; assert(width * height <= RESTORATION_UNITPELS_MAX); - av1_selfguided_restoration_sse4_1(dat8, width, height, stride, flt0, flt1, - width, eps, bit_depth, highbd); + const int ret = av1_selfguided_restoration_sse4_1( + dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd); + (void)ret; + assert(!ret); const sgr_params_type *const params = &sgr_params[eps]; int xq[2]; decode_xq(xqd, xq, params); diff --git a/third_party/aom/av1/common/x86/warp_plane_sse4.c b/third_party/aom/av1/common/x86/warp_plane_sse4.c index efc542cbf..b810cea2e 100644 --- a/third_party/aom/av1/common/x86/warp_plane_sse4.c +++ b/third_party/aom/av1/common/x86/warp_plane_sse4.c @@ -203,15 +203,72 @@ static const uint8_t even_mask[16] = { 0, 2, 2, 4, 4, 6, 6, 8, static const uint8_t odd_mask[16] = { 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15, 0 }; -static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx, - int alpha, int k, +static const uint8_t shuffle_alpha0_mask01[16] = { 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, 0, 1 }; + +static const uint8_t shuffle_alpha0_mask23[16] = { 2, 3, 2, 3, 2, 3, 2, 3, + 2, 3, 2, 3, 2, 3, 2, 3 }; + +static const uint8_t shuffle_alpha0_mask45[16] = { 4, 5, 4, 5, 4, 5, 4, 5, + 4, 5, 4, 5, 4, 5, 4, 5 }; + +static const uint8_t shuffle_alpha0_mask67[16] = { 6, 7, 6, 7, 6, 7, 6, 7, + 6, 7, 6, 7, 6, 7, 6, 7 }; + +static const uint8_t shuffle_gamma0_mask0[16] = { 0, 1, 2, 3, 0, 1, 2, 3, + 0, 1, 2, 3, 0, 1, 2, 3 }; +static const uint8_t shuffle_gamma0_mask1[16] = { 4, 5, 6, 7, 4, 5, 6, 7, + 4, 5, 6, 7, 4, 5, 6, 7 }; +static const uint8_t shuffle_gamma0_mask2[16] = { 8, 9, 10, 11, 8, 9, 10, 11, + 8, 9, 10, 11, 8, 9, 10, 11 }; +static const uint8_t shuffle_gamma0_mask3[16] = { + 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15 +}; + +static INLINE void filter_src_pixels(__m128i src, __m128i *tmp, __m128i *coeff, const int offset_bits_horiz, - const int reduce_bits_horiz) { + const int reduce_bits_horiz, int k) { const __m128i src_even = _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)even_mask)); const __m128i src_odd = _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)odd_mask)); + // The pixel order we need for 'src' is: + // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9 + const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd); + const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff[0]); + // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13 + const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4), + _mm_srli_si128(src_odd, 4)); + const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff[1]); + // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10 + const __m128i src_13 = + _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2)); + const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff[2]); + // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14 + const __m128i src_57 = _mm_unpacklo_epi64(_mm_srli_si128(src_odd, 4), + _mm_srli_si128(src_even, 6)); + const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff[3]); + + const __m128i round_const = _mm_set1_epi16((1 << offset_bits_horiz) + + ((1 << reduce_bits_horiz) >> 1)); + // Note: The values res_02 + res_46 and res_13 + res_57 both + // fit into int16s at this point, but their sum may be too wide to fit + // into an int16. However, once we also add round_const, the sum of + // all of these fits into a uint16. + // + // The wrapping behaviour of _mm_add_* is used here to make sure we + // get the correct result despite converting between different + // (implicit) types. + const __m128i res_even = _mm_add_epi16(res_02, res_46); + const __m128i res_odd = _mm_add_epi16(res_13, res_57); + const __m128i res = + _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const); + tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz)); +} + +static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx, + __m128i *coeff) { // Filter even-index pixels const __m128i tmp_0 = _mm_loadl_epi64( (__m128i *)&filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]); @@ -249,47 +306,504 @@ static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx, const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11); // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7 - const __m128i coeff_02 = _mm_unpacklo_epi64(tmp_12, tmp_14); + coeff[0] = _mm_unpacklo_epi64(tmp_12, tmp_14); // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7 - const __m128i coeff_46 = _mm_unpackhi_epi64(tmp_12, tmp_14); + coeff[1] = _mm_unpackhi_epi64(tmp_12, tmp_14); // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7 - const __m128i coeff_13 = _mm_unpacklo_epi64(tmp_13, tmp_15); + coeff[2] = _mm_unpacklo_epi64(tmp_13, tmp_15); // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7 - const __m128i coeff_57 = _mm_unpackhi_epi64(tmp_13, tmp_15); + coeff[3] = _mm_unpackhi_epi64(tmp_13, tmp_15); +} - // The pixel order we need for 'src' is: - // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9 - const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd); - const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff_02); - // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13 - const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4), - _mm_srli_si128(src_odd, 4)); - const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff_46); - // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10 - const __m128i src_13 = - _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2)); - const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff_13); - // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14 - const __m128i src_57 = _mm_unpacklo_epi64(_mm_srli_si128(src_odd, 4), - _mm_srli_si128(src_even, 6)); - const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff_57); +static INLINE void prepare_horizontal_filter_coeff_alpha0(int sx, + __m128i *coeff) { + // Filter even-index pixels + const __m128i tmp_0 = + _mm_loadl_epi64((__m128i *)&filter_8bit[sx >> WARPEDDIFF_PREC_BITS]); - const __m128i round_const = _mm_set1_epi16((1 << offset_bits_horiz) + - ((1 << reduce_bits_horiz) >> 1)); + // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7 + coeff[0] = _mm_shuffle_epi8( + tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask01)); + // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7 + coeff[1] = _mm_shuffle_epi8( + tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask23)); + // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7 + coeff[2] = _mm_shuffle_epi8( + tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask45)); + // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7 + coeff[3] = _mm_shuffle_epi8( + tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask67)); +} - // Note: The values res_02 + res_46 and res_13 + res_57 both - // fit into int16s at this point, but their sum may be too wide to fit - // into an int16. However, once we also add round_const, the sum of - // all of these fits into a uint16. - // - // The wrapping behaviour of _mm_add_* is used here to make sure we - // get the correct result despite converting between different - // (implicit) types. - const __m128i res_even = _mm_add_epi16(res_02, res_46); - const __m128i res_odd = _mm_add_epi16(res_13, res_57); - const __m128i res = - _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const); - tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz)); +static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx, + int alpha, int k, + const int offset_bits_horiz, + const int reduce_bits_horiz) { + __m128i coeff[4]; + prepare_horizontal_filter_coeff(alpha, sx, coeff); + filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k); +} + +static INLINE void warp_horizontal_filter(const uint8_t *ref, __m128i *tmp, + int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, + int p_height, int height, int i, + const int offset_bits_horiz, + const int reduce_bits_horiz) { + int k; + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + int sx = sx4 + beta * (k + 4); + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz, + reduce_bits_horiz); + } +} + +static INLINE void warp_horizontal_filter_alpha0( + const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + (void)alpha; + int k; + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + int sx = sx4 + beta * (k + 4); + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + + __m128i coeff[4]; + prepare_horizontal_filter_coeff_alpha0(sx, coeff); + filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k); + } +} + +static INLINE void warp_horizontal_filter_beta0( + const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + (void)beta; + int k; + __m128i coeff[4]; + prepare_horizontal_filter_coeff(alpha, sx4, coeff); + + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k); + } +} + +static INLINE void warp_horizontal_filter_alpha0_beta0( + const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + (void)beta; + (void)alpha; + int k; + + __m128i coeff[4]; + prepare_horizontal_filter_coeff_alpha0(sx4, coeff); + + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k); + } +} + +static INLINE void unpack_weights_and_set_round_const( + ConvolveParams *conv_params, const int round_bits, const int offset_bits, + __m128i *res_sub_const, __m128i *round_bits_const, __m128i *wt) { + *res_sub_const = + _mm_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) - + (1 << (offset_bits - conv_params->round_1 - 1))); + *round_bits_const = _mm_set1_epi16(((1 << round_bits) >> 1)); + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m128i wt0 = _mm_set1_epi16(w0); + const __m128i wt1 = _mm_set1_epi16(w1); + *wt = _mm_unpacklo_epi16(wt0, wt1); +} + +static INLINE void prepare_vertical_filter_coeffs(int gamma, int sy, + __m128i *coeffs) { + const __m128i tmp_0 = _mm_loadu_si128( + (__m128i *)(warped_filter + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_2 = _mm_loadu_si128( + (__m128i *)(warped_filter + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_4 = _mm_loadu_si128( + (__m128i *)(warped_filter + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_6 = _mm_loadu_si128( + (__m128i *)(warped_filter + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); + + const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2); + const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6); + const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2); + const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6); + + // even coeffs + coeffs[0] = _mm_unpacklo_epi64(tmp_8, tmp_10); + coeffs[1] = _mm_unpackhi_epi64(tmp_8, tmp_10); + coeffs[2] = _mm_unpacklo_epi64(tmp_12, tmp_14); + coeffs[3] = _mm_unpackhi_epi64(tmp_12, tmp_14); + + const __m128i tmp_1 = _mm_loadu_si128( + (__m128i *)(warped_filter + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_3 = _mm_loadu_si128( + (__m128i *)(warped_filter + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_5 = _mm_loadu_si128( + (__m128i *)(warped_filter + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_7 = _mm_loadu_si128( + (__m128i *)(warped_filter + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); + + const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3); + const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7); + const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3); + const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7); + + // odd coeffs + coeffs[4] = _mm_unpacklo_epi64(tmp_9, tmp_11); + coeffs[5] = _mm_unpackhi_epi64(tmp_9, tmp_11); + coeffs[6] = _mm_unpacklo_epi64(tmp_13, tmp_15); + coeffs[7] = _mm_unpackhi_epi64(tmp_13, tmp_15); +} + +static INLINE void prepare_vertical_filter_coeffs_gamma0(int sy, + __m128i *coeffs) { + const __m128i tmp_0 = _mm_loadu_si128( + (__m128i *)(warped_filter + (sy >> WARPEDDIFF_PREC_BITS))); + + // even coeffs + coeffs[0] = + _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask0)); + coeffs[1] = + _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask1)); + coeffs[2] = + _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask2)); + coeffs[3] = + _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask3)); + + // odd coeffs + coeffs[4] = coeffs[0]; + coeffs[5] = coeffs[1]; + coeffs[6] = coeffs[2]; + coeffs[7] = coeffs[3]; +} + +static INLINE void filter_src_pixels_vertical(__m128i *tmp, __m128i *coeffs, + __m128i *res_lo, __m128i *res_hi, + int k) { + // Load from tmp and rearrange pairs of consecutive rows into the + // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7 + const __m128i *src = tmp + (k + 4); + const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]); + const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]); + const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]); + const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]); + + const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]); + const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]); + const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]); + const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]); + + const __m128i res_even = + _mm_add_epi32(_mm_add_epi32(res_0, res_2), _mm_add_epi32(res_4, res_6)); + + // Filter odd-index pixels + const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]); + const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]); + const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]); + const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]); + + const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[4]); + const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[5]); + const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[6]); + const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[7]); + + const __m128i res_odd = + _mm_add_epi32(_mm_add_epi32(res_1, res_3), _mm_add_epi32(res_5, res_7)); + + // Rearrange pixels back into the order 0 ... 7 + *res_lo = _mm_unpacklo_epi32(res_even, res_odd); + *res_hi = _mm_unpackhi_epi32(res_even, res_odd); +} + +static INLINE void store_vertical_filter_output( + __m128i *res_lo, __m128i *res_hi, const __m128i *res_add_const, + const __m128i *wt, const __m128i *res_sub_const, __m128i *round_bits_const, + uint8_t *pred, ConvolveParams *conv_params, int i, int j, int k, + const int reduce_bits_vert, int p_stride, int p_width, + const int round_bits) { + __m128i res_lo_1 = *res_lo; + __m128i res_hi_1 = *res_hi; + + if (conv_params->is_compound) { + __m128i *const p = + (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j]; + res_lo_1 = _mm_srai_epi32(_mm_add_epi32(res_lo_1, *res_add_const), + reduce_bits_vert); + const __m128i temp_lo_16 = _mm_packus_epi32(res_lo_1, res_lo_1); + __m128i res_lo_16; + if (conv_params->do_average) { + __m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j]; + const __m128i p_16 = _mm_loadl_epi64(p); + + if (conv_params->use_jnt_comp_avg) { + const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16); + const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, *wt); + const __m128i shifted_32 = + _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); + res_lo_16 = _mm_packus_epi32(shifted_32, shifted_32); + } else { + res_lo_16 = _mm_srai_epi16(_mm_add_epi16(p_16, temp_lo_16), 1); + } + + res_lo_16 = _mm_add_epi16(res_lo_16, *res_sub_const); + + res_lo_16 = _mm_srai_epi16(_mm_add_epi16(res_lo_16, *round_bits_const), + round_bits); + __m128i res_8_lo = _mm_packus_epi16(res_lo_16, res_lo_16); + *(uint32_t *)dst8 = _mm_cvtsi128_si32(res_8_lo); + } else { + _mm_storel_epi64(p, temp_lo_16); + } + if (p_width > 4) { + __m128i *const p4 = + (__m128i *)&conv_params + ->dst[(i + k + 4) * conv_params->dst_stride + j + 4]; + res_hi_1 = _mm_srai_epi32(_mm_add_epi32(res_hi_1, *res_add_const), + reduce_bits_vert); + const __m128i temp_hi_16 = _mm_packus_epi32(res_hi_1, res_hi_1); + __m128i res_hi_16; + + if (conv_params->do_average) { + __m128i *const dst8_4 = + (__m128i *)&pred[(i + k + 4) * p_stride + j + 4]; + const __m128i p4_16 = _mm_loadl_epi64(p4); + + if (conv_params->use_jnt_comp_avg) { + const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16); + const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, *wt); + const __m128i shifted_32 = + _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS); + res_hi_16 = _mm_packus_epi32(shifted_32, shifted_32); + } else { + res_hi_16 = _mm_srai_epi16(_mm_add_epi16(p4_16, temp_hi_16), 1); + } + res_hi_16 = _mm_add_epi16(res_hi_16, *res_sub_const); + + res_hi_16 = _mm_srai_epi16(_mm_add_epi16(res_hi_16, *round_bits_const), + round_bits); + __m128i res_8_hi = _mm_packus_epi16(res_hi_16, res_hi_16); + *(uint32_t *)dst8_4 = _mm_cvtsi128_si32(res_8_hi); + + } else { + _mm_storel_epi64(p4, temp_hi_16); + } + } + } else { + const __m128i res_lo_round = _mm_srai_epi32( + _mm_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert); + const __m128i res_hi_round = _mm_srai_epi32( + _mm_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert); + + const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); + __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit); + + // Store, blending with 'pred' if needed + __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j]; + + // Note: If we're outputting a 4x4 block, we need to be very careful + // to only output 4 pixels at this point, to avoid encode/decode + // mismatches when encoding with multiple threads. + if (p_width == 4) { + *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit); + } else { + _mm_storel_epi64(p, res_8bit); + } + } +} + +static INLINE void warp_vertical_filter( + uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma, + int16_t delta, int p_height, int p_stride, int p_width, int i, int j, + int sy4, const int reduce_bits_vert, const __m128i *res_add_const, + const int round_bits, const int offset_bits) { + int k; + __m128i res_sub_const, round_bits_const, wt; + unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits, + &res_sub_const, &round_bits_const, &wt); + // Vertical filter + for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { + int sy = sy4 + delta * (k + 4); + + __m128i coeffs[8]; + prepare_vertical_filter_coeffs(gamma, sy, coeffs); + + __m128i res_lo; + __m128i res_hi; + filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k); + + store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt, + &res_sub_const, &round_bits_const, pred, + conv_params, i, j, k, reduce_bits_vert, + p_stride, p_width, round_bits); + } +} + +static INLINE void warp_vertical_filter_gamma0( + uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma, + int16_t delta, int p_height, int p_stride, int p_width, int i, int j, + int sy4, const int reduce_bits_vert, const __m128i *res_add_const, + const int round_bits, const int offset_bits) { + int k; + (void)gamma; + __m128i res_sub_const, round_bits_const, wt; + unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits, + &res_sub_const, &round_bits_const, &wt); + // Vertical filter + for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { + int sy = sy4 + delta * (k + 4); + + __m128i coeffs[8]; + prepare_vertical_filter_coeffs_gamma0(sy, coeffs); + + __m128i res_lo; + __m128i res_hi; + filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k); + + store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt, + &res_sub_const, &round_bits_const, pred, + conv_params, i, j, k, reduce_bits_vert, + p_stride, p_width, round_bits); + } +} + +static INLINE void warp_vertical_filter_delta0( + uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma, + int16_t delta, int p_height, int p_stride, int p_width, int i, int j, + int sy4, const int reduce_bits_vert, const __m128i *res_add_const, + const int round_bits, const int offset_bits) { + (void)delta; + int k; + __m128i res_sub_const, round_bits_const, wt; + unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits, + &res_sub_const, &round_bits_const, &wt); + + __m128i coeffs[8]; + prepare_vertical_filter_coeffs(gamma, sy4, coeffs); + // Vertical filter + for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { + __m128i res_lo; + __m128i res_hi; + filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k); + + store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt, + &res_sub_const, &round_bits_const, pred, + conv_params, i, j, k, reduce_bits_vert, + p_stride, p_width, round_bits); + } +} + +static INLINE void warp_vertical_filter_gamma0_delta0( + uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma, + int16_t delta, int p_height, int p_stride, int p_width, int i, int j, + int sy4, const int reduce_bits_vert, const __m128i *res_add_const, + const int round_bits, const int offset_bits) { + (void)delta; + (void)gamma; + int k; + __m128i res_sub_const, round_bits_const, wt; + unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits, + &res_sub_const, &round_bits_const, &wt); + + __m128i coeffs[8]; + prepare_vertical_filter_coeffs_gamma0(sy4, coeffs); + // Vertical filter + for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { + __m128i res_lo; + __m128i res_hi; + filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k); + + store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt, + &res_sub_const, &round_bits_const, pred, + conv_params, i, j, k, reduce_bits_vert, + p_stride, p_width, round_bits); + } +} + +static INLINE void prepare_warp_vertical_filter( + uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma, + int16_t delta, int p_height, int p_stride, int p_width, int i, int j, + int sy4, const int reduce_bits_vert, const __m128i *res_add_const, + const int round_bits, const int offset_bits) { + if (gamma == 0 && delta == 0) + warp_vertical_filter_gamma0_delta0( + pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i, j, + sy4, reduce_bits_vert, res_add_const, round_bits, offset_bits); + else if (gamma == 0 && delta != 0) + warp_vertical_filter_gamma0(pred, tmp, conv_params, gamma, delta, p_height, + p_stride, p_width, i, j, sy4, reduce_bits_vert, + res_add_const, round_bits, offset_bits); + else if (gamma != 0 && delta == 0) + warp_vertical_filter_delta0(pred, tmp, conv_params, gamma, delta, p_height, + p_stride, p_width, i, j, sy4, reduce_bits_vert, + res_add_const, round_bits, offset_bits); + else + warp_vertical_filter(pred, tmp, conv_params, gamma, delta, p_height, + p_stride, p_width, i, j, sy4, reduce_bits_vert, + res_add_const, round_bits, offset_bits); +} + +static INLINE void prepare_warp_horizontal_filter( + const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + if (alpha == 0 && beta == 0) + warp_horizontal_filter_alpha0_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, + beta, p_height, height, i, + offset_bits_horiz, reduce_bits_horiz); + else if (alpha == 0 && beta != 0) + warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta, + p_height, height, i, offset_bits_horiz, + reduce_bits_horiz); + else if (alpha != 0 && beta == 0) + warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta, + p_height, height, i, offset_bits_horiz, + reduce_bits_horiz); + else + warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta, + p_height, height, i, offset_bits_horiz, + reduce_bits_horiz); } void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width, @@ -309,24 +823,12 @@ void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width, assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL)); const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz; - const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert); const __m128i reduce_bits_vert_const = _mm_set1_epi32(((1 << reduce_bits_vert) >> 1)); const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert); const int round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; - const __m128i res_sub_const = - _mm_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) - - (1 << (offset_bits - conv_params->round_1 - 1))); - __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits); - __m128i round_bits_const = _mm_set1_epi16(((1 << round_bits) >> 1)); - - const int w0 = conv_params->fwd_offset; - const int w1 = conv_params->bck_offset; - const __m128i wt0 = _mm_set1_epi16(w0); - const __m128i wt1 = _mm_set1_epi16(w1); - const __m128i wt = _mm_unpacklo_epi16(wt0, wt1); assert(IMPLIES(conv_params->do_average, conv_params->is_compound)); /* Note: For this code to work, the left/right frame borders need to be @@ -340,6 +842,13 @@ void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width, assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]); } }*/ + __m128i res_add_const_1; + if (conv_params->is_compound == 1) { + res_add_const_1 = _mm_add_epi32(reduce_bits_vert_const, res_add_const); + } else { + res_add_const_1 = _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) + + ((1 << reduce_bits_vert) >> 1)); + } for (i = 0; i < p_height; i += 8) { for (j = 0; j < p_width; j += 8) { @@ -419,203 +928,15 @@ void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width, reduce_bits_horiz); } } else { - for (k = -7; k < AOMMIN(8, p_height - i); ++k) { - int iy = iy4 + k; - if (iy < 0) - iy = 0; - else if (iy > height - 1) - iy = height - 1; - int sx = sx4 + beta * (k + 4); - - // Load source pixels - const __m128i src = - _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); - horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz, - reduce_bits_horiz); - } + prepare_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, + beta, p_height, height, i, + offset_bits_horiz, reduce_bits_horiz); } // Vertical filter - for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { - int sy = sy4 + delta * (k + 4); - - // Load from tmp and rearrange pairs of consecutive rows into the - // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7 - const __m128i *src = tmp + (k + 4); - const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]); - const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]); - const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]); - const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]); - - // Filter even-index pixels - const __m128i tmp_0 = _mm_loadu_si128( - (__m128i *)(warped_filter + - ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_2 = _mm_loadu_si128( - (__m128i *)(warped_filter + - ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_4 = _mm_loadu_si128( - (__m128i *)(warped_filter + - ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_6 = _mm_loadu_si128( - (__m128i *)(warped_filter + - ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); - - const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2); - const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6); - const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2); - const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6); - - const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10); - const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10); - const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14); - const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14); - - const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0); - const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2); - const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4); - const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6); - - const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), - _mm_add_epi32(res_4, res_6)); - - // Filter odd-index pixels - const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]); - const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]); - const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]); - const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]); - - const __m128i tmp_1 = _mm_loadu_si128( - (__m128i *)(warped_filter + - ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_3 = _mm_loadu_si128( - (__m128i *)(warped_filter + - ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_5 = _mm_loadu_si128( - (__m128i *)(warped_filter + - ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_7 = _mm_loadu_si128( - (__m128i *)(warped_filter + - ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); - - const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3); - const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7); - const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3); - const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7); - - const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11); - const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11); - const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15); - const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15); - - const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1); - const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3); - const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5); - const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7); - - const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), - _mm_add_epi32(res_5, res_7)); - - // Rearrange pixels back into the order 0 ... 7 - __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); - __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); - - if (conv_params->is_compound) { - __m128i *const p = - (__m128i *)&conv_params - ->dst[(i + k + 4) * conv_params->dst_stride + j]; - res_lo = _mm_add_epi32(res_lo, res_add_const); - res_lo = _mm_sra_epi32(_mm_add_epi32(res_lo, reduce_bits_vert_const), - reduce_bits_vert_shift); - const __m128i temp_lo_16 = _mm_packus_epi32(res_lo, res_lo); - __m128i res_lo_16; - if (conv_params->do_average) { - __m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j]; - const __m128i p_16 = _mm_loadl_epi64(p); - - if (conv_params->use_jnt_comp_avg) { - const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16); - const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, wt); - const __m128i shifted_32 = - _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); - res_lo_16 = _mm_packus_epi32(shifted_32, shifted_32); - } else { - res_lo_16 = _mm_srai_epi16(_mm_add_epi16(p_16, temp_lo_16), 1); - } - - res_lo_16 = _mm_add_epi16(res_lo_16, res_sub_const); - - res_lo_16 = _mm_sra_epi16( - _mm_add_epi16(res_lo_16, round_bits_const), round_bits_shift); - __m128i res_8_lo = _mm_packus_epi16(res_lo_16, res_lo_16); - *(uint32_t *)dst8 = _mm_cvtsi128_si32(res_8_lo); - } else { - _mm_storel_epi64(p, temp_lo_16); - } - if (p_width > 4) { - __m128i *const p4 = - (__m128i *)&conv_params - ->dst[(i + k + 4) * conv_params->dst_stride + j + 4]; - - res_hi = _mm_add_epi32(res_hi, res_add_const); - res_hi = - _mm_sra_epi32(_mm_add_epi32(res_hi, reduce_bits_vert_const), - reduce_bits_vert_shift); - const __m128i temp_hi_16 = _mm_packus_epi32(res_hi, res_hi); - __m128i res_hi_16; - - if (conv_params->do_average) { - __m128i *const dst8_4 = - (__m128i *)&pred[(i + k + 4) * p_stride + j + 4]; - const __m128i p4_16 = _mm_loadl_epi64(p4); - - if (conv_params->use_jnt_comp_avg) { - const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16); - const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, wt); - const __m128i shifted_32 = - _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS); - res_hi_16 = _mm_packus_epi32(shifted_32, shifted_32); - } else { - res_hi_16 = _mm_srai_epi16(_mm_add_epi16(p4_16, temp_hi_16), 1); - } - res_hi_16 = _mm_add_epi16(res_hi_16, res_sub_const); - - res_hi_16 = _mm_sra_epi16( - _mm_add_epi16(res_hi_16, round_bits_const), round_bits_shift); - __m128i res_8_hi = _mm_packus_epi16(res_hi_16, res_hi_16); - *(uint32_t *)dst8_4 = _mm_cvtsi128_si32(res_8_hi); - - } else { - _mm_storel_epi64(p4, temp_hi_16); - } - } - } else { - // Round and pack into 8 bits - const __m128i round_const = - _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) + - ((1 << reduce_bits_vert) >> 1)); - - const __m128i res_lo_round = _mm_srai_epi32( - _mm_add_epi32(res_lo, round_const), reduce_bits_vert); - const __m128i res_hi_round = _mm_srai_epi32( - _mm_add_epi32(res_hi, round_const), reduce_bits_vert); - - const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); - __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit); - - // Store, blending with 'pred' if needed - __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j]; - - // Note: If we're outputting a 4x4 block, we need to be very careful - // to only output 4 pixels at this point, to avoid encode/decode - // mismatches when encoding with multiple threads. - if (p_width == 4) { - *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit); - } else { - _mm_storel_epi64(p, res_8bit); - } - } - } + prepare_warp_vertical_filter( + pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i, + j, sy4, reduce_bits_vert, &res_add_const_1, round_bits, offset_bits); } } } diff --git a/third_party/aom/av1/common/x86/wiener_convolve_avx2.c b/third_party/aom/av1/common/x86/wiener_convolve_avx2.c index e1449fd21..87a6e1239 100644 --- a/third_party/aom/av1/common/x86/wiener_convolve_avx2.c +++ b/third_party/aom/av1/common/x86/wiener_convolve_avx2.c @@ -39,7 +39,8 @@ void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride, DECLARE_ALIGNED(32, uint16_t, temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); - int intermediate_height = h + SUBPEL_TAPS - 1; + int intermediate_height = h + SUBPEL_TAPS - 2; + memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE); const int center_tap = ((SUBPEL_TAPS - 1) / 2); const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap; diff --git a/third_party/aom/av1/common/x86/wiener_convolve_sse2.c b/third_party/aom/av1/common/x86/wiener_convolve_sse2.c index 3083d224b..f9d00b733 100644 --- a/third_party/aom/av1/common/x86/wiener_convolve_sse2.c +++ b/third_party/aom/av1/common/x86/wiener_convolve_sse2.c @@ -32,7 +32,8 @@ void av1_wiener_convolve_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride, DECLARE_ALIGNED(16, uint16_t, temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); - int intermediate_height = h + SUBPEL_TAPS - 1; + int intermediate_height = h + SUBPEL_TAPS - 2; + memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE); int i, j; const int center_tap = ((SUBPEL_TAPS - 1) / 2); const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap; -- cgit v1.2.3