diff options
author | trav90 <travawine@palemoon.org> | 2018-10-17 05:59:08 -0500 |
---|---|---|
committer | trav90 <travawine@palemoon.org> | 2018-10-17 05:59:08 -0500 |
commit | df9477dfa60ebb5d31bc142e58ce46535c17abce (patch) | |
tree | c4fdd5d1b09d08c0514f208246260fc87372cb56 /third_party/aom/av1/common/x86 | |
parent | 0cc51bc106250988cc3b89cb5d743a5af52cd35a (diff) | |
download | UXP-df9477dfa60ebb5d31bc142e58ce46535c17abce.tar UXP-df9477dfa60ebb5d31bc142e58ce46535c17abce.tar.gz UXP-df9477dfa60ebb5d31bc142e58ce46535c17abce.tar.lz UXP-df9477dfa60ebb5d31bc142e58ce46535c17abce.tar.xz UXP-df9477dfa60ebb5d31bc142e58ce46535c17abce.zip |
Update aom to slightly newer commit ID
Diffstat (limited to 'third_party/aom/av1/common/x86')
-rw-r--r-- | third_party/aom/av1/common/x86/av1_convolve_ssse3.c | 15 | ||||
-rw-r--r-- | third_party/aom/av1/common/x86/av1_fwd_txfm1d_sse4.c | 28 | ||||
-rw-r--r-- | third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c | 26 | ||||
-rw-r--r-- | third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c | 8 | ||||
-rw-r--r-- | third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c | 222 | ||||
-rw-r--r-- | third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c | 384 | ||||
-rw-r--r-- | third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h | 11 | ||||
-rw-r--r-- | third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c | 344 | ||||
-rw-r--r-- | third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c | 74 | ||||
-rw-r--r-- | third_party/aom/av1/common/x86/idct_intrin_sse2.c | 8 | ||||
-rw-r--r-- | third_party/aom/av1/common/x86/warp_plane_sse2.c | 369 | ||||
-rw-r--r-- | third_party/aom/av1/common/x86/warp_plane_ssse3.c | 508 |
12 files changed, 1319 insertions, 678 deletions
diff --git a/third_party/aom/av1/common/x86/av1_convolve_ssse3.c b/third_party/aom/av1/common/x86/av1_convolve_ssse3.c index 91102bbaf..5e627ebcf 100644 --- a/third_party/aom/av1/common/x86/av1_convolve_ssse3.c +++ b/third_party/aom/av1/common/x86/av1_convolve_ssse3.c @@ -19,13 +19,13 @@ #define WIDTH_BOUND (16) #define HEIGHT_BOUND (16) -#if CONFIG_DUAL_FILTER +#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER DECLARE_ALIGNED(16, static int8_t, sub_pel_filters_12sharp_signal_dir[15][2][16]); DECLARE_ALIGNED(16, static int8_t, sub_pel_filters_12sharp_ver_signal_dir[15][6][16]); -#endif // CONFIG_DUAL_FILTER +#endif // CONFIG_DUAL_FILTER && USE_EXTRA_FILTER #if USE_TEMPORALFILTER_12TAP DECLARE_ALIGNED(16, static int8_t, @@ -39,7 +39,7 @@ typedef int8_t (*SubpelFilterCoeffs)[16]; static INLINE SubpelFilterCoeffs get_subpel_filter_signal_dir(const InterpFilterParams p, int index) { -#if CONFIG_DUAL_FILTER +#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER if (p.interp_filter == MULTITAP_SHARP) { return &sub_pel_filters_12sharp_signal_dir[index][0]; } @@ -56,7 +56,7 @@ get_subpel_filter_signal_dir(const InterpFilterParams p, int index) { static INLINE SubpelFilterCoeffs get_subpel_filter_ver_signal_dir(const InterpFilterParams p, int index) { -#if CONFIG_DUAL_FILTER +#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER if (p.interp_filter == MULTITAP_SHARP) { return &sub_pel_filters_12sharp_ver_signal_dir[index][0]; } @@ -143,6 +143,7 @@ static void horiz_w4_ssse3(const uint8_t *src, const __m128i *f, int tapsNum, const __m128i k_256 = _mm_set1_epi16(1 << 8); const __m128i zero = _mm_setzero_si128(); + assert(tapsNum == 10 || tapsNum == 12); if (10 == tapsNum) { src -= 1; } @@ -470,6 +471,7 @@ static void filter_horiz_v8p_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, __m128i min_x2x3, max_x2x3; __m128i temp; + assert(tapsNum == 10 || tapsNum == 12); if (tapsNum == 10) { src_ptr -= 1; } @@ -612,6 +614,7 @@ static void filter_horiz_v4p_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, __m128i x0, x1, x2, x3, x4, x5; __m128i min_x2x3, max_x2x3, temp; + assert(tapsNum == 10 || tapsNum == 12); if (tapsNum == 10) { src_ptr -= 1; } @@ -982,7 +985,7 @@ typedef struct SimdFilter { int8_t (*simd_vert_filter)[6][16]; } SimdFilter; -#if CONFIG_DUAL_FILTER +#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER #define MULTITAP_FILTER_NUM 1 SimdFilter simd_filters[MULTITAP_FILTER_NUM] = { { MULTITAP_SHARP, &sub_pel_filters_12sharp_signal_dir[0], @@ -1010,7 +1013,7 @@ void av1_lowbd_convolve_init_ssse3(void) { temporal_simd_filter.simd_vert_filter); } #endif -#if CONFIG_DUAL_FILTER +#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER { int i; for (i = 0; i < MULTITAP_FILTER_NUM; ++i) { diff --git a/third_party/aom/av1/common/x86/av1_fwd_txfm1d_sse4.c b/third_party/aom/av1/common/x86/av1_fwd_txfm1d_sse4.c index d04b667f1..97d2e74b1 100644 --- a/third_party/aom/av1/common/x86/av1_fwd_txfm1d_sse4.c +++ b/third_party/aom/av1/common/x86/av1_fwd_txfm1d_sse4.c @@ -57,7 +57,7 @@ void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output, // stage 2 stage_idx++; bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; + cospi = cospi_arr(bit); buf0[0] = _mm_add_epi32(buf1[0], buf1[15]); buf0[15] = _mm_sub_epi32(buf1[0], buf1[15]); buf0[1] = _mm_add_epi32(buf1[1], buf1[14]); @@ -94,7 +94,7 @@ void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output, // stage 3 stage_idx++; bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; + cospi = cospi_arr(bit); buf1[0] = _mm_add_epi32(buf0[0], buf0[7]); buf1[7] = _mm_sub_epi32(buf0[0], buf0[7]); buf1[1] = _mm_add_epi32(buf0[1], buf0[6]); @@ -131,7 +131,7 @@ void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output, // stage 4 stage_idx++; bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; + cospi = cospi_arr(bit); buf0[0] = _mm_add_epi32(buf1[0], buf1[3]); buf0[3] = _mm_sub_epi32(buf1[0], buf1[3]); buf0[1] = _mm_add_epi32(buf1[1], buf1[2]); @@ -168,7 +168,7 @@ void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output, // stage 5 stage_idx++; bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; + cospi = cospi_arr(bit); btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1], bit); btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2], @@ -205,7 +205,7 @@ void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output, // stage 6 stage_idx++; bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; + cospi = cospi_arr(bit); buf0[0] = buf1[0]; buf0[1] = buf1[1]; buf0[2] = buf1[2]; @@ -242,7 +242,7 @@ void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output, // stage 7 stage_idx++; bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; + cospi = cospi_arr(bit); buf1[0] = buf0[0]; buf1[1] = buf0[1]; buf1[2] = buf0[2]; @@ -279,7 +279,7 @@ void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output, // stage 8 stage_idx++; bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; + cospi = cospi_arr(bit); buf0[0] = buf1[0]; buf0[1] = buf1[1]; buf0[2] = buf1[2]; @@ -383,7 +383,7 @@ void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output, // stage 2 stage_idx++; bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; + cospi = cospi_arr(bit); btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1], bit); btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[2], buf1[3], buf0[2], @@ -399,7 +399,7 @@ void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output, // stage 4 stage_idx++; bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; + cospi = cospi_arr(bit); buf0[0] = buf1[0]; buf0[1] = buf1[1]; btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2], @@ -475,7 +475,7 @@ void av1_fadst32_new_sse4_1(const __m128i *input, __m128i *output, // stage 2 stage_idx++; bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; + cospi = cospi_arr(bit); btf_32_sse4_1_type0(cospi[1], cospi[63], buf1[0], buf1[1], buf0[0], buf0[1], bit); btf_32_sse4_1_type0(cospi[5], cospi[59], buf1[2], buf1[3], buf0[2], buf0[3], @@ -547,7 +547,7 @@ void av1_fadst32_new_sse4_1(const __m128i *input, __m128i *output, // stage 4 stage_idx++; bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; + cospi = cospi_arr(bit); buf0[0] = buf1[0]; buf0[1] = buf1[1]; buf0[2] = buf1[2]; @@ -619,7 +619,7 @@ void av1_fadst32_new_sse4_1(const __m128i *input, __m128i *output, // stage 6 stage_idx++; bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; + cospi = cospi_arr(bit); buf0[0] = buf1[0]; buf0[1] = buf1[1]; buf0[2] = buf1[2]; @@ -691,7 +691,7 @@ void av1_fadst32_new_sse4_1(const __m128i *input, __m128i *output, // stage 8 stage_idx++; bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; + cospi = cospi_arr(bit); buf0[0] = buf1[0]; buf0[1] = buf1[1]; buf0[2] = buf1[2]; @@ -763,7 +763,7 @@ void av1_fadst32_new_sse4_1(const __m128i *input, __m128i *output, // stage 10 stage_idx++; bit = cos_bit[stage_idx]; - cospi = cospi_arr[bit - cos_bit_min]; + cospi = cospi_arr(bit); buf0[0] = buf1[0]; buf0[1] = buf1[1]; btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2], diff --git a/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c b/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c index 78c261374..1d7c55349 100644 --- a/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c +++ b/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c @@ -37,16 +37,20 @@ static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) { } static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output, - const int stride, const TXFM_2D_CFG *cfg, + const int stride, + const TXFM_2D_FLIP_CFG *cfg, int32_t *txfm_buf) { - const int txfm_size = cfg->txfm_size; - const int8_t *shift = cfg->shift; - const int8_t *stage_range_col = cfg->stage_range_col; - const int8_t *stage_range_row = cfg->stage_range_row; - const int8_t *cos_bit_col = cfg->cos_bit_col; - const int8_t *cos_bit_row = cfg->cos_bit_row; - const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col); - const TxfmFuncSSE2 txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row); + // TODO(sarahparker) must correct for rectangular transforms in follow up + const int txfm_size = cfg->row_cfg->txfm_size; + const int8_t *shift = cfg->row_cfg->shift; + const int8_t *stage_range_col = cfg->col_cfg->stage_range; + const int8_t *stage_range_row = cfg->row_cfg->stage_range; + const int8_t *cos_bit_col = cfg->col_cfg->cos_bit; + const int8_t *cos_bit_row = cfg->row_cfg->cos_bit; + const TxfmFuncSSE2 txfm_func_col = + fwd_txfm_type_to_func(cfg->col_cfg->txfm_type); + const TxfmFuncSSE2 txfm_func_row = + fwd_txfm_type_to_func(cfg->row_cfg->txfm_type); __m128i *buf_128 = (__m128i *)txfm_buf; __m128i *out_128 = (__m128i *)output; @@ -69,7 +73,7 @@ void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output, DECLARE_ALIGNED(16, int32_t, txfm_buf[1024]); TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_32X32); (void)bd; - fwd_txfm2d_sse4_1(input, output, stride, cfg.cfg, txfm_buf); + fwd_txfm2d_sse4_1(input, output, stride, &cfg, txfm_buf); } void av1_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output, @@ -77,5 +81,5 @@ void av1_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output, DECLARE_ALIGNED(16, int32_t, txfm_buf[4096]); TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_64x64_cfg(tx_type); (void)bd; - fwd_txfm2d_sse4_1(input, output, stride, cfg.cfg, txfm_buf); + fwd_txfm2d_sse4_1(input, output, stride, &cfg, txfm_buf); } diff --git a/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c b/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c index cf6249bdc..68461bc36 100644 --- a/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c +++ b/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c @@ -15,7 +15,7 @@ #include "./av1_rtcd.h" #include "av1/common/filter.h" -#if CONFIG_DUAL_FILTER +#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER DECLARE_ALIGNED(16, static int16_t, subpel_filters_sharp[15][6][8]); #endif @@ -31,7 +31,7 @@ typedef void (*TransposeSave)(int width, int pixelsNum, uint32_t *src, static INLINE HbdSubpelFilterCoeffs hbd_get_subpel_filter_ver_signal_dir(const InterpFilterParams p, int index) { -#if CONFIG_DUAL_FILTER +#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER if (p.interp_filter == MULTITAP_SHARP) { return &subpel_filters_sharp[index][0]; } @@ -76,7 +76,7 @@ void av1_highbd_convolve_init_sse4_1(void) { init_simd_filter(filter_ptr, taps, subpel_temporalfilter); } #endif -#if CONFIG_DUAL_FILTER +#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER { InterpFilterParams filter_params = av1_get_interp_filter_params(MULTITAP_SHARP); @@ -246,6 +246,7 @@ static void highbd_filter_horiz(const uint16_t *src, int src_stride, __m128i *f, int tapsNum, uint32_t *buf) { __m128i u[8], v[6]; + assert(tapsNum == 10 || tapsNum == 12); if (tapsNum == 10) { src -= 1; } @@ -412,6 +413,7 @@ static void filter_vert_horiz_parallel(const uint16_t *src, int src_stride, int r = 0; // TODO(luoyi) treat s[12] as a circular buffer in width = 2 case + assert(taps == 10 || taps == 12); if (10 == taps) { i += 1; s[0] = zero; diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c index d10f1ccc2..dd2a681bc 100644 --- a/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c +++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c @@ -13,7 +13,7 @@ #include "./av1_rtcd.h" #include "./aom_config.h" -#include "av1/common/av1_inv_txfm2d_cfg.h" +#include "av1/common/av1_inv_txfm1d_cfg.h" // Note: // Total 32x4 registers to represent 32x32 block coefficients. @@ -154,20 +154,21 @@ static void write_buffer_32x32(__m256i *in, uint16_t *output, int stride, } } -static INLINE __m256i half_btf_avx2(__m256i w0, __m256i n0, __m256i w1, - __m256i n1, __m256i rounding, int bit) { +static INLINE __m256i half_btf_avx2(const __m256i *w0, const __m256i *n0, + const __m256i *w1, const __m256i *n1, + const __m256i *rounding, int bit) { __m256i x, y; - x = _mm256_mullo_epi32(w0, n0); - y = _mm256_mullo_epi32(w1, n1); + x = _mm256_mullo_epi32(*w0, *n0); + y = _mm256_mullo_epi32(*w1, *n1); x = _mm256_add_epi32(x, y); - x = _mm256_add_epi32(x, rounding); + x = _mm256_add_epi32(x, *rounding); x = _mm256_srai_epi32(x, bit); return x; } static void idct32_avx2(__m256i *in, __m256i *out, int bit) { - const int32_t *cospi = cospi_arr[bit - cos_bit_min]; + const int32_t *cospi = cospi_arr(bit); const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); const __m256i cospi30 = _mm256_set1_epi32(cospi[30]); const __m256i cospi46 = _mm256_set1_epi32(cospi[46]); @@ -275,22 +276,38 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) { bf0[13] = bf1[13]; bf0[14] = bf1[14]; bf0[15] = bf1[15]; - bf0[16] = half_btf_avx2(cospi62, bf1[16], cospim2, bf1[31], rounding, bit); - bf0[17] = half_btf_avx2(cospi30, bf1[17], cospim34, bf1[30], rounding, bit); - bf0[18] = half_btf_avx2(cospi46, bf1[18], cospim18, bf1[29], rounding, bit); - bf0[19] = half_btf_avx2(cospi14, bf1[19], cospim50, bf1[28], rounding, bit); - bf0[20] = half_btf_avx2(cospi54, bf1[20], cospim10, bf1[27], rounding, bit); - bf0[21] = half_btf_avx2(cospi22, bf1[21], cospim42, bf1[26], rounding, bit); - bf0[22] = half_btf_avx2(cospi38, bf1[22], cospim26, bf1[25], rounding, bit); - bf0[23] = half_btf_avx2(cospi6, bf1[23], cospim58, bf1[24], rounding, bit); - bf0[24] = half_btf_avx2(cospi58, bf1[23], cospi6, bf1[24], rounding, bit); - bf0[25] = half_btf_avx2(cospi26, bf1[22], cospi38, bf1[25], rounding, bit); - bf0[26] = half_btf_avx2(cospi42, bf1[21], cospi22, bf1[26], rounding, bit); - bf0[27] = half_btf_avx2(cospi10, bf1[20], cospi54, bf1[27], rounding, bit); - bf0[28] = half_btf_avx2(cospi50, bf1[19], cospi14, bf1[28], rounding, bit); - bf0[29] = half_btf_avx2(cospi18, bf1[18], cospi46, bf1[29], rounding, bit); - bf0[30] = half_btf_avx2(cospi34, bf1[17], cospi30, bf1[30], rounding, bit); - bf0[31] = half_btf_avx2(cospi2, bf1[16], cospi62, bf1[31], rounding, bit); + bf0[16] = + half_btf_avx2(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit); + bf0[17] = + half_btf_avx2(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit); + bf0[18] = + half_btf_avx2(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit); + bf0[19] = + half_btf_avx2(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit); + bf0[20] = + half_btf_avx2(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit); + bf0[21] = + half_btf_avx2(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit); + bf0[22] = + half_btf_avx2(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit); + bf0[23] = + half_btf_avx2(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit); + bf0[24] = + half_btf_avx2(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit); + bf0[25] = + half_btf_avx2(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit); + bf0[26] = + half_btf_avx2(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit); + bf0[27] = + half_btf_avx2(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit); + bf0[28] = + half_btf_avx2(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit); + bf0[29] = + half_btf_avx2(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit); + bf0[30] = + half_btf_avx2(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit); + bf0[31] = + half_btf_avx2(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit); // stage 3 bf1[0] = bf0[0]; @@ -301,14 +318,22 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) { bf1[5] = bf0[5]; bf1[6] = bf0[6]; bf1[7] = bf0[7]; - bf1[8] = half_btf_avx2(cospi60, bf0[8], cospim4, bf0[15], rounding, bit); - bf1[9] = half_btf_avx2(cospi28, bf0[9], cospim36, bf0[14], rounding, bit); - bf1[10] = half_btf_avx2(cospi44, bf0[10], cospim20, bf0[13], rounding, bit); - bf1[11] = half_btf_avx2(cospi12, bf0[11], cospim52, bf0[12], rounding, bit); - bf1[12] = half_btf_avx2(cospi52, bf0[11], cospi12, bf0[12], rounding, bit); - bf1[13] = half_btf_avx2(cospi20, bf0[10], cospi44, bf0[13], rounding, bit); - bf1[14] = half_btf_avx2(cospi36, bf0[9], cospi28, bf0[14], rounding, bit); - bf1[15] = half_btf_avx2(cospi4, bf0[8], cospi60, bf0[15], rounding, bit); + bf1[8] = + half_btf_avx2(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit); + bf1[9] = + half_btf_avx2(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit); + bf1[10] = + half_btf_avx2(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit); + bf1[11] = + half_btf_avx2(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit); + bf1[12] = + half_btf_avx2(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit); + bf1[13] = + half_btf_avx2(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit); + bf1[14] = + half_btf_avx2(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit); + bf1[15] = + half_btf_avx2(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit); bf1[16] = _mm256_add_epi32(bf0[16], bf0[17]); bf1[17] = _mm256_sub_epi32(bf0[16], bf0[17]); bf1[18] = _mm256_sub_epi32(bf0[19], bf0[18]); @@ -331,10 +356,13 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) { bf0[1] = bf1[1]; bf0[2] = bf1[2]; bf0[3] = bf1[3]; - bf0[4] = half_btf_avx2(cospi56, bf1[4], cospim8, bf1[7], rounding, bit); - bf0[5] = half_btf_avx2(cospi24, bf1[5], cospim40, bf1[6], rounding, bit); - bf0[6] = half_btf_avx2(cospi40, bf1[5], cospi24, bf1[6], rounding, bit); - bf0[7] = half_btf_avx2(cospi8, bf1[4], cospi56, bf1[7], rounding, bit); + bf0[4] = + half_btf_avx2(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit); + bf0[5] = + half_btf_avx2(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit); + bf0[6] = + half_btf_avx2(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit); + bf0[7] = half_btf_avx2(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit); bf0[8] = _mm256_add_epi32(bf1[8], bf1[9]); bf0[9] = _mm256_sub_epi32(bf1[8], bf1[9]); bf0[10] = _mm256_sub_epi32(bf1[11], bf1[10]); @@ -344,40 +372,54 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) { bf0[14] = _mm256_sub_epi32(bf1[15], bf1[14]); bf0[15] = _mm256_add_epi32(bf1[14], bf1[15]); bf0[16] = bf1[16]; - bf0[17] = half_btf_avx2(cospim8, bf1[17], cospi56, bf1[30], rounding, bit); - bf0[18] = half_btf_avx2(cospim56, bf1[18], cospim8, bf1[29], rounding, bit); + bf0[17] = + half_btf_avx2(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit); + bf0[18] = + half_btf_avx2(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit); bf0[19] = bf1[19]; bf0[20] = bf1[20]; - bf0[21] = half_btf_avx2(cospim40, bf1[21], cospi24, bf1[26], rounding, bit); + bf0[21] = + half_btf_avx2(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit); bf0[22] = - half_btf_avx2(cospim24, bf1[22], cospim40, bf1[25], rounding, bit); + half_btf_avx2(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit); bf0[23] = bf1[23]; bf0[24] = bf1[24]; - bf0[25] = half_btf_avx2(cospim40, bf1[22], cospi24, bf1[25], rounding, bit); - bf0[26] = half_btf_avx2(cospi24, bf1[21], cospi40, bf1[26], rounding, bit); + bf0[25] = + half_btf_avx2(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit); + bf0[26] = + half_btf_avx2(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit); bf0[27] = bf1[27]; bf0[28] = bf1[28]; - bf0[29] = half_btf_avx2(cospim8, bf1[18], cospi56, bf1[29], rounding, bit); - bf0[30] = half_btf_avx2(cospi56, bf1[17], cospi8, bf1[30], rounding, bit); + bf0[29] = + half_btf_avx2(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit); + bf0[30] = + half_btf_avx2(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit); bf0[31] = bf1[31]; // stage 5 - bf1[0] = half_btf_avx2(cospi32, bf0[0], cospi32, bf0[1], rounding, bit); - bf1[1] = half_btf_avx2(cospi32, bf0[0], cospim32, bf0[1], rounding, bit); - bf1[2] = half_btf_avx2(cospi48, bf0[2], cospim16, bf0[3], rounding, bit); - bf1[3] = half_btf_avx2(cospi16, bf0[2], cospi48, bf0[3], rounding, bit); + bf1[0] = + half_btf_avx2(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit); + bf1[1] = + half_btf_avx2(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit); + bf1[2] = + half_btf_avx2(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit); + bf1[3] = + half_btf_avx2(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit); bf1[4] = _mm256_add_epi32(bf0[4], bf0[5]); bf1[5] = _mm256_sub_epi32(bf0[4], bf0[5]); bf1[6] = _mm256_sub_epi32(bf0[7], bf0[6]); bf1[7] = _mm256_add_epi32(bf0[6], bf0[7]); bf1[8] = bf0[8]; - bf1[9] = half_btf_avx2(cospim16, bf0[9], cospi48, bf0[14], rounding, bit); + bf1[9] = + half_btf_avx2(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit); bf1[10] = - half_btf_avx2(cospim48, bf0[10], cospim16, bf0[13], rounding, bit); + half_btf_avx2(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit); bf1[11] = bf0[11]; bf1[12] = bf0[12]; - bf1[13] = half_btf_avx2(cospim16, bf0[10], cospi48, bf0[13], rounding, bit); - bf1[14] = half_btf_avx2(cospi48, bf0[9], cospi16, bf0[14], rounding, bit); + bf1[13] = + half_btf_avx2(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit); + bf1[14] = + half_btf_avx2(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit); bf1[15] = bf0[15]; bf1[16] = _mm256_add_epi32(bf0[16], bf0[19]); bf1[17] = _mm256_add_epi32(bf0[17], bf0[18]); @@ -402,8 +444,10 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) { bf0[2] = _mm256_sub_epi32(bf1[1], bf1[2]); bf0[3] = _mm256_sub_epi32(bf1[0], bf1[3]); bf0[4] = bf1[4]; - bf0[5] = half_btf_avx2(cospim32, bf1[5], cospi32, bf1[6], rounding, bit); - bf0[6] = half_btf_avx2(cospi32, bf1[5], cospi32, bf1[6], rounding, bit); + bf0[5] = + half_btf_avx2(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit); + bf0[6] = + half_btf_avx2(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit); bf0[7] = bf1[7]; bf0[8] = _mm256_add_epi32(bf1[8], bf1[11]); bf0[9] = _mm256_add_epi32(bf1[9], bf1[10]); @@ -415,20 +459,26 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) { bf0[15] = _mm256_add_epi32(bf1[12], bf1[15]); bf0[16] = bf1[16]; bf0[17] = bf1[17]; - bf0[18] = half_btf_avx2(cospim16, bf1[18], cospi48, bf1[29], rounding, bit); - bf0[19] = half_btf_avx2(cospim16, bf1[19], cospi48, bf1[28], rounding, bit); + bf0[18] = + half_btf_avx2(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit); + bf0[19] = + half_btf_avx2(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit); bf0[20] = - half_btf_avx2(cospim48, bf1[20], cospim16, bf1[27], rounding, bit); + half_btf_avx2(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit); bf0[21] = - half_btf_avx2(cospim48, bf1[21], cospim16, bf1[26], rounding, bit); + half_btf_avx2(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit); bf0[22] = bf1[22]; bf0[23] = bf1[23]; bf0[24] = bf1[24]; bf0[25] = bf1[25]; - bf0[26] = half_btf_avx2(cospim16, bf1[21], cospi48, bf1[26], rounding, bit); - bf0[27] = half_btf_avx2(cospim16, bf1[20], cospi48, bf1[27], rounding, bit); - bf0[28] = half_btf_avx2(cospi48, bf1[19], cospi16, bf1[28], rounding, bit); - bf0[29] = half_btf_avx2(cospi48, bf1[18], cospi16, bf1[29], rounding, bit); + bf0[26] = + half_btf_avx2(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit); + bf0[27] = + half_btf_avx2(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit); + bf0[28] = + half_btf_avx2(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit); + bf0[29] = + half_btf_avx2(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit); bf0[30] = bf1[30]; bf0[31] = bf1[31]; @@ -443,10 +493,14 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) { bf1[7] = _mm256_sub_epi32(bf0[0], bf0[7]); bf1[8] = bf0[8]; bf1[9] = bf0[9]; - bf1[10] = half_btf_avx2(cospim32, bf0[10], cospi32, bf0[13], rounding, bit); - bf1[11] = half_btf_avx2(cospim32, bf0[11], cospi32, bf0[12], rounding, bit); - bf1[12] = half_btf_avx2(cospi32, bf0[11], cospi32, bf0[12], rounding, bit); - bf1[13] = half_btf_avx2(cospi32, bf0[10], cospi32, bf0[13], rounding, bit); + bf1[10] = + half_btf_avx2(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit); + bf1[11] = + half_btf_avx2(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit); + bf1[12] = + half_btf_avx2(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit); + bf1[13] = + half_btf_avx2(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit); bf1[14] = bf0[14]; bf1[15] = bf0[15]; bf1[16] = _mm256_add_epi32(bf0[16], bf0[23]); @@ -487,14 +541,22 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) { bf0[17] = bf1[17]; bf0[18] = bf1[18]; bf0[19] = bf1[19]; - bf0[20] = half_btf_avx2(cospim32, bf1[20], cospi32, bf1[27], rounding, bit); - bf0[21] = half_btf_avx2(cospim32, bf1[21], cospi32, bf1[26], rounding, bit); - bf0[22] = half_btf_avx2(cospim32, bf1[22], cospi32, bf1[25], rounding, bit); - bf0[23] = half_btf_avx2(cospim32, bf1[23], cospi32, bf1[24], rounding, bit); - bf0[24] = half_btf_avx2(cospi32, bf1[23], cospi32, bf1[24], rounding, bit); - bf0[25] = half_btf_avx2(cospi32, bf1[22], cospi32, bf1[25], rounding, bit); - bf0[26] = half_btf_avx2(cospi32, bf1[21], cospi32, bf1[26], rounding, bit); - bf0[27] = half_btf_avx2(cospi32, bf1[20], cospi32, bf1[27], rounding, bit); + bf0[20] = + half_btf_avx2(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit); + bf0[21] = + half_btf_avx2(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit); + bf0[22] = + half_btf_avx2(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit); + bf0[23] = + half_btf_avx2(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit); + bf0[24] = + half_btf_avx2(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit); + bf0[25] = + half_btf_avx2(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit); + bf0[26] = + half_btf_avx2(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit); + bf0[27] = + half_btf_avx2(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit); bf0[28] = bf1[28]; bf0[29] = bf1[29]; bf0[30] = bf1[30]; @@ -539,18 +601,20 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) { void av1_inv_txfm2d_add_32x32_avx2(const int32_t *coeff, uint16_t *output, int stride, int tx_type, int bd) { __m256i in[128], out[128]; - const TXFM_2D_CFG *cfg = NULL; + const TXFM_1D_CFG *row_cfg = NULL; + const TXFM_1D_CFG *col_cfg = NULL; switch (tx_type) { case DCT_DCT: - cfg = &inv_txfm_2d_cfg_dct_dct_32; + row_cfg = &inv_txfm_1d_row_cfg_dct_32; + col_cfg = &inv_txfm_1d_col_cfg_dct_32; load_buffer_32x32(coeff, in); transpose_32x32(in, out); - idct32_avx2(out, in, cfg->cos_bit_row[2]); - round_shift_32x32(in, -cfg->shift[0]); + idct32_avx2(out, in, row_cfg->cos_bit[2]); + round_shift_32x32(in, -row_cfg->shift[0]); transpose_32x32(in, out); - idct32_avx2(out, in, cfg->cos_bit_col[2]); - write_buffer_32x32(in, output, stride, 0, 0, -cfg->shift[1], bd); + idct32_avx2(out, in, col_cfg->cos_bit[2]); + write_buffer_32x32(in, output, stride, 0, 0, -row_cfg->shift[1], bd); break; default: assert(0); } diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c index 24b2760b9..a93699f0b 100644 --- a/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c +++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c @@ -13,7 +13,7 @@ #include "./av1_rtcd.h" #include "./aom_config.h" -#include "av1/common/av1_inv_txfm2d_cfg.h" +#include "av1/common/av1_inv_txfm1d_cfg.h" #include "av1/common/x86/highbd_txfm_utility_sse4.h" static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) { @@ -24,7 +24,7 @@ static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) { } static void idct4x4_sse4_1(__m128i *in, int bit) { - const int32_t *cospi = cospi_arr[bit - cos_bit_min]; + const int32_t *cospi = cospi_arr(bit); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i cospi48 = _mm_set1_epi32(cospi[48]); const __m128i cospi16 = _mm_set1_epi32(cospi[16]); @@ -72,7 +72,7 @@ static void idct4x4_sse4_1(__m128i *in, int bit) { } static void iadst4x4_sse4_1(__m128i *in, int bit) { - const int32_t *cospi = cospi_arr[bit - cos_bit_min]; + const int32_t *cospi = cospi_arr(bit); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i cospi8 = _mm_set1_epi32(cospi[8]); const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); @@ -232,72 +232,82 @@ static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride, void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output, int stride, int tx_type, int bd) { __m128i in[4]; - const TXFM_2D_CFG *cfg = NULL; + const TXFM_1D_CFG *row_cfg = NULL; + const TXFM_1D_CFG *col_cfg = NULL; switch (tx_type) { case DCT_DCT: - cfg = &inv_txfm_2d_cfg_dct_dct_4; + row_cfg = &inv_txfm_1d_row_cfg_dct_4; + col_cfg = &inv_txfm_1d_col_cfg_dct_4; load_buffer_4x4(coeff, in); - idct4x4_sse4_1(in, cfg->cos_bit_row[2]); - idct4x4_sse4_1(in, cfg->cos_bit_col[2]); - write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd); + idct4x4_sse4_1(in, row_cfg->cos_bit[2]); + idct4x4_sse4_1(in, col_cfg->cos_bit[2]); + write_buffer_4x4(in, output, stride, 0, 0, -row_cfg->shift[1], bd); break; case ADST_DCT: - cfg = &inv_txfm_2d_cfg_adst_dct_4; + row_cfg = &inv_txfm_1d_row_cfg_dct_4; + col_cfg = &inv_txfm_1d_col_cfg_adst_4; load_buffer_4x4(coeff, in); - idct4x4_sse4_1(in, cfg->cos_bit_row[2]); - iadst4x4_sse4_1(in, cfg->cos_bit_col[2]); - write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd); + idct4x4_sse4_1(in, row_cfg->cos_bit[2]); + iadst4x4_sse4_1(in, col_cfg->cos_bit[2]); + write_buffer_4x4(in, output, stride, 0, 0, -row_cfg->shift[1], bd); break; case DCT_ADST: - cfg = &inv_txfm_2d_cfg_dct_adst_4; + row_cfg = &inv_txfm_1d_row_cfg_adst_4; + col_cfg = &inv_txfm_1d_col_cfg_dct_4; load_buffer_4x4(coeff, in); - iadst4x4_sse4_1(in, cfg->cos_bit_row[2]); - idct4x4_sse4_1(in, cfg->cos_bit_col[2]); - write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd); + iadst4x4_sse4_1(in, row_cfg->cos_bit[2]); + idct4x4_sse4_1(in, col_cfg->cos_bit[2]); + write_buffer_4x4(in, output, stride, 0, 0, -row_cfg->shift[1], bd); break; case ADST_ADST: - cfg = &inv_txfm_2d_cfg_adst_adst_4; + row_cfg = &inv_txfm_1d_row_cfg_adst_4; + col_cfg = &inv_txfm_1d_col_cfg_adst_4; load_buffer_4x4(coeff, in); - iadst4x4_sse4_1(in, cfg->cos_bit_row[2]); - iadst4x4_sse4_1(in, cfg->cos_bit_col[2]); - write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd); + iadst4x4_sse4_1(in, row_cfg->cos_bit[2]); + iadst4x4_sse4_1(in, col_cfg->cos_bit[2]); + write_buffer_4x4(in, output, stride, 0, 0, -row_cfg->shift[1], bd); break; #if CONFIG_EXT_TX case FLIPADST_DCT: - cfg = &inv_txfm_2d_cfg_adst_dct_4; + row_cfg = &inv_txfm_1d_row_cfg_dct_4; + col_cfg = &inv_txfm_1d_col_cfg_adst_4; load_buffer_4x4(coeff, in); - idct4x4_sse4_1(in, cfg->cos_bit_row[2]); - iadst4x4_sse4_1(in, cfg->cos_bit_col[2]); - write_buffer_4x4(in, output, stride, 0, 1, -cfg->shift[1], bd); + idct4x4_sse4_1(in, row_cfg->cos_bit[2]); + iadst4x4_sse4_1(in, col_cfg->cos_bit[2]); + write_buffer_4x4(in, output, stride, 0, 1, -row_cfg->shift[1], bd); break; case DCT_FLIPADST: - cfg = &inv_txfm_2d_cfg_dct_adst_4; + row_cfg = &inv_txfm_1d_row_cfg_adst_4; + col_cfg = &inv_txfm_1d_col_cfg_dct_4; load_buffer_4x4(coeff, in); - iadst4x4_sse4_1(in, cfg->cos_bit_row[2]); - idct4x4_sse4_1(in, cfg->cos_bit_col[2]); - write_buffer_4x4(in, output, stride, 1, 0, -cfg->shift[1], bd); + iadst4x4_sse4_1(in, row_cfg->cos_bit[2]); + idct4x4_sse4_1(in, col_cfg->cos_bit[2]); + write_buffer_4x4(in, output, stride, 1, 0, -row_cfg->shift[1], bd); break; case FLIPADST_FLIPADST: - cfg = &inv_txfm_2d_cfg_adst_adst_4; + row_cfg = &inv_txfm_1d_row_cfg_adst_4; + col_cfg = &inv_txfm_1d_col_cfg_adst_4; load_buffer_4x4(coeff, in); - iadst4x4_sse4_1(in, cfg->cos_bit_row[2]); - iadst4x4_sse4_1(in, cfg->cos_bit_col[2]); - write_buffer_4x4(in, output, stride, 1, 1, -cfg->shift[1], bd); + iadst4x4_sse4_1(in, row_cfg->cos_bit[2]); + iadst4x4_sse4_1(in, col_cfg->cos_bit[2]); + write_buffer_4x4(in, output, stride, 1, 1, -row_cfg->shift[1], bd); break; case ADST_FLIPADST: - cfg = &inv_txfm_2d_cfg_adst_adst_4; + row_cfg = &inv_txfm_1d_row_cfg_adst_4; + col_cfg = &inv_txfm_1d_col_cfg_adst_4; load_buffer_4x4(coeff, in); - iadst4x4_sse4_1(in, cfg->cos_bit_row[2]); - iadst4x4_sse4_1(in, cfg->cos_bit_col[2]); - write_buffer_4x4(in, output, stride, 1, 0, -cfg->shift[1], bd); + iadst4x4_sse4_1(in, row_cfg->cos_bit[2]); + iadst4x4_sse4_1(in, col_cfg->cos_bit[2]); + write_buffer_4x4(in, output, stride, 1, 0, -row_cfg->shift[1], bd); break; case FLIPADST_ADST: - cfg = &inv_txfm_2d_cfg_adst_adst_4; + row_cfg = &inv_txfm_1d_row_cfg_adst_4; + col_cfg = &inv_txfm_1d_col_cfg_adst_4; load_buffer_4x4(coeff, in); - iadst4x4_sse4_1(in, cfg->cos_bit_row[2]); - iadst4x4_sse4_1(in, cfg->cos_bit_col[2]); - write_buffer_4x4(in, output, stride, 0, 1, -cfg->shift[1], bd); + iadst4x4_sse4_1(in, row_cfg->cos_bit[2]); + iadst4x4_sse4_1(in, col_cfg->cos_bit[2]); + write_buffer_4x4(in, output, stride, 0, 1, -row_cfg->shift[1], bd); break; #endif // CONFIG_EXT_TX default: assert(0); @@ -325,7 +335,7 @@ static void load_buffer_8x8(const int32_t *coeff, __m128i *in) { } static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit) { - const int32_t *cospi = cospi_arr[bit - cos_bit_min]; + const int32_t *cospi = cospi_arr(bit); const __m128i cospi56 = _mm_set1_epi32(cospi[56]); const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); const __m128i cospi24 = _mm_set1_epi32(cospi[24]); @@ -439,7 +449,7 @@ static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit) { } static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit) { - const int32_t *cospi = cospi_arr[bit - cos_bit_min]; + const int32_t *cospi = cospi_arr(bit); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i cospi16 = _mm_set1_epi32(cospi[16]); const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); @@ -698,90 +708,100 @@ static void write_buffer_8x8(__m128i *in, uint16_t *output, int stride, void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output, int stride, int tx_type, int bd) { __m128i in[16], out[16]; - const TXFM_2D_CFG *cfg = NULL; + const TXFM_1D_CFG *row_cfg = NULL; + const TXFM_1D_CFG *col_cfg = NULL; switch (tx_type) { case DCT_DCT: - cfg = &inv_txfm_2d_cfg_dct_dct_8; + row_cfg = &inv_txfm_1d_row_cfg_dct_8; + col_cfg = &inv_txfm_1d_col_cfg_dct_8; load_buffer_8x8(coeff, in); transpose_8x8(in, out); - idct8x8_sse4_1(out, in, cfg->cos_bit_row[2]); + idct8x8_sse4_1(out, in, row_cfg->cos_bit[2]); transpose_8x8(in, out); - idct8x8_sse4_1(out, in, cfg->cos_bit_col[2]); - write_buffer_8x8(in, output, stride, 0, 0, -cfg->shift[1], bd); + idct8x8_sse4_1(out, in, col_cfg->cos_bit[2]); + write_buffer_8x8(in, output, stride, 0, 0, -row_cfg->shift[1], bd); break; case DCT_ADST: - cfg = &inv_txfm_2d_cfg_dct_adst_8; + row_cfg = &inv_txfm_1d_row_cfg_adst_8; + col_cfg = &inv_txfm_1d_col_cfg_dct_8; load_buffer_8x8(coeff, in); transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]); + iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]); transpose_8x8(in, out); - idct8x8_sse4_1(out, in, cfg->cos_bit_col[2]); - write_buffer_8x8(in, output, stride, 0, 0, -cfg->shift[1], bd); + idct8x8_sse4_1(out, in, col_cfg->cos_bit[2]); + write_buffer_8x8(in, output, stride, 0, 0, -row_cfg->shift[1], bd); break; case ADST_DCT: - cfg = &inv_txfm_2d_cfg_adst_dct_8; + row_cfg = &inv_txfm_1d_row_cfg_dct_8; + col_cfg = &inv_txfm_1d_col_cfg_adst_8; load_buffer_8x8(coeff, in); transpose_8x8(in, out); - idct8x8_sse4_1(out, in, cfg->cos_bit_row[2]); + idct8x8_sse4_1(out, in, row_cfg->cos_bit[2]); transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]); - write_buffer_8x8(in, output, stride, 0, 0, -cfg->shift[1], bd); + iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]); + write_buffer_8x8(in, output, stride, 0, 0, -row_cfg->shift[1], bd); break; case ADST_ADST: - cfg = &inv_txfm_2d_cfg_adst_adst_8; + row_cfg = &inv_txfm_1d_row_cfg_adst_8; + col_cfg = &inv_txfm_1d_col_cfg_adst_8; load_buffer_8x8(coeff, in); transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]); + iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]); transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]); - write_buffer_8x8(in, output, stride, 0, 0, -cfg->shift[1], bd); + iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]); + write_buffer_8x8(in, output, stride, 0, 0, -row_cfg->shift[1], bd); break; #if CONFIG_EXT_TX case FLIPADST_DCT: - cfg = &inv_txfm_2d_cfg_adst_dct_8; + row_cfg = &inv_txfm_1d_row_cfg_dct_8; + col_cfg = &inv_txfm_1d_col_cfg_adst_8; load_buffer_8x8(coeff, in); transpose_8x8(in, out); - idct8x8_sse4_1(out, in, cfg->cos_bit_row[2]); + idct8x8_sse4_1(out, in, row_cfg->cos_bit[2]); transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]); - write_buffer_8x8(in, output, stride, 0, 1, -cfg->shift[1], bd); + iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]); + write_buffer_8x8(in, output, stride, 0, 1, -row_cfg->shift[1], bd); break; case DCT_FLIPADST: - cfg = &inv_txfm_2d_cfg_dct_adst_8; + row_cfg = &inv_txfm_1d_row_cfg_adst_8; + col_cfg = &inv_txfm_1d_col_cfg_dct_8; load_buffer_8x8(coeff, in); transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]); + iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]); transpose_8x8(in, out); - idct8x8_sse4_1(out, in, cfg->cos_bit_col[2]); - write_buffer_8x8(in, output, stride, 1, 0, -cfg->shift[1], bd); + idct8x8_sse4_1(out, in, col_cfg->cos_bit[2]); + write_buffer_8x8(in, output, stride, 1, 0, -row_cfg->shift[1], bd); break; case ADST_FLIPADST: - cfg = &inv_txfm_2d_cfg_adst_adst_8; + row_cfg = &inv_txfm_1d_row_cfg_adst_8; + col_cfg = &inv_txfm_1d_col_cfg_adst_8; load_buffer_8x8(coeff, in); transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]); + iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]); transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]); - write_buffer_8x8(in, output, stride, 1, 0, -cfg->shift[1], bd); + iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]); + write_buffer_8x8(in, output, stride, 1, 0, -row_cfg->shift[1], bd); break; case FLIPADST_FLIPADST: - cfg = &inv_txfm_2d_cfg_adst_adst_8; + row_cfg = &inv_txfm_1d_row_cfg_adst_8; + col_cfg = &inv_txfm_1d_col_cfg_adst_8; load_buffer_8x8(coeff, in); transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]); + iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]); transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]); - write_buffer_8x8(in, output, stride, 1, 1, -cfg->shift[1], bd); + iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]); + write_buffer_8x8(in, output, stride, 1, 1, -row_cfg->shift[1], bd); break; case FLIPADST_ADST: - cfg = &inv_txfm_2d_cfg_adst_adst_8; + row_cfg = &inv_txfm_1d_row_cfg_adst_8; + col_cfg = &inv_txfm_1d_col_cfg_adst_8; load_buffer_8x8(coeff, in); transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]); + iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]); transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]); - write_buffer_8x8(in, output, stride, 0, 1, -cfg->shift[1], bd); + iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]); + write_buffer_8x8(in, output, stride, 0, 1, -row_cfg->shift[1], bd); break; #endif // CONFIG_EXT_TX default: assert(0); @@ -849,7 +869,7 @@ static void write_buffer_16x16(__m128i *in, uint16_t *output, int stride, } static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) { - const int32_t *cospi = cospi_arr[bit - cos_bit_min]; + const int32_t *cospi = cospi_arr(bit); const __m128i cospi60 = _mm_set1_epi32(cospi[60]); const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); const __m128i cospi28 = _mm_set1_epi32(cospi[28]); @@ -907,24 +927,24 @@ static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) { v[6] = u[6]; v[7] = u[7]; - v[8] = half_btf_sse4_1(cospi60, u[8], cospim4, u[15], rnding, bit); - v[9] = half_btf_sse4_1(cospi28, u[9], cospim36, u[14], rnding, bit); - v[10] = half_btf_sse4_1(cospi44, u[10], cospim20, u[13], rnding, bit); - v[11] = half_btf_sse4_1(cospi12, u[11], cospim52, u[12], rnding, bit); - v[12] = half_btf_sse4_1(cospi52, u[11], cospi12, u[12], rnding, bit); - v[13] = half_btf_sse4_1(cospi20, u[10], cospi44, u[13], rnding, bit); - v[14] = half_btf_sse4_1(cospi36, u[9], cospi28, u[14], rnding, bit); - v[15] = half_btf_sse4_1(cospi4, u[8], cospi60, u[15], rnding, bit); + v[8] = half_btf_sse4_1(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit); + v[9] = half_btf_sse4_1(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit); + v[10] = half_btf_sse4_1(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit); + v[11] = half_btf_sse4_1(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit); + v[12] = half_btf_sse4_1(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit); + v[13] = half_btf_sse4_1(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit); + v[14] = half_btf_sse4_1(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit); + v[15] = half_btf_sse4_1(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit); // stage 3 u[0] = v[0]; u[1] = v[1]; u[2] = v[2]; u[3] = v[3]; - u[4] = half_btf_sse4_1(cospi56, v[4], cospim8, v[7], rnding, bit); - u[5] = half_btf_sse4_1(cospi24, v[5], cospim40, v[6], rnding, bit); - u[6] = half_btf_sse4_1(cospi40, v[5], cospi24, v[6], rnding, bit); - u[7] = half_btf_sse4_1(cospi8, v[4], cospi56, v[7], rnding, bit); + u[4] = half_btf_sse4_1(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit); + u[5] = half_btf_sse4_1(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit); + u[6] = half_btf_sse4_1(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit); + u[7] = half_btf_sse4_1(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit); u[8] = _mm_add_epi32(v[8], v[9]); u[9] = _mm_sub_epi32(v[8], v[9]); u[10] = _mm_sub_epi32(v[11], v[10]); @@ -945,19 +965,19 @@ static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) { v[1] = _mm_add_epi32(v[1], rnding); v[1] = _mm_srai_epi32(v[1], bit); - v[2] = half_btf_sse4_1(cospi48, u[2], cospim16, u[3], rnding, bit); - v[3] = half_btf_sse4_1(cospi16, u[2], cospi48, u[3], rnding, bit); + v[2] = half_btf_sse4_1(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit); + v[3] = half_btf_sse4_1(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit); v[4] = _mm_add_epi32(u[4], u[5]); v[5] = _mm_sub_epi32(u[4], u[5]); v[6] = _mm_sub_epi32(u[7], u[6]); v[7] = _mm_add_epi32(u[6], u[7]); v[8] = u[8]; - v[9] = half_btf_sse4_1(cospim16, u[9], cospi48, u[14], rnding, bit); - v[10] = half_btf_sse4_1(cospim48, u[10], cospim16, u[13], rnding, bit); + v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); v[11] = u[11]; v[12] = u[12]; - v[13] = half_btf_sse4_1(cospim16, u[10], cospi48, u[13], rnding, bit); - v[14] = half_btf_sse4_1(cospi48, u[9], cospi16, u[14], rnding, bit); + v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); + v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); v[15] = u[15]; // stage 5 @@ -1043,7 +1063,7 @@ static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) { } static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) { - const int32_t *cospi = cospi_arr[bit - cos_bit_min]; + const int32_t *cospi = cospi_arr(bit); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i cospi48 = _mm_set1_epi32(cospi[48]); const __m128i cospi16 = _mm_set1_epi32(cospi[16]); @@ -1183,18 +1203,18 @@ static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) { v[1] = u[1]; v[2] = u[2]; v[3] = u[3]; - v[4] = half_btf_sse4_1(cospi16, u[4], cospi48, u[5], rnding, bit); - v[5] = half_btf_sse4_1(cospi48, u[4], cospim16, u[5], rnding, bit); - v[6] = half_btf_sse4_1(cospim48, u[6], cospi16, u[7], rnding, bit); - v[7] = half_btf_sse4_1(cospi16, u[6], cospi48, u[7], rnding, bit); + v[4] = half_btf_sse4_1(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit); + v[5] = half_btf_sse4_1(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit); + v[6] = half_btf_sse4_1(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit); + v[7] = half_btf_sse4_1(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit); v[8] = u[8]; v[9] = u[9]; v[10] = u[10]; v[11] = u[11]; - v[12] = half_btf_sse4_1(cospi16, u[12], cospi48, u[13], rnding, bit); - v[13] = half_btf_sse4_1(cospi48, u[12], cospim16, u[13], rnding, bit); - v[14] = half_btf_sse4_1(cospim48, u[14], cospi16, u[15], rnding, bit); - v[15] = half_btf_sse4_1(cospi16, u[14], cospi48, u[15], rnding, bit); + v[12] = half_btf_sse4_1(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit); + v[13] = half_btf_sse4_1(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit); + v[14] = half_btf_sse4_1(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit); + v[15] = half_btf_sse4_1(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit); // stage 5 u[0] = _mm_add_epi32(v[0], v[4]); @@ -1223,14 +1243,14 @@ static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) { v[5] = u[5]; v[6] = u[6]; v[7] = u[7]; - v[8] = half_btf_sse4_1(cospi8, u[8], cospi56, u[9], rnding, bit); - v[9] = half_btf_sse4_1(cospi56, u[8], cospim8, u[9], rnding, bit); - v[10] = half_btf_sse4_1(cospi40, u[10], cospi24, u[11], rnding, bit); - v[11] = half_btf_sse4_1(cospi24, u[10], cospim40, u[11], rnding, bit); - v[12] = half_btf_sse4_1(cospim56, u[12], cospi8, u[13], rnding, bit); - v[13] = half_btf_sse4_1(cospi8, u[12], cospi56, u[13], rnding, bit); - v[14] = half_btf_sse4_1(cospim24, u[14], cospi40, u[15], rnding, bit); - v[15] = half_btf_sse4_1(cospi40, u[14], cospi24, u[15], rnding, bit); + v[8] = half_btf_sse4_1(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit); + v[9] = half_btf_sse4_1(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit); + v[10] = half_btf_sse4_1(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit); + v[11] = half_btf_sse4_1(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit); + v[12] = half_btf_sse4_1(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit); + v[13] = half_btf_sse4_1(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit); + v[14] = half_btf_sse4_1(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit); + v[15] = half_btf_sse4_1(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit); // stage 7 u[0] = _mm_add_epi32(v[0], v[8]); @@ -1251,22 +1271,22 @@ static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) { u[15] = _mm_sub_epi32(v[7], v[15]); // stage 8 - v[0] = half_btf_sse4_1(cospi2, u[0], cospi62, u[1], rnding, bit); - v[1] = half_btf_sse4_1(cospi62, u[0], cospim2, u[1], rnding, bit); - v[2] = half_btf_sse4_1(cospi10, u[2], cospi54, u[3], rnding, bit); - v[3] = half_btf_sse4_1(cospi54, u[2], cospim10, u[3], rnding, bit); - v[4] = half_btf_sse4_1(cospi18, u[4], cospi46, u[5], rnding, bit); - v[5] = half_btf_sse4_1(cospi46, u[4], cospim18, u[5], rnding, bit); - v[6] = half_btf_sse4_1(cospi26, u[6], cospi38, u[7], rnding, bit); - v[7] = half_btf_sse4_1(cospi38, u[6], cospim26, u[7], rnding, bit); - v[8] = half_btf_sse4_1(cospi34, u[8], cospi30, u[9], rnding, bit); - v[9] = half_btf_sse4_1(cospi30, u[8], cospim34, u[9], rnding, bit); - v[10] = half_btf_sse4_1(cospi42, u[10], cospi22, u[11], rnding, bit); - v[11] = half_btf_sse4_1(cospi22, u[10], cospim42, u[11], rnding, bit); - v[12] = half_btf_sse4_1(cospi50, u[12], cospi14, u[13], rnding, bit); - v[13] = half_btf_sse4_1(cospi14, u[12], cospim50, u[13], rnding, bit); - v[14] = half_btf_sse4_1(cospi58, u[14], cospi6, u[15], rnding, bit); - v[15] = half_btf_sse4_1(cospi6, u[14], cospim58, u[15], rnding, bit); + v[0] = half_btf_sse4_1(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit); + v[1] = half_btf_sse4_1(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit); + v[2] = half_btf_sse4_1(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit); + v[3] = half_btf_sse4_1(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit); + v[4] = half_btf_sse4_1(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit); + v[5] = half_btf_sse4_1(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit); + v[6] = half_btf_sse4_1(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit); + v[7] = half_btf_sse4_1(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit); + v[8] = half_btf_sse4_1(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit); + v[9] = half_btf_sse4_1(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit); + v[10] = half_btf_sse4_1(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit); + v[11] = half_btf_sse4_1(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit); + v[12] = half_btf_sse4_1(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit); + v[13] = half_btf_sse4_1(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit); + v[14] = half_btf_sse4_1(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit); + v[15] = half_btf_sse4_1(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit); // stage 9 out[0 * 4 + col] = v[1]; @@ -1298,99 +1318,109 @@ static void round_shift_16x16(__m128i *in, int shift) { void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *coeff, uint16_t *output, int stride, int tx_type, int bd) { __m128i in[64], out[64]; - const TXFM_2D_CFG *cfg = NULL; + const TXFM_1D_CFG *row_cfg = NULL; + const TXFM_1D_CFG *col_cfg = NULL; switch (tx_type) { case DCT_DCT: - cfg = &inv_txfm_2d_cfg_dct_dct_16; + row_cfg = &inv_txfm_1d_row_cfg_dct_16; + col_cfg = &inv_txfm_1d_col_cfg_dct_16; load_buffer_16x16(coeff, in); transpose_16x16(in, out); - idct16x16_sse4_1(out, in, cfg->cos_bit_row[2]); - round_shift_16x16(in, -cfg->shift[0]); + idct16x16_sse4_1(out, in, row_cfg->cos_bit[2]); + round_shift_16x16(in, -row_cfg->shift[0]); transpose_16x16(in, out); - idct16x16_sse4_1(out, in, cfg->cos_bit_col[2]); - write_buffer_16x16(in, output, stride, 0, 0, -cfg->shift[1], bd); + idct16x16_sse4_1(out, in, col_cfg->cos_bit[2]); + write_buffer_16x16(in, output, stride, 0, 0, -row_cfg->shift[1], bd); break; case DCT_ADST: - cfg = &inv_txfm_2d_cfg_dct_adst_16; + row_cfg = &inv_txfm_1d_row_cfg_adst_16; + col_cfg = &inv_txfm_1d_col_cfg_dct_16; load_buffer_16x16(coeff, in); transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]); - round_shift_16x16(in, -cfg->shift[0]); + iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]); + round_shift_16x16(in, -row_cfg->shift[0]); transpose_16x16(in, out); - idct16x16_sse4_1(out, in, cfg->cos_bit_col[2]); - write_buffer_16x16(in, output, stride, 0, 0, -cfg->shift[1], bd); + idct16x16_sse4_1(out, in, col_cfg->cos_bit[2]); + write_buffer_16x16(in, output, stride, 0, 0, -row_cfg->shift[1], bd); break; case ADST_DCT: - cfg = &inv_txfm_2d_cfg_adst_dct_16; + row_cfg = &inv_txfm_1d_row_cfg_dct_16; + col_cfg = &inv_txfm_1d_col_cfg_adst_16; load_buffer_16x16(coeff, in); transpose_16x16(in, out); - idct16x16_sse4_1(out, in, cfg->cos_bit_row[2]); - round_shift_16x16(in, -cfg->shift[0]); + idct16x16_sse4_1(out, in, row_cfg->cos_bit[2]); + round_shift_16x16(in, -row_cfg->shift[0]); transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]); - write_buffer_16x16(in, output, stride, 0, 0, -cfg->shift[1], bd); + iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]); + write_buffer_16x16(in, output, stride, 0, 0, -row_cfg->shift[1], bd); break; case ADST_ADST: - cfg = &inv_txfm_2d_cfg_adst_adst_16; + row_cfg = &inv_txfm_1d_row_cfg_adst_16; + col_cfg = &inv_txfm_1d_col_cfg_adst_16; load_buffer_16x16(coeff, in); transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]); - round_shift_16x16(in, -cfg->shift[0]); + iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]); + round_shift_16x16(in, -row_cfg->shift[0]); transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]); - write_buffer_16x16(in, output, stride, 0, 0, -cfg->shift[1], bd); + iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]); + write_buffer_16x16(in, output, stride, 0, 0, -row_cfg->shift[1], bd); break; #if CONFIG_EXT_TX case FLIPADST_DCT: - cfg = &inv_txfm_2d_cfg_adst_dct_16; + row_cfg = &inv_txfm_1d_row_cfg_dct_16; + col_cfg = &inv_txfm_1d_col_cfg_adst_16; load_buffer_16x16(coeff, in); transpose_16x16(in, out); - idct16x16_sse4_1(out, in, cfg->cos_bit_row[2]); - round_shift_16x16(in, -cfg->shift[0]); + idct16x16_sse4_1(out, in, row_cfg->cos_bit[2]); + round_shift_16x16(in, -row_cfg->shift[0]); transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]); - write_buffer_16x16(in, output, stride, 0, 1, -cfg->shift[1], bd); + iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]); + write_buffer_16x16(in, output, stride, 0, 1, -row_cfg->shift[1], bd); break; case DCT_FLIPADST: - cfg = &inv_txfm_2d_cfg_dct_adst_16; + row_cfg = &inv_txfm_1d_row_cfg_adst_16; + col_cfg = &inv_txfm_1d_col_cfg_dct_16; load_buffer_16x16(coeff, in); transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]); - round_shift_16x16(in, -cfg->shift[0]); + iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]); + round_shift_16x16(in, -row_cfg->shift[0]); transpose_16x16(in, out); - idct16x16_sse4_1(out, in, cfg->cos_bit_col[2]); - write_buffer_16x16(in, output, stride, 1, 0, -cfg->shift[1], bd); + idct16x16_sse4_1(out, in, col_cfg->cos_bit[2]); + write_buffer_16x16(in, output, stride, 1, 0, -row_cfg->shift[1], bd); break; case ADST_FLIPADST: - cfg = &inv_txfm_2d_cfg_adst_adst_16; + row_cfg = &inv_txfm_1d_row_cfg_adst_16; + col_cfg = &inv_txfm_1d_col_cfg_adst_16; load_buffer_16x16(coeff, in); transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]); - round_shift_16x16(in, -cfg->shift[0]); + iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]); + round_shift_16x16(in, -row_cfg->shift[0]); transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]); - write_buffer_16x16(in, output, stride, 1, 0, -cfg->shift[1], bd); + iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]); + write_buffer_16x16(in, output, stride, 1, 0, -row_cfg->shift[1], bd); break; case FLIPADST_FLIPADST: - cfg = &inv_txfm_2d_cfg_adst_adst_16; + row_cfg = &inv_txfm_1d_row_cfg_adst_16; + col_cfg = &inv_txfm_1d_col_cfg_adst_16; load_buffer_16x16(coeff, in); transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]); - round_shift_16x16(in, -cfg->shift[0]); + iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]); + round_shift_16x16(in, -row_cfg->shift[0]); transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]); - write_buffer_16x16(in, output, stride, 1, 1, -cfg->shift[1], bd); + iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]); + write_buffer_16x16(in, output, stride, 1, 1, -row_cfg->shift[1], bd); break; case FLIPADST_ADST: - cfg = &inv_txfm_2d_cfg_adst_adst_16; + row_cfg = &inv_txfm_1d_row_cfg_adst_16; + col_cfg = &inv_txfm_1d_col_cfg_adst_16; load_buffer_16x16(coeff, in); transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]); - round_shift_16x16(in, -cfg->shift[0]); + iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]); + round_shift_16x16(in, -row_cfg->shift[0]); transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]); - write_buffer_16x16(in, output, stride, 0, 1, -cfg->shift[1], bd); + iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]); + write_buffer_16x16(in, output, stride, 0, 1, -row_cfg->shift[1], bd); break; #endif default: assert(0); diff --git a/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h index bc96defe3..fb246674a 100644 --- a/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h +++ b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h @@ -77,14 +77,15 @@ static INLINE void transpose_16x16(const __m128i *in, __m128i *out) { // Note: // rounding = 1 << (bit - 1) -static INLINE __m128i half_btf_sse4_1(__m128i w0, __m128i n0, __m128i w1, - __m128i n1, __m128i rounding, int bit) { +static INLINE __m128i half_btf_sse4_1(const __m128i *w0, const __m128i *n0, + const __m128i *w1, const __m128i *n1, + const __m128i *rounding, int bit) { __m128i x, y; - x = _mm_mullo_epi32(w0, n0); - y = _mm_mullo_epi32(w1, n1); + x = _mm_mullo_epi32(*w0, *n0); + y = _mm_mullo_epi32(*w1, *n1); x = _mm_add_epi32(x, y); - x = _mm_add_epi32(x, rounding); + x = _mm_add_epi32(x, *rounding); x = _mm_srai_epi32(x, bit); return x; } diff --git a/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c b/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c index c25db88b7..37e2f61e7 100644 --- a/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c +++ b/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c @@ -14,16 +14,13 @@ #include "./av1_rtcd.h" #include "av1/common/warped_motion.h" -static const __m128i *const filter = (const __m128i *const)warped_filter; - -/* SSE2 version of the rotzoom/affine warp filter */ -void av1_highbd_warp_affine_ssse3(int32_t *mat, uint16_t *ref, int width, - int height, int stride, uint16_t *pred, - int p_col, int p_row, int p_width, - int p_height, int p_stride, int subsampling_x, - int subsampling_y, int bd, int ref_frm, - int16_t alpha, int16_t beta, int16_t gamma, - int16_t delta) { +void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref, + int width, int height, int stride, + uint16_t *pred, int p_col, int p_row, + int p_width, int p_height, int p_stride, + int subsampling_x, int subsampling_y, int bd, + int comp_avg, int16_t alpha, int16_t beta, + int16_t gamma, int16_t delta) { #if HORSHEAR_REDUCE_PREC_BITS >= 5 __m128i tmp[15]; #else @@ -47,23 +44,21 @@ void av1_highbd_warp_affine_ssse3(int32_t *mat, uint16_t *ref, int width, for (j = 0; j < p_width; j += 8) { // (x, y) coordinates of the center of this block in the destination // image - int32_t dst_x = p_col + j + 4; - int32_t dst_y = p_row + i + 4; + const int32_t dst_x = p_col + j + 4; + const int32_t dst_y = p_row + i + 4; int32_t x4, y4, ix4, sx4, iy4, sy4; if (subsampling_x) - x4 = ROUND_POWER_OF_TWO_SIGNED( - mat[2] * 2 * dst_x + mat[3] * 2 * dst_y + mat[0] + - (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2, - 1); + x4 = (mat[2] * 4 * dst_x + mat[3] * 4 * dst_y + mat[0] * 2 + + (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS))) / + 4; else x4 = mat[2] * dst_x + mat[3] * dst_y + mat[0]; if (subsampling_y) - y4 = ROUND_POWER_OF_TWO_SIGNED( - mat[4] * 2 * dst_x + mat[5] * 2 * dst_y + mat[1] + - (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2, - 1); + y4 = (mat[4] * 4 * dst_x + mat[5] * 4 * dst_y + mat[1] * 2 + + (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS))) / + 4; else y4 = mat[4] * dst_x + mat[5] * dst_y + mat[1]; @@ -72,71 +67,103 @@ void av1_highbd_warp_affine_ssse3(int32_t *mat, uint16_t *ref, int width, iy4 = y4 >> WARPEDMODEL_PREC_BITS; sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + // Add in all the constant terms, including rounding and offset + sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + + sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + // Horizontal filter - for (k = -7; k < AOMMIN(8, p_height - i); ++k) { - int iy = iy4 + k; - if (iy < 0) - iy = 0; - else if (iy > height - 1) - iy = height - 1; - - // If the block is aligned such that, after clamping, every sample - // would be taken from the leftmost/rightmost column, then we can - // skip the expensive horizontal filter. - if (ix4 <= -7) { + // If the block is aligned such that, after clamping, every sample + // would be taken from the leftmost/rightmost column, then we can + // skip the expensive horizontal filter. + if (ix4 <= -7) { + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; tmp[k + 7] = _mm_set1_epi16( + (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS - + 1)) + ref[iy * stride] * - (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS))); - } else if (ix4 >= width + 6) { + (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS))); + } + } else if (ix4 >= width + 6) { + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; tmp[k + 7] = _mm_set1_epi16( + (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS - + 1)) + ref[iy * stride + (width - 1)] * - (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS))); - } else { - int sx = sx4 + alpha * (-4) + beta * k + - // Include rounding and offset here - (1 << (WARPEDDIFF_PREC_BITS - 1)) + - (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS))); + } + } else { + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + int sx = sx4 + beta * (k + 4); // Load source pixels - __m128i src = + const __m128i src = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); - __m128i src2 = + const __m128i src2 = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1)); // Filter even-index pixels - __m128i tmp_0 = filter[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]; - __m128i tmp_2 = filter[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]; - __m128i tmp_4 = filter[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]; - __m128i tmp_6 = filter[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]; + const __m128i tmp_0 = _mm_loadu_si128( + (__m128i *)(warped_filter + + ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_2 = _mm_loadu_si128( + (__m128i *)(warped_filter + + ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_4 = _mm_loadu_si128( + (__m128i *)(warped_filter + + ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_6 = _mm_loadu_si128( + (__m128i *)(warped_filter + + ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS))); // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2 - __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2); + const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2); // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6 - __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6); + const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6); // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2 - __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2); + const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2); // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6 - __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6); + const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6); // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6 - __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10); + const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10); // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6 - __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10); + const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10); // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6 - __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14); + const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14); // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6 - __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14); + const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14); - __m128i round_const = - _mm_set1_epi32((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1); + const __m128i round_const = + _mm_set1_epi32((1 << (bd + WARPEDPIXEL_FILTER_BITS - 1)) + + ((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1)); // Calculate filtered results - __m128i res_0 = _mm_madd_epi16(src, coeff_0); - __m128i res_2 = + const __m128i res_0 = _mm_madd_epi16(src, coeff_0); + const __m128i res_2 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 4), coeff_2); - __m128i res_4 = + const __m128i res_4 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 8), coeff_4); - __m128i res_6 = + const __m128i res_6 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 12), coeff_6); __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), @@ -145,28 +172,36 @@ void av1_highbd_warp_affine_ssse3(int32_t *mat, uint16_t *ref, int width, HORSHEAR_REDUCE_PREC_BITS); // Filter odd-index pixels - __m128i tmp_1 = filter[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]; - __m128i tmp_3 = filter[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]; - __m128i tmp_5 = filter[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]; - __m128i tmp_7 = filter[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]; - - __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3); - __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7); - __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3); - __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7); - - __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11); - __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11); - __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15); - __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15); - - __m128i res_1 = + const __m128i tmp_1 = _mm_loadu_si128( + (__m128i *)(warped_filter + + ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_3 = _mm_loadu_si128( + (__m128i *)(warped_filter + + ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_5 = _mm_loadu_si128( + (__m128i *)(warped_filter + + ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_7 = _mm_loadu_si128( + (__m128i *)(warped_filter + + ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS))); + + const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3); + const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7); + const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3); + const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7); + + const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11); + const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11); + const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15); + const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15); + + const __m128i res_1 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 2), coeff_1); - __m128i res_3 = + const __m128i res_3 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 6), coeff_3); - __m128i res_5 = + const __m128i res_5 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 10), coeff_5); - __m128i res_7 = + const __m128i res_7 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 14), coeff_7); __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), @@ -183,101 +218,118 @@ void av1_highbd_warp_affine_ssse3(int32_t *mat, uint16_t *ref, int width, // Vertical filter for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { - int sy = sy4 + gamma * (-4) + delta * k + - (1 << (WARPEDDIFF_PREC_BITS - 1)) + - (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + int sy = sy4 + delta * (k + 4); // Load from tmp and rearrange pairs of consecutive rows into the // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7 - __m128i *src = tmp + (k + 4); - __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]); - __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]); - __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]); - __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]); + const __m128i *src = tmp + (k + 4); + const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]); + const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]); + const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]); + const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]); // Filter even-index pixels - __m128i tmp_0 = filter[(sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS]; - __m128i tmp_2 = filter[(sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS]; - __m128i tmp_4 = filter[(sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS]; - __m128i tmp_6 = filter[(sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS]; - - __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2); - __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6); - __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2); - __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6); - - __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10); - __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10); - __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14); - __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14); - - __m128i res_0 = _mm_madd_epi16(src_0, coeff_0); - __m128i res_2 = _mm_madd_epi16(src_2, coeff_2); - __m128i res_4 = _mm_madd_epi16(src_4, coeff_4); - __m128i res_6 = _mm_madd_epi16(src_6, coeff_6); - - __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), - _mm_add_epi32(res_4, res_6)); + const __m128i tmp_0 = _mm_loadu_si128( + (__m128i *)(warped_filter + + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_2 = _mm_loadu_si128( + (__m128i *)(warped_filter + + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_4 = _mm_loadu_si128( + (__m128i *)(warped_filter + + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_6 = _mm_loadu_si128( + (__m128i *)(warped_filter + + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); + + const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2); + const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6); + const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2); + const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6); + + const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10); + const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10); + const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14); + const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14); + + const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6); + + const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), + _mm_add_epi32(res_4, res_6)); // Filter odd-index pixels - __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]); - __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]); - __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]); - __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]); - - __m128i tmp_1 = filter[(sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS]; - __m128i tmp_3 = filter[(sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS]; - __m128i tmp_5 = filter[(sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS]; - __m128i tmp_7 = filter[(sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS]; - - __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3); - __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7); - __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3); - __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7); - - __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11); - __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11); - __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15); - __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15); - - __m128i res_1 = _mm_madd_epi16(src_1, coeff_1); - __m128i res_3 = _mm_madd_epi16(src_3, coeff_3); - __m128i res_5 = _mm_madd_epi16(src_5, coeff_5); - __m128i res_7 = _mm_madd_epi16(src_7, coeff_7); - - __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), - _mm_add_epi32(res_5, res_7)); + const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]); + const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]); + const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]); + const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]); + + const __m128i tmp_1 = _mm_loadu_si128( + (__m128i *)(warped_filter + + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_3 = _mm_loadu_si128( + (__m128i *)(warped_filter + + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_5 = _mm_loadu_si128( + (__m128i *)(warped_filter + + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_7 = _mm_loadu_si128( + (__m128i *)(warped_filter + + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); + + const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3); + const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7); + const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3); + const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7); + + const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11); + const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11); + const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15); + const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15); + + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7); + + const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), + _mm_add_epi32(res_5, res_7)); // Rearrange pixels back into the order 0 ... 7 - __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); - __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); + const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); + const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); // Round and pack into 8 bits - __m128i round_const = - _mm_set1_epi32((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1); + const __m128i round_const = + _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) + + ((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1)); - __m128i res_lo_round = _mm_srai_epi32( + const __m128i res_lo_round = _mm_srai_epi32( _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS); - __m128i res_hi_round = _mm_srai_epi32( + const __m128i res_hi_round = _mm_srai_epi32( _mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS); __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); // Clamp res_16bit to the range [0, 2^bd - 1] - __m128i max_val = _mm_set1_epi16((1 << bd) - 1); - __m128i zero = _mm_setzero_si128(); + const __m128i max_val = _mm_set1_epi16((1 << bd) - 1); + const __m128i zero = _mm_setzero_si128(); res_16bit = _mm_max_epi16(_mm_min_epi16(res_16bit, max_val), zero); // Store, blending with 'pred' if needed - __m128i *p = (__m128i *)&pred[(i + k + 4) * p_stride + j]; + __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j]; // Note: If we're outputting a 4x4 block, we need to be very careful // to only output 4 pixels at this point, to avoid encode/decode // mismatches when encoding with multiple threads. if (p_width == 4) { - if (ref_frm) res_16bit = _mm_avg_epu16(res_16bit, _mm_loadl_epi64(p)); + if (comp_avg) + res_16bit = _mm_avg_epu16(res_16bit, _mm_loadl_epi64(p)); _mm_storel_epi64(p, res_16bit); } else { - if (ref_frm) res_16bit = _mm_avg_epu16(res_16bit, _mm_loadu_si128(p)); + if (comp_avg) + res_16bit = _mm_avg_epu16(res_16bit, _mm_loadu_si128(p)); _mm_storeu_si128(p, res_16bit); } } diff --git a/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c index efc8d1e24..c69614e42 100644 --- a/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c +++ b/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c @@ -14,67 +14,9 @@ #include "./aom_config.h" #include "./av1_rtcd.h" -#include "aom_dsp/x86/txfm_common_avx2.h" - -static INLINE void load_coeff(const tran_low_t *coeff, __m256i *in) { -#if CONFIG_HIGHBITDEPTH - *in = _mm256_setr_epi16( - (int16_t)coeff[0], (int16_t)coeff[1], (int16_t)coeff[2], - (int16_t)coeff[3], (int16_t)coeff[4], (int16_t)coeff[5], - (int16_t)coeff[6], (int16_t)coeff[7], (int16_t)coeff[8], - (int16_t)coeff[9], (int16_t)coeff[10], (int16_t)coeff[11], - (int16_t)coeff[12], (int16_t)coeff[13], (int16_t)coeff[14], - (int16_t)coeff[15]); -#else - *in = _mm256_loadu_si256((const __m256i *)coeff); -#endif -} - -static void load_buffer_16x16(const tran_low_t *coeff, __m256i *in) { - int i = 0; - while (i < 16) { - load_coeff(coeff + (i << 4), &in[i]); - i += 1; - } -} - -static void recon_and_store(const __m256i *res, uint8_t *output) { - const __m128i zero = _mm_setzero_si128(); - __m128i x = _mm_loadu_si128((__m128i const *)output); - __m128i p0 = _mm_unpacklo_epi8(x, zero); - __m128i p1 = _mm_unpackhi_epi8(x, zero); - - p0 = _mm_add_epi16(p0, _mm256_castsi256_si128(*res)); - p1 = _mm_add_epi16(p1, _mm256_extractf128_si256(*res, 1)); - x = _mm_packus_epi16(p0, p1); - _mm_storeu_si128((__m128i *)output, x); -} - -#define IDCT_ROUNDING_POS (6) - -static void write_buffer_16x16(__m256i *in, const int stride, uint8_t *output) { - const __m256i rounding = _mm256_set1_epi16(1 << (IDCT_ROUNDING_POS - 1)); - int i = 0; - - while (i < 16) { - in[i] = _mm256_add_epi16(in[i], rounding); - in[i] = _mm256_srai_epi16(in[i], IDCT_ROUNDING_POS); - recon_and_store(&in[i], output + i * stride); - i += 1; - } -} - -static INLINE void unpack_butter_fly(const __m256i *a0, const __m256i *a1, - const __m256i *c0, const __m256i *c1, - __m256i *b0, __m256i *b1) { - __m256i x0, x1; - x0 = _mm256_unpacklo_epi16(*a0, *a1); - x1 = _mm256_unpackhi_epi16(*a0, *a1); - *b0 = butter_fly(x0, x1, *c0); - *b1 = butter_fly(x0, x1, *c1); -} +#include "aom_dsp/x86/inv_txfm_common_avx2.h" -static void idct16_avx2(__m256i *in) { +void av1_idct16_avx2(__m256i *in) { const __m256i cospi_p30_m02 = pair256_set_epi16(cospi_30_64, -cospi_2_64); const __m256i cospi_p02_p30 = pair256_set_epi16(cospi_2_64, cospi_30_64); const __m256i cospi_p14_m18 = pair256_set_epi16(cospi_14_64, -cospi_18_64); @@ -216,8 +158,8 @@ static void idct16_avx2(__m256i *in) { } static void idct16(__m256i *in) { - mm256_transpose_16x16(in); - idct16_avx2(in); + mm256_transpose_16x16(in, in); + av1_idct16_avx2(in); } static INLINE void butterfly_32b(const __m256i *a0, const __m256i *a1, @@ -398,7 +340,7 @@ static void iadst16_avx2(__m256i *in) { } static void iadst16(__m256i *in) { - mm256_transpose_16x16(in); + mm256_transpose_16x16(in, in); iadst16_avx2(in); } @@ -416,8 +358,8 @@ static void flip_col(uint8_t **dest, int *stride, int rows) { } static void iidtx16(__m256i *in) { - mm256_transpose_16x16(in); - txfm_scaling16_avx2(Sqrt2, in); + mm256_transpose_16x16(in, in); + txfm_scaling16_avx2((int16_t)Sqrt2, in); } #endif @@ -503,5 +445,5 @@ void av1_iht16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest, #endif // CONFIG_EXT_TX default: assert(0); break; } - write_buffer_16x16(in, stride, dest); + store_buffer_16xN(in, stride, dest, 16); } diff --git a/third_party/aom/av1/common/x86/idct_intrin_sse2.c b/third_party/aom/av1/common/x86/idct_intrin_sse2.c index 522e8988c..d6a598746 100644 --- a/third_party/aom/av1/common/x86/idct_intrin_sse2.c +++ b/third_party/aom/av1/common/x86/idct_intrin_sse2.c @@ -17,14 +17,14 @@ #include "av1/common/enums.h" #if CONFIG_EXT_TX -static INLINE void fliplr_4x4(__m128i in[2]) { +static INLINE void fliplr_4x4(__m128i *in /*in[2]*/) { in[0] = _mm_shufflelo_epi16(in[0], 0x1b); in[0] = _mm_shufflehi_epi16(in[0], 0x1b); in[1] = _mm_shufflelo_epi16(in[1], 0x1b); in[1] = _mm_shufflehi_epi16(in[1], 0x1b); } -static INLINE void fliplr_8x8(__m128i in[8]) { +static INLINE void fliplr_8x8(__m128i *in /*in[8]*/) { in[0] = mm_reverse_epi16(in[0]); in[1] = mm_reverse_epi16(in[1]); in[2] = mm_reverse_epi16(in[2]); @@ -36,7 +36,7 @@ static INLINE void fliplr_8x8(__m128i in[8]) { in[7] = mm_reverse_epi16(in[7]); } -static INLINE void fliplr_16x8(__m128i in[16]) { +static INLINE void fliplr_16x8(__m128i *in /*in[16]*/) { fliplr_8x8(&in[0]); fliplr_8x8(&in[8]); } @@ -356,7 +356,7 @@ static void iidtx8_sse2(__m128i *in) { } static INLINE void iidtx4_sse2(__m128i *in) { - const __m128i v_scale_w = _mm_set1_epi16(Sqrt2); + const __m128i v_scale_w = _mm_set1_epi16((int16_t)Sqrt2); const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w); const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w); diff --git a/third_party/aom/av1/common/x86/warp_plane_sse2.c b/third_party/aom/av1/common/x86/warp_plane_sse2.c index 925e4650d..cdc4e8d0f 100644 --- a/third_party/aom/av1/common/x86/warp_plane_sse2.c +++ b/third_party/aom/av1/common/x86/warp_plane_sse2.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * Copyright (c) 2017, Alliance for Open Media. All rights reserved * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License @@ -14,17 +14,15 @@ #include "./av1_rtcd.h" #include "av1/common/warped_motion.h" -static const __m128i *const filter = (const __m128i *const)warped_filter; - -/* SSE2 version of the rotzoom/affine warp filter */ -void av1_warp_affine_sse2(int32_t *mat, uint8_t *ref, int width, int height, - int stride, uint8_t *pred, int p_col, int p_row, - int p_width, int p_height, int p_stride, - int subsampling_x, int subsampling_y, int ref_frm, +void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width, + int height, int stride, uint8_t *pred, int p_col, + int p_row, int p_width, int p_height, int p_stride, + int subsampling_x, int subsampling_y, int comp_avg, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta) { __m128i tmp[15]; int i, j, k; + const int bd = 8; /* Note: For this code to work, the left/right frame borders need to be extended by at least 13 pixels each. By the time we get here, other @@ -42,23 +40,21 @@ void av1_warp_affine_sse2(int32_t *mat, uint8_t *ref, int width, int height, for (j = 0; j < p_width; j += 8) { // (x, y) coordinates of the center of this block in the destination // image - int32_t dst_x = p_col + j + 4; - int32_t dst_y = p_row + i + 4; + const int32_t dst_x = p_col + j + 4; + const int32_t dst_y = p_row + i + 4; int32_t x4, y4, ix4, sx4, iy4, sy4; if (subsampling_x) - x4 = ROUND_POWER_OF_TWO_SIGNED( - mat[2] * 2 * dst_x + mat[3] * 2 * dst_y + mat[0] + - (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2, - 1); + x4 = (mat[2] * 4 * dst_x + mat[3] * 4 * dst_y + mat[0] * 2 + + (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS))) / + 4; else x4 = mat[2] * dst_x + mat[3] * dst_y + mat[0]; if (subsampling_y) - y4 = ROUND_POWER_OF_TWO_SIGNED( - mat[4] * 2 * dst_x + mat[5] * 2 * dst_y + mat[1] + - (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2, - 1); + y4 = (mat[4] * 4 * dst_x + mat[5] * 4 * dst_y + mat[1] * 2 + + (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS))) / + 4; else y4 = mat[4] * dst_x + mat[5] * dst_y + mat[1]; @@ -67,76 +63,104 @@ void av1_warp_affine_sse2(int32_t *mat, uint8_t *ref, int width, int height, iy4 = y4 >> WARPEDMODEL_PREC_BITS; sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + // Add in all the constant terms, including rounding and offset + sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + + sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + // Horizontal filter - for (k = -7; k < AOMMIN(8, p_height - i); ++k) { - int iy = iy4 + k; - if (iy < 0) - iy = 0; - else if (iy > height - 1) - iy = height - 1; - - // If the block is aligned such that, after clamping, every sample - // would be taken from the leftmost/rightmost column, then we can - // skip the expensive horizontal filter. - if (ix4 <= -7) { + // If the block is aligned such that, after clamping, every sample + // would be taken from the leftmost/rightmost column, then we can + // skip the expensive horizontal filter. + if (ix4 <= -7) { + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; tmp[k + 7] = _mm_set1_epi16( + (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS - + 1)) + ref[iy * stride] * - (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS))); - } else if (ix4 >= width + 6) { + (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS))); + } + } else if (ix4 >= width + 6) { + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; tmp[k + 7] = _mm_set1_epi16( + (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS - + 1)) + ref[iy * stride + (width - 1)] * - (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS))); - } else { - int sx = sx4 + alpha * (-4) + beta * k + - // Include rounding and offset here - (1 << (WARPEDDIFF_PREC_BITS - 1)) + - (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS))); + } + } else { + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + int sx = sx4 + beta * (k + 4); // Load source pixels - __m128i zero = _mm_setzero_si128(); - __m128i src = + const __m128i zero = _mm_setzero_si128(); + const __m128i src = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); // Filter even-index pixels - __m128i tmp_0 = _mm_loadu_si128( - (__m128i *)(filter + ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS))); - __m128i tmp_2 = _mm_loadu_si128( - (__m128i *)(filter + ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS))); - __m128i tmp_4 = _mm_loadu_si128( - (__m128i *)(filter + ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS))); - __m128i tmp_6 = _mm_loadu_si128( - (__m128i *)(filter + ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_0 = _mm_loadu_si128( + (__m128i *)(warped_filter + + ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_2 = _mm_loadu_si128( + (__m128i *)(warped_filter + + ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_4 = _mm_loadu_si128( + (__m128i *)(warped_filter + + ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_6 = _mm_loadu_si128( + (__m128i *)(warped_filter + + ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS))); // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2 - __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2); + const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2); // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6 - __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6); + const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6); // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2 - __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2); + const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2); // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6 - __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6); + const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6); // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6 - __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10); + const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10); // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6 - __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10); + const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10); // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6 - __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14); + const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14); // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6 - __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14); + const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14); - __m128i round_const = - _mm_set1_epi32((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1); + const __m128i round_const = + _mm_set1_epi32((1 << (bd + WARPEDPIXEL_FILTER_BITS - 1)) + + ((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1)); // Calculate filtered results - __m128i src_0 = _mm_unpacklo_epi8(src, zero); - __m128i res_0 = _mm_madd_epi16(src_0, coeff_0); - __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(src, 2), zero); - __m128i res_2 = _mm_madd_epi16(src_2, coeff_2); - __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(src, 4), zero); - __m128i res_4 = _mm_madd_epi16(src_4, coeff_4); - __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(src, 6), zero); - __m128i res_6 = _mm_madd_epi16(src_6, coeff_6); + const __m128i src_0 = _mm_unpacklo_epi8(src, zero); + const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0); + const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(src, 2), zero); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2); + const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(src, 4), zero); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4); + const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(src, 6), zero); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6); __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6)); @@ -144,33 +168,37 @@ void av1_warp_affine_sse2(int32_t *mat, uint8_t *ref, int width, int height, HORSHEAR_REDUCE_PREC_BITS); // Filter odd-index pixels - __m128i tmp_1 = _mm_loadu_si128( - (__m128i *)(filter + ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS))); - __m128i tmp_3 = _mm_loadu_si128( - (__m128i *)(filter + ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS))); - __m128i tmp_5 = _mm_loadu_si128( - (__m128i *)(filter + ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS))); - __m128i tmp_7 = _mm_loadu_si128( - (__m128i *)(filter + ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS))); - - __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3); - __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7); - __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3); - __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7); - - __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11); - __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11); - __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15); - __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15); - - __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(src, 1), zero); - __m128i res_1 = _mm_madd_epi16(src_1, coeff_1); - __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(src, 3), zero); - __m128i res_3 = _mm_madd_epi16(src_3, coeff_3); - __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(src, 5), zero); - __m128i res_5 = _mm_madd_epi16(src_5, coeff_5); - __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(src, 7), zero); - __m128i res_7 = _mm_madd_epi16(src_7, coeff_7); + const __m128i tmp_1 = _mm_loadu_si128( + (__m128i *)(warped_filter + + ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_3 = _mm_loadu_si128( + (__m128i *)(warped_filter + + ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_5 = _mm_loadu_si128( + (__m128i *)(warped_filter + + ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_7 = _mm_loadu_si128( + (__m128i *)(warped_filter + + ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS))); + + const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3); + const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7); + const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3); + const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7); + + const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11); + const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11); + const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15); + const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15); + + const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(src, 1), zero); + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1); + const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(src, 3), zero); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3); + const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(src, 5), zero); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5); + const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(src, 7), zero); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7); __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7)); @@ -186,109 +214,116 @@ void av1_warp_affine_sse2(int32_t *mat, uint8_t *ref, int width, int height, // Vertical filter for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { - int sy = sy4 + gamma * (-4) + delta * k + - (1 << (WARPEDDIFF_PREC_BITS - 1)) + - (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + int sy = sy4 + delta * (k + 4); // Load from tmp and rearrange pairs of consecutive rows into the // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7 - __m128i *src = tmp + (k + 4); - __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]); - __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]); - __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]); - __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]); + const __m128i *src = tmp + (k + 4); + const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]); + const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]); + const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]); + const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]); // Filter even-index pixels - __m128i tmp_0 = _mm_loadu_si128( - (__m128i *)(filter + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); - __m128i tmp_2 = _mm_loadu_si128( - (__m128i *)(filter + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); - __m128i tmp_4 = _mm_loadu_si128( - (__m128i *)(filter + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); - __m128i tmp_6 = _mm_loadu_si128( - (__m128i *)(filter + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); - - __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2); - __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6); - __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2); - __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6); - - __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10); - __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10); - __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14); - __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14); - - __m128i res_0 = _mm_madd_epi16(src_0, coeff_0); - __m128i res_2 = _mm_madd_epi16(src_2, coeff_2); - __m128i res_4 = _mm_madd_epi16(src_4, coeff_4); - __m128i res_6 = _mm_madd_epi16(src_6, coeff_6); - - __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), - _mm_add_epi32(res_4, res_6)); + const __m128i tmp_0 = _mm_loadu_si128( + (__m128i *)(warped_filter + + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_2 = _mm_loadu_si128( + (__m128i *)(warped_filter + + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_4 = _mm_loadu_si128( + (__m128i *)(warped_filter + + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_6 = _mm_loadu_si128( + (__m128i *)(warped_filter + + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); + + const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2); + const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6); + const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2); + const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6); + + const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10); + const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10); + const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14); + const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14); + + const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6); + + const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), + _mm_add_epi32(res_4, res_6)); // Filter odd-index pixels - __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]); - __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]); - __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]); - __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]); - - __m128i tmp_1 = _mm_loadu_si128( - (__m128i *)(filter + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); - __m128i tmp_3 = _mm_loadu_si128( - (__m128i *)(filter + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); - __m128i tmp_5 = _mm_loadu_si128( - (__m128i *)(filter + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); - __m128i tmp_7 = _mm_loadu_si128( - (__m128i *)(filter + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); - - __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3); - __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7); - __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3); - __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7); - - __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11); - __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11); - __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15); - __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15); - - __m128i res_1 = _mm_madd_epi16(src_1, coeff_1); - __m128i res_3 = _mm_madd_epi16(src_3, coeff_3); - __m128i res_5 = _mm_madd_epi16(src_5, coeff_5); - __m128i res_7 = _mm_madd_epi16(src_7, coeff_7); - - __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), - _mm_add_epi32(res_5, res_7)); + const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]); + const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]); + const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]); + const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]); + + const __m128i tmp_1 = _mm_loadu_si128( + (__m128i *)(warped_filter + + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_3 = _mm_loadu_si128( + (__m128i *)(warped_filter + + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_5 = _mm_loadu_si128( + (__m128i *)(warped_filter + + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_7 = _mm_loadu_si128( + (__m128i *)(warped_filter + + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); + + const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3); + const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7); + const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3); + const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7); + + const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11); + const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11); + const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15); + const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15); + + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7); + + const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), + _mm_add_epi32(res_5, res_7)); // Rearrange pixels back into the order 0 ... 7 - __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); - __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); + const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); + const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); // Round and pack into 8 bits - __m128i round_const = - _mm_set1_epi32((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1); + const __m128i round_const = + _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) + + ((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1)); - __m128i res_lo_round = _mm_srai_epi32( + const __m128i res_lo_round = _mm_srai_epi32( _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS); - __m128i res_hi_round = _mm_srai_epi32( + const __m128i res_hi_round = _mm_srai_epi32( _mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS); - __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); + const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit); // Store, blending with 'pred' if needed - __m128i *p = (__m128i *)&pred[(i + k + 4) * p_stride + j]; + __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j]; // Note: If we're outputting a 4x4 block, we need to be very careful // to only output 4 pixels at this point, to avoid encode/decode // mismatches when encoding with multiple threads. if (p_width == 4) { - if (ref_frm) { + if (comp_avg) { const __m128i orig = _mm_cvtsi32_si128(*(uint32_t *)p); res_8bit = _mm_avg_epu8(res_8bit, orig); } *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit); } else { - if (ref_frm) res_8bit = _mm_avg_epu8(res_8bit, _mm_loadl_epi64(p)); + if (comp_avg) res_8bit = _mm_avg_epu8(res_8bit, _mm_loadl_epi64(p)); _mm_storel_epi64(p, res_8bit); } } diff --git a/third_party/aom/av1/common/x86/warp_plane_ssse3.c b/third_party/aom/av1/common/x86/warp_plane_ssse3.c new file mode 100644 index 000000000..494410e99 --- /dev/null +++ b/third_party/aom/av1/common/x86/warp_plane_ssse3.c @@ -0,0 +1,508 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <tmmintrin.h> + +#include "./av1_rtcd.h" +#include "av1/common/warped_motion.h" + +/* This is a modified version of 'warped_filter' from warped_motion.c: + * Each coefficient is stored in 8 bits instead of 16 bits + * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7 + + This is done in order to avoid overflow: Since the tap with the largest + coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation + order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular + convolve functions. + + Instead, we use the summation order + ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)). + The rearrangement of coefficients in this table is so that we can get the + coefficients into the correct order more quickly. +*/ +/* clang-format off */ +DECLARE_ALIGNED(8, static const int8_t, + filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = { +#if WARPEDPIXEL_PREC_BITS == 6 + // [-1, 0) + { 0, 127, 0, 0, 0, 1, 0, 0}, { 0, 127, 0, 0, -1, 2, 0, 0}, + { 1, 127, -1, 0, -3, 4, 0, 0}, { 1, 126, -2, 0, -4, 6, 1, 0}, + { 1, 126, -3, 0, -5, 8, 1, 0}, { 1, 125, -4, 0, -6, 11, 1, 0}, + { 1, 124, -4, 0, -7, 13, 1, 0}, { 2, 123, -5, 0, -8, 15, 1, 0}, + { 2, 122, -6, 0, -9, 18, 1, 0}, { 2, 121, -6, 0, -10, 20, 1, 0}, + { 2, 120, -7, 0, -11, 22, 2, 0}, { 2, 119, -8, 0, -12, 25, 2, 0}, + { 3, 117, -8, 0, -13, 27, 2, 0}, { 3, 116, -9, 0, -13, 29, 2, 0}, + { 3, 114, -10, 0, -14, 32, 3, 0}, { 3, 113, -10, 0, -15, 35, 2, 0}, + { 3, 111, -11, 0, -15, 37, 3, 0}, { 3, 109, -11, 0, -16, 40, 3, 0}, + { 3, 108, -12, 0, -16, 42, 3, 0}, { 4, 106, -13, 0, -17, 45, 3, 0}, + { 4, 104, -13, 0, -17, 47, 3, 0}, { 4, 102, -14, 0, -17, 50, 3, 0}, + { 4, 100, -14, 0, -17, 52, 3, 0}, { 4, 98, -15, 0, -18, 55, 4, 0}, + { 4, 96, -15, 0, -18, 58, 3, 0}, { 4, 94, -16, 0, -18, 60, 4, 0}, + { 4, 91, -16, 0, -18, 63, 4, 0}, { 4, 89, -16, 0, -18, 65, 4, 0}, + { 4, 87, -17, 0, -18, 68, 4, 0}, { 4, 85, -17, 0, -18, 70, 4, 0}, + { 4, 82, -17, 0, -18, 73, 4, 0}, { 4, 80, -17, 0, -18, 75, 4, 0}, + { 4, 78, -18, 0, -18, 78, 4, 0}, { 4, 75, -18, 0, -17, 80, 4, 0}, + { 4, 73, -18, 0, -17, 82, 4, 0}, { 4, 70, -18, 0, -17, 85, 4, 0}, + { 4, 68, -18, 0, -17, 87, 4, 0}, { 4, 65, -18, 0, -16, 89, 4, 0}, + { 4, 63, -18, 0, -16, 91, 4, 0}, { 4, 60, -18, 0, -16, 94, 4, 0}, + { 3, 58, -18, 0, -15, 96, 4, 0}, { 4, 55, -18, 0, -15, 98, 4, 0}, + { 3, 52, -17, 0, -14, 100, 4, 0}, { 3, 50, -17, 0, -14, 102, 4, 0}, + { 3, 47, -17, 0, -13, 104, 4, 0}, { 3, 45, -17, 0, -13, 106, 4, 0}, + { 3, 42, -16, 0, -12, 108, 3, 0}, { 3, 40, -16, 0, -11, 109, 3, 0}, + { 3, 37, -15, 0, -11, 111, 3, 0}, { 2, 35, -15, 0, -10, 113, 3, 0}, + { 3, 32, -14, 0, -10, 114, 3, 0}, { 2, 29, -13, 0, -9, 116, 3, 0}, + { 2, 27, -13, 0, -8, 117, 3, 0}, { 2, 25, -12, 0, -8, 119, 2, 0}, + { 2, 22, -11, 0, -7, 120, 2, 0}, { 1, 20, -10, 0, -6, 121, 2, 0}, + { 1, 18, -9, 0, -6, 122, 2, 0}, { 1, 15, -8, 0, -5, 123, 2, 0}, + { 1, 13, -7, 0, -4, 124, 1, 0}, { 1, 11, -6, 0, -4, 125, 1, 0}, + { 1, 8, -5, 0, -3, 126, 1, 0}, { 1, 6, -4, 0, -2, 126, 1, 0}, + { 0, 4, -3, 0, -1, 127, 1, 0}, { 0, 2, -1, 0, 0, 127, 0, 0}, + // [0, 1) + { 0, 0, 1, 0, 0, 127, 0, 0}, { 0, -1, 2, 0, 0, 127, 0, 0}, + { 0, -3, 4, 1, 1, 127, -2, 0}, { 0, -5, 6, 1, 1, 127, -2, 0}, + { 0, -6, 8, 1, 2, 126, -3, 0}, {-1, -7, 11, 2, 2, 126, -4, -1}, + {-1, -8, 13, 2, 3, 125, -5, -1}, {-1, -10, 16, 3, 3, 124, -6, -1}, + {-1, -11, 18, 3, 4, 123, -7, -1}, {-1, -12, 20, 3, 4, 122, -7, -1}, + {-1, -13, 23, 3, 4, 121, -8, -1}, {-2, -14, 25, 4, 5, 120, -9, -1}, + {-1, -15, 27, 4, 5, 119, -10, -1}, {-1, -16, 30, 4, 5, 118, -11, -1}, + {-2, -17, 33, 5, 6, 116, -12, -1}, {-2, -17, 35, 5, 6, 114, -12, -1}, + {-2, -18, 38, 5, 6, 113, -13, -1}, {-2, -19, 41, 6, 7, 111, -14, -2}, + {-2, -19, 43, 6, 7, 110, -15, -2}, {-2, -20, 46, 6, 7, 108, -15, -2}, + {-2, -20, 49, 6, 7, 106, -16, -2}, {-2, -21, 51, 7, 7, 104, -16, -2}, + {-2, -21, 54, 7, 7, 102, -17, -2}, {-2, -21, 56, 7, 8, 100, -18, -2}, + {-2, -22, 59, 7, 8, 98, -18, -2}, {-2, -22, 62, 7, 8, 96, -19, -2}, + {-2, -22, 64, 7, 8, 94, -19, -2}, {-2, -22, 67, 8, 8, 91, -20, -2}, + {-2, -22, 69, 8, 8, 89, -20, -2}, {-2, -22, 72, 8, 8, 87, -21, -2}, + {-2, -21, 74, 8, 8, 84, -21, -2}, {-2, -22, 77, 8, 8, 82, -21, -2}, + {-2, -21, 79, 8, 8, 79, -21, -2}, {-2, -21, 82, 8, 8, 77, -22, -2}, + {-2, -21, 84, 8, 8, 74, -21, -2}, {-2, -21, 87, 8, 8, 72, -22, -2}, + {-2, -20, 89, 8, 8, 69, -22, -2}, {-2, -20, 91, 8, 8, 67, -22, -2}, + {-2, -19, 94, 8, 7, 64, -22, -2}, {-2, -19, 96, 8, 7, 62, -22, -2}, + {-2, -18, 98, 8, 7, 59, -22, -2}, {-2, -18, 100, 8, 7, 56, -21, -2}, + {-2, -17, 102, 7, 7, 54, -21, -2}, {-2, -16, 104, 7, 7, 51, -21, -2}, + {-2, -16, 106, 7, 6, 49, -20, -2}, {-2, -15, 108, 7, 6, 46, -20, -2}, + {-2, -15, 110, 7, 6, 43, -19, -2}, {-2, -14, 111, 7, 6, 41, -19, -2}, + {-1, -13, 113, 6, 5, 38, -18, -2}, {-1, -12, 114, 6, 5, 35, -17, -2}, + {-1, -12, 116, 6, 5, 33, -17, -2}, {-1, -11, 118, 5, 4, 30, -16, -1}, + {-1, -10, 119, 5, 4, 27, -15, -1}, {-1, -9, 120, 5, 4, 25, -14, -2}, + {-1, -8, 121, 4, 3, 23, -13, -1}, {-1, -7, 122, 4, 3, 20, -12, -1}, + {-1, -7, 123, 4, 3, 18, -11, -1}, {-1, -6, 124, 3, 3, 16, -10, -1}, + {-1, -5, 125, 3, 2, 13, -8, -1}, {-1, -4, 126, 2, 2, 11, -7, -1}, + { 0, -3, 126, 2, 1, 8, -6, 0}, { 0, -2, 127, 1, 1, 6, -5, 0}, + { 0, -2, 127, 1, 1, 4, -3, 0}, { 0, 0, 127, 0, 0, 2, -1, 0}, + // [1, 2) + { 0, 0, 127, 0, 0, 1, 0, 0}, { 0, 0, 127, 0, 0, -1, 2, 0}, + { 0, 1, 127, -1, 0, -3, 4, 0}, { 0, 1, 126, -2, 0, -4, 6, 1}, + { 0, 1, 126, -3, 0, -5, 8, 1}, { 0, 1, 125, -4, 0, -6, 11, 1}, + { 0, 1, 124, -4, 0, -7, 13, 1}, { 0, 2, 123, -5, 0, -8, 15, 1}, + { 0, 2, 122, -6, 0, -9, 18, 1}, { 0, 2, 121, -6, 0, -10, 20, 1}, + { 0, 2, 120, -7, 0, -11, 22, 2}, { 0, 2, 119, -8, 0, -12, 25, 2}, + { 0, 3, 117, -8, 0, -13, 27, 2}, { 0, 3, 116, -9, 0, -13, 29, 2}, + { 0, 3, 114, -10, 0, -14, 32, 3}, { 0, 3, 113, -10, 0, -15, 35, 2}, + { 0, 3, 111, -11, 0, -15, 37, 3}, { 0, 3, 109, -11, 0, -16, 40, 3}, + { 0, 3, 108, -12, 0, -16, 42, 3}, { 0, 4, 106, -13, 0, -17, 45, 3}, + { 0, 4, 104, -13, 0, -17, 47, 3}, { 0, 4, 102, -14, 0, -17, 50, 3}, + { 0, 4, 100, -14, 0, -17, 52, 3}, { 0, 4, 98, -15, 0, -18, 55, 4}, + { 0, 4, 96, -15, 0, -18, 58, 3}, { 0, 4, 94, -16, 0, -18, 60, 4}, + { 0, 4, 91, -16, 0, -18, 63, 4}, { 0, 4, 89, -16, 0, -18, 65, 4}, + { 0, 4, 87, -17, 0, -18, 68, 4}, { 0, 4, 85, -17, 0, -18, 70, 4}, + { 0, 4, 82, -17, 0, -18, 73, 4}, { 0, 4, 80, -17, 0, -18, 75, 4}, + { 0, 4, 78, -18, 0, -18, 78, 4}, { 0, 4, 75, -18, 0, -17, 80, 4}, + { 0, 4, 73, -18, 0, -17, 82, 4}, { 0, 4, 70, -18, 0, -17, 85, 4}, + { 0, 4, 68, -18, 0, -17, 87, 4}, { 0, 4, 65, -18, 0, -16, 89, 4}, + { 0, 4, 63, -18, 0, -16, 91, 4}, { 0, 4, 60, -18, 0, -16, 94, 4}, + { 0, 3, 58, -18, 0, -15, 96, 4}, { 0, 4, 55, -18, 0, -15, 98, 4}, + { 0, 3, 52, -17, 0, -14, 100, 4}, { 0, 3, 50, -17, 0, -14, 102, 4}, + { 0, 3, 47, -17, 0, -13, 104, 4}, { 0, 3, 45, -17, 0, -13, 106, 4}, + { 0, 3, 42, -16, 0, -12, 108, 3}, { 0, 3, 40, -16, 0, -11, 109, 3}, + { 0, 3, 37, -15, 0, -11, 111, 3}, { 0, 2, 35, -15, 0, -10, 113, 3}, + { 0, 3, 32, -14, 0, -10, 114, 3}, { 0, 2, 29, -13, 0, -9, 116, 3}, + { 0, 2, 27, -13, 0, -8, 117, 3}, { 0, 2, 25, -12, 0, -8, 119, 2}, + { 0, 2, 22, -11, 0, -7, 120, 2}, { 0, 1, 20, -10, 0, -6, 121, 2}, + { 0, 1, 18, -9, 0, -6, 122, 2}, { 0, 1, 15, -8, 0, -5, 123, 2}, + { 0, 1, 13, -7, 0, -4, 124, 1}, { 0, 1, 11, -6, 0, -4, 125, 1}, + { 0, 1, 8, -5, 0, -3, 126, 1}, { 0, 1, 6, -4, 0, -2, 126, 1}, + { 0, 0, 4, -3, 0, -1, 127, 1}, { 0, 0, 2, -1, 0, 0, 127, 0}, + // dummy (replicate row index 191) + { 0, 0, 2, -1, 0, 0, 127, 0}, + +#else + // [-1, 0) + { 0, 127, 0, 0, 0, 1, 0, 0}, { 1, 127, -1, 0, -3, 4, 0, 0}, + { 1, 126, -3, 0, -5, 8, 1, 0}, { 1, 124, -4, 0, -7, 13, 1, 0}, + { 2, 122, -6, 0, -9, 18, 1, 0}, { 2, 120, -7, 0, -11, 22, 2, 0}, + { 3, 117, -8, 0, -13, 27, 2, 0}, { 3, 114, -10, 0, -14, 32, 3, 0}, + { 3, 111, -11, 0, -15, 37, 3, 0}, { 3, 108, -12, 0, -16, 42, 3, 0}, + { 4, 104, -13, 0, -17, 47, 3, 0}, { 4, 100, -14, 0, -17, 52, 3, 0}, + { 4, 96, -15, 0, -18, 58, 3, 0}, { 4, 91, -16, 0, -18, 63, 4, 0}, + { 4, 87, -17, 0, -18, 68, 4, 0}, { 4, 82, -17, 0, -18, 73, 4, 0}, + { 4, 78, -18, 0, -18, 78, 4, 0}, { 4, 73, -18, 0, -17, 82, 4, 0}, + { 4, 68, -18, 0, -17, 87, 4, 0}, { 4, 63, -18, 0, -16, 91, 4, 0}, + { 3, 58, -18, 0, -15, 96, 4, 0}, { 3, 52, -17, 0, -14, 100, 4, 0}, + { 3, 47, -17, 0, -13, 104, 4, 0}, { 3, 42, -16, 0, -12, 108, 3, 0}, + { 3, 37, -15, 0, -11, 111, 3, 0}, { 3, 32, -14, 0, -10, 114, 3, 0}, + { 2, 27, -13, 0, -8, 117, 3, 0}, { 2, 22, -11, 0, -7, 120, 2, 0}, + { 1, 18, -9, 0, -6, 122, 2, 0}, { 1, 13, -7, 0, -4, 124, 1, 0}, + { 1, 8, -5, 0, -3, 126, 1, 0}, { 0, 4, -3, 0, -1, 127, 1, 0}, + // [0, 1) + { 0, 0, 1, 0, 0, 127, 0, 0}, { 0, -3, 4, 1, 1, 127, -2, 0}, + { 0, -6, 8, 1, 2, 126, -3, 0}, {-1, -8, 13, 2, 3, 125, -5, -1}, + {-1, -11, 18, 3, 4, 123, -7, -1}, {-1, -13, 23, 3, 4, 121, -8, -1}, + {-1, -15, 27, 4, 5, 119, -10, -1}, {-2, -17, 33, 5, 6, 116, -12, -1}, + {-2, -18, 38, 5, 6, 113, -13, -1}, {-2, -19, 43, 6, 7, 110, -15, -2}, + {-2, -20, 49, 6, 7, 106, -16, -2}, {-2, -21, 54, 7, 7, 102, -17, -2}, + {-2, -22, 59, 7, 8, 98, -18, -2}, {-2, -22, 64, 7, 8, 94, -19, -2}, + {-2, -22, 69, 8, 8, 89, -20, -2}, {-2, -21, 74, 8, 8, 84, -21, -2}, + {-2, -21, 79, 8, 8, 79, -21, -2}, {-2, -21, 84, 8, 8, 74, -21, -2}, + {-2, -20, 89, 8, 8, 69, -22, -2}, {-2, -19, 94, 8, 7, 64, -22, -2}, + {-2, -18, 98, 8, 7, 59, -22, -2}, {-2, -17, 102, 7, 7, 54, -21, -2}, + {-2, -16, 106, 7, 6, 49, -20, -2}, {-2, -15, 110, 7, 6, 43, -19, -2}, + {-1, -13, 113, 6, 5, 38, -18, -2}, {-1, -12, 116, 6, 5, 33, -17, -2}, + {-1, -10, 119, 5, 4, 27, -15, -1}, {-1, -8, 121, 4, 3, 23, -13, -1}, + {-1, -7, 123, 4, 3, 18, -11, -1}, {-1, -5, 125, 3, 2, 13, -8, -1}, + { 0, -3, 126, 2, 1, 8, -6, 0}, { 0, -2, 127, 1, 1, 4, -3, 0}, + // [1, 2) + { 0, 0, 127, 0, 0, 1, 0, 0}, { 0, 1, 127, -1, 0, -3, 4, 0}, + { 0, 1, 126, -3, 0, -5, 8, 1}, { 0, 1, 124, -4, 0, -7, 13, 1}, + { 0, 2, 122, -6, 0, -9, 18, 1}, { 0, 2, 120, -7, 0, -11, 22, 2}, + { 0, 3, 117, -8, 0, -13, 27, 2}, { 0, 3, 114, -10, 0, -14, 32, 3}, + { 0, 3, 111, -11, 0, -15, 37, 3}, { 0, 3, 108, -12, 0, -16, 42, 3}, + { 0, 4, 104, -13, 0, -17, 47, 3}, { 0, 4, 100, -14, 0, -17, 52, 3}, + { 0, 4, 96, -15, 0, -18, 58, 3}, { 0, 4, 91, -16, 0, -18, 63, 4}, + { 0, 4, 87, -17, 0, -18, 68, 4}, { 0, 4, 82, -17, 0, -18, 73, 4}, + { 0, 4, 78, -18, 0, -18, 78, 4}, { 0, 4, 73, -18, 0, -17, 82, 4}, + { 0, 4, 68, -18, 0, -17, 87, 4}, { 0, 4, 63, -18, 0, -16, 91, 4}, + { 0, 3, 58, -18, 0, -15, 96, 4}, { 0, 3, 52, -17, 0, -14, 100, 4}, + { 0, 3, 47, -17, 0, -13, 104, 4}, { 0, 3, 42, -16, 0, -12, 108, 3}, + { 0, 3, 37, -15, 0, -11, 111, 3}, { 0, 3, 32, -14, 0, -10, 114, 3}, + { 0, 2, 27, -13, 0, -8, 117, 3}, { 0, 2, 22, -11, 0, -7, 120, 2}, + { 0, 1, 18, -9, 0, -6, 122, 2}, { 0, 1, 13, -7, 0, -4, 124, 1}, + { 0, 1, 8, -5, 0, -3, 126, 1}, { 0, 0, 4, -3, 0, -1, 127, 1}, + // dummy (replicate row index 95) + { 0, 0, 4, -3, 0, -1, 127, 1}, +#endif // WARPEDPIXEL_PREC_BITS == 6 +}; +/* clang-format on */ + +// Shuffle masks: we want to convert a sequence of bytes 0, 1, 2, ..., 15 +// in an SSE register into two sequences: +// 0, 2, 2, 4, ..., 12, 12, 14, <don't care> +// 1, 3, 3, 5, ..., 13, 13, 15, <don't care> +static const uint8_t even_mask[16] = { 0, 2, 2, 4, 4, 6, 6, 8, + 8, 10, 10, 12, 12, 14, 14, 0 }; +static const uint8_t odd_mask[16] = { 1, 3, 3, 5, 5, 7, 7, 9, + 9, 11, 11, 13, 13, 15, 15, 0 }; + +void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width, + int height, int stride, uint8_t *pred, int p_col, + int p_row, int p_width, int p_height, int p_stride, + int subsampling_x, int subsampling_y, int comp_avg, + int16_t alpha, int16_t beta, int16_t gamma, + int16_t delta) { + __m128i tmp[15]; + int i, j, k; + const int bd = 8; + + /* Note: For this code to work, the left/right frame borders need to be + extended by at least 13 pixels each. By the time we get here, other + code will have set up this border, but we allow an explicit check + for debugging purposes. + */ + /*for (i = 0; i < height; ++i) { + for (j = 0; j < 13; ++j) { + assert(ref[i * stride - 13 + j] == ref[i * stride]); + assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]); + } + }*/ + + for (i = 0; i < p_height; i += 8) { + for (j = 0; j < p_width; j += 8) { + // (x, y) coordinates of the center of this block in the destination + // image + const int32_t dst_x = p_col + j + 4; + const int32_t dst_y = p_row + i + 4; + + int32_t x4, y4, ix4, sx4, iy4, sy4; + if (subsampling_x) + x4 = (mat[2] * 4 * dst_x + mat[3] * 4 * dst_y + mat[0] * 2 + + (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS))) / + 4; + else + x4 = mat[2] * dst_x + mat[3] * dst_y + mat[0]; + + if (subsampling_y) + y4 = (mat[4] * 4 * dst_x + mat[5] * 4 * dst_y + mat[1] * 2 + + (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS))) / + 4; + else + y4 = mat[4] * dst_x + mat[5] * dst_y + mat[1]; + + ix4 = x4 >> WARPEDMODEL_PREC_BITS; + sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + iy4 = y4 >> WARPEDMODEL_PREC_BITS; + sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + + // Add in all the constant terms, including rounding and offset + sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + + sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + + // Horizontal filter + // If the block is aligned such that, after clamping, every sample + // would be taken from the leftmost/rightmost column, then we can + // skip the expensive horizontal filter. + if (ix4 <= -7) { + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + tmp[k + 7] = _mm_set1_epi16( + (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS - + 1)) + + ref[iy * stride] * + (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS))); + } + } else if (ix4 >= width + 6) { + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + tmp[k + 7] = _mm_set1_epi16( + (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS - + 1)) + + ref[iy * stride + (width - 1)] * + (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS))); + } + } else { + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + int sx = sx4 + beta * (k + 4); + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + const __m128i src_even = + _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)even_mask)); + const __m128i src_odd = + _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)odd_mask)); + + // Filter even-index pixels + const __m128i tmp_0 = _mm_loadl_epi64(( + __m128i *)&filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_1 = _mm_loadl_epi64(( + __m128i *)&filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_2 = _mm_loadl_epi64(( + __m128i *)&filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_3 = _mm_loadl_epi64(( + __m128i *)&filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_4 = _mm_loadl_epi64(( + __m128i *)&filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_5 = _mm_loadl_epi64(( + __m128i *)&filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_6 = _mm_loadl_epi64(( + __m128i *)&filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_7 = _mm_loadl_epi64(( + __m128i *)&filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]); + + // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 0 2 + const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2); + // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 1 3 + const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3); + // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 4 6 + const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6); + // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 5 7 + const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7); + + // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 0 2 4 6 + const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10); + // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 0 2 4 6 + const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10); + // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 1 3 5 7 + const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11); + // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 1 3 5 7 + const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11); + + // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7 + const __m128i coeff_02 = _mm_unpacklo_epi64(tmp_12, tmp_14); + // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7 + const __m128i coeff_46 = _mm_unpackhi_epi64(tmp_12, tmp_14); + // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7 + const __m128i coeff_13 = _mm_unpacklo_epi64(tmp_13, tmp_15); + // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7 + const __m128i coeff_57 = _mm_unpackhi_epi64(tmp_13, tmp_15); + + // The pixel order we need for 'src' is: + // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9 + const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd); + const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff_02); + // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13 + const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4), + _mm_srli_si128(src_odd, 4)); + const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff_46); + // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10 + const __m128i src_13 = + _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2)); + const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff_13); + // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14 + const __m128i src_57 = _mm_unpacklo_epi64( + _mm_srli_si128(src_odd, 4), _mm_srli_si128(src_even, 6)); + const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff_57); + + const __m128i round_const = + _mm_set1_epi16((1 << (bd + WARPEDPIXEL_FILTER_BITS - 1)) + + ((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1)); + + // Note: The values res_02 + res_46 and res_13 + res_57 both + // fit into int16s at this point, but their sum may be too wide to fit + // into an int16. However, once we also add round_const, the sum of + // all of these fits into a uint16. + // + // The wrapping behaviour of _mm_add_* is used here to make sure we + // get the correct result despite converting between different + // (implicit) types. + const __m128i res_even = _mm_add_epi16(res_02, res_46); + const __m128i res_odd = _mm_add_epi16(res_13, res_57); + const __m128i res = + _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const); + tmp[k + 7] = _mm_srli_epi16(res, HORSHEAR_REDUCE_PREC_BITS); + } + } + + // Vertical filter + for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { + int sy = sy4 + delta * (k + 4); + + // Load from tmp and rearrange pairs of consecutive rows into the + // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7 + const __m128i *src = tmp + (k + 4); + const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]); + const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]); + const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]); + const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]); + + // Filter even-index pixels + const __m128i tmp_0 = _mm_loadu_si128( + (__m128i *)(warped_filter + + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_2 = _mm_loadu_si128( + (__m128i *)(warped_filter + + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_4 = _mm_loadu_si128( + (__m128i *)(warped_filter + + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_6 = _mm_loadu_si128( + (__m128i *)(warped_filter + + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); + + const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2); + const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6); + const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2); + const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6); + + const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10); + const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10); + const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14); + const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14); + + const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6); + + const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), + _mm_add_epi32(res_4, res_6)); + + // Filter odd-index pixels + const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]); + const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]); + const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]); + const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]); + + const __m128i tmp_1 = _mm_loadu_si128( + (__m128i *)(warped_filter + + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_3 = _mm_loadu_si128( + (__m128i *)(warped_filter + + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_5 = _mm_loadu_si128( + (__m128i *)(warped_filter + + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_7 = _mm_loadu_si128( + (__m128i *)(warped_filter + + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); + + const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3); + const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7); + const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3); + const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7); + + const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11); + const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11); + const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15); + const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15); + + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7); + + const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), + _mm_add_epi32(res_5, res_7)); + + // Rearrange pixels back into the order 0 ... 7 + const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); + const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); + + // Round and pack into 8 bits + const __m128i round_const = + _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) + + ((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1)); + + const __m128i res_lo_round = _mm_srai_epi32( + _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS); + const __m128i res_hi_round = _mm_srai_epi32( + _mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS); + + const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); + __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit); + + // Store, blending with 'pred' if needed + __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j]; + + // Note: If we're outputting a 4x4 block, we need to be very careful + // to only output 4 pixels at this point, to avoid encode/decode + // mismatches when encoding with multiple threads. + if (p_width == 4) { + if (comp_avg) { + const __m128i orig = _mm_cvtsi32_si128(*(uint32_t *)p); + res_8bit = _mm_avg_epu8(res_8bit, orig); + } + *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit); + } else { + if (comp_avg) res_8bit = _mm_avg_epu8(res_8bit, _mm_loadl_epi64(p)); + _mm_storel_epi64(p, res_8bit); + } + } + } + } +} |