From df9477dfa60ebb5d31bc142e58ce46535c17abce Mon Sep 17 00:00:00 2001 From: trav90 Date: Wed, 17 Oct 2018 05:59:08 -0500 Subject: Update aom to slightly newer commit ID --- .../aom/av1/encoder/x86/av1_quantize_sse2.c | 93 +++++-- .../aom/av1/encoder/x86/corner_match_sse4.c | 91 +++++++ .../aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c | 302 +++++++++++---------- .../aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c | 180 ++++++------ 4 files changed, 415 insertions(+), 251 deletions(-) create mode 100644 third_party/aom/av1/encoder/x86/corner_match_sse4.c (limited to 'third_party/aom/av1/encoder/x86') diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c b/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c index f9c95b6cb..190317389 100644 --- a/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c +++ b/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c @@ -15,13 +15,65 @@ #include "./av1_rtcd.h" #include "aom/aom_integer.h" -void av1_quantize_fp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs, +static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset, + __m128i *c0, __m128i *c1) { + const tran_low_t *addr = coeff + offset; +#if CONFIG_HIGHBITDEPTH + const __m128i x0 = _mm_load_si128((const __m128i *)addr); + const __m128i x1 = _mm_load_si128((const __m128i *)addr + 1); + const __m128i x2 = _mm_load_si128((const __m128i *)addr + 2); + const __m128i x3 = _mm_load_si128((const __m128i *)addr + 3); + *c0 = _mm_packs_epi32(x0, x1); + *c1 = _mm_packs_epi32(x2, x3); +#else + *c0 = _mm_load_si128((const __m128i *)addr); + *c1 = _mm_load_si128((const __m128i *)addr + 1); +#endif +} + +static INLINE void write_qcoeff(const __m128i *qc0, const __m128i *qc1, + tran_low_t *qcoeff, intptr_t offset) { + tran_low_t *addr = qcoeff + offset; +#if CONFIG_HIGHBITDEPTH + const __m128i zero = _mm_setzero_si128(); + __m128i sign_bits = _mm_cmplt_epi16(*qc0, zero); + __m128i y0 = _mm_unpacklo_epi16(*qc0, sign_bits); + __m128i y1 = _mm_unpackhi_epi16(*qc0, sign_bits); + _mm_store_si128((__m128i *)addr, y0); + _mm_store_si128((__m128i *)addr + 1, y1); + + sign_bits = _mm_cmplt_epi16(*qc1, zero); + y0 = _mm_unpacklo_epi16(*qc1, sign_bits); + y1 = _mm_unpackhi_epi16(*qc1, sign_bits); + _mm_store_si128((__m128i *)addr + 2, y0); + _mm_store_si128((__m128i *)addr + 3, y1); +#else + _mm_store_si128((__m128i *)addr, *qc0); + _mm_store_si128((__m128i *)addr + 1, *qc1); +#endif +} + +static INLINE void write_zero(tran_low_t *qcoeff, intptr_t offset) { + const __m128i zero = _mm_setzero_si128(); + tran_low_t *addr = qcoeff + offset; +#if CONFIG_HIGHBITDEPTH + _mm_store_si128((__m128i *)addr, zero); + _mm_store_si128((__m128i *)addr + 1, zero); + _mm_store_si128((__m128i *)addr + 2, zero); + _mm_store_si128((__m128i *)addr + 3, zero); +#else + _mm_store_si128((__m128i *)addr, zero); + _mm_store_si128((__m128i *)addr + 1, zero); +#endif +} + +void av1_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, - int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan_ptr, - const int16_t *iscan_ptr) { + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan_ptr, const int16_t *iscan_ptr) { __m128i zero; __m128i thr; int16_t nzflag; @@ -54,8 +106,7 @@ void av1_quantize_fp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs, __m128i qcoeff0, qcoeff1; __m128i qtmp0, qtmp1; // Do DC and first 15 AC - coeff0 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs)); - coeff1 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs) + 1); + read_coeff(coeff_ptr, n_coeffs, &coeff0, &coeff1); // Poor man's sign extract coeff0_sign = _mm_srai_epi16(coeff0, 15); @@ -78,15 +129,13 @@ void av1_quantize_fp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs, qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); + write_qcoeff(&qcoeff0, &qcoeff1, qcoeff_ptr, n_coeffs); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); dequant = _mm_unpackhi_epi64(dequant, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1); + write_qcoeff(&coeff0, &coeff1, dqcoeff_ptr, n_coeffs); } { @@ -121,8 +170,7 @@ void av1_quantize_fp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs, __m128i qcoeff0, qcoeff1; __m128i qtmp0, qtmp1; - coeff0 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs)); - coeff1 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs) + 1); + read_coeff(coeff_ptr, n_coeffs, &coeff0, &coeff1); // Poor man's sign extract coeff0_sign = _mm_srai_epi16(coeff0, 15); @@ -147,20 +195,15 @@ void av1_quantize_fp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs, qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); + write_qcoeff(&qcoeff0, &qcoeff1, qcoeff_ptr, n_coeffs); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1); + write_qcoeff(&coeff0, &coeff1, dqcoeff_ptr, n_coeffs); } else { - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero); - - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero); + write_zero(qcoeff_ptr, n_coeffs); + write_zero(dqcoeff_ptr, n_coeffs); } } @@ -200,10 +243,8 @@ void av1_quantize_fp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs, } } else { do { - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero); + write_zero(dqcoeff_ptr, n_coeffs); + write_zero(qcoeff_ptr, n_coeffs); n_coeffs += 8 * 2; } while (n_coeffs < 0); *eob_ptr = 0; diff --git a/third_party/aom/av1/encoder/x86/corner_match_sse4.c b/third_party/aom/av1/encoder/x86/corner_match_sse4.c new file mode 100644 index 000000000..179da0d28 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/corner_match_sse4.c @@ -0,0 +1,91 @@ +#include +#include +#include +#include + +#include + +#include "./av1_rtcd.h" +#include "aom_ports/mem.h" +#include "av1/encoder/corner_match.h" + +DECLARE_ALIGNED(16, static const uint8_t, byte_mask[16]) = { + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0 +}; +#if MATCH_SZ != 13 +#error "Need to change byte_mask in corner_match_sse4.c if MATCH_SZ != 13" +#endif + +/* Compute corr(im1, im2) * MATCH_SZ * stddev(im1), where the + correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows + of each image, centered at (x1, y1) and (x2, y2) respectively. +*/ +double compute_cross_correlation_sse4_1(unsigned char *im1, int stride1, int x1, + int y1, unsigned char *im2, int stride2, + int x2, int y2) { + int i; + // 2 16-bit partial sums in lanes 0, 4 (== 2 32-bit partial sums in lanes 0, + // 2) + __m128i sum1_vec = _mm_setzero_si128(); + __m128i sum2_vec = _mm_setzero_si128(); + // 4 32-bit partial sums of squares + __m128i sumsq2_vec = _mm_setzero_si128(); + __m128i cross_vec = _mm_setzero_si128(); + + const __m128i mask = _mm_load_si128((__m128i *)byte_mask); + const __m128i zero = _mm_setzero_si128(); + + im1 += (y1 - MATCH_SZ_BY2) * stride1 + (x1 - MATCH_SZ_BY2); + im2 += (y2 - MATCH_SZ_BY2) * stride2 + (x2 - MATCH_SZ_BY2); + + for (i = 0; i < MATCH_SZ; ++i) { + const __m128i v1 = + _mm_and_si128(_mm_loadu_si128((__m128i *)&im1[i * stride1]), mask); + const __m128i v2 = + _mm_and_si128(_mm_loadu_si128((__m128i *)&im2[i * stride2]), mask); + + // Using the 'sad' intrinsic here is a bit faster than adding + // v1_l + v1_r and v2_l + v2_r, plus it avoids the need for a 16->32 bit + // conversion step later, for a net speedup of ~10% + sum1_vec = _mm_add_epi16(sum1_vec, _mm_sad_epu8(v1, zero)); + sum2_vec = _mm_add_epi16(sum2_vec, _mm_sad_epu8(v2, zero)); + + const __m128i v1_l = _mm_cvtepu8_epi16(v1); + const __m128i v1_r = _mm_cvtepu8_epi16(_mm_srli_si128(v1, 8)); + const __m128i v2_l = _mm_cvtepu8_epi16(v2); + const __m128i v2_r = _mm_cvtepu8_epi16(_mm_srli_si128(v2, 8)); + + sumsq2_vec = _mm_add_epi32( + sumsq2_vec, + _mm_add_epi32(_mm_madd_epi16(v2_l, v2_l), _mm_madd_epi16(v2_r, v2_r))); + cross_vec = _mm_add_epi32( + cross_vec, + _mm_add_epi32(_mm_madd_epi16(v1_l, v2_l), _mm_madd_epi16(v1_r, v2_r))); + } + + // Now we can treat the four registers (sum1_vec, sum2_vec, sumsq2_vec, + // cross_vec) + // as holding 4 32-bit elements each, which we want to sum horizontally. + // We do this by transposing and then summing vertically. + __m128i tmp_0 = _mm_unpacklo_epi32(sum1_vec, sum2_vec); + __m128i tmp_1 = _mm_unpackhi_epi32(sum1_vec, sum2_vec); + __m128i tmp_2 = _mm_unpacklo_epi32(sumsq2_vec, cross_vec); + __m128i tmp_3 = _mm_unpackhi_epi32(sumsq2_vec, cross_vec); + + __m128i tmp_4 = _mm_unpacklo_epi64(tmp_0, tmp_2); + __m128i tmp_5 = _mm_unpackhi_epi64(tmp_0, tmp_2); + __m128i tmp_6 = _mm_unpacklo_epi64(tmp_1, tmp_3); + __m128i tmp_7 = _mm_unpackhi_epi64(tmp_1, tmp_3); + + __m128i res = + _mm_add_epi32(_mm_add_epi32(tmp_4, tmp_5), _mm_add_epi32(tmp_6, tmp_7)); + + int sum1 = _mm_extract_epi32(res, 0); + int sum2 = _mm_extract_epi32(res, 1); + int sumsq2 = _mm_extract_epi32(res, 2); + int cross = _mm_extract_epi32(res, 3); + + int var2 = sumsq2 * MATCH_SZ_SQ - sum2 * sum2; + int cov = cross * MATCH_SZ_SQ - sum1 * sum2; + return cov / sqrt((double)var2); +} diff --git a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c index f201a29aa..b56eed518 100644 --- a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c +++ b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c @@ -13,7 +13,7 @@ #include "./av1_rtcd.h" #include "./aom_config.h" -#include "av1/common/av1_fwd_txfm2d_cfg.h" +#include "av1/common/av1_fwd_txfm1d_cfg.h" #include "av1/common/av1_txfm.h" #include "av1/common/x86/highbd_txfm_utility_sse4.h" #include "aom_dsp/txfm_common.h" @@ -58,7 +58,7 @@ static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in, // shift[1] is used in txfm_func_col() // shift[2] is used in txfm_func_row() static void fdct4x4_sse4_1(__m128i *in, int bit) { - const int32_t *cospi = cospi_arr[bit - cos_bit_min]; + const int32_t *cospi = cospi_arr(bit); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i cospi48 = _mm_set1_epi32(cospi[48]); const __m128i cospi16 = _mm_set1_epi32(cospi[16]); @@ -133,7 +133,7 @@ void av1_highbd_fht4x4_sse4_1(const int16_t *input, tran_low_t *output, } static void fadst4x4_sse4_1(__m128i *in, int bit) { - const int32_t *cospi = cospi_arr[bit - cos_bit_min]; + const int32_t *cospi = cospi_arr(bit); const __m128i cospi8 = _mm_set1_epi32(cospi[8]); const __m128i cospi56 = _mm_set1_epi32(cospi[56]); const __m128i cospi40 = _mm_set1_epi32(cospi[40]); @@ -209,71 +209,81 @@ static void fadst4x4_sse4_1(__m128i *in, int bit) { void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff, int input_stride, int tx_type, int bd) { __m128i in[4]; - const TXFM_2D_CFG *cfg = NULL; + const TXFM_1D_CFG *row_cfg = NULL; + const TXFM_1D_CFG *col_cfg = NULL; switch (tx_type) { case DCT_DCT: - cfg = &fwd_txfm_2d_cfg_dct_dct_4; - load_buffer_4x4(input, in, input_stride, 0, 0, cfg->shift[0]); - fdct4x4_sse4_1(in, cfg->cos_bit_col[2]); - fdct4x4_sse4_1(in, cfg->cos_bit_row[2]); + row_cfg = &fwd_txfm_1d_row_cfg_dct_4; + col_cfg = &fwd_txfm_1d_col_cfg_dct_4; + load_buffer_4x4(input, in, input_stride, 0, 0, row_cfg->shift[0]); + fdct4x4_sse4_1(in, col_cfg->cos_bit[2]); + fdct4x4_sse4_1(in, row_cfg->cos_bit[2]); write_buffer_4x4(in, coeff); break; case ADST_DCT: - cfg = &fwd_txfm_2d_cfg_adst_dct_4; - load_buffer_4x4(input, in, input_stride, 0, 0, cfg->shift[0]); - fadst4x4_sse4_1(in, cfg->cos_bit_col[2]); - fdct4x4_sse4_1(in, cfg->cos_bit_row[2]); + row_cfg = &fwd_txfm_1d_row_cfg_dct_4; + col_cfg = &fwd_txfm_1d_col_cfg_adst_4; + load_buffer_4x4(input, in, input_stride, 0, 0, row_cfg->shift[0]); + fadst4x4_sse4_1(in, col_cfg->cos_bit[2]); + fdct4x4_sse4_1(in, row_cfg->cos_bit[2]); write_buffer_4x4(in, coeff); break; case DCT_ADST: - cfg = &fwd_txfm_2d_cfg_dct_adst_4; - load_buffer_4x4(input, in, input_stride, 0, 0, cfg->shift[0]); - fdct4x4_sse4_1(in, cfg->cos_bit_col[2]); - fadst4x4_sse4_1(in, cfg->cos_bit_row[2]); + row_cfg = &fwd_txfm_1d_row_cfg_adst_4; + col_cfg = &fwd_txfm_1d_col_cfg_dct_4; + load_buffer_4x4(input, in, input_stride, 0, 0, row_cfg->shift[0]); + fdct4x4_sse4_1(in, col_cfg->cos_bit[2]); + fadst4x4_sse4_1(in, row_cfg->cos_bit[2]); write_buffer_4x4(in, coeff); break; case ADST_ADST: - cfg = &fwd_txfm_2d_cfg_adst_adst_4; - load_buffer_4x4(input, in, input_stride, 0, 0, cfg->shift[0]); - fadst4x4_sse4_1(in, cfg->cos_bit_col[2]); - fadst4x4_sse4_1(in, cfg->cos_bit_row[2]); + row_cfg = &fwd_txfm_1d_row_cfg_adst_4; + col_cfg = &fwd_txfm_1d_col_cfg_adst_4; + load_buffer_4x4(input, in, input_stride, 0, 0, row_cfg->shift[0]); + fadst4x4_sse4_1(in, col_cfg->cos_bit[2]); + fadst4x4_sse4_1(in, row_cfg->cos_bit[2]); write_buffer_4x4(in, coeff); break; #if CONFIG_EXT_TX case FLIPADST_DCT: - cfg = &fwd_txfm_2d_cfg_adst_dct_4; - load_buffer_4x4(input, in, input_stride, 1, 0, cfg->shift[0]); - fadst4x4_sse4_1(in, cfg->cos_bit_col[2]); - fdct4x4_sse4_1(in, cfg->cos_bit_row[2]); + row_cfg = &fwd_txfm_1d_row_cfg_dct_4; + col_cfg = &fwd_txfm_1d_col_cfg_adst_4; + load_buffer_4x4(input, in, input_stride, 1, 0, row_cfg->shift[0]); + fadst4x4_sse4_1(in, col_cfg->cos_bit[2]); + fdct4x4_sse4_1(in, row_cfg->cos_bit[2]); write_buffer_4x4(in, coeff); break; case DCT_FLIPADST: - cfg = &fwd_txfm_2d_cfg_dct_adst_4; - load_buffer_4x4(input, in, input_stride, 0, 1, cfg->shift[0]); - fdct4x4_sse4_1(in, cfg->cos_bit_col[2]); - fadst4x4_sse4_1(in, cfg->cos_bit_row[2]); + row_cfg = &fwd_txfm_1d_row_cfg_adst_4; + col_cfg = &fwd_txfm_1d_col_cfg_dct_4; + load_buffer_4x4(input, in, input_stride, 0, 1, row_cfg->shift[0]); + fdct4x4_sse4_1(in, col_cfg->cos_bit[2]); + fadst4x4_sse4_1(in, row_cfg->cos_bit[2]); write_buffer_4x4(in, coeff); break; case FLIPADST_FLIPADST: - cfg = &fwd_txfm_2d_cfg_adst_adst_4; - load_buffer_4x4(input, in, input_stride, 1, 1, cfg->shift[0]); - fadst4x4_sse4_1(in, cfg->cos_bit_col[2]); - fadst4x4_sse4_1(in, cfg->cos_bit_row[2]); + row_cfg = &fwd_txfm_1d_row_cfg_adst_4; + col_cfg = &fwd_txfm_1d_col_cfg_adst_4; + load_buffer_4x4(input, in, input_stride, 1, 1, row_cfg->shift[0]); + fadst4x4_sse4_1(in, col_cfg->cos_bit[2]); + fadst4x4_sse4_1(in, row_cfg->cos_bit[2]); write_buffer_4x4(in, coeff); break; case ADST_FLIPADST: - cfg = &fwd_txfm_2d_cfg_adst_adst_4; - load_buffer_4x4(input, in, input_stride, 0, 1, cfg->shift[0]); - fadst4x4_sse4_1(in, cfg->cos_bit_col[2]); - fadst4x4_sse4_1(in, cfg->cos_bit_row[2]); + row_cfg = &fwd_txfm_1d_row_cfg_adst_4; + col_cfg = &fwd_txfm_1d_col_cfg_adst_4; + load_buffer_4x4(input, in, input_stride, 0, 1, row_cfg->shift[0]); + fadst4x4_sse4_1(in, col_cfg->cos_bit[2]); + fadst4x4_sse4_1(in, row_cfg->cos_bit[2]); write_buffer_4x4(in, coeff); break; case FLIPADST_ADST: - cfg = &fwd_txfm_2d_cfg_adst_adst_4; - load_buffer_4x4(input, in, input_stride, 1, 0, cfg->shift[0]); - fadst4x4_sse4_1(in, cfg->cos_bit_col[2]); - fadst4x4_sse4_1(in, cfg->cos_bit_row[2]); + row_cfg = &fwd_txfm_1d_row_cfg_adst_4; + col_cfg = &fwd_txfm_1d_col_cfg_adst_4; + load_buffer_4x4(input, in, input_stride, 1, 0, row_cfg->shift[0]); + fadst4x4_sse4_1(in, col_cfg->cos_bit[2]); + fadst4x4_sse4_1(in, row_cfg->cos_bit[2]); write_buffer_4x4(in, coeff); break; #endif @@ -429,7 +439,7 @@ static INLINE void write_buffer_8x8(const __m128i *res, tran_low_t *output) { } static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit) { - const int32_t *cospi = cospi_arr[bit - cos_bit_min]; + const int32_t *cospi = cospi_arr(bit); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); const __m128i cospi48 = _mm_set1_epi32(cospi[48]); @@ -625,7 +635,7 @@ static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit) { } static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit) { - const int32_t *cospi = cospi_arr[bit - cos_bit_min]; + const int32_t *cospi = cospi_arr(bit); const __m128i cospi4 = _mm_set1_epi32(cospi[4]); const __m128i cospi60 = _mm_set1_epi32(cospi[60]); const __m128i cospi20 = _mm_set1_epi32(cospi[20]); @@ -930,97 +940,107 @@ static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit) { void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride, int tx_type, int bd) { __m128i in[16], out[16]; - const TXFM_2D_CFG *cfg = NULL; + const TXFM_1D_CFG *row_cfg = NULL; + const TXFM_1D_CFG *col_cfg = NULL; switch (tx_type) { case DCT_DCT: - cfg = &fwd_txfm_2d_cfg_dct_dct_8; - load_buffer_8x8(input, in, stride, 0, 0, cfg->shift[0]); - fdct8x8_sse4_1(in, out, cfg->cos_bit_col[2]); - col_txfm_8x8_rounding(out, -cfg->shift[1]); + row_cfg = &fwd_txfm_1d_row_cfg_dct_8; + col_cfg = &fwd_txfm_1d_col_cfg_dct_8; + load_buffer_8x8(input, in, stride, 0, 0, row_cfg->shift[0]); + fdct8x8_sse4_1(in, out, col_cfg->cos_bit[2]); + col_txfm_8x8_rounding(out, -row_cfg->shift[1]); transpose_8x8(out, in); - fdct8x8_sse4_1(in, out, cfg->cos_bit_row[2]); + fdct8x8_sse4_1(in, out, row_cfg->cos_bit[2]); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; case ADST_DCT: - cfg = &fwd_txfm_2d_cfg_adst_dct_8; - load_buffer_8x8(input, in, stride, 0, 0, cfg->shift[0]); - fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]); - col_txfm_8x8_rounding(out, -cfg->shift[1]); + row_cfg = &fwd_txfm_1d_row_cfg_dct_8; + col_cfg = &fwd_txfm_1d_col_cfg_adst_8; + load_buffer_8x8(input, in, stride, 0, 0, row_cfg->shift[0]); + fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]); + col_txfm_8x8_rounding(out, -row_cfg->shift[1]); transpose_8x8(out, in); - fdct8x8_sse4_1(in, out, cfg->cos_bit_row[2]); + fdct8x8_sse4_1(in, out, row_cfg->cos_bit[2]); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; case DCT_ADST: - cfg = &fwd_txfm_2d_cfg_dct_adst_8; - load_buffer_8x8(input, in, stride, 0, 0, cfg->shift[0]); - fdct8x8_sse4_1(in, out, cfg->cos_bit_col[2]); - col_txfm_8x8_rounding(out, -cfg->shift[1]); + row_cfg = &fwd_txfm_1d_row_cfg_adst_8; + col_cfg = &fwd_txfm_1d_col_cfg_dct_8; + load_buffer_8x8(input, in, stride, 0, 0, row_cfg->shift[0]); + fdct8x8_sse4_1(in, out, col_cfg->cos_bit[2]); + col_txfm_8x8_rounding(out, -row_cfg->shift[1]); transpose_8x8(out, in); - fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]); + fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; case ADST_ADST: - cfg = &fwd_txfm_2d_cfg_adst_adst_8; - load_buffer_8x8(input, in, stride, 0, 0, cfg->shift[0]); - fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]); - col_txfm_8x8_rounding(out, -cfg->shift[1]); + row_cfg = &fwd_txfm_1d_row_cfg_adst_8; + col_cfg = &fwd_txfm_1d_col_cfg_adst_8; + load_buffer_8x8(input, in, stride, 0, 0, row_cfg->shift[0]); + fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]); + col_txfm_8x8_rounding(out, -row_cfg->shift[1]); transpose_8x8(out, in); - fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]); + fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; #if CONFIG_EXT_TX case FLIPADST_DCT: - cfg = &fwd_txfm_2d_cfg_adst_dct_8; - load_buffer_8x8(input, in, stride, 1, 0, cfg->shift[0]); - fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]); - col_txfm_8x8_rounding(out, -cfg->shift[1]); + row_cfg = &fwd_txfm_1d_row_cfg_dct_8; + col_cfg = &fwd_txfm_1d_col_cfg_adst_8; + load_buffer_8x8(input, in, stride, 1, 0, row_cfg->shift[0]); + fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]); + col_txfm_8x8_rounding(out, -row_cfg->shift[1]); transpose_8x8(out, in); - fdct8x8_sse4_1(in, out, cfg->cos_bit_row[2]); + fdct8x8_sse4_1(in, out, row_cfg->cos_bit[2]); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; case DCT_FLIPADST: - cfg = &fwd_txfm_2d_cfg_dct_adst_8; - load_buffer_8x8(input, in, stride, 0, 1, cfg->shift[0]); - fdct8x8_sse4_1(in, out, cfg->cos_bit_col[2]); - col_txfm_8x8_rounding(out, -cfg->shift[1]); + row_cfg = &fwd_txfm_1d_row_cfg_adst_8; + col_cfg = &fwd_txfm_1d_col_cfg_dct_8; + load_buffer_8x8(input, in, stride, 0, 1, row_cfg->shift[0]); + fdct8x8_sse4_1(in, out, col_cfg->cos_bit[2]); + col_txfm_8x8_rounding(out, -row_cfg->shift[1]); transpose_8x8(out, in); - fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]); + fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; case FLIPADST_FLIPADST: - cfg = &fwd_txfm_2d_cfg_adst_adst_8; - load_buffer_8x8(input, in, stride, 1, 1, cfg->shift[0]); - fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]); - col_txfm_8x8_rounding(out, -cfg->shift[1]); + row_cfg = &fwd_txfm_1d_row_cfg_adst_8; + col_cfg = &fwd_txfm_1d_col_cfg_adst_8; + load_buffer_8x8(input, in, stride, 1, 1, row_cfg->shift[0]); + fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]); + col_txfm_8x8_rounding(out, -row_cfg->shift[1]); transpose_8x8(out, in); - fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]); + fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; case ADST_FLIPADST: - cfg = &fwd_txfm_2d_cfg_adst_adst_8; - load_buffer_8x8(input, in, stride, 0, 1, cfg->shift[0]); - fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]); - col_txfm_8x8_rounding(out, -cfg->shift[1]); + row_cfg = &fwd_txfm_1d_row_cfg_adst_8; + col_cfg = &fwd_txfm_1d_col_cfg_adst_8; + load_buffer_8x8(input, in, stride, 0, 1, row_cfg->shift[0]); + fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]); + col_txfm_8x8_rounding(out, -row_cfg->shift[1]); transpose_8x8(out, in); - fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]); + fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; case FLIPADST_ADST: - cfg = &fwd_txfm_2d_cfg_adst_adst_8; - load_buffer_8x8(input, in, stride, 1, 0, cfg->shift[0]); - fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]); - col_txfm_8x8_rounding(out, -cfg->shift[1]); + row_cfg = &fwd_txfm_1d_row_cfg_adst_8; + col_cfg = &fwd_txfm_1d_col_cfg_adst_8; + load_buffer_8x8(input, in, stride, 1, 0, row_cfg->shift[0]); + fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]); + col_txfm_8x8_rounding(out, -row_cfg->shift[1]); transpose_8x8(out, in); - fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]); + fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]); transpose_8x8(out, in); write_buffer_8x8(in, coeff); break; @@ -1107,7 +1127,7 @@ static INLINE void load_buffer_16x16(const int16_t *input, __m128i *out, } static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit) { - const int32_t *cospi = cospi_arr[bit - cos_bit_min]; + const int32_t *cospi = cospi_arr(bit); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); const __m128i cospi48 = _mm_set1_epi32(cospi[48]); @@ -1393,7 +1413,7 @@ static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit) { } static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) { - const int32_t *cospi = cospi_arr[bit - cos_bit_min]; + const int32_t *cospi = cospi_arr(bit); const __m128i cospi2 = _mm_set1_epi32(cospi[2]); const __m128i cospi62 = _mm_set1_epi32(cospi[62]); const __m128i cospi10 = _mm_set1_epi32(cospi[10]); @@ -1794,97 +1814,107 @@ static void write_buffer_16x16(const __m128i *in, tran_low_t *output) { void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff, int stride, int tx_type, int bd) { __m128i in[64], out[64]; - const TXFM_2D_CFG *cfg = NULL; + const TXFM_1D_CFG *row_cfg = NULL; + const TXFM_1D_CFG *col_cfg = NULL; switch (tx_type) { case DCT_DCT: - cfg = &fwd_txfm_2d_cfg_dct_dct_16; - load_buffer_16x16(input, in, stride, 0, 0, cfg->shift[0]); - fdct16x16_sse4_1(in, out, cfg->cos_bit_col[0]); - col_txfm_16x16_rounding(out, -cfg->shift[1]); + row_cfg = &fwd_txfm_1d_row_cfg_dct_16; + col_cfg = &fwd_txfm_1d_col_cfg_dct_16; + load_buffer_16x16(input, in, stride, 0, 0, row_cfg->shift[0]); + fdct16x16_sse4_1(in, out, col_cfg->cos_bit[0]); + col_txfm_16x16_rounding(out, -row_cfg->shift[1]); transpose_16x16(out, in); - fdct16x16_sse4_1(in, out, cfg->cos_bit_row[0]); + fdct16x16_sse4_1(in, out, row_cfg->cos_bit[0]); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; case ADST_DCT: - cfg = &fwd_txfm_2d_cfg_adst_dct_16; - load_buffer_16x16(input, in, stride, 0, 0, cfg->shift[0]); - fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]); - col_txfm_16x16_rounding(out, -cfg->shift[1]); + row_cfg = &fwd_txfm_1d_row_cfg_dct_16; + col_cfg = &fwd_txfm_1d_col_cfg_adst_16; + load_buffer_16x16(input, in, stride, 0, 0, row_cfg->shift[0]); + fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]); + col_txfm_16x16_rounding(out, -row_cfg->shift[1]); transpose_16x16(out, in); - fdct16x16_sse4_1(in, out, cfg->cos_bit_row[0]); + fdct16x16_sse4_1(in, out, row_cfg->cos_bit[0]); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; case DCT_ADST: - cfg = &fwd_txfm_2d_cfg_dct_adst_16; - load_buffer_16x16(input, in, stride, 0, 0, cfg->shift[0]); - fdct16x16_sse4_1(in, out, cfg->cos_bit_col[0]); - col_txfm_16x16_rounding(out, -cfg->shift[1]); + row_cfg = &fwd_txfm_1d_row_cfg_adst_16; + col_cfg = &fwd_txfm_1d_col_cfg_dct_16; + load_buffer_16x16(input, in, stride, 0, 0, row_cfg->shift[0]); + fdct16x16_sse4_1(in, out, col_cfg->cos_bit[0]); + col_txfm_16x16_rounding(out, -row_cfg->shift[1]); transpose_16x16(out, in); - fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]); + fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; case ADST_ADST: - cfg = &fwd_txfm_2d_cfg_adst_adst_16; - load_buffer_16x16(input, in, stride, 0, 0, cfg->shift[0]); - fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]); - col_txfm_16x16_rounding(out, -cfg->shift[1]); + row_cfg = &fwd_txfm_1d_row_cfg_adst_16; + col_cfg = &fwd_txfm_1d_col_cfg_adst_16; + load_buffer_16x16(input, in, stride, 0, 0, row_cfg->shift[0]); + fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]); + col_txfm_16x16_rounding(out, -row_cfg->shift[1]); transpose_16x16(out, in); - fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]); + fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; #if CONFIG_EXT_TX case FLIPADST_DCT: - cfg = &fwd_txfm_2d_cfg_adst_dct_16; - load_buffer_16x16(input, in, stride, 1, 0, cfg->shift[0]); - fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]); - col_txfm_16x16_rounding(out, -cfg->shift[1]); + row_cfg = &fwd_txfm_1d_row_cfg_dct_16; + col_cfg = &fwd_txfm_1d_col_cfg_adst_16; + load_buffer_16x16(input, in, stride, 1, 0, row_cfg->shift[0]); + fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]); + col_txfm_16x16_rounding(out, -row_cfg->shift[1]); transpose_16x16(out, in); - fdct16x16_sse4_1(in, out, cfg->cos_bit_row[0]); + fdct16x16_sse4_1(in, out, row_cfg->cos_bit[0]); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; case DCT_FLIPADST: - cfg = &fwd_txfm_2d_cfg_dct_adst_16; - load_buffer_16x16(input, in, stride, 0, 1, cfg->shift[0]); - fdct16x16_sse4_1(in, out, cfg->cos_bit_col[0]); - col_txfm_16x16_rounding(out, -cfg->shift[1]); + row_cfg = &fwd_txfm_1d_row_cfg_adst_16; + col_cfg = &fwd_txfm_1d_col_cfg_dct_16; + load_buffer_16x16(input, in, stride, 0, 1, row_cfg->shift[0]); + fdct16x16_sse4_1(in, out, col_cfg->cos_bit[0]); + col_txfm_16x16_rounding(out, -row_cfg->shift[1]); transpose_16x16(out, in); - fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]); + fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; case FLIPADST_FLIPADST: - cfg = &fwd_txfm_2d_cfg_adst_adst_16; - load_buffer_16x16(input, in, stride, 1, 1, cfg->shift[0]); - fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]); - col_txfm_16x16_rounding(out, -cfg->shift[1]); + row_cfg = &fwd_txfm_1d_row_cfg_adst_16; + col_cfg = &fwd_txfm_1d_col_cfg_adst_16; + load_buffer_16x16(input, in, stride, 1, 1, row_cfg->shift[0]); + fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]); + col_txfm_16x16_rounding(out, -row_cfg->shift[1]); transpose_16x16(out, in); - fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]); + fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; case ADST_FLIPADST: - cfg = &fwd_txfm_2d_cfg_adst_adst_16; - load_buffer_16x16(input, in, stride, 0, 1, cfg->shift[0]); - fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]); - col_txfm_16x16_rounding(out, -cfg->shift[1]); + row_cfg = &fwd_txfm_1d_row_cfg_adst_16; + col_cfg = &fwd_txfm_1d_col_cfg_adst_16; + load_buffer_16x16(input, in, stride, 0, 1, row_cfg->shift[0]); + fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]); + col_txfm_16x16_rounding(out, -row_cfg->shift[1]); transpose_16x16(out, in); - fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]); + fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; case FLIPADST_ADST: - cfg = &fwd_txfm_2d_cfg_adst_adst_16; - load_buffer_16x16(input, in, stride, 1, 0, cfg->shift[0]); - fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]); - col_txfm_16x16_rounding(out, -cfg->shift[1]); + row_cfg = &fwd_txfm_1d_row_cfg_adst_16; + col_cfg = &fwd_txfm_1d_col_cfg_adst_16; + load_buffer_16x16(input, in, stride, 1, 0, row_cfg->shift[0]); + fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]); + col_txfm_16x16_rounding(out, -row_cfg->shift[1]); transpose_16x16(out, in); - fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]); + fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]); transpose_16x16(out, in); write_buffer_16x16(in, coeff); break; diff --git a/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c b/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c index 198e4e4c4..8495ad1aa 100644 --- a/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c +++ b/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c @@ -269,8 +269,8 @@ static void fdct16_avx2(__m256i *in) { x0 = _mm256_unpacklo_epi16(v0, v1); x1 = _mm256_unpackhi_epi16(v0, v1); - t0 = butter_fly(x0, x1, cospi_p16_p16); - t1 = butter_fly(x0, x1, cospi_p16_m16); + t0 = butter_fly(&x0, &x1, &cospi_p16_p16); + t1 = butter_fly(&x0, &x1, &cospi_p16_m16); // 4, 12 v0 = _mm256_sub_epi16(s1, s2); @@ -279,8 +279,8 @@ static void fdct16_avx2(__m256i *in) { x0 = _mm256_unpacklo_epi16(v0, v1); x1 = _mm256_unpackhi_epi16(v0, v1); - t2 = butter_fly(x0, x1, cospi_p24_p08); - t3 = butter_fly(x0, x1, cospi_m08_p24); + t2 = butter_fly(&x0, &x1, &cospi_p24_p08); + t3 = butter_fly(&x0, &x1, &cospi_m08_p24); // 2, 6, 10, 14 s0 = _mm256_sub_epi16(u3, u4); @@ -294,8 +294,8 @@ static void fdct16_avx2(__m256i *in) { x0 = _mm256_unpacklo_epi16(s2, s1); x1 = _mm256_unpackhi_epi16(s2, s1); - v2 = butter_fly(x0, x1, cospi_p16_p16); // output[5] - v1 = butter_fly(x0, x1, cospi_p16_m16); // output[6] + v2 = butter_fly(&x0, &x1, &cospi_p16_p16); // output[5] + v1 = butter_fly(&x0, &x1, &cospi_p16_m16); // output[6] s0 = _mm256_add_epi16(v0, v1); // step[4] s1 = _mm256_sub_epi16(v0, v1); // step[5] @@ -306,14 +306,14 @@ static void fdct16_avx2(__m256i *in) { x0 = _mm256_unpacklo_epi16(s0, s3); x1 = _mm256_unpackhi_epi16(s0, s3); - t4 = butter_fly(x0, x1, cospi_p28_p04); - t5 = butter_fly(x0, x1, cospi_m04_p28); + t4 = butter_fly(&x0, &x1, &cospi_p28_p04); + t5 = butter_fly(&x0, &x1, &cospi_m04_p28); // 10, 6 x0 = _mm256_unpacklo_epi16(s1, s2); x1 = _mm256_unpackhi_epi16(s1, s2); - t6 = butter_fly(x0, x1, cospi_p12_p20); - t7 = butter_fly(x0, x1, cospi_m20_p12); + t6 = butter_fly(&x0, &x1, &cospi_p12_p20); + t7 = butter_fly(&x0, &x1, &cospi_m20_p12); // 1, 3, 5, 7, 9, 11, 13, 15 s0 = _mm256_sub_epi16(in[7], in[8]); // step[8] @@ -337,14 +337,14 @@ static void fdct16_avx2(__m256i *in) { x0 = _mm256_unpacklo_epi16(u5, u2); x1 = _mm256_unpackhi_epi16(u5, u2); - s2 = butter_fly(x0, x1, cospi_p16_p16); // step[13] - s5 = butter_fly(x0, x1, cospi_p16_m16); // step[10] + s2 = butter_fly(&x0, &x1, &cospi_p16_p16); // step[13] + s5 = butter_fly(&x0, &x1, &cospi_p16_m16); // step[10] x0 = _mm256_unpacklo_epi16(u4, u3); x1 = _mm256_unpackhi_epi16(u4, u3); - s3 = butter_fly(x0, x1, cospi_p16_p16); // step[12] - s4 = butter_fly(x0, x1, cospi_p16_m16); // step[11] + s3 = butter_fly(&x0, &x1, &cospi_p16_p16); // step[12] + s4 = butter_fly(&x0, &x1, &cospi_p16_m16); // step[11] u0 = _mm256_add_epi16(s0, s4); // output[8] u1 = _mm256_add_epi16(s1, s5); @@ -364,14 +364,14 @@ static void fdct16_avx2(__m256i *in) { x0 = _mm256_unpacklo_epi16(u1, u6); x1 = _mm256_unpackhi_epi16(u1, u6); - s1 = butter_fly(x0, x1, cospi_m08_p24); - s6 = butter_fly(x0, x1, cospi_p24_p08); + s1 = butter_fly(&x0, &x1, &cospi_m08_p24); + s6 = butter_fly(&x0, &x1, &cospi_p24_p08); x0 = _mm256_unpacklo_epi16(u2, u5); x1 = _mm256_unpackhi_epi16(u2, u5); - s2 = butter_fly(x0, x1, cospi_m24_m08); - s5 = butter_fly(x0, x1, cospi_m08_p24); + s2 = butter_fly(&x0, &x1, &cospi_m24_m08); + s5 = butter_fly(&x0, &x1, &cospi_m08_p24); // stage 5 u0 = _mm256_add_epi16(s0, s1); @@ -386,23 +386,23 @@ static void fdct16_avx2(__m256i *in) { // stage 6 x0 = _mm256_unpacklo_epi16(u0, u7); x1 = _mm256_unpackhi_epi16(u0, u7); - in[1] = butter_fly(x0, x1, cospi_p30_p02); - in[15] = butter_fly(x0, x1, cospi_m02_p30); + in[1] = butter_fly(&x0, &x1, &cospi_p30_p02); + in[15] = butter_fly(&x0, &x1, &cospi_m02_p30); x0 = _mm256_unpacklo_epi16(u1, u6); x1 = _mm256_unpackhi_epi16(u1, u6); - in[9] = butter_fly(x0, x1, cospi_p14_p18); - in[7] = butter_fly(x0, x1, cospi_m18_p14); + in[9] = butter_fly(&x0, &x1, &cospi_p14_p18); + in[7] = butter_fly(&x0, &x1, &cospi_m18_p14); x0 = _mm256_unpacklo_epi16(u2, u5); x1 = _mm256_unpackhi_epi16(u2, u5); - in[5] = butter_fly(x0, x1, cospi_p22_p10); - in[11] = butter_fly(x0, x1, cospi_m10_p22); + in[5] = butter_fly(&x0, &x1, &cospi_p22_p10); + in[11] = butter_fly(&x0, &x1, &cospi_m10_p22); x0 = _mm256_unpacklo_epi16(u3, u4); x1 = _mm256_unpackhi_epi16(u3, u4); - in[13] = butter_fly(x0, x1, cospi_p06_p26); - in[3] = butter_fly(x0, x1, cospi_m26_p06); + in[13] = butter_fly(&x0, &x1, &cospi_p06_p26); + in[3] = butter_fly(&x0, &x1, &cospi_m26_p06); } void fadst16_avx2(__m256i *in) { @@ -953,7 +953,9 @@ void fadst16_avx2(__m256i *in) { } #if CONFIG_EXT_TX -static void fidtx16_avx2(__m256i *in) { txfm_scaling16_avx2(Sqrt2, in); } +static void fidtx16_avx2(__m256i *in) { + txfm_scaling16_avx2((int16_t)Sqrt2, in); +} #endif void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride, @@ -964,28 +966,28 @@ void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride, case DCT_DCT: load_buffer_16x16(input, stride, 0, 0, in); fdct16_avx2(in); - mm256_transpose_16x16(in); + mm256_transpose_16x16(in, in); right_shift_16x16(in); fdct16_avx2(in); break; case ADST_DCT: load_buffer_16x16(input, stride, 0, 0, in); fadst16_avx2(in); - mm256_transpose_16x16(in); + mm256_transpose_16x16(in, in); right_shift_16x16(in); fdct16_avx2(in); break; case DCT_ADST: load_buffer_16x16(input, stride, 0, 0, in); fdct16_avx2(in); - mm256_transpose_16x16(in); + mm256_transpose_16x16(in, in); right_shift_16x16(in); fadst16_avx2(in); break; case ADST_ADST: load_buffer_16x16(input, stride, 0, 0, in); fadst16_avx2(in); - mm256_transpose_16x16(in); + mm256_transpose_16x16(in, in); right_shift_16x16(in); fadst16_avx2(in); break; @@ -993,91 +995,91 @@ void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride, case FLIPADST_DCT: load_buffer_16x16(input, stride, 1, 0, in); fadst16_avx2(in); - mm256_transpose_16x16(in); + mm256_transpose_16x16(in, in); right_shift_16x16(in); fdct16_avx2(in); break; case DCT_FLIPADST: load_buffer_16x16(input, stride, 0, 1, in); fdct16_avx2(in); - mm256_transpose_16x16(in); + mm256_transpose_16x16(in, in); right_shift_16x16(in); fadst16_avx2(in); break; case FLIPADST_FLIPADST: load_buffer_16x16(input, stride, 1, 1, in); fadst16_avx2(in); - mm256_transpose_16x16(in); + mm256_transpose_16x16(in, in); right_shift_16x16(in); fadst16_avx2(in); break; case ADST_FLIPADST: load_buffer_16x16(input, stride, 0, 1, in); fadst16_avx2(in); - mm256_transpose_16x16(in); + mm256_transpose_16x16(in, in); right_shift_16x16(in); fadst16_avx2(in); break; case FLIPADST_ADST: load_buffer_16x16(input, stride, 1, 0, in); fadst16_avx2(in); - mm256_transpose_16x16(in); + mm256_transpose_16x16(in, in); right_shift_16x16(in); fadst16_avx2(in); break; case IDTX: load_buffer_16x16(input, stride, 0, 0, in); fidtx16_avx2(in); - mm256_transpose_16x16(in); + mm256_transpose_16x16(in, in); right_shift_16x16(in); fidtx16_avx2(in); break; case V_DCT: load_buffer_16x16(input, stride, 0, 0, in); fdct16_avx2(in); - mm256_transpose_16x16(in); + mm256_transpose_16x16(in, in); right_shift_16x16(in); fidtx16_avx2(in); break; case H_DCT: load_buffer_16x16(input, stride, 0, 0, in); fidtx16_avx2(in); - mm256_transpose_16x16(in); + mm256_transpose_16x16(in, in); right_shift_16x16(in); fdct16_avx2(in); break; case V_ADST: load_buffer_16x16(input, stride, 0, 0, in); fadst16_avx2(in); - mm256_transpose_16x16(in); + mm256_transpose_16x16(in, in); right_shift_16x16(in); fidtx16_avx2(in); break; case H_ADST: load_buffer_16x16(input, stride, 0, 0, in); fidtx16_avx2(in); - mm256_transpose_16x16(in); + mm256_transpose_16x16(in, in); right_shift_16x16(in); fadst16_avx2(in); break; case V_FLIPADST: load_buffer_16x16(input, stride, 1, 0, in); fadst16_avx2(in); - mm256_transpose_16x16(in); + mm256_transpose_16x16(in, in); right_shift_16x16(in); fidtx16_avx2(in); break; case H_FLIPADST: load_buffer_16x16(input, stride, 0, 1, in); fidtx16_avx2(in); - mm256_transpose_16x16(in); + mm256_transpose_16x16(in, in); right_shift_16x16(in); fadst16_avx2(in); break; #endif // CONFIG_EXT_TX default: assert(0); break; } - mm256_transpose_16x16(in); + mm256_transpose_16x16(in, in); write_buffer_16x16(in, output); _mm256_zeroupper(); } @@ -1110,10 +1112,10 @@ static void mm256_vectors_swap(__m256i *a0, __m256i *a1, const int size) { } static void mm256_transpose_32x32(__m256i *in0, __m256i *in1) { - mm256_transpose_16x16(in0); - mm256_transpose_16x16(&in0[16]); - mm256_transpose_16x16(in1); - mm256_transpose_16x16(&in1[16]); + mm256_transpose_16x16(in0, in0); + mm256_transpose_16x16(&in0[16], &in0[16]); + mm256_transpose_16x16(in1, in1); + mm256_transpose_16x16(&in1[16], &in1[16]); mm256_vectors_swap(&in0[16], in1, 16); } @@ -1247,23 +1249,23 @@ static void fdct16_odd_avx2(__m256i *in) { u0 = _mm256_unpacklo_epi16(in[4], in[11]); u1 = _mm256_unpackhi_epi16(in[4], in[11]); - y4 = butter_fly(u0, u1, cospi_m16_p16); - y11 = butter_fly(u0, u1, cospi_p16_p16); + y4 = butter_fly(&u0, &u1, &cospi_m16_p16); + y11 = butter_fly(&u0, &u1, &cospi_p16_p16); u0 = _mm256_unpacklo_epi16(in[5], in[10]); u1 = _mm256_unpackhi_epi16(in[5], in[10]); - y5 = butter_fly(u0, u1, cospi_m16_p16); - y10 = butter_fly(u0, u1, cospi_p16_p16); + y5 = butter_fly(&u0, &u1, &cospi_m16_p16); + y10 = butter_fly(&u0, &u1, &cospi_p16_p16); u0 = _mm256_unpacklo_epi16(in[6], in[9]); u1 = _mm256_unpackhi_epi16(in[6], in[9]); - y6 = butter_fly(u0, u1, cospi_m16_p16); - y9 = butter_fly(u0, u1, cospi_p16_p16); + y6 = butter_fly(&u0, &u1, &cospi_m16_p16); + y9 = butter_fly(&u0, &u1, &cospi_p16_p16); u0 = _mm256_unpacklo_epi16(in[7], in[8]); u1 = _mm256_unpackhi_epi16(in[7], in[8]); - y7 = butter_fly(u0, u1, cospi_m16_p16); - y8 = butter_fly(u0, u1, cospi_p16_p16); + y7 = butter_fly(&u0, &u1, &cospi_m16_p16); + y8 = butter_fly(&u0, &u1, &cospi_p16_p16); y12 = in[12]; y13 = in[13]; @@ -1300,23 +1302,23 @@ static void fdct16_odd_avx2(__m256i *in) { u0 = _mm256_unpacklo_epi16(x2, x13); u1 = _mm256_unpackhi_epi16(x2, x13); - y2 = butter_fly(u0, u1, cospi_m08_p24); - y13 = butter_fly(u0, u1, cospi_p24_p08); + y2 = butter_fly(&u0, &u1, &cospi_m08_p24); + y13 = butter_fly(&u0, &u1, &cospi_p24_p08); u0 = _mm256_unpacklo_epi16(x3, x12); u1 = _mm256_unpackhi_epi16(x3, x12); - y3 = butter_fly(u0, u1, cospi_m08_p24); - y12 = butter_fly(u0, u1, cospi_p24_p08); + y3 = butter_fly(&u0, &u1, &cospi_m08_p24); + y12 = butter_fly(&u0, &u1, &cospi_p24_p08); u0 = _mm256_unpacklo_epi16(x4, x11); u1 = _mm256_unpackhi_epi16(x4, x11); - y4 = butter_fly(u0, u1, cospi_m24_m08); - y11 = butter_fly(u0, u1, cospi_m08_p24); + y4 = butter_fly(&u0, &u1, &cospi_m24_m08); + y11 = butter_fly(&u0, &u1, &cospi_m08_p24); u0 = _mm256_unpacklo_epi16(x5, x10); u1 = _mm256_unpackhi_epi16(x5, x10); - y5 = butter_fly(u0, u1, cospi_m24_m08); - y10 = butter_fly(u0, u1, cospi_m08_p24); + y5 = butter_fly(&u0, &u1, &cospi_m24_m08); + y10 = butter_fly(&u0, &u1, &cospi_m08_p24); // stage 5 x0 = _mm256_add_epi16(y0, y3); @@ -1349,23 +1351,23 @@ static void fdct16_odd_avx2(__m256i *in) { u0 = _mm256_unpacklo_epi16(x1, x14); u1 = _mm256_unpackhi_epi16(x1, x14); - y1 = butter_fly(u0, u1, cospi_m04_p28); - y14 = butter_fly(u0, u1, cospi_p28_p04); + y1 = butter_fly(&u0, &u1, &cospi_m04_p28); + y14 = butter_fly(&u0, &u1, &cospi_p28_p04); u0 = _mm256_unpacklo_epi16(x2, x13); u1 = _mm256_unpackhi_epi16(x2, x13); - y2 = butter_fly(u0, u1, cospi_m28_m04); - y13 = butter_fly(u0, u1, cospi_m04_p28); + y2 = butter_fly(&u0, &u1, &cospi_m28_m04); + y13 = butter_fly(&u0, &u1, &cospi_m04_p28); u0 = _mm256_unpacklo_epi16(x5, x10); u1 = _mm256_unpackhi_epi16(x5, x10); - y5 = butter_fly(u0, u1, cospi_m20_p12); - y10 = butter_fly(u0, u1, cospi_p12_p20); + y5 = butter_fly(&u0, &u1, &cospi_m20_p12); + y10 = butter_fly(&u0, &u1, &cospi_p12_p20); u0 = _mm256_unpacklo_epi16(x6, x9); u1 = _mm256_unpackhi_epi16(x6, x9); - y6 = butter_fly(u0, u1, cospi_m12_m20); - y9 = butter_fly(u0, u1, cospi_m20_p12); + y6 = butter_fly(&u0, &u1, &cospi_m12_m20); + y9 = butter_fly(&u0, &u1, &cospi_m20_p12); // stage 7 x0 = _mm256_add_epi16(y0, y1); @@ -1389,43 +1391,43 @@ static void fdct16_odd_avx2(__m256i *in) { // stage 8 u0 = _mm256_unpacklo_epi16(x0, x15); u1 = _mm256_unpackhi_epi16(x0, x15); - in[0] = butter_fly(u0, u1, cospi_p31_p01); - in[15] = butter_fly(u0, u1, cospi_m01_p31); + in[0] = butter_fly(&u0, &u1, &cospi_p31_p01); + in[15] = butter_fly(&u0, &u1, &cospi_m01_p31); u0 = _mm256_unpacklo_epi16(x1, x14); u1 = _mm256_unpackhi_epi16(x1, x14); - in[1] = butter_fly(u0, u1, cospi_p15_p17); - in[14] = butter_fly(u0, u1, cospi_m17_p15); + in[1] = butter_fly(&u0, &u1, &cospi_p15_p17); + in[14] = butter_fly(&u0, &u1, &cospi_m17_p15); u0 = _mm256_unpacklo_epi16(x2, x13); u1 = _mm256_unpackhi_epi16(x2, x13); - in[2] = butter_fly(u0, u1, cospi_p23_p09); - in[13] = butter_fly(u0, u1, cospi_m09_p23); + in[2] = butter_fly(&u0, &u1, &cospi_p23_p09); + in[13] = butter_fly(&u0, &u1, &cospi_m09_p23); u0 = _mm256_unpacklo_epi16(x3, x12); u1 = _mm256_unpackhi_epi16(x3, x12); - in[3] = butter_fly(u0, u1, cospi_p07_p25); - in[12] = butter_fly(u0, u1, cospi_m25_p07); + in[3] = butter_fly(&u0, &u1, &cospi_p07_p25); + in[12] = butter_fly(&u0, &u1, &cospi_m25_p07); u0 = _mm256_unpacklo_epi16(x4, x11); u1 = _mm256_unpackhi_epi16(x4, x11); - in[4] = butter_fly(u0, u1, cospi_p27_p05); - in[11] = butter_fly(u0, u1, cospi_m05_p27); + in[4] = butter_fly(&u0, &u1, &cospi_p27_p05); + in[11] = butter_fly(&u0, &u1, &cospi_m05_p27); u0 = _mm256_unpacklo_epi16(x5, x10); u1 = _mm256_unpackhi_epi16(x5, x10); - in[5] = butter_fly(u0, u1, cospi_p11_p21); - in[10] = butter_fly(u0, u1, cospi_m21_p11); + in[5] = butter_fly(&u0, &u1, &cospi_p11_p21); + in[10] = butter_fly(&u0, &u1, &cospi_m21_p11); u0 = _mm256_unpacklo_epi16(x6, x9); u1 = _mm256_unpackhi_epi16(x6, x9); - in[6] = butter_fly(u0, u1, cospi_p19_p13); - in[9] = butter_fly(u0, u1, cospi_m13_p19); + in[6] = butter_fly(&u0, &u1, &cospi_p19_p13); + in[9] = butter_fly(&u0, &u1, &cospi_m13_p19); u0 = _mm256_unpacklo_epi16(x7, x8); u1 = _mm256_unpackhi_epi16(x7, x8); - in[7] = butter_fly(u0, u1, cospi_p03_p29); - in[8] = butter_fly(u0, u1, cospi_m29_p03); + in[7] = butter_fly(&u0, &u1, &cospi_p03_p29); + in[8] = butter_fly(&u0, &u1, &cospi_m29_p03); } static void fdct32_avx2(__m256i *in0, __m256i *in1) { @@ -1464,7 +1466,7 @@ static INLINE void write_buffer_32x32(const __m256i *in0, const __m256i *in1, static void fhalfright32_16col_avx2(__m256i *in) { int i = 0; const __m256i zero = _mm256_setzero_si256(); - const __m256i sqrt2 = _mm256_set1_epi16(Sqrt2); + const __m256i sqrt2 = _mm256_set1_epi16((int16_t)Sqrt2); const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING); __m256i x0, x1; -- cgit v1.2.3