diff options
Diffstat (limited to 'third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c')
-rw-r--r-- | third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c | 384 |
1 files changed, 207 insertions, 177 deletions
diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c index 24b2760b9..a93699f0b 100644 --- a/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c +++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c @@ -13,7 +13,7 @@ #include "./av1_rtcd.h" #include "./aom_config.h" -#include "av1/common/av1_inv_txfm2d_cfg.h" +#include "av1/common/av1_inv_txfm1d_cfg.h" #include "av1/common/x86/highbd_txfm_utility_sse4.h" static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) { @@ -24,7 +24,7 @@ static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) { } static void idct4x4_sse4_1(__m128i *in, int bit) { - const int32_t *cospi = cospi_arr[bit - cos_bit_min]; + const int32_t *cospi = cospi_arr(bit); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i cospi48 = _mm_set1_epi32(cospi[48]); const __m128i cospi16 = _mm_set1_epi32(cospi[16]); @@ -72,7 +72,7 @@ static void idct4x4_sse4_1(__m128i *in, int bit) { } static void iadst4x4_sse4_1(__m128i *in, int bit) { - const int32_t *cospi = cospi_arr[bit - cos_bit_min]; + const int32_t *cospi = cospi_arr(bit); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i cospi8 = _mm_set1_epi32(cospi[8]); const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); @@ -232,72 +232,82 @@ static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride, void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output, int stride, int tx_type, int bd) { __m128i in[4]; - const TXFM_2D_CFG *cfg = NULL; + const TXFM_1D_CFG *row_cfg = NULL; + const TXFM_1D_CFG *col_cfg = NULL; switch (tx_type) { case DCT_DCT: - cfg = &inv_txfm_2d_cfg_dct_dct_4; + row_cfg = &inv_txfm_1d_row_cfg_dct_4; + col_cfg = &inv_txfm_1d_col_cfg_dct_4; load_buffer_4x4(coeff, in); - idct4x4_sse4_1(in, cfg->cos_bit_row[2]); - idct4x4_sse4_1(in, cfg->cos_bit_col[2]); - write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd); + idct4x4_sse4_1(in, row_cfg->cos_bit[2]); + idct4x4_sse4_1(in, col_cfg->cos_bit[2]); + write_buffer_4x4(in, output, stride, 0, 0, -row_cfg->shift[1], bd); break; case ADST_DCT: - cfg = &inv_txfm_2d_cfg_adst_dct_4; + row_cfg = &inv_txfm_1d_row_cfg_dct_4; + col_cfg = &inv_txfm_1d_col_cfg_adst_4; load_buffer_4x4(coeff, in); - idct4x4_sse4_1(in, cfg->cos_bit_row[2]); - iadst4x4_sse4_1(in, cfg->cos_bit_col[2]); - write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd); + idct4x4_sse4_1(in, row_cfg->cos_bit[2]); + iadst4x4_sse4_1(in, col_cfg->cos_bit[2]); + write_buffer_4x4(in, output, stride, 0, 0, -row_cfg->shift[1], bd); break; case DCT_ADST: - cfg = &inv_txfm_2d_cfg_dct_adst_4; + row_cfg = &inv_txfm_1d_row_cfg_adst_4; + col_cfg = &inv_txfm_1d_col_cfg_dct_4; load_buffer_4x4(coeff, in); - iadst4x4_sse4_1(in, cfg->cos_bit_row[2]); - idct4x4_sse4_1(in, cfg->cos_bit_col[2]); - write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd); + iadst4x4_sse4_1(in, row_cfg->cos_bit[2]); + idct4x4_sse4_1(in, col_cfg->cos_bit[2]); + write_buffer_4x4(in, output, stride, 0, 0, -row_cfg->shift[1], bd); break; case ADST_ADST: - cfg = &inv_txfm_2d_cfg_adst_adst_4; + row_cfg = &inv_txfm_1d_row_cfg_adst_4; + col_cfg = &inv_txfm_1d_col_cfg_adst_4; load_buffer_4x4(coeff, in); - iadst4x4_sse4_1(in, cfg->cos_bit_row[2]); - iadst4x4_sse4_1(in, cfg->cos_bit_col[2]); - write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd); + iadst4x4_sse4_1(in, row_cfg->cos_bit[2]); + iadst4x4_sse4_1(in, col_cfg->cos_bit[2]); + write_buffer_4x4(in, output, stride, 0, 0, -row_cfg->shift[1], bd); break; #if CONFIG_EXT_TX case FLIPADST_DCT: - cfg = &inv_txfm_2d_cfg_adst_dct_4; + row_cfg = &inv_txfm_1d_row_cfg_dct_4; + col_cfg = &inv_txfm_1d_col_cfg_adst_4; load_buffer_4x4(coeff, in); - idct4x4_sse4_1(in, cfg->cos_bit_row[2]); - iadst4x4_sse4_1(in, cfg->cos_bit_col[2]); - write_buffer_4x4(in, output, stride, 0, 1, -cfg->shift[1], bd); + idct4x4_sse4_1(in, row_cfg->cos_bit[2]); + iadst4x4_sse4_1(in, col_cfg->cos_bit[2]); + write_buffer_4x4(in, output, stride, 0, 1, -row_cfg->shift[1], bd); break; case DCT_FLIPADST: - cfg = &inv_txfm_2d_cfg_dct_adst_4; + row_cfg = &inv_txfm_1d_row_cfg_adst_4; + col_cfg = &inv_txfm_1d_col_cfg_dct_4; load_buffer_4x4(coeff, in); - iadst4x4_sse4_1(in, cfg->cos_bit_row[2]); - idct4x4_sse4_1(in, cfg->cos_bit_col[2]); - write_buffer_4x4(in, output, stride, 1, 0, -cfg->shift[1], bd); + iadst4x4_sse4_1(in, row_cfg->cos_bit[2]); + idct4x4_sse4_1(in, col_cfg->cos_bit[2]); + write_buffer_4x4(in, output, stride, 1, 0, -row_cfg->shift[1], bd); break; case FLIPADST_FLIPADST: - cfg = &inv_txfm_2d_cfg_adst_adst_4; + row_cfg = &inv_txfm_1d_row_cfg_adst_4; + col_cfg = &inv_txfm_1d_col_cfg_adst_4; load_buffer_4x4(coeff, in); - iadst4x4_sse4_1(in, cfg->cos_bit_row[2]); - iadst4x4_sse4_1(in, cfg->cos_bit_col[2]); - write_buffer_4x4(in, output, stride, 1, 1, -cfg->shift[1], bd); + iadst4x4_sse4_1(in, row_cfg->cos_bit[2]); + iadst4x4_sse4_1(in, col_cfg->cos_bit[2]); + write_buffer_4x4(in, output, stride, 1, 1, -row_cfg->shift[1], bd); break; case ADST_FLIPADST: - cfg = &inv_txfm_2d_cfg_adst_adst_4; + row_cfg = &inv_txfm_1d_row_cfg_adst_4; + col_cfg = &inv_txfm_1d_col_cfg_adst_4; load_buffer_4x4(coeff, in); - iadst4x4_sse4_1(in, cfg->cos_bit_row[2]); - iadst4x4_sse4_1(in, cfg->cos_bit_col[2]); - write_buffer_4x4(in, output, stride, 1, 0, -cfg->shift[1], bd); + iadst4x4_sse4_1(in, row_cfg->cos_bit[2]); + iadst4x4_sse4_1(in, col_cfg->cos_bit[2]); + write_buffer_4x4(in, output, stride, 1, 0, -row_cfg->shift[1], bd); break; case FLIPADST_ADST: - cfg = &inv_txfm_2d_cfg_adst_adst_4; + row_cfg = &inv_txfm_1d_row_cfg_adst_4; + col_cfg = &inv_txfm_1d_col_cfg_adst_4; load_buffer_4x4(coeff, in); - iadst4x4_sse4_1(in, cfg->cos_bit_row[2]); - iadst4x4_sse4_1(in, cfg->cos_bit_col[2]); - write_buffer_4x4(in, output, stride, 0, 1, -cfg->shift[1], bd); + iadst4x4_sse4_1(in, row_cfg->cos_bit[2]); + iadst4x4_sse4_1(in, col_cfg->cos_bit[2]); + write_buffer_4x4(in, output, stride, 0, 1, -row_cfg->shift[1], bd); break; #endif // CONFIG_EXT_TX default: assert(0); @@ -325,7 +335,7 @@ static void load_buffer_8x8(const int32_t *coeff, __m128i *in) { } static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit) { - const int32_t *cospi = cospi_arr[bit - cos_bit_min]; + const int32_t *cospi = cospi_arr(bit); const __m128i cospi56 = _mm_set1_epi32(cospi[56]); const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); const __m128i cospi24 = _mm_set1_epi32(cospi[24]); @@ -439,7 +449,7 @@ static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit) { } static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit) { - const int32_t *cospi = cospi_arr[bit - cos_bit_min]; + const int32_t *cospi = cospi_arr(bit); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i cospi16 = _mm_set1_epi32(cospi[16]); const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); @@ -698,90 +708,100 @@ static void write_buffer_8x8(__m128i *in, uint16_t *output, int stride, void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output, int stride, int tx_type, int bd) { __m128i in[16], out[16]; - const TXFM_2D_CFG *cfg = NULL; + const TXFM_1D_CFG *row_cfg = NULL; + const TXFM_1D_CFG *col_cfg = NULL; switch (tx_type) { case DCT_DCT: - cfg = &inv_txfm_2d_cfg_dct_dct_8; + row_cfg = &inv_txfm_1d_row_cfg_dct_8; + col_cfg = &inv_txfm_1d_col_cfg_dct_8; load_buffer_8x8(coeff, in); transpose_8x8(in, out); - idct8x8_sse4_1(out, in, cfg->cos_bit_row[2]); + idct8x8_sse4_1(out, in, row_cfg->cos_bit[2]); transpose_8x8(in, out); - idct8x8_sse4_1(out, in, cfg->cos_bit_col[2]); - write_buffer_8x8(in, output, stride, 0, 0, -cfg->shift[1], bd); + idct8x8_sse4_1(out, in, col_cfg->cos_bit[2]); + write_buffer_8x8(in, output, stride, 0, 0, -row_cfg->shift[1], bd); break; case DCT_ADST: - cfg = &inv_txfm_2d_cfg_dct_adst_8; + row_cfg = &inv_txfm_1d_row_cfg_adst_8; + col_cfg = &inv_txfm_1d_col_cfg_dct_8; load_buffer_8x8(coeff, in); transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]); + iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]); transpose_8x8(in, out); - idct8x8_sse4_1(out, in, cfg->cos_bit_col[2]); - write_buffer_8x8(in, output, stride, 0, 0, -cfg->shift[1], bd); + idct8x8_sse4_1(out, in, col_cfg->cos_bit[2]); + write_buffer_8x8(in, output, stride, 0, 0, -row_cfg->shift[1], bd); break; case ADST_DCT: - cfg = &inv_txfm_2d_cfg_adst_dct_8; + row_cfg = &inv_txfm_1d_row_cfg_dct_8; + col_cfg = &inv_txfm_1d_col_cfg_adst_8; load_buffer_8x8(coeff, in); transpose_8x8(in, out); - idct8x8_sse4_1(out, in, cfg->cos_bit_row[2]); + idct8x8_sse4_1(out, in, row_cfg->cos_bit[2]); transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]); - write_buffer_8x8(in, output, stride, 0, 0, -cfg->shift[1], bd); + iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]); + write_buffer_8x8(in, output, stride, 0, 0, -row_cfg->shift[1], bd); break; case ADST_ADST: - cfg = &inv_txfm_2d_cfg_adst_adst_8; + row_cfg = &inv_txfm_1d_row_cfg_adst_8; + col_cfg = &inv_txfm_1d_col_cfg_adst_8; load_buffer_8x8(coeff, in); transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]); + iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]); transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]); - write_buffer_8x8(in, output, stride, 0, 0, -cfg->shift[1], bd); + iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]); + write_buffer_8x8(in, output, stride, 0, 0, -row_cfg->shift[1], bd); break; #if CONFIG_EXT_TX case FLIPADST_DCT: - cfg = &inv_txfm_2d_cfg_adst_dct_8; + row_cfg = &inv_txfm_1d_row_cfg_dct_8; + col_cfg = &inv_txfm_1d_col_cfg_adst_8; load_buffer_8x8(coeff, in); transpose_8x8(in, out); - idct8x8_sse4_1(out, in, cfg->cos_bit_row[2]); + idct8x8_sse4_1(out, in, row_cfg->cos_bit[2]); transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]); - write_buffer_8x8(in, output, stride, 0, 1, -cfg->shift[1], bd); + iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]); + write_buffer_8x8(in, output, stride, 0, 1, -row_cfg->shift[1], bd); break; case DCT_FLIPADST: - cfg = &inv_txfm_2d_cfg_dct_adst_8; + row_cfg = &inv_txfm_1d_row_cfg_adst_8; + col_cfg = &inv_txfm_1d_col_cfg_dct_8; load_buffer_8x8(coeff, in); transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]); + iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]); transpose_8x8(in, out); - idct8x8_sse4_1(out, in, cfg->cos_bit_col[2]); - write_buffer_8x8(in, output, stride, 1, 0, -cfg->shift[1], bd); + idct8x8_sse4_1(out, in, col_cfg->cos_bit[2]); + write_buffer_8x8(in, output, stride, 1, 0, -row_cfg->shift[1], bd); break; case ADST_FLIPADST: - cfg = &inv_txfm_2d_cfg_adst_adst_8; + row_cfg = &inv_txfm_1d_row_cfg_adst_8; + col_cfg = &inv_txfm_1d_col_cfg_adst_8; load_buffer_8x8(coeff, in); transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]); + iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]); transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]); - write_buffer_8x8(in, output, stride, 1, 0, -cfg->shift[1], bd); + iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]); + write_buffer_8x8(in, output, stride, 1, 0, -row_cfg->shift[1], bd); break; case FLIPADST_FLIPADST: - cfg = &inv_txfm_2d_cfg_adst_adst_8; + row_cfg = &inv_txfm_1d_row_cfg_adst_8; + col_cfg = &inv_txfm_1d_col_cfg_adst_8; load_buffer_8x8(coeff, in); transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]); + iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]); transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]); - write_buffer_8x8(in, output, stride, 1, 1, -cfg->shift[1], bd); + iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]); + write_buffer_8x8(in, output, stride, 1, 1, -row_cfg->shift[1], bd); break; case FLIPADST_ADST: - cfg = &inv_txfm_2d_cfg_adst_adst_8; + row_cfg = &inv_txfm_1d_row_cfg_adst_8; + col_cfg = &inv_txfm_1d_col_cfg_adst_8; load_buffer_8x8(coeff, in); transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]); + iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]); transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]); - write_buffer_8x8(in, output, stride, 0, 1, -cfg->shift[1], bd); + iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]); + write_buffer_8x8(in, output, stride, 0, 1, -row_cfg->shift[1], bd); break; #endif // CONFIG_EXT_TX default: assert(0); @@ -849,7 +869,7 @@ static void write_buffer_16x16(__m128i *in, uint16_t *output, int stride, } static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) { - const int32_t *cospi = cospi_arr[bit - cos_bit_min]; + const int32_t *cospi = cospi_arr(bit); const __m128i cospi60 = _mm_set1_epi32(cospi[60]); const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); const __m128i cospi28 = _mm_set1_epi32(cospi[28]); @@ -907,24 +927,24 @@ static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) { v[6] = u[6]; v[7] = u[7]; - v[8] = half_btf_sse4_1(cospi60, u[8], cospim4, u[15], rnding, bit); - v[9] = half_btf_sse4_1(cospi28, u[9], cospim36, u[14], rnding, bit); - v[10] = half_btf_sse4_1(cospi44, u[10], cospim20, u[13], rnding, bit); - v[11] = half_btf_sse4_1(cospi12, u[11], cospim52, u[12], rnding, bit); - v[12] = half_btf_sse4_1(cospi52, u[11], cospi12, u[12], rnding, bit); - v[13] = half_btf_sse4_1(cospi20, u[10], cospi44, u[13], rnding, bit); - v[14] = half_btf_sse4_1(cospi36, u[9], cospi28, u[14], rnding, bit); - v[15] = half_btf_sse4_1(cospi4, u[8], cospi60, u[15], rnding, bit); + v[8] = half_btf_sse4_1(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit); + v[9] = half_btf_sse4_1(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit); + v[10] = half_btf_sse4_1(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit); + v[11] = half_btf_sse4_1(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit); + v[12] = half_btf_sse4_1(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit); + v[13] = half_btf_sse4_1(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit); + v[14] = half_btf_sse4_1(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit); + v[15] = half_btf_sse4_1(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit); // stage 3 u[0] = v[0]; u[1] = v[1]; u[2] = v[2]; u[3] = v[3]; - u[4] = half_btf_sse4_1(cospi56, v[4], cospim8, v[7], rnding, bit); - u[5] = half_btf_sse4_1(cospi24, v[5], cospim40, v[6], rnding, bit); - u[6] = half_btf_sse4_1(cospi40, v[5], cospi24, v[6], rnding, bit); - u[7] = half_btf_sse4_1(cospi8, v[4], cospi56, v[7], rnding, bit); + u[4] = half_btf_sse4_1(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit); + u[5] = half_btf_sse4_1(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit); + u[6] = half_btf_sse4_1(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit); + u[7] = half_btf_sse4_1(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit); u[8] = _mm_add_epi32(v[8], v[9]); u[9] = _mm_sub_epi32(v[8], v[9]); u[10] = _mm_sub_epi32(v[11], v[10]); @@ -945,19 +965,19 @@ static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) { v[1] = _mm_add_epi32(v[1], rnding); v[1] = _mm_srai_epi32(v[1], bit); - v[2] = half_btf_sse4_1(cospi48, u[2], cospim16, u[3], rnding, bit); - v[3] = half_btf_sse4_1(cospi16, u[2], cospi48, u[3], rnding, bit); + v[2] = half_btf_sse4_1(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit); + v[3] = half_btf_sse4_1(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit); v[4] = _mm_add_epi32(u[4], u[5]); v[5] = _mm_sub_epi32(u[4], u[5]); v[6] = _mm_sub_epi32(u[7], u[6]); v[7] = _mm_add_epi32(u[6], u[7]); v[8] = u[8]; - v[9] = half_btf_sse4_1(cospim16, u[9], cospi48, u[14], rnding, bit); - v[10] = half_btf_sse4_1(cospim48, u[10], cospim16, u[13], rnding, bit); + v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); v[11] = u[11]; v[12] = u[12]; - v[13] = half_btf_sse4_1(cospim16, u[10], cospi48, u[13], rnding, bit); - v[14] = half_btf_sse4_1(cospi48, u[9], cospi16, u[14], rnding, bit); + v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); + v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); v[15] = u[15]; // stage 5 @@ -1043,7 +1063,7 @@ static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) { } static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) { - const int32_t *cospi = cospi_arr[bit - cos_bit_min]; + const int32_t *cospi = cospi_arr(bit); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i cospi48 = _mm_set1_epi32(cospi[48]); const __m128i cospi16 = _mm_set1_epi32(cospi[16]); @@ -1183,18 +1203,18 @@ static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) { v[1] = u[1]; v[2] = u[2]; v[3] = u[3]; - v[4] = half_btf_sse4_1(cospi16, u[4], cospi48, u[5], rnding, bit); - v[5] = half_btf_sse4_1(cospi48, u[4], cospim16, u[5], rnding, bit); - v[6] = half_btf_sse4_1(cospim48, u[6], cospi16, u[7], rnding, bit); - v[7] = half_btf_sse4_1(cospi16, u[6], cospi48, u[7], rnding, bit); + v[4] = half_btf_sse4_1(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit); + v[5] = half_btf_sse4_1(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit); + v[6] = half_btf_sse4_1(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit); + v[7] = half_btf_sse4_1(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit); v[8] = u[8]; v[9] = u[9]; v[10] = u[10]; v[11] = u[11]; - v[12] = half_btf_sse4_1(cospi16, u[12], cospi48, u[13], rnding, bit); - v[13] = half_btf_sse4_1(cospi48, u[12], cospim16, u[13], rnding, bit); - v[14] = half_btf_sse4_1(cospim48, u[14], cospi16, u[15], rnding, bit); - v[15] = half_btf_sse4_1(cospi16, u[14], cospi48, u[15], rnding, bit); + v[12] = half_btf_sse4_1(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit); + v[13] = half_btf_sse4_1(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit); + v[14] = half_btf_sse4_1(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit); + v[15] = half_btf_sse4_1(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit); // stage 5 u[0] = _mm_add_epi32(v[0], v[4]); @@ -1223,14 +1243,14 @@ static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) { v[5] = u[5]; v[6] = u[6]; v[7] = u[7]; - v[8] = half_btf_sse4_1(cospi8, u[8], cospi56, u[9], rnding, bit); - v[9] = half_btf_sse4_1(cospi56, u[8], cospim8, u[9], rnding, bit); - v[10] = half_btf_sse4_1(cospi40, u[10], cospi24, u[11], rnding, bit); - v[11] = half_btf_sse4_1(cospi24, u[10], cospim40, u[11], rnding, bit); - v[12] = half_btf_sse4_1(cospim56, u[12], cospi8, u[13], rnding, bit); - v[13] = half_btf_sse4_1(cospi8, u[12], cospi56, u[13], rnding, bit); - v[14] = half_btf_sse4_1(cospim24, u[14], cospi40, u[15], rnding, bit); - v[15] = half_btf_sse4_1(cospi40, u[14], cospi24, u[15], rnding, bit); + v[8] = half_btf_sse4_1(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit); + v[9] = half_btf_sse4_1(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit); + v[10] = half_btf_sse4_1(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit); + v[11] = half_btf_sse4_1(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit); + v[12] = half_btf_sse4_1(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit); + v[13] = half_btf_sse4_1(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit); + v[14] = half_btf_sse4_1(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit); + v[15] = half_btf_sse4_1(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit); // stage 7 u[0] = _mm_add_epi32(v[0], v[8]); @@ -1251,22 +1271,22 @@ static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) { u[15] = _mm_sub_epi32(v[7], v[15]); // stage 8 - v[0] = half_btf_sse4_1(cospi2, u[0], cospi62, u[1], rnding, bit); - v[1] = half_btf_sse4_1(cospi62, u[0], cospim2, u[1], rnding, bit); - v[2] = half_btf_sse4_1(cospi10, u[2], cospi54, u[3], rnding, bit); - v[3] = half_btf_sse4_1(cospi54, u[2], cospim10, u[3], rnding, bit); - v[4] = half_btf_sse4_1(cospi18, u[4], cospi46, u[5], rnding, bit); - v[5] = half_btf_sse4_1(cospi46, u[4], cospim18, u[5], rnding, bit); - v[6] = half_btf_sse4_1(cospi26, u[6], cospi38, u[7], rnding, bit); - v[7] = half_btf_sse4_1(cospi38, u[6], cospim26, u[7], rnding, bit); - v[8] = half_btf_sse4_1(cospi34, u[8], cospi30, u[9], rnding, bit); - v[9] = half_btf_sse4_1(cospi30, u[8], cospim34, u[9], rnding, bit); - v[10] = half_btf_sse4_1(cospi42, u[10], cospi22, u[11], rnding, bit); - v[11] = half_btf_sse4_1(cospi22, u[10], cospim42, u[11], rnding, bit); - v[12] = half_btf_sse4_1(cospi50, u[12], cospi14, u[13], rnding, bit); - v[13] = half_btf_sse4_1(cospi14, u[12], cospim50, u[13], rnding, bit); - v[14] = half_btf_sse4_1(cospi58, u[14], cospi6, u[15], rnding, bit); - v[15] = half_btf_sse4_1(cospi6, u[14], cospim58, u[15], rnding, bit); + v[0] = half_btf_sse4_1(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit); + v[1] = half_btf_sse4_1(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit); + v[2] = half_btf_sse4_1(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit); + v[3] = half_btf_sse4_1(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit); + v[4] = half_btf_sse4_1(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit); + v[5] = half_btf_sse4_1(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit); + v[6] = half_btf_sse4_1(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit); + v[7] = half_btf_sse4_1(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit); + v[8] = half_btf_sse4_1(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit); + v[9] = half_btf_sse4_1(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit); + v[10] = half_btf_sse4_1(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit); + v[11] = half_btf_sse4_1(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit); + v[12] = half_btf_sse4_1(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit); + v[13] = half_btf_sse4_1(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit); + v[14] = half_btf_sse4_1(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit); + v[15] = half_btf_sse4_1(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit); // stage 9 out[0 * 4 + col] = v[1]; @@ -1298,99 +1318,109 @@ static void round_shift_16x16(__m128i *in, int shift) { void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *coeff, uint16_t *output, int stride, int tx_type, int bd) { __m128i in[64], out[64]; - const TXFM_2D_CFG *cfg = NULL; + const TXFM_1D_CFG *row_cfg = NULL; + const TXFM_1D_CFG *col_cfg = NULL; switch (tx_type) { case DCT_DCT: - cfg = &inv_txfm_2d_cfg_dct_dct_16; + row_cfg = &inv_txfm_1d_row_cfg_dct_16; + col_cfg = &inv_txfm_1d_col_cfg_dct_16; load_buffer_16x16(coeff, in); transpose_16x16(in, out); - idct16x16_sse4_1(out, in, cfg->cos_bit_row[2]); - round_shift_16x16(in, -cfg->shift[0]); + idct16x16_sse4_1(out, in, row_cfg->cos_bit[2]); + round_shift_16x16(in, -row_cfg->shift[0]); transpose_16x16(in, out); - idct16x16_sse4_1(out, in, cfg->cos_bit_col[2]); - write_buffer_16x16(in, output, stride, 0, 0, -cfg->shift[1], bd); + idct16x16_sse4_1(out, in, col_cfg->cos_bit[2]); + write_buffer_16x16(in, output, stride, 0, 0, -row_cfg->shift[1], bd); break; case DCT_ADST: - cfg = &inv_txfm_2d_cfg_dct_adst_16; + row_cfg = &inv_txfm_1d_row_cfg_adst_16; + col_cfg = &inv_txfm_1d_col_cfg_dct_16; load_buffer_16x16(coeff, in); transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]); - round_shift_16x16(in, -cfg->shift[0]); + iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]); + round_shift_16x16(in, -row_cfg->shift[0]); transpose_16x16(in, out); - idct16x16_sse4_1(out, in, cfg->cos_bit_col[2]); - write_buffer_16x16(in, output, stride, 0, 0, -cfg->shift[1], bd); + idct16x16_sse4_1(out, in, col_cfg->cos_bit[2]); + write_buffer_16x16(in, output, stride, 0, 0, -row_cfg->shift[1], bd); break; case ADST_DCT: - cfg = &inv_txfm_2d_cfg_adst_dct_16; + row_cfg = &inv_txfm_1d_row_cfg_dct_16; + col_cfg = &inv_txfm_1d_col_cfg_adst_16; load_buffer_16x16(coeff, in); transpose_16x16(in, out); - idct16x16_sse4_1(out, in, cfg->cos_bit_row[2]); - round_shift_16x16(in, -cfg->shift[0]); + idct16x16_sse4_1(out, in, row_cfg->cos_bit[2]); + round_shift_16x16(in, -row_cfg->shift[0]); transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]); - write_buffer_16x16(in, output, stride, 0, 0, -cfg->shift[1], bd); + iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]); + write_buffer_16x16(in, output, stride, 0, 0, -row_cfg->shift[1], bd); break; case ADST_ADST: - cfg = &inv_txfm_2d_cfg_adst_adst_16; + row_cfg = &inv_txfm_1d_row_cfg_adst_16; + col_cfg = &inv_txfm_1d_col_cfg_adst_16; load_buffer_16x16(coeff, in); transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]); - round_shift_16x16(in, -cfg->shift[0]); + iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]); + round_shift_16x16(in, -row_cfg->shift[0]); transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]); - write_buffer_16x16(in, output, stride, 0, 0, -cfg->shift[1], bd); + iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]); + write_buffer_16x16(in, output, stride, 0, 0, -row_cfg->shift[1], bd); break; #if CONFIG_EXT_TX case FLIPADST_DCT: - cfg = &inv_txfm_2d_cfg_adst_dct_16; + row_cfg = &inv_txfm_1d_row_cfg_dct_16; + col_cfg = &inv_txfm_1d_col_cfg_adst_16; load_buffer_16x16(coeff, in); transpose_16x16(in, out); - idct16x16_sse4_1(out, in, cfg->cos_bit_row[2]); - round_shift_16x16(in, -cfg->shift[0]); + idct16x16_sse4_1(out, in, row_cfg->cos_bit[2]); + round_shift_16x16(in, -row_cfg->shift[0]); transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]); - write_buffer_16x16(in, output, stride, 0, 1, -cfg->shift[1], bd); + iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]); + write_buffer_16x16(in, output, stride, 0, 1, -row_cfg->shift[1], bd); break; case DCT_FLIPADST: - cfg = &inv_txfm_2d_cfg_dct_adst_16; + row_cfg = &inv_txfm_1d_row_cfg_adst_16; + col_cfg = &inv_txfm_1d_col_cfg_dct_16; load_buffer_16x16(coeff, in); transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]); - round_shift_16x16(in, -cfg->shift[0]); + iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]); + round_shift_16x16(in, -row_cfg->shift[0]); transpose_16x16(in, out); - idct16x16_sse4_1(out, in, cfg->cos_bit_col[2]); - write_buffer_16x16(in, output, stride, 1, 0, -cfg->shift[1], bd); + idct16x16_sse4_1(out, in, col_cfg->cos_bit[2]); + write_buffer_16x16(in, output, stride, 1, 0, -row_cfg->shift[1], bd); break; case ADST_FLIPADST: - cfg = &inv_txfm_2d_cfg_adst_adst_16; + row_cfg = &inv_txfm_1d_row_cfg_adst_16; + col_cfg = &inv_txfm_1d_col_cfg_adst_16; load_buffer_16x16(coeff, in); transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]); - round_shift_16x16(in, -cfg->shift[0]); + iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]); + round_shift_16x16(in, -row_cfg->shift[0]); transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]); - write_buffer_16x16(in, output, stride, 1, 0, -cfg->shift[1], bd); + iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]); + write_buffer_16x16(in, output, stride, 1, 0, -row_cfg->shift[1], bd); break; case FLIPADST_FLIPADST: - cfg = &inv_txfm_2d_cfg_adst_adst_16; + row_cfg = &inv_txfm_1d_row_cfg_adst_16; + col_cfg = &inv_txfm_1d_col_cfg_adst_16; load_buffer_16x16(coeff, in); transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]); - round_shift_16x16(in, -cfg->shift[0]); + iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]); + round_shift_16x16(in, -row_cfg->shift[0]); transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]); - write_buffer_16x16(in, output, stride, 1, 1, -cfg->shift[1], bd); + iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]); + write_buffer_16x16(in, output, stride, 1, 1, -row_cfg->shift[1], bd); break; case FLIPADST_ADST: - cfg = &inv_txfm_2d_cfg_adst_adst_16; + row_cfg = &inv_txfm_1d_row_cfg_adst_16; + col_cfg = &inv_txfm_1d_col_cfg_adst_16; load_buffer_16x16(coeff, in); transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]); - round_shift_16x16(in, -cfg->shift[0]); + iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]); + round_shift_16x16(in, -row_cfg->shift[0]); transpose_16x16(in, out); - iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]); - write_buffer_16x16(in, output, stride, 0, 1, -cfg->shift[1], bd); + iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]); + write_buffer_16x16(in, output, stride, 0, 1, -row_cfg->shift[1], bd); break; #endif default: assert(0); |