diff options
Diffstat (limited to 'third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c')
-rw-r--r-- | third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c | 75 |
1 files changed, 67 insertions, 8 deletions
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c index abb95f31e..8ec0256eb 100644 --- a/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c +++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c @@ -14,6 +14,7 @@ #include "av1/common/enums.h" #include "av1/common/av1_txfm.h" #include "av1/common/x86/av1_txfm_sse2.h" +#include "av1/common/x86/highbd_txfm_utility_sse4.h" #include "av1/encoder/av1_fwd_txfm1d_cfg.h" #include "av1/encoder/x86/av1_txfm1d_sse4.h" #include "av1/encoder/x86/av1_fwd_txfm_sse2.h" @@ -52,9 +53,22 @@ static void fdct32_new_sse4_1(const __m128i *input, __m128i *output, } } +static void fdct64_new_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range) { + const int txfm_size = 64; + const int num_per_128 = 4; + int col_num = txfm_size / num_per_128; + (void)stage_range; + for (int col = 0; col < col_num; col++) { + av1_fdct64_new_sse4_1((input + col), (output + col), cos_bit, col_num, + col_num); + } +} + static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) { switch (txfm_type) { case TXFM_TYPE_DCT32: return fdct32_new_sse4_1; break; + case TXFM_TYPE_DCT64: return fdct64_new_sse4_1; break; default: assert(0); } return NULL; @@ -95,6 +109,42 @@ static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output, transpose_32(txfm_size, buf_128, out_128); } +static INLINE void fwd_txfm2d_64x64_sse4_1(const int16_t *input, + int32_t *output, const int stride, + const TXFM_2D_FLIP_CFG *cfg, + int32_t *txfm_buf) { + assert(cfg->tx_size < TX_SIZES); + const int txfm_size = tx_size_wide[cfg->tx_size]; + const int8_t *shift = cfg->shift; + const int8_t *stage_range_col = cfg->stage_range_col; + const int8_t cos_bit_col = cfg->cos_bit_col; + const int8_t cos_bit_row = cfg->cos_bit_row; + const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col); + __m128i *buf_128 = (__m128i *)txfm_buf; + __m128i *out_128 = (__m128i *)output; + + const int num_per_128 = 4; + int txfm2d_size_128 = txfm_size * txfm_size / num_per_128; + int col_num = txfm_size / num_per_128; + + int16_array_with_stride_to_int32_array_without_stride(input, stride, output, + txfm_size); + /*col wise transform*/ + txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col); + av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]); + transpose_32(txfm_size, out_128, buf_128); + + /*row wise transform*/ + for (int col = 0; col < (col_num >> 1); col++) { + av1_fdct64_new_sse4_1((buf_128 + col), (out_128 + col), cos_bit_row, + col_num, (col_num >> 1)); + } + + txfm2d_size_128 = (col_num >> 1) * (txfm_size >> 1); + av1_round_shift_array_32_sse4_1(out_128, buf_128, txfm2d_size_128, -shift[2]); + transpose_32x32(buf_128, out_128); +} + void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { DECLARE_ALIGNED(16, int32_t, txfm_buf[1024]); @@ -104,6 +154,15 @@ void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output, fwd_txfm2d_sse4_1(input, output, stride, &cfg, txfm_buf); } +void av1_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(16, int32_t, txfm_buf[4096]); + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_64X64, &cfg); + (void)bd; + fwd_txfm2d_64x64_sse4_1(input, output, stride, &cfg, txfm_buf); +} + static INLINE void transpose_32_4x4x2(int stride, const __m128i *inputA, const __m128i *inputB, __m128i *output) { __m128i temp0 = _mm_unpacklo_epi32(inputA[0], inputA[2]); @@ -162,8 +221,8 @@ static void lowbd_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output, bufA[j] = _mm_cvtepi16_epi32(buf[j]); bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j])); } - av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row); - av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row); + av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row, 1, 1); + av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row, 1, 1); av1_round_shift_array_32_sse4_1(bufA, bufA, 32, -shift[2]); av1_round_shift_array_32_sse4_1(bufB, bufB, 32, -shift[2]); @@ -209,10 +268,10 @@ static void lowbd_fwd_txfm2d_64x32_sse4_1(const int16_t *input, int32_t *output, bufA[j] = _mm_cvtepi16_epi32(buf[j]); bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j])); } - av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row); - av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row); - av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2]); - av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2]); + av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row, 1, 1); + av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row, 1, 1); + av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2); + av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2); int32_t *output8 = output + 8 * 32 * i; for (int j = 0; j < width_div8; ++j) { @@ -260,8 +319,8 @@ static void lowbd_fwd_txfm2d_32x64_sse4_1(const int16_t *input, int32_t *output, } av1_fdct32_new_sse4_1(bufA, bufA, cos_bit_row); av1_fdct32_new_sse4_1(bufB, bufB, cos_bit_row); - av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2]); - av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2]); + av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2); + av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2); int32_t *output8 = output + 8 * 32 * i; for (int j = 0; j < (32 / 4); ++j) { |