From 7369c7d7a5eed32963d8af37658286617919f91c Mon Sep 17 00:00:00 2001 From: trav90 Date: Thu, 18 Oct 2018 06:04:57 -0500 Subject: Update aom to commit id f5bdeac22930ff4c6b219be49c843db35970b918 --- .../aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c | 73 +++------------------- 1 file changed, 10 insertions(+), 63 deletions(-) (limited to 'third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c') diff --git a/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c b/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c index 8495ad1aa..af8e9a5f4 100644 --- a/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c +++ b/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c @@ -18,51 +18,6 @@ #include "aom_dsp/txfm_common.h" #include "aom_dsp/x86/txfm_common_avx2.h" -static int32_t get_16x16_sum(const int16_t *input, int stride) { - __m256i r0, r1, r2, r3, u0, u1; - __m256i zero = _mm256_setzero_si256(); - __m256i sum = _mm256_setzero_si256(); - const int16_t *blockBound = input + (stride << 4); - __m128i v0, v1; - - while (input < blockBound) { - r0 = _mm256_loadu_si256((__m256i const *)input); - r1 = _mm256_loadu_si256((__m256i const *)(input + stride)); - r2 = _mm256_loadu_si256((__m256i const *)(input + 2 * stride)); - r3 = _mm256_loadu_si256((__m256i const *)(input + 3 * stride)); - - u0 = _mm256_add_epi16(r0, r1); - u1 = _mm256_add_epi16(r2, r3); - sum = _mm256_add_epi16(sum, u0); - sum = _mm256_add_epi16(sum, u1); - - input += stride << 2; - } - - // unpack 16 int16_t into 2x8 int32_t - u0 = _mm256_unpacklo_epi16(zero, sum); - u1 = _mm256_unpackhi_epi16(zero, sum); - u0 = _mm256_srai_epi32(u0, 16); - u1 = _mm256_srai_epi32(u1, 16); - sum = _mm256_add_epi32(u0, u1); - - u0 = _mm256_srli_si256(sum, 8); - u1 = _mm256_add_epi32(sum, u0); - - v0 = _mm_add_epi32(_mm256_extracti128_si256(u1, 1), - _mm256_castsi256_si128(u1)); - v1 = _mm_srli_si128(v0, 4); - v0 = _mm_add_epi32(v0, v1); - return (int32_t)_mm_extract_epi32(v0, 0); -} - -void aom_fdct16x16_1_avx2(const int16_t *input, tran_low_t *output, - int stride) { - int32_t dc = get_16x16_sum(input, stride); - output[0] = (tran_low_t)(dc >> 1); - _mm256_zeroupper(); -} - static INLINE void load_buffer_16x16(const int16_t *input, int stride, int flipud, int fliplr, __m256i *in) { if (!flipud) { @@ -959,8 +914,12 @@ static void fidtx16_avx2(__m256i *in) { #endif void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride, - int tx_type) { + TxfmParam *txfm_param) { __m256i in[16]; + int tx_type = txfm_param->tx_type; +#if CONFIG_MRC_TX + assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); +#endif switch (tx_type) { case DCT_DCT: @@ -1084,22 +1043,6 @@ void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride, _mm256_zeroupper(); } -void aom_fdct32x32_1_avx2(const int16_t *input, tran_low_t *output, - int stride) { - // left and upper corner - int32_t sum = get_16x16_sum(input, stride); - // right and upper corner - sum += get_16x16_sum(input + 16, stride); - // left and lower corner - sum += get_16x16_sum(input + (stride << 4), stride); - // right and lower corner - sum += get_16x16_sum(input + (stride << 4) + 16, stride); - - sum >>= 3; - output[0] = (tran_low_t)sum; - _mm256_zeroupper(); -} - static void mm256_vectors_swap(__m256i *a0, __m256i *a1, const int size) { int i = 0; __m256i temp; @@ -1570,9 +1513,13 @@ static void fidtx32_avx2(__m256i *in0, __m256i *in1) { #endif void av1_fht32x32_avx2(const int16_t *input, tran_low_t *output, int stride, - int tx_type) { + TxfmParam *txfm_param) { __m256i in0[32]; // left 32 columns __m256i in1[32]; // right 32 columns + int tx_type = txfm_param->tx_type; +#if CONFIG_MRC_TX + assert(tx_type != MRC_DCT && "No avx2 32x32 implementation of MRC_DCT"); +#endif switch (tx_type) { case DCT_DCT: -- cgit v1.2.3