diff options
author | trav90 <travawine@palemoon.org> | 2018-10-18 06:04:57 -0500 |
---|---|---|
committer | trav90 <travawine@palemoon.org> | 2018-10-18 06:04:57 -0500 |
commit | 7369c7d7a5eed32963d8af37658286617919f91c (patch) | |
tree | 5397ce7ee9bca1641118fdc3187bd9e2b24fdc9c /third_party/aom/av1/encoder/x86 | |
parent | 77887af9c4ad1420bbdb33984af4f74b55ca59db (diff) | |
download | UXP-7369c7d7a5eed32963d8af37658286617919f91c.tar UXP-7369c7d7a5eed32963d8af37658286617919f91c.tar.gz UXP-7369c7d7a5eed32963d8af37658286617919f91c.tar.lz UXP-7369c7d7a5eed32963d8af37658286617919f91c.tar.xz UXP-7369c7d7a5eed32963d8af37658286617919f91c.zip |
Update aom to commit id f5bdeac22930ff4c6b219be49c843db35970b918
Diffstat (limited to 'third_party/aom/av1/encoder/x86')
7 files changed, 520 insertions, 99 deletions
diff --git a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c new file mode 100644 index 000000000..c8d4ccb70 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <immintrin.h> + +#include "./av1_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_dsp/aom_dsp_common.h" + +static INLINE void init_one_qp(const __m128i *p, __m256i *qp) { + const __m128i zero = _mm_setzero_si128(); + const __m128i dc = _mm_unpacklo_epi16(*p, zero); + const __m128i ac = _mm_unpackhi_epi16(*p, zero); + *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1); +} + +static INLINE void update_qp(__m256i *qp) { + qp[0] = _mm256_permute2x128_si256(qp[0], qp[0], 0x11); + qp[1] = _mm256_permute2x128_si256(qp[1], qp[1], 0x11); + qp[2] = _mm256_permute2x128_si256(qp[2], qp[2], 0x11); +} + +static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *dequant_ptr, int log_scale, + __m256i *qp) { + __m128i round = _mm_loadu_si128((const __m128i *)round_ptr); + round = _mm_srai_epi16(round, log_scale); + const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr); + const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr); + + init_one_qp(&round, &qp[0]); + init_one_qp(&quant, &qp[1]); + init_one_qp(&dequant, &qp[2]); +} + +static INLINE void quantize(const __m256i *qp, __m256i *c, + const int16_t *iscan_ptr, int log_scale, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + __m256i *eob) { + const __m256i abs = _mm256_abs_epi32(*c); + __m256i q = _mm256_add_epi32(abs, qp[0]); + + __m256i q_lo = _mm256_mul_epi32(q, qp[1]); + __m256i q_hi = _mm256_srli_epi64(q, 32); + const __m256i qp_hi = _mm256_srli_epi64(qp[1], 32); + q_hi = _mm256_mul_epi32(q_hi, qp_hi); + q_lo = _mm256_srli_epi64(q_lo, 16 - log_scale); + q_hi = _mm256_srli_epi64(q_hi, 16 - log_scale); + q_hi = _mm256_slli_epi64(q_hi, 32); + q = _mm256_or_si256(q_lo, q_hi); + + __m256i dq = _mm256_mullo_epi32(q, qp[2]); + dq = _mm256_srai_epi32(dq, log_scale); + q = _mm256_sign_epi32(q, *c); + dq = _mm256_sign_epi32(dq, *c); + + _mm256_storeu_si256((__m256i *)qcoeff, q); + _mm256_storeu_si256((__m256i *)dqcoeff, dq); + + const __m128i isc = _mm_loadu_si128((const __m128i *)iscan_ptr); + const __m128i zr = _mm_setzero_si128(); + const __m128i lo = _mm_unpacklo_epi16(isc, zr); + const __m128i hi = _mm_unpackhi_epi16(isc, zr); + const __m256i iscan = + _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1); + + const __m256i zero = _mm256_setzero_si256(); + const __m256i zc = _mm256_cmpeq_epi32(dq, zero); + const __m256i nz = _mm256_cmpeq_epi32(zc, zero); + __m256i cur_eob = _mm256_sub_epi32(iscan, nz); + cur_eob = _mm256_and_si256(cur_eob, nz); + *eob = _mm256_max_epi32(cur_eob, *eob); +} + +void av1_highbd_quantize_fp_avx2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, + const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, int log_scale) { + (void)scan; + (void)zbin_ptr; + (void)quant_shift_ptr; + const unsigned int step = 8; + + if (LIKELY(!skip_block)) { + __m256i qp[3], coeff; + + init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, qp); + coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + + __m256i eob = _mm256_setzero_si256(); + quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + + update_qp(qp); + while (n_coeffs > 0) { + coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + } + { + __m256i eob_s; + eob_s = _mm256_shuffle_epi32(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 1); + eob = _mm256_max_epi16(eob, eob_s); + const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob), + _mm256_extractf128_si256(eob, 1)); + *eob_ptr = _mm_extract_epi16(final_eob, 0); + } + } else { + do { + const __m256i zero = _mm256_setzero_si256(); + _mm256_storeu_si256((__m256i *)qcoeff_ptr, zero); + _mm256_storeu_si256((__m256i *)dqcoeff_ptr, zero); + qcoeff_ptr += step; + dqcoeff_ptr += step; + n_coeffs -= step; + } while (n_coeffs > 0); + *eob_ptr = 0; + } +} diff --git a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c index fa5626002..8d717a083 100644 --- a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c +++ b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c @@ -133,9 +133,10 @@ void av1_highbd_quantize_fp_sse4_1( coeff[0] = _mm_loadu_si128((__m128i const *)src); qparam[0] = - _mm_set_epi32(round_ptr[1], round_ptr[1], round_ptr[1], round_ptr[0]); - qparam[1] = _mm_set_epi64x(quant_ptr[1], quant_ptr[0]); - qparam[2] = _mm_set_epi64x(dequant_ptr[1], dequant_ptr[0]); + _mm_set_epi32(round_ptr[1] >> log_scale, round_ptr[1] >> log_scale, + round_ptr[1] >> log_scale, round_ptr[0] >> log_scale); + qparam[1] = _mm_set_epi32(0, quant_ptr[1], 0, quant_ptr[0]); + qparam[2] = _mm_set_epi32(0, dequant_ptr[1], 0, dequant_ptr[0]); // DC and first 3 AC quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant, @@ -143,8 +144,8 @@ void av1_highbd_quantize_fp_sse4_1( // update round/quan/dquan for AC qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]); - qparam[1] = _mm_set_epi64x(quant_ptr[1], quant_ptr[1]); - qparam[2] = _mm_set_epi64x(dequant_ptr[1], dequant_ptr[1]); + qparam[1] = _mm_set_epi32(0, quant_ptr[1], 0, quant_ptr[1]); + qparam[2] = _mm_set_epi32(0, dequant_ptr[1], 0, dequant_ptr[1]); quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale, quanAddr, dquanAddr); diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c b/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c new file mode 100644 index 000000000..1c0a120ca --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c @@ -0,0 +1,289 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <immintrin.h> + +#include "./av1_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_dsp/aom_dsp_common.h" + +static INLINE void read_coeff(const tran_low_t *coeff, __m256i *c) { +#if CONFIG_HIGHBITDEPTH + const __m256i x0 = _mm256_loadu_si256((const __m256i *)coeff); + const __m256i x1 = _mm256_loadu_si256((const __m256i *)coeff + 1); + *c = _mm256_packs_epi32(x0, x1); + *c = _mm256_permute4x64_epi64(*c, 0xD8); +#else + *c = _mm256_loadu_si256((const __m256i *)coeff); +#endif +} + +static INLINE void write_zero(tran_low_t *qcoeff) { + const __m256i zero = _mm256_setzero_si256(); +#if CONFIG_HIGHBITDEPTH + _mm256_storeu_si256((__m256i *)qcoeff, zero); + _mm256_storeu_si256((__m256i *)qcoeff + 1, zero); +#else + _mm256_storeu_si256((__m256i *)qcoeff, zero); +#endif +} + +static INLINE void init_one_qp(const __m128i *p, __m256i *qp) { + const __m128i ac = _mm_unpackhi_epi64(*p, *p); + *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(*p), ac, 1); +} + +static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *dequant_ptr, int log_scale, + __m256i *thr, __m256i *qp) { + __m128i round = _mm_loadu_si128((const __m128i *)round_ptr); + const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr); + const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr); + + if (log_scale > 0) { + const __m128i rnd = _mm_set1_epi16((int16_t)1 << (log_scale - 1)); + round = _mm_add_epi16(round, rnd); + round = _mm_srai_epi16(round, log_scale); + } + + init_one_qp(&round, &qp[0]); + init_one_qp(&quant, &qp[1]); + + if (log_scale > 0) { + qp[1] = _mm256_slli_epi16(qp[1], log_scale); + } + + init_one_qp(&dequant, &qp[2]); + *thr = _mm256_srai_epi16(qp[2], 1 + log_scale); +} + +static INLINE void update_qp(int log_scale, __m256i *thr, __m256i *qp) { + qp[0] = _mm256_permute2x128_si256(qp[0], qp[0], 0x11); + qp[1] = _mm256_permute2x128_si256(qp[1], qp[1], 0x11); + qp[2] = _mm256_permute2x128_si256(qp[2], qp[2], 0x11); + *thr = _mm256_srai_epi16(qp[2], 1 + log_scale); +} + +#define store_quan(q, addr) \ + do { \ + __m256i sign_bits = _mm256_srai_epi16(q, 15); \ + __m256i y0 = _mm256_unpacklo_epi16(q, sign_bits); \ + __m256i y1 = _mm256_unpackhi_epi16(q, sign_bits); \ + __m256i x0 = _mm256_permute2x128_si256(y0, y1, 0x20); \ + __m256i x1 = _mm256_permute2x128_si256(y0, y1, 0x31); \ + _mm256_storeu_si256((__m256i *)addr, x0); \ + _mm256_storeu_si256((__m256i *)addr + 1, x1); \ + } while (0) + +#if CONFIG_HIGHBITDEPTH +#define store_two_quan(q, addr1, dq, addr2) \ + do { \ + store_quan(q, addr1); \ + store_quan(dq, addr2); \ + } while (0) +#else +#define store_two_quan(q, addr1, dq, addr2) \ + do { \ + _mm256_storeu_si256((__m256i *)addr1, q); \ + _mm256_storeu_si256((__m256i *)addr2, dq); \ + } while (0) +#endif + +static INLINE void quantize(const __m256i *thr, const __m256i *qp, __m256i *c, + const int16_t *iscan_ptr, tran_low_t *qcoeff, + tran_low_t *dqcoeff, __m256i *eob) { + const __m256i abs = _mm256_abs_epi16(*c); + __m256i mask = _mm256_cmpgt_epi16(abs, *thr); + mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs, *thr)); + const int nzflag = _mm256_movemask_epi8(mask); + + if (nzflag) { + __m256i q = _mm256_adds_epi16(abs, qp[0]); + q = _mm256_mulhi_epi16(q, qp[1]); + q = _mm256_sign_epi16(q, *c); + const __m256i dq = _mm256_mullo_epi16(q, qp[2]); + + store_two_quan(q, qcoeff, dq, dqcoeff); + const __m256i zero = _mm256_setzero_si256(); + const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr); + const __m256i zero_coeff = _mm256_cmpeq_epi16(dq, zero); + const __m256i nzero_coeff = _mm256_cmpeq_epi16(zero_coeff, zero); + __m256i cur_eob = _mm256_sub_epi16(iscan, nzero_coeff); + cur_eob = _mm256_and_si256(cur_eob, nzero_coeff); + *eob = _mm256_max_epi16(*eob, cur_eob); + } else { + write_zero(qcoeff); + write_zero(dqcoeff); + } +} + +void av1_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan_ptr, const int16_t *iscan_ptr) { + (void)scan_ptr; + (void)zbin_ptr; + (void)quant_shift_ptr; + const unsigned int step = 16; + + if (LIKELY(!skip_block)) { + __m256i qp[3]; + __m256i coeff, thr; + const int log_scale = 0; + + init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp); + read_coeff(coeff_ptr, &coeff); + + __m256i eob = _mm256_setzero_si256(); + quantize(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan_ptr += step; + n_coeffs -= step; + + update_qp(log_scale, &thr, qp); + + while (n_coeffs > 0) { + read_coeff(coeff_ptr, &coeff); + quantize(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan_ptr += step; + n_coeffs -= step; + } + { + __m256i eob_s; + eob_s = _mm256_shuffle_epi32(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 1); + eob = _mm256_max_epi16(eob, eob_s); + const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob), + _mm256_extractf128_si256(eob, 1)); + *eob_ptr = _mm_extract_epi16(final_eob, 0); + } + } else { + do { + write_zero(qcoeff_ptr); + write_zero(dqcoeff_ptr); + qcoeff_ptr += step; + dqcoeff_ptr += step; + n_coeffs -= step; + } while (n_coeffs > 0); + *eob_ptr = 0; + } +} + +static INLINE void quantize_32x32(const __m256i *thr, const __m256i *qp, + __m256i *c, const int16_t *iscan_ptr, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + __m256i *eob) { + const __m256i abs = _mm256_abs_epi16(*c); + __m256i mask = _mm256_cmpgt_epi16(abs, *thr); + mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs, *thr)); + const int nzflag = _mm256_movemask_epi8(mask); + + if (nzflag) { + __m256i q = _mm256_adds_epi16(abs, qp[0]); + q = _mm256_mulhi_epu16(q, qp[1]); + + __m256i dq = _mm256_mullo_epi16(q, qp[2]); + dq = _mm256_srli_epi16(dq, 1); + + q = _mm256_sign_epi16(q, *c); + dq = _mm256_sign_epi16(dq, *c); + + store_two_quan(q, qcoeff, dq, dqcoeff); + const __m256i zero = _mm256_setzero_si256(); + const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr); + const __m256i zero_coeff = _mm256_cmpeq_epi16(dq, zero); + const __m256i nzero_coeff = _mm256_cmpeq_epi16(zero_coeff, zero); + __m256i cur_eob = _mm256_sub_epi16(iscan, nzero_coeff); + cur_eob = _mm256_and_si256(cur_eob, nzero_coeff); + *eob = _mm256_max_epi16(*eob, cur_eob); + } else { + write_zero(qcoeff); + write_zero(dqcoeff); + } +} + +void av1_quantize_fp_32x32_avx2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, + const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan_ptr, const int16_t *iscan_ptr) { + (void)scan_ptr; + (void)zbin_ptr; + (void)quant_shift_ptr; + const unsigned int step = 16; + + if (LIKELY(!skip_block)) { + __m256i qp[3]; + __m256i coeff, thr; + const int log_scale = 1; + + init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp); + read_coeff(coeff_ptr, &coeff); + + __m256i eob = _mm256_setzero_si256(); + quantize_32x32(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan_ptr += step; + n_coeffs -= step; + + update_qp(log_scale, &thr, qp); + + while (n_coeffs > 0) { + read_coeff(coeff_ptr, &coeff); + quantize_32x32(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, + &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan_ptr += step; + n_coeffs -= step; + } + { + __m256i eob_s; + eob_s = _mm256_shuffle_epi32(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 1); + eob = _mm256_max_epi16(eob, eob_s); + const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob), + _mm256_extractf128_si256(eob, 1)); + *eob_ptr = _mm_extract_epi16(final_eob, 0); + } + } else { + do { + write_zero(qcoeff_ptr); + write_zero(dqcoeff_ptr); + qcoeff_ptr += step; + dqcoeff_ptr += step; + n_coeffs -= step; + } while (n_coeffs > 0); + *eob_ptr = 0; + } +} diff --git a/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c b/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c index 37c4b0d88..496c33395 100644 --- a/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c +++ b/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c @@ -203,8 +203,12 @@ static void fidtx4_sse2(__m128i *in) { #endif // CONFIG_EXT_TX void av1_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride, - int tx_type) { + TxfmParam *txfm_param) { __m128i in[4]; + int tx_type = txfm_param->tx_type; +#if CONFIG_MRC_TX + assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); +#endif switch (tx_type) { case DCT_DCT: aom_fdct4x4_sse2(input, output, stride); break; @@ -1301,8 +1305,12 @@ static void fidtx8_sse2(__m128i *in) { #endif // CONFIG_EXT_TX void av1_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride, - int tx_type) { + TxfmParam *txfm_param) { __m128i in[8]; + int tx_type = txfm_param->tx_type; +#if CONFIG_MRC_TX + assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); +#endif switch (tx_type) { case DCT_DCT: aom_fdct8x8_sse2(input, output, stride); break; @@ -2334,8 +2342,12 @@ static void fidtx16_sse2(__m128i *in0, __m128i *in1) { #endif // CONFIG_EXT_TX void av1_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, - int tx_type) { + TxfmParam *txfm_param) { __m128i in0[16], in1[16]; + int tx_type = txfm_param->tx_type; +#if CONFIG_MRC_TX + assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); +#endif switch (tx_type) { case DCT_DCT: @@ -2550,8 +2562,12 @@ static INLINE void write_buffer_4x8(tran_low_t *output, __m128i *res) { } void av1_fht4x8_sse2(const int16_t *input, tran_low_t *output, int stride, - int tx_type) { + TxfmParam *txfm_param) { __m128i in[8]; + int tx_type = txfm_param->tx_type; +#if CONFIG_MRC_TX + assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); +#endif switch (tx_type) { case DCT_DCT: @@ -2724,8 +2740,12 @@ static INLINE void write_buffer_8x4(tran_low_t *output, __m128i *res) { } void av1_fht8x4_sse2(const int16_t *input, tran_low_t *output, int stride, - int tx_type) { + TxfmParam *txfm_param) { __m128i in[8]; + int tx_type = txfm_param->tx_type; +#if CONFIG_MRC_TX + assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); +#endif switch (tx_type) { case DCT_DCT: @@ -2864,8 +2884,12 @@ static void row_8x16_rounding(__m128i *in, int bits) { } void av1_fht8x16_sse2(const int16_t *input, tran_low_t *output, int stride, - int tx_type) { + TxfmParam *txfm_param) { __m128i in[16]; + int tx_type = txfm_param->tx_type; +#if CONFIG_MRC_TX + assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); +#endif __m128i *const t = in; // Alias to top 8x8 sub block __m128i *const b = in + 8; // Alias to bottom 8x8 sub block @@ -3045,8 +3069,12 @@ static INLINE void load_buffer_16x8(const int16_t *input, __m128i *in, #define col_16x8_rounding row_8x16_rounding void av1_fht16x8_sse2(const int16_t *input, tran_low_t *output, int stride, - int tx_type) { + TxfmParam *txfm_param) { __m128i in[16]; + int tx_type = txfm_param->tx_type; +#if CONFIG_MRC_TX + assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); +#endif __m128i *const l = in; // Alias to left 8x8 sub block __m128i *const r = in + 8; // Alias to right 8x8 sub block, which we store @@ -3355,8 +3383,12 @@ static INLINE void fhalfright32_16col(__m128i *tl, __m128i *tr, __m128i *bl, // For 16x32, this means the input is a 2x2 grid of such blocks. // For 32x16, it means the input is a 4x1 grid. void av1_fht16x32_sse2(const int16_t *input, tran_low_t *output, int stride, - int tx_type) { + TxfmParam *txfm_param) { __m128i intl[16], intr[16], inbl[16], inbr[16]; + int tx_type = txfm_param->tx_type; +#if CONFIG_MRC_TX + assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); +#endif switch (tx_type) { case DCT_DCT: @@ -3544,8 +3576,12 @@ static INLINE void write_buffer_32x16(tran_low_t *output, __m128i *res0, } void av1_fht32x16_sse2(const int16_t *input, tran_low_t *output, int stride, - int tx_type) { + TxfmParam *txfm_param) { __m128i in0[16], in1[16], in2[16], in3[16]; + int tx_type = txfm_param->tx_type; +#if CONFIG_MRC_TX + assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); +#endif load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0); switch (tx_type) { @@ -3784,8 +3820,12 @@ static INLINE void write_buffer_32x32(__m128i *in0, __m128i *in1, __m128i *in2, } void av1_fht32x32_sse2(const int16_t *input, tran_low_t *output, int stride, - int tx_type) { + TxfmParam *txfm_param) { __m128i in0[32], in1[32], in2[32], in3[32]; + int tx_type = txfm_param->tx_type; +#if CONFIG_MRC_TX + assert(tx_type != MRC_DCT && "No 32x32 sse2 MRC_DCT implementation"); +#endif load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 0); switch (tx_type) { diff --git a/third_party/aom/av1/encoder/x86/error_intrin_avx2.c b/third_party/aom/av1/encoder/x86/error_intrin_avx2.c index ae733a1ce..20ba4149c 100644 --- a/third_party/aom/av1/encoder/x86/error_intrin_avx2.c +++ b/third_party/aom/av1/encoder/x86/error_intrin_avx2.c @@ -14,7 +14,20 @@ #include "./av1_rtcd.h" #include "aom/aom_integer.h" -int64_t av1_block_error_avx2(const int16_t *coeff, const int16_t *dqcoeff, +static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset, + __m256i *c) { + const tran_low_t *addr = coeff + offset; +#if CONFIG_HIGHBITDEPTH + const __m256i x0 = _mm256_loadu_si256((const __m256i *)addr); + const __m256i x1 = _mm256_loadu_si256((const __m256i *)addr + 1); + const __m256i y = _mm256_packs_epi32(x0, x1); + *c = _mm256_permute4x64_epi64(y, 0xD8); +#else + *c = _mm256_loadu_si256((const __m256i *)addr); +#endif +} + +int64_t av1_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz) { __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg; __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi; @@ -22,16 +35,16 @@ int64_t av1_block_error_avx2(const int16_t *coeff, const int16_t *dqcoeff, __m128i sse_reg128, ssz_reg128; int64_t sse; int i; - const __m256i zero_reg = _mm256_set1_epi16(0); + const __m256i zero_reg = _mm256_setzero_si256(); // init sse and ssz registerd to zero - sse_reg = _mm256_set1_epi16(0); - ssz_reg = _mm256_set1_epi16(0); + sse_reg = _mm256_setzero_si256(); + ssz_reg = _mm256_setzero_si256(); for (i = 0; i < block_size; i += 16) { // load 32 bytes from coeff and dqcoeff - coeff_reg = _mm256_loadu_si256((const __m256i *)(coeff + i)); - dqcoeff_reg = _mm256_loadu_si256((const __m256i *)(dqcoeff + i)); + read_coeff(coeff, i, &coeff_reg); + read_coeff(dqcoeff, i, &dqcoeff_reg); // dqcoeff - coeff dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg); // madd (dqcoeff - coeff) diff --git a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c index b56eed518..cab36f2bd 100644 --- a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c +++ b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c @@ -113,25 +113,13 @@ static void fdct4x4_sse4_1(__m128i *in, int bit) { in[3] = _mm_unpackhi_epi64(v1, v3); } -static INLINE void write_buffer_4x4(__m128i *res, tran_low_t *output) { +static INLINE void write_buffer_4x4(__m128i *res, int32_t *output) { _mm_store_si128((__m128i *)(output + 0 * 4), res[0]); _mm_store_si128((__m128i *)(output + 1 * 4), res[1]); _mm_store_si128((__m128i *)(output + 2 * 4), res[2]); _mm_store_si128((__m128i *)(output + 3 * 4), res[3]); } -// Note: -// We implement av1_fwd_txfm2d_4x4(). This function is kept here since -// av1_highbd_fht4x4_c() is not removed yet -void av1_highbd_fht4x4_sse4_1(const int16_t *input, tran_low_t *output, - int stride, int tx_type) { - (void)input; - (void)output; - (void)stride; - (void)tx_type; - assert(0); -} - static void fadst4x4_sse4_1(__m128i *in, int bit) { const int32_t *cospi = cospi_arr(bit); const __m128i cospi8 = _mm_set1_epi32(cospi[8]); @@ -416,7 +404,7 @@ static INLINE void col_txfm_8x8_rounding(__m128i *in, int shift) { in[15] = _mm_srai_epi32(in[15], shift); } -static INLINE void write_buffer_8x8(const __m128i *res, tran_low_t *output) { +static INLINE void write_buffer_8x8(const __m128i *res, int32_t *output) { _mm_store_si128((__m128i *)(output + 0 * 4), res[0]); _mm_store_si128((__m128i *)(output + 1 * 4), res[1]); _mm_store_si128((__m128i *)(output + 2 * 4), res[2]); @@ -1800,7 +1788,7 @@ static void col_txfm_16x16_rounding(__m128i *in, int shift) { col_txfm_8x8_rounding(&in[48], shift); } -static void write_buffer_16x16(const __m128i *in, tran_low_t *output) { +static void write_buffer_16x16(const __m128i *in, int32_t *output) { const int size_8x8 = 16 * 4; write_buffer_8x8(&in[0], output); output += size_8x8; diff --git a/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c b/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c index 8495ad1aa..af8e9a5f4 100644 --- a/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c +++ b/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c @@ -18,51 +18,6 @@ #include "aom_dsp/txfm_common.h" #include "aom_dsp/x86/txfm_common_avx2.h" -static int32_t get_16x16_sum(const int16_t *input, int stride) { - __m256i r0, r1, r2, r3, u0, u1; - __m256i zero = _mm256_setzero_si256(); - __m256i sum = _mm256_setzero_si256(); - const int16_t *blockBound = input + (stride << 4); - __m128i v0, v1; - - while (input < blockBound) { - r0 = _mm256_loadu_si256((__m256i const *)input); - r1 = _mm256_loadu_si256((__m256i const *)(input + stride)); - r2 = _mm256_loadu_si256((__m256i const *)(input + 2 * stride)); - r3 = _mm256_loadu_si256((__m256i const *)(input + 3 * stride)); - - u0 = _mm256_add_epi16(r0, r1); - u1 = _mm256_add_epi16(r2, r3); - sum = _mm256_add_epi16(sum, u0); - sum = _mm256_add_epi16(sum, u1); - - input += stride << 2; - } - - // unpack 16 int16_t into 2x8 int32_t - u0 = _mm256_unpacklo_epi16(zero, sum); - u1 = _mm256_unpackhi_epi16(zero, sum); - u0 = _mm256_srai_epi32(u0, 16); - u1 = _mm256_srai_epi32(u1, 16); - sum = _mm256_add_epi32(u0, u1); - - u0 = _mm256_srli_si256(sum, 8); - u1 = _mm256_add_epi32(sum, u0); - - v0 = _mm_add_epi32(_mm256_extracti128_si256(u1, 1), - _mm256_castsi256_si128(u1)); - v1 = _mm_srli_si128(v0, 4); - v0 = _mm_add_epi32(v0, v1); - return (int32_t)_mm_extract_epi32(v0, 0); -} - -void aom_fdct16x16_1_avx2(const int16_t *input, tran_low_t *output, - int stride) { - int32_t dc = get_16x16_sum(input, stride); - output[0] = (tran_low_t)(dc >> 1); - _mm256_zeroupper(); -} - static INLINE void load_buffer_16x16(const int16_t *input, int stride, int flipud, int fliplr, __m256i *in) { if (!flipud) { @@ -959,8 +914,12 @@ static void fidtx16_avx2(__m256i *in) { #endif void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride, - int tx_type) { + TxfmParam *txfm_param) { __m256i in[16]; + int tx_type = txfm_param->tx_type; +#if CONFIG_MRC_TX + assert(tx_type != MRC_DCT && "Invalid tx type for tx size"); +#endif switch (tx_type) { case DCT_DCT: @@ -1084,22 +1043,6 @@ void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride, _mm256_zeroupper(); } -void aom_fdct32x32_1_avx2(const int16_t *input, tran_low_t *output, - int stride) { - // left and upper corner - int32_t sum = get_16x16_sum(input, stride); - // right and upper corner - sum += get_16x16_sum(input + 16, stride); - // left and lower corner - sum += get_16x16_sum(input + (stride << 4), stride); - // right and lower corner - sum += get_16x16_sum(input + (stride << 4) + 16, stride); - - sum >>= 3; - output[0] = (tran_low_t)sum; - _mm256_zeroupper(); -} - static void mm256_vectors_swap(__m256i *a0, __m256i *a1, const int size) { int i = 0; __m256i temp; @@ -1570,9 +1513,13 @@ static void fidtx32_avx2(__m256i *in0, __m256i *in1) { #endif void av1_fht32x32_avx2(const int16_t *input, tran_low_t *output, int stride, - int tx_type) { + TxfmParam *txfm_param) { __m256i in0[32]; // left 32 columns __m256i in1[32]; // right 32 columns + int tx_type = txfm_param->tx_type; +#if CONFIG_MRC_TX + assert(tx_type != MRC_DCT && "No avx2 32x32 implementation of MRC_DCT"); +#endif switch (tx_type) { case DCT_DCT: |