summaryrefslogtreecommitdiffstats
path: root/third_party/aom/av1/encoder/x86
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/aom/av1/encoder/x86')
-rw-r--r--third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c143
-rw-r--r--third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c11
-rw-r--r--third_party/aom/av1/encoder/x86/av1_quantize_avx2.c289
-rw-r--r--third_party/aom/av1/encoder/x86/dct_intrin_sse2.c60
-rw-r--r--third_party/aom/av1/encoder/x86/error_intrin_avx2.c25
-rw-r--r--third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c18
-rw-r--r--third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c73
7 files changed, 520 insertions, 99 deletions
diff --git a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c
new file mode 100644
index 000000000..c8d4ccb70
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "./av1_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+static INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i dc = _mm_unpacklo_epi16(*p, zero);
+ const __m128i ac = _mm_unpackhi_epi16(*p, zero);
+ *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1);
+}
+
+static INLINE void update_qp(__m256i *qp) {
+ qp[0] = _mm256_permute2x128_si256(qp[0], qp[0], 0x11);
+ qp[1] = _mm256_permute2x128_si256(qp[1], qp[1], 0x11);
+ qp[2] = _mm256_permute2x128_si256(qp[2], qp[2], 0x11);
+}
+
+static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *dequant_ptr, int log_scale,
+ __m256i *qp) {
+ __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
+ round = _mm_srai_epi16(round, log_scale);
+ const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
+ const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
+
+ init_one_qp(&round, &qp[0]);
+ init_one_qp(&quant, &qp[1]);
+ init_one_qp(&dequant, &qp[2]);
+}
+
+static INLINE void quantize(const __m256i *qp, __m256i *c,
+ const int16_t *iscan_ptr, int log_scale,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff,
+ __m256i *eob) {
+ const __m256i abs = _mm256_abs_epi32(*c);
+ __m256i q = _mm256_add_epi32(abs, qp[0]);
+
+ __m256i q_lo = _mm256_mul_epi32(q, qp[1]);
+ __m256i q_hi = _mm256_srli_epi64(q, 32);
+ const __m256i qp_hi = _mm256_srli_epi64(qp[1], 32);
+ q_hi = _mm256_mul_epi32(q_hi, qp_hi);
+ q_lo = _mm256_srli_epi64(q_lo, 16 - log_scale);
+ q_hi = _mm256_srli_epi64(q_hi, 16 - log_scale);
+ q_hi = _mm256_slli_epi64(q_hi, 32);
+ q = _mm256_or_si256(q_lo, q_hi);
+
+ __m256i dq = _mm256_mullo_epi32(q, qp[2]);
+ dq = _mm256_srai_epi32(dq, log_scale);
+ q = _mm256_sign_epi32(q, *c);
+ dq = _mm256_sign_epi32(dq, *c);
+
+ _mm256_storeu_si256((__m256i *)qcoeff, q);
+ _mm256_storeu_si256((__m256i *)dqcoeff, dq);
+
+ const __m128i isc = _mm_loadu_si128((const __m128i *)iscan_ptr);
+ const __m128i zr = _mm_setzero_si128();
+ const __m128i lo = _mm_unpacklo_epi16(isc, zr);
+ const __m128i hi = _mm_unpackhi_epi16(isc, zr);
+ const __m256i iscan =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
+
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i zc = _mm256_cmpeq_epi32(dq, zero);
+ const __m256i nz = _mm256_cmpeq_epi32(zc, zero);
+ __m256i cur_eob = _mm256_sub_epi32(iscan, nz);
+ cur_eob = _mm256_and_si256(cur_eob, nz);
+ *eob = _mm256_max_epi32(cur_eob, *eob);
+}
+
+void av1_highbd_quantize_fp_avx2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+ const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, int log_scale) {
+ (void)scan;
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+ const unsigned int step = 8;
+
+ if (LIKELY(!skip_block)) {
+ __m256i qp[3], coeff;
+
+ init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, qp);
+ coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+
+ __m256i eob = _mm256_setzero_si256();
+ quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan += step;
+ n_coeffs -= step;
+
+ update_qp(qp);
+ while (n_coeffs > 0) {
+ coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+ quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan += step;
+ n_coeffs -= step;
+ }
+ {
+ __m256i eob_s;
+ eob_s = _mm256_shuffle_epi32(eob, 0xe);
+ eob = _mm256_max_epi16(eob, eob_s);
+ eob_s = _mm256_shufflelo_epi16(eob, 0xe);
+ eob = _mm256_max_epi16(eob, eob_s);
+ eob_s = _mm256_shufflelo_epi16(eob, 1);
+ eob = _mm256_max_epi16(eob, eob_s);
+ const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
+ _mm256_extractf128_si256(eob, 1));
+ *eob_ptr = _mm_extract_epi16(final_eob, 0);
+ }
+ } else {
+ do {
+ const __m256i zero = _mm256_setzero_si256();
+ _mm256_storeu_si256((__m256i *)qcoeff_ptr, zero);
+ _mm256_storeu_si256((__m256i *)dqcoeff_ptr, zero);
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ n_coeffs -= step;
+ } while (n_coeffs > 0);
+ *eob_ptr = 0;
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c
index fa5626002..8d717a083 100644
--- a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c
+++ b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c
@@ -133,9 +133,10 @@ void av1_highbd_quantize_fp_sse4_1(
coeff[0] = _mm_loadu_si128((__m128i const *)src);
qparam[0] =
- _mm_set_epi32(round_ptr[1], round_ptr[1], round_ptr[1], round_ptr[0]);
- qparam[1] = _mm_set_epi64x(quant_ptr[1], quant_ptr[0]);
- qparam[2] = _mm_set_epi64x(dequant_ptr[1], dequant_ptr[0]);
+ _mm_set_epi32(round_ptr[1] >> log_scale, round_ptr[1] >> log_scale,
+ round_ptr[1] >> log_scale, round_ptr[0] >> log_scale);
+ qparam[1] = _mm_set_epi32(0, quant_ptr[1], 0, quant_ptr[0]);
+ qparam[2] = _mm_set_epi32(0, dequant_ptr[1], 0, dequant_ptr[0]);
// DC and first 3 AC
quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant,
@@ -143,8 +144,8 @@ void av1_highbd_quantize_fp_sse4_1(
// update round/quan/dquan for AC
qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]);
- qparam[1] = _mm_set_epi64x(quant_ptr[1], quant_ptr[1]);
- qparam[2] = _mm_set_epi64x(dequant_ptr[1], dequant_ptr[1]);
+ qparam[1] = _mm_set_epi32(0, quant_ptr[1], 0, quant_ptr[1]);
+ qparam[2] = _mm_set_epi32(0, dequant_ptr[1], 0, dequant_ptr[1]);
quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
log_scale, quanAddr, dquanAddr);
diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c b/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c
new file mode 100644
index 000000000..1c0a120ca
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "./av1_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+static INLINE void read_coeff(const tran_low_t *coeff, __m256i *c) {
+#if CONFIG_HIGHBITDEPTH
+ const __m256i x0 = _mm256_loadu_si256((const __m256i *)coeff);
+ const __m256i x1 = _mm256_loadu_si256((const __m256i *)coeff + 1);
+ *c = _mm256_packs_epi32(x0, x1);
+ *c = _mm256_permute4x64_epi64(*c, 0xD8);
+#else
+ *c = _mm256_loadu_si256((const __m256i *)coeff);
+#endif
+}
+
+static INLINE void write_zero(tran_low_t *qcoeff) {
+ const __m256i zero = _mm256_setzero_si256();
+#if CONFIG_HIGHBITDEPTH
+ _mm256_storeu_si256((__m256i *)qcoeff, zero);
+ _mm256_storeu_si256((__m256i *)qcoeff + 1, zero);
+#else
+ _mm256_storeu_si256((__m256i *)qcoeff, zero);
+#endif
+}
+
+static INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
+ const __m128i ac = _mm_unpackhi_epi64(*p, *p);
+ *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(*p), ac, 1);
+}
+
+static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *dequant_ptr, int log_scale,
+ __m256i *thr, __m256i *qp) {
+ __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
+ const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
+ const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
+
+ if (log_scale > 0) {
+ const __m128i rnd = _mm_set1_epi16((int16_t)1 << (log_scale - 1));
+ round = _mm_add_epi16(round, rnd);
+ round = _mm_srai_epi16(round, log_scale);
+ }
+
+ init_one_qp(&round, &qp[0]);
+ init_one_qp(&quant, &qp[1]);
+
+ if (log_scale > 0) {
+ qp[1] = _mm256_slli_epi16(qp[1], log_scale);
+ }
+
+ init_one_qp(&dequant, &qp[2]);
+ *thr = _mm256_srai_epi16(qp[2], 1 + log_scale);
+}
+
+static INLINE void update_qp(int log_scale, __m256i *thr, __m256i *qp) {
+ qp[0] = _mm256_permute2x128_si256(qp[0], qp[0], 0x11);
+ qp[1] = _mm256_permute2x128_si256(qp[1], qp[1], 0x11);
+ qp[2] = _mm256_permute2x128_si256(qp[2], qp[2], 0x11);
+ *thr = _mm256_srai_epi16(qp[2], 1 + log_scale);
+}
+
+#define store_quan(q, addr) \
+ do { \
+ __m256i sign_bits = _mm256_srai_epi16(q, 15); \
+ __m256i y0 = _mm256_unpacklo_epi16(q, sign_bits); \
+ __m256i y1 = _mm256_unpackhi_epi16(q, sign_bits); \
+ __m256i x0 = _mm256_permute2x128_si256(y0, y1, 0x20); \
+ __m256i x1 = _mm256_permute2x128_si256(y0, y1, 0x31); \
+ _mm256_storeu_si256((__m256i *)addr, x0); \
+ _mm256_storeu_si256((__m256i *)addr + 1, x1); \
+ } while (0)
+
+#if CONFIG_HIGHBITDEPTH
+#define store_two_quan(q, addr1, dq, addr2) \
+ do { \
+ store_quan(q, addr1); \
+ store_quan(dq, addr2); \
+ } while (0)
+#else
+#define store_two_quan(q, addr1, dq, addr2) \
+ do { \
+ _mm256_storeu_si256((__m256i *)addr1, q); \
+ _mm256_storeu_si256((__m256i *)addr2, dq); \
+ } while (0)
+#endif
+
+static INLINE void quantize(const __m256i *thr, const __m256i *qp, __m256i *c,
+ const int16_t *iscan_ptr, tran_low_t *qcoeff,
+ tran_low_t *dqcoeff, __m256i *eob) {
+ const __m256i abs = _mm256_abs_epi16(*c);
+ __m256i mask = _mm256_cmpgt_epi16(abs, *thr);
+ mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs, *thr));
+ const int nzflag = _mm256_movemask_epi8(mask);
+
+ if (nzflag) {
+ __m256i q = _mm256_adds_epi16(abs, qp[0]);
+ q = _mm256_mulhi_epi16(q, qp[1]);
+ q = _mm256_sign_epi16(q, *c);
+ const __m256i dq = _mm256_mullo_epi16(q, qp[2]);
+
+ store_two_quan(q, qcoeff, dq, dqcoeff);
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr);
+ const __m256i zero_coeff = _mm256_cmpeq_epi16(dq, zero);
+ const __m256i nzero_coeff = _mm256_cmpeq_epi16(zero_coeff, zero);
+ __m256i cur_eob = _mm256_sub_epi16(iscan, nzero_coeff);
+ cur_eob = _mm256_and_si256(cur_eob, nzero_coeff);
+ *eob = _mm256_max_epi16(*eob, cur_eob);
+ } else {
+ write_zero(qcoeff);
+ write_zero(dqcoeff);
+ }
+}
+
+void av1_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+ (void)scan_ptr;
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+ const unsigned int step = 16;
+
+ if (LIKELY(!skip_block)) {
+ __m256i qp[3];
+ __m256i coeff, thr;
+ const int log_scale = 0;
+
+ init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
+ read_coeff(coeff_ptr, &coeff);
+
+ __m256i eob = _mm256_setzero_si256();
+ quantize(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan_ptr += step;
+ n_coeffs -= step;
+
+ update_qp(log_scale, &thr, qp);
+
+ while (n_coeffs > 0) {
+ read_coeff(coeff_ptr, &coeff);
+ quantize(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan_ptr += step;
+ n_coeffs -= step;
+ }
+ {
+ __m256i eob_s;
+ eob_s = _mm256_shuffle_epi32(eob, 0xe);
+ eob = _mm256_max_epi16(eob, eob_s);
+ eob_s = _mm256_shufflelo_epi16(eob, 0xe);
+ eob = _mm256_max_epi16(eob, eob_s);
+ eob_s = _mm256_shufflelo_epi16(eob, 1);
+ eob = _mm256_max_epi16(eob, eob_s);
+ const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
+ _mm256_extractf128_si256(eob, 1));
+ *eob_ptr = _mm_extract_epi16(final_eob, 0);
+ }
+ } else {
+ do {
+ write_zero(qcoeff_ptr);
+ write_zero(dqcoeff_ptr);
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ n_coeffs -= step;
+ } while (n_coeffs > 0);
+ *eob_ptr = 0;
+ }
+}
+
+static INLINE void quantize_32x32(const __m256i *thr, const __m256i *qp,
+ __m256i *c, const int16_t *iscan_ptr,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff,
+ __m256i *eob) {
+ const __m256i abs = _mm256_abs_epi16(*c);
+ __m256i mask = _mm256_cmpgt_epi16(abs, *thr);
+ mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs, *thr));
+ const int nzflag = _mm256_movemask_epi8(mask);
+
+ if (nzflag) {
+ __m256i q = _mm256_adds_epi16(abs, qp[0]);
+ q = _mm256_mulhi_epu16(q, qp[1]);
+
+ __m256i dq = _mm256_mullo_epi16(q, qp[2]);
+ dq = _mm256_srli_epi16(dq, 1);
+
+ q = _mm256_sign_epi16(q, *c);
+ dq = _mm256_sign_epi16(dq, *c);
+
+ store_two_quan(q, qcoeff, dq, dqcoeff);
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr);
+ const __m256i zero_coeff = _mm256_cmpeq_epi16(dq, zero);
+ const __m256i nzero_coeff = _mm256_cmpeq_epi16(zero_coeff, zero);
+ __m256i cur_eob = _mm256_sub_epi16(iscan, nzero_coeff);
+ cur_eob = _mm256_and_si256(cur_eob, nzero_coeff);
+ *eob = _mm256_max_epi16(*eob, cur_eob);
+ } else {
+ write_zero(qcoeff);
+ write_zero(dqcoeff);
+ }
+}
+
+void av1_quantize_fp_32x32_avx2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+ const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+ (void)scan_ptr;
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+ const unsigned int step = 16;
+
+ if (LIKELY(!skip_block)) {
+ __m256i qp[3];
+ __m256i coeff, thr;
+ const int log_scale = 1;
+
+ init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
+ read_coeff(coeff_ptr, &coeff);
+
+ __m256i eob = _mm256_setzero_si256();
+ quantize_32x32(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan_ptr += step;
+ n_coeffs -= step;
+
+ update_qp(log_scale, &thr, qp);
+
+ while (n_coeffs > 0) {
+ read_coeff(coeff_ptr, &coeff);
+ quantize_32x32(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr,
+ &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan_ptr += step;
+ n_coeffs -= step;
+ }
+ {
+ __m256i eob_s;
+ eob_s = _mm256_shuffle_epi32(eob, 0xe);
+ eob = _mm256_max_epi16(eob, eob_s);
+ eob_s = _mm256_shufflelo_epi16(eob, 0xe);
+ eob = _mm256_max_epi16(eob, eob_s);
+ eob_s = _mm256_shufflelo_epi16(eob, 1);
+ eob = _mm256_max_epi16(eob, eob_s);
+ const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
+ _mm256_extractf128_si256(eob, 1));
+ *eob_ptr = _mm_extract_epi16(final_eob, 0);
+ }
+ } else {
+ do {
+ write_zero(qcoeff_ptr);
+ write_zero(dqcoeff_ptr);
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ n_coeffs -= step;
+ } while (n_coeffs > 0);
+ *eob_ptr = 0;
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c b/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c
index 37c4b0d88..496c33395 100644
--- a/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c
+++ b/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c
@@ -203,8 +203,12 @@ static void fidtx4_sse2(__m128i *in) {
#endif // CONFIG_EXT_TX
void av1_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride,
- int tx_type) {
+ TxfmParam *txfm_param) {
__m128i in[4];
+ int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+ assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif
switch (tx_type) {
case DCT_DCT: aom_fdct4x4_sse2(input, output, stride); break;
@@ -1301,8 +1305,12 @@ static void fidtx8_sse2(__m128i *in) {
#endif // CONFIG_EXT_TX
void av1_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride,
- int tx_type) {
+ TxfmParam *txfm_param) {
__m128i in[8];
+ int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+ assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif
switch (tx_type) {
case DCT_DCT: aom_fdct8x8_sse2(input, output, stride); break;
@@ -2334,8 +2342,12 @@ static void fidtx16_sse2(__m128i *in0, __m128i *in1) {
#endif // CONFIG_EXT_TX
void av1_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride,
- int tx_type) {
+ TxfmParam *txfm_param) {
__m128i in0[16], in1[16];
+ int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+ assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif
switch (tx_type) {
case DCT_DCT:
@@ -2550,8 +2562,12 @@ static INLINE void write_buffer_4x8(tran_low_t *output, __m128i *res) {
}
void av1_fht4x8_sse2(const int16_t *input, tran_low_t *output, int stride,
- int tx_type) {
+ TxfmParam *txfm_param) {
__m128i in[8];
+ int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+ assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif
switch (tx_type) {
case DCT_DCT:
@@ -2724,8 +2740,12 @@ static INLINE void write_buffer_8x4(tran_low_t *output, __m128i *res) {
}
void av1_fht8x4_sse2(const int16_t *input, tran_low_t *output, int stride,
- int tx_type) {
+ TxfmParam *txfm_param) {
__m128i in[8];
+ int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+ assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif
switch (tx_type) {
case DCT_DCT:
@@ -2864,8 +2884,12 @@ static void row_8x16_rounding(__m128i *in, int bits) {
}
void av1_fht8x16_sse2(const int16_t *input, tran_low_t *output, int stride,
- int tx_type) {
+ TxfmParam *txfm_param) {
__m128i in[16];
+ int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+ assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif
__m128i *const t = in; // Alias to top 8x8 sub block
__m128i *const b = in + 8; // Alias to bottom 8x8 sub block
@@ -3045,8 +3069,12 @@ static INLINE void load_buffer_16x8(const int16_t *input, __m128i *in,
#define col_16x8_rounding row_8x16_rounding
void av1_fht16x8_sse2(const int16_t *input, tran_low_t *output, int stride,
- int tx_type) {
+ TxfmParam *txfm_param) {
__m128i in[16];
+ int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+ assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif
__m128i *const l = in; // Alias to left 8x8 sub block
__m128i *const r = in + 8; // Alias to right 8x8 sub block, which we store
@@ -3355,8 +3383,12 @@ static INLINE void fhalfright32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
// For 16x32, this means the input is a 2x2 grid of such blocks.
// For 32x16, it means the input is a 4x1 grid.
void av1_fht16x32_sse2(const int16_t *input, tran_low_t *output, int stride,
- int tx_type) {
+ TxfmParam *txfm_param) {
__m128i intl[16], intr[16], inbl[16], inbr[16];
+ int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+ assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif
switch (tx_type) {
case DCT_DCT:
@@ -3544,8 +3576,12 @@ static INLINE void write_buffer_32x16(tran_low_t *output, __m128i *res0,
}
void av1_fht32x16_sse2(const int16_t *input, tran_low_t *output, int stride,
- int tx_type) {
+ TxfmParam *txfm_param) {
__m128i in0[16], in1[16], in2[16], in3[16];
+ int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+ assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif
load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
switch (tx_type) {
@@ -3784,8 +3820,12 @@ static INLINE void write_buffer_32x32(__m128i *in0, __m128i *in1, __m128i *in2,
}
void av1_fht32x32_sse2(const int16_t *input, tran_low_t *output, int stride,
- int tx_type) {
+ TxfmParam *txfm_param) {
__m128i in0[32], in1[32], in2[32], in3[32];
+ int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+ assert(tx_type != MRC_DCT && "No 32x32 sse2 MRC_DCT implementation");
+#endif
load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 0);
switch (tx_type) {
diff --git a/third_party/aom/av1/encoder/x86/error_intrin_avx2.c b/third_party/aom/av1/encoder/x86/error_intrin_avx2.c
index ae733a1ce..20ba4149c 100644
--- a/third_party/aom/av1/encoder/x86/error_intrin_avx2.c
+++ b/third_party/aom/av1/encoder/x86/error_intrin_avx2.c
@@ -14,7 +14,20 @@
#include "./av1_rtcd.h"
#include "aom/aom_integer.h"
-int64_t av1_block_error_avx2(const int16_t *coeff, const int16_t *dqcoeff,
+static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset,
+ __m256i *c) {
+ const tran_low_t *addr = coeff + offset;
+#if CONFIG_HIGHBITDEPTH
+ const __m256i x0 = _mm256_loadu_si256((const __m256i *)addr);
+ const __m256i x1 = _mm256_loadu_si256((const __m256i *)addr + 1);
+ const __m256i y = _mm256_packs_epi32(x0, x1);
+ *c = _mm256_permute4x64_epi64(y, 0xD8);
+#else
+ *c = _mm256_loadu_si256((const __m256i *)addr);
+#endif
+}
+
+int64_t av1_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff,
intptr_t block_size, int64_t *ssz) {
__m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg;
__m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi;
@@ -22,16 +35,16 @@ int64_t av1_block_error_avx2(const int16_t *coeff, const int16_t *dqcoeff,
__m128i sse_reg128, ssz_reg128;
int64_t sse;
int i;
- const __m256i zero_reg = _mm256_set1_epi16(0);
+ const __m256i zero_reg = _mm256_setzero_si256();
// init sse and ssz registerd to zero
- sse_reg = _mm256_set1_epi16(0);
- ssz_reg = _mm256_set1_epi16(0);
+ sse_reg = _mm256_setzero_si256();
+ ssz_reg = _mm256_setzero_si256();
for (i = 0; i < block_size; i += 16) {
// load 32 bytes from coeff and dqcoeff
- coeff_reg = _mm256_loadu_si256((const __m256i *)(coeff + i));
- dqcoeff_reg = _mm256_loadu_si256((const __m256i *)(dqcoeff + i));
+ read_coeff(coeff, i, &coeff_reg);
+ read_coeff(dqcoeff, i, &dqcoeff_reg);
// dqcoeff - coeff
dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg);
// madd (dqcoeff - coeff)
diff --git a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
index b56eed518..cab36f2bd 100644
--- a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
+++ b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
@@ -113,25 +113,13 @@ static void fdct4x4_sse4_1(__m128i *in, int bit) {
in[3] = _mm_unpackhi_epi64(v1, v3);
}
-static INLINE void write_buffer_4x4(__m128i *res, tran_low_t *output) {
+static INLINE void write_buffer_4x4(__m128i *res, int32_t *output) {
_mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
_mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
_mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
_mm_store_si128((__m128i *)(output + 3 * 4), res[3]);
}
-// Note:
-// We implement av1_fwd_txfm2d_4x4(). This function is kept here since
-// av1_highbd_fht4x4_c() is not removed yet
-void av1_highbd_fht4x4_sse4_1(const int16_t *input, tran_low_t *output,
- int stride, int tx_type) {
- (void)input;
- (void)output;
- (void)stride;
- (void)tx_type;
- assert(0);
-}
-
static void fadst4x4_sse4_1(__m128i *in, int bit) {
const int32_t *cospi = cospi_arr(bit);
const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
@@ -416,7 +404,7 @@ static INLINE void col_txfm_8x8_rounding(__m128i *in, int shift) {
in[15] = _mm_srai_epi32(in[15], shift);
}
-static INLINE void write_buffer_8x8(const __m128i *res, tran_low_t *output) {
+static INLINE void write_buffer_8x8(const __m128i *res, int32_t *output) {
_mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
_mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
_mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
@@ -1800,7 +1788,7 @@ static void col_txfm_16x16_rounding(__m128i *in, int shift) {
col_txfm_8x8_rounding(&in[48], shift);
}
-static void write_buffer_16x16(const __m128i *in, tran_low_t *output) {
+static void write_buffer_16x16(const __m128i *in, int32_t *output) {
const int size_8x8 = 16 * 4;
write_buffer_8x8(&in[0], output);
output += size_8x8;
diff --git a/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c b/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
index 8495ad1aa..af8e9a5f4 100644
--- a/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
+++ b/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
@@ -18,51 +18,6 @@
#include "aom_dsp/txfm_common.h"
#include "aom_dsp/x86/txfm_common_avx2.h"
-static int32_t get_16x16_sum(const int16_t *input, int stride) {
- __m256i r0, r1, r2, r3, u0, u1;
- __m256i zero = _mm256_setzero_si256();
- __m256i sum = _mm256_setzero_si256();
- const int16_t *blockBound = input + (stride << 4);
- __m128i v0, v1;
-
- while (input < blockBound) {
- r0 = _mm256_loadu_si256((__m256i const *)input);
- r1 = _mm256_loadu_si256((__m256i const *)(input + stride));
- r2 = _mm256_loadu_si256((__m256i const *)(input + 2 * stride));
- r3 = _mm256_loadu_si256((__m256i const *)(input + 3 * stride));
-
- u0 = _mm256_add_epi16(r0, r1);
- u1 = _mm256_add_epi16(r2, r3);
- sum = _mm256_add_epi16(sum, u0);
- sum = _mm256_add_epi16(sum, u1);
-
- input += stride << 2;
- }
-
- // unpack 16 int16_t into 2x8 int32_t
- u0 = _mm256_unpacklo_epi16(zero, sum);
- u1 = _mm256_unpackhi_epi16(zero, sum);
- u0 = _mm256_srai_epi32(u0, 16);
- u1 = _mm256_srai_epi32(u1, 16);
- sum = _mm256_add_epi32(u0, u1);
-
- u0 = _mm256_srli_si256(sum, 8);
- u1 = _mm256_add_epi32(sum, u0);
-
- v0 = _mm_add_epi32(_mm256_extracti128_si256(u1, 1),
- _mm256_castsi256_si128(u1));
- v1 = _mm_srli_si128(v0, 4);
- v0 = _mm_add_epi32(v0, v1);
- return (int32_t)_mm_extract_epi32(v0, 0);
-}
-
-void aom_fdct16x16_1_avx2(const int16_t *input, tran_low_t *output,
- int stride) {
- int32_t dc = get_16x16_sum(input, stride);
- output[0] = (tran_low_t)(dc >> 1);
- _mm256_zeroupper();
-}
-
static INLINE void load_buffer_16x16(const int16_t *input, int stride,
int flipud, int fliplr, __m256i *in) {
if (!flipud) {
@@ -959,8 +914,12 @@ static void fidtx16_avx2(__m256i *in) {
#endif
void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride,
- int tx_type) {
+ TxfmParam *txfm_param) {
__m256i in[16];
+ int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+ assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif
switch (tx_type) {
case DCT_DCT:
@@ -1084,22 +1043,6 @@ void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride,
_mm256_zeroupper();
}
-void aom_fdct32x32_1_avx2(const int16_t *input, tran_low_t *output,
- int stride) {
- // left and upper corner
- int32_t sum = get_16x16_sum(input, stride);
- // right and upper corner
- sum += get_16x16_sum(input + 16, stride);
- // left and lower corner
- sum += get_16x16_sum(input + (stride << 4), stride);
- // right and lower corner
- sum += get_16x16_sum(input + (stride << 4) + 16, stride);
-
- sum >>= 3;
- output[0] = (tran_low_t)sum;
- _mm256_zeroupper();
-}
-
static void mm256_vectors_swap(__m256i *a0, __m256i *a1, const int size) {
int i = 0;
__m256i temp;
@@ -1570,9 +1513,13 @@ static void fidtx32_avx2(__m256i *in0, __m256i *in1) {
#endif
void av1_fht32x32_avx2(const int16_t *input, tran_low_t *output, int stride,
- int tx_type) {
+ TxfmParam *txfm_param) {
__m256i in0[32]; // left 32 columns
__m256i in1[32]; // right 32 columns
+ int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+ assert(tx_type != MRC_DCT && "No avx2 32x32 implementation of MRC_DCT");
+#endif
switch (tx_type) {
case DCT_DCT: