7 files changed, 520 insertions, 99 deletions
diff --git a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c
new file mode 100644
index 000000000..c8d4ccb70
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "./av1_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+static INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i dc = _mm_unpacklo_epi16(*p, zero);
+  const __m128i ac = _mm_unpackhi_epi16(*p, zero);
+  *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1);
+}
+
+static INLINE void update_qp(__m256i *qp) {
+  qp[0] = _mm256_permute2x128_si256(qp[0], qp[0], 0x11);
+  qp[1] = _mm256_permute2x128_si256(qp[1], qp[1], 0x11);
+  qp[2] = _mm256_permute2x128_si256(qp[2], qp[2], 0x11);
+}
+
+static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr,
+                           const int16_t *dequant_ptr, int log_scale,
+                           __m256i *qp) {
+  __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
+  round = _mm_srai_epi16(round, log_scale);
+  const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
+  const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
+
+  init_one_qp(&round, &qp[0]);
+  init_one_qp(&quant, &qp[1]);
+  init_one_qp(&dequant, &qp[2]);
+}
+
+static INLINE void quantize(const __m256i *qp, __m256i *c,
+                            const int16_t *iscan_ptr, int log_scale,
+                            tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                            __m256i *eob) {
+  const __m256i abs = _mm256_abs_epi32(*c);
+  __m256i q = _mm256_add_epi32(abs, qp[0]);
+
+  __m256i q_lo = _mm256_mul_epi32(q, qp[1]);
+  __m256i q_hi = _mm256_srli_epi64(q, 32);
+  const __m256i qp_hi = _mm256_srli_epi64(qp[1], 32);
+  q_hi = _mm256_mul_epi32(q_hi, qp_hi);
+  q_lo = _mm256_srli_epi64(q_lo, 16 - log_scale);
+  q_hi = _mm256_srli_epi64(q_hi, 16 - log_scale);
+  q_hi = _mm256_slli_epi64(q_hi, 32);
+  q = _mm256_or_si256(q_lo, q_hi);
+
+  __m256i dq = _mm256_mullo_epi32(q, qp[2]);
+  dq = _mm256_srai_epi32(dq, log_scale);
+  q = _mm256_sign_epi32(q, *c);
+  dq = _mm256_sign_epi32(dq, *c);
+
+  _mm256_storeu_si256((__m256i *)qcoeff, q);
+  _mm256_storeu_si256((__m256i *)dqcoeff, dq);
+
+  const __m128i isc = _mm_loadu_si128((const __m128i *)iscan_ptr);
+  const __m128i zr = _mm_setzero_si128();
+  const __m128i lo = _mm_unpacklo_epi16(isc, zr);
+  const __m128i hi = _mm_unpackhi_epi16(isc, zr);
+  const __m256i iscan =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
+
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i zc = _mm256_cmpeq_epi32(dq, zero);
+  const __m256i nz = _mm256_cmpeq_epi32(zc, zero);
+  __m256i cur_eob = _mm256_sub_epi32(iscan, nz);
+  cur_eob = _mm256_and_si256(cur_eob, nz);
+  *eob = _mm256_max_epi32(cur_eob, *eob);
+}
+
+void av1_highbd_quantize_fp_avx2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, int log_scale) {
+  (void)scan;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  const unsigned int step = 8;
+
+  if (LIKELY(!skip_block)) {
+    __m256i qp[3], coeff;
+
+    init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, qp);
+    coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+
+    __m256i eob = _mm256_setzero_si256();
+    quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+    coeff_ptr += step;
+    qcoeff_ptr += step;
+    dqcoeff_ptr += step;
+    iscan += step;
+    n_coeffs -= step;
+
+    update_qp(qp);
+    while (n_coeffs > 0) {
+      coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+      quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+      coeff_ptr += step;
+      qcoeff_ptr += step;
+      dqcoeff_ptr += step;
+      iscan += step;
+      n_coeffs -= step;
+    }
+    {
+      __m256i eob_s;
+      eob_s = _mm256_shuffle_epi32(eob, 0xe);
+      eob = _mm256_max_epi16(eob, eob_s);
+      eob_s = _mm256_shufflelo_epi16(eob, 0xe);
+      eob = _mm256_max_epi16(eob, eob_s);
+      eob_s = _mm256_shufflelo_epi16(eob, 1);
+      eob = _mm256_max_epi16(eob, eob_s);
+      const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
+                                              _mm256_extractf128_si256(eob, 1));
+      *eob_ptr = _mm_extract_epi16(final_eob, 0);
+    }
+  } else {
+    do {
+      const __m256i zero = _mm256_setzero_si256();
+      _mm256_storeu_si256((__m256i *)qcoeff_ptr, zero);
+      _mm256_storeu_si256((__m256i *)dqcoeff_ptr, zero);
+      qcoeff_ptr += step;
+      dqcoeff_ptr += step;
+      n_coeffs -= step;
+    } while (n_coeffs > 0);
+    *eob_ptr = 0;
+  }
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c
index fa5626002..8d717a083 100644
--- a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c
+++ b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c
@@ -133,9 +133,10 @@ void av1_highbd_quantize_fp_sse4_1(
     coeff[0] = _mm_loadu_si128((__m128i const *)src);
 
     qparam[0] =
-        _mm_set_epi32(round_ptr[1], round_ptr[1], round_ptr[1], round_ptr[0]);
-    qparam[1] = _mm_set_epi64x(quant_ptr[1], quant_ptr[0]);
-    qparam[2] = _mm_set_epi64x(dequant_ptr[1], dequant_ptr[0]);
+        _mm_set_epi32(round_ptr[1] >> log_scale, round_ptr[1] >> log_scale,
+                      round_ptr[1] >> log_scale, round_ptr[0] >> log_scale);
+    qparam[1] = _mm_set_epi32(0, quant_ptr[1], 0, quant_ptr[0]);
+    qparam[2] = _mm_set_epi32(0, dequant_ptr[1], 0, dequant_ptr[0]);
 
     // DC and first 3 AC
     quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant,
@@ -143,8 +144,8 @@ void av1_highbd_quantize_fp_sse4_1(
 
     // update round/quan/dquan for AC
     qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]);
-    qparam[1] = _mm_set_epi64x(quant_ptr[1], quant_ptr[1]);
-    qparam[2] = _mm_set_epi64x(dequant_ptr[1], dequant_ptr[1]);
+    qparam[1] = _mm_set_epi32(0, quant_ptr[1], 0, quant_ptr[1]);
+    qparam[2] = _mm_set_epi32(0, dequant_ptr[1], 0, dequant_ptr[1]);
 
     quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
                           log_scale, quanAddr, dquanAddr);
diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c b/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c
new file mode 100644
index 000000000..1c0a120ca
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "./av1_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+static INLINE void read_coeff(const tran_low_t *coeff, __m256i *c) {
+#if CONFIG_HIGHBITDEPTH
+  const __m256i x0 = _mm256_loadu_si256((const __m256i *)coeff);
+  const __m256i x1 = _mm256_loadu_si256((const __m256i *)coeff + 1);
+  *c = _mm256_packs_epi32(x0, x1);
+  *c = _mm256_permute4x64_epi64(*c, 0xD8);
+#else
+  *c = _mm256_loadu_si256((const __m256i *)coeff);
+#endif
+}
+
+static INLINE void write_zero(tran_low_t *qcoeff) {
+  const __m256i zero = _mm256_setzero_si256();
+#if CONFIG_HIGHBITDEPTH
+  _mm256_storeu_si256((__m256i *)qcoeff, zero);
+  _mm256_storeu_si256((__m256i *)qcoeff + 1, zero);
+#else
+  _mm256_storeu_si256((__m256i *)qcoeff, zero);
+#endif
+}
+
+static INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
+  const __m128i ac = _mm_unpackhi_epi64(*p, *p);
+  *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(*p), ac, 1);
+}
+
+static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr,
+                           const int16_t *dequant_ptr, int log_scale,
+                           __m256i *thr, __m256i *qp) {
+  __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
+  const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
+  const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
+
+  if (log_scale > 0) {
+    const __m128i rnd = _mm_set1_epi16((int16_t)1 << (log_scale - 1));
+    round = _mm_add_epi16(round, rnd);
+    round = _mm_srai_epi16(round, log_scale);
+  }
+
+  init_one_qp(&round, &qp[0]);
+  init_one_qp(&quant, &qp[1]);
+
+  if (log_scale > 0) {
+    qp[1] = _mm256_slli_epi16(qp[1], log_scale);
+  }
+
+  init_one_qp(&dequant, &qp[2]);
+  *thr = _mm256_srai_epi16(qp[2], 1 + log_scale);
+}
+
+static INLINE void update_qp(int log_scale, __m256i *thr, __m256i *qp) {
+  qp[0] = _mm256_permute2x128_si256(qp[0], qp[0], 0x11);
+  qp[1] = _mm256_permute2x128_si256(qp[1], qp[1], 0x11);
+  qp[2] = _mm256_permute2x128_si256(qp[2], qp[2], 0x11);
+  *thr = _mm256_srai_epi16(qp[2], 1 + log_scale);
+}
+
+#define store_quan(q, addr)                               \
+  do {                                                    \
+    __m256i sign_bits = _mm256_srai_epi16(q, 15);         \
+    __m256i y0 = _mm256_unpacklo_epi16(q, sign_bits);     \
+    __m256i y1 = _mm256_unpackhi_epi16(q, sign_bits);     \
+    __m256i x0 = _mm256_permute2x128_si256(y0, y1, 0x20); \
+    __m256i x1 = _mm256_permute2x128_si256(y0, y1, 0x31); \
+    _mm256_storeu_si256((__m256i *)addr, x0);             \
+    _mm256_storeu_si256((__m256i *)addr + 1, x1);         \
+  } while (0)
+
+#if CONFIG_HIGHBITDEPTH
+#define store_two_quan(q, addr1, dq, addr2) \
+  do {                                      \
+    store_quan(q, addr1);                   \
+    store_quan(dq, addr2);                  \
+  } while (0)
+#else
+#define store_two_quan(q, addr1, dq, addr2)    \
+  do {                                         \
+    _mm256_storeu_si256((__m256i *)addr1, q);  \
+    _mm256_storeu_si256((__m256i *)addr2, dq); \
+  } while (0)
+#endif
+
+static INLINE void quantize(const __m256i *thr, const __m256i *qp, __m256i *c,
+                            const int16_t *iscan_ptr, tran_low_t *qcoeff,
+                            tran_low_t *dqcoeff, __m256i *eob) {
+  const __m256i abs = _mm256_abs_epi16(*c);
+  __m256i mask = _mm256_cmpgt_epi16(abs, *thr);
+  mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs, *thr));
+  const int nzflag = _mm256_movemask_epi8(mask);
+
+  if (nzflag) {
+    __m256i q = _mm256_adds_epi16(abs, qp[0]);
+    q = _mm256_mulhi_epi16(q, qp[1]);
+    q = _mm256_sign_epi16(q, *c);
+    const __m256i dq = _mm256_mullo_epi16(q, qp[2]);
+
+    store_two_quan(q, qcoeff, dq, dqcoeff);
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr);
+    const __m256i zero_coeff = _mm256_cmpeq_epi16(dq, zero);
+    const __m256i nzero_coeff = _mm256_cmpeq_epi16(zero_coeff, zero);
+    __m256i cur_eob = _mm256_sub_epi16(iscan, nzero_coeff);
+    cur_eob = _mm256_and_si256(cur_eob, nzero_coeff);
+    *eob = _mm256_max_epi16(*eob, cur_eob);
+  } else {
+    write_zero(qcoeff);
+    write_zero(dqcoeff);
+  }
+}
+
+void av1_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                          int skip_block, const int16_t *zbin_ptr,
+                          const int16_t *round_ptr, const int16_t *quant_ptr,
+                          const int16_t *quant_shift_ptr,
+                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                          const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+  (void)scan_ptr;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  const unsigned int step = 16;
+
+  if (LIKELY(!skip_block)) {
+    __m256i qp[3];
+    __m256i coeff, thr;
+    const int log_scale = 0;
+
+    init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
+    read_coeff(coeff_ptr, &coeff);
+
+    __m256i eob = _mm256_setzero_si256();
+    quantize(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+    coeff_ptr += step;
+    qcoeff_ptr += step;
+    dqcoeff_ptr += step;
+    iscan_ptr += step;
+    n_coeffs -= step;
+
+    update_qp(log_scale, &thr, qp);
+
+    while (n_coeffs > 0) {
+      read_coeff(coeff_ptr, &coeff);
+      quantize(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+      coeff_ptr += step;
+      qcoeff_ptr += step;
+      dqcoeff_ptr += step;
+      iscan_ptr += step;
+      n_coeffs -= step;
+    }
+    {
+      __m256i eob_s;
+      eob_s = _mm256_shuffle_epi32(eob, 0xe);
+      eob = _mm256_max_epi16(eob, eob_s);
+      eob_s = _mm256_shufflelo_epi16(eob, 0xe);
+      eob = _mm256_max_epi16(eob, eob_s);
+      eob_s = _mm256_shufflelo_epi16(eob, 1);
+      eob = _mm256_max_epi16(eob, eob_s);
+      const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
+                                              _mm256_extractf128_si256(eob, 1));
+      *eob_ptr = _mm_extract_epi16(final_eob, 0);
+    }
+  } else {
+    do {
+      write_zero(qcoeff_ptr);
+      write_zero(dqcoeff_ptr);
+      qcoeff_ptr += step;
+      dqcoeff_ptr += step;
+      n_coeffs -= step;
+    } while (n_coeffs > 0);
+    *eob_ptr = 0;
+  }
+}
+
+static INLINE void quantize_32x32(const __m256i *thr, const __m256i *qp,
+                                  __m256i *c, const int16_t *iscan_ptr,
+                                  tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                                  __m256i *eob) {
+  const __m256i abs = _mm256_abs_epi16(*c);
+  __m256i mask = _mm256_cmpgt_epi16(abs, *thr);
+  mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs, *thr));
+  const int nzflag = _mm256_movemask_epi8(mask);
+
+  if (nzflag) {
+    __m256i q = _mm256_adds_epi16(abs, qp[0]);
+    q = _mm256_mulhi_epu16(q, qp[1]);
+
+    __m256i dq = _mm256_mullo_epi16(q, qp[2]);
+    dq = _mm256_srli_epi16(dq, 1);
+
+    q = _mm256_sign_epi16(q, *c);
+    dq = _mm256_sign_epi16(dq, *c);
+
+    store_two_quan(q, qcoeff, dq, dqcoeff);
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr);
+    const __m256i zero_coeff = _mm256_cmpeq_epi16(dq, zero);
+    const __m256i nzero_coeff = _mm256_cmpeq_epi16(zero_coeff, zero);
+    __m256i cur_eob = _mm256_sub_epi16(iscan, nzero_coeff);
+    cur_eob = _mm256_and_si256(cur_eob, nzero_coeff);
+    *eob = _mm256_max_epi16(*eob, cur_eob);
+  } else {
+    write_zero(qcoeff);
+    write_zero(dqcoeff);
+  }
+}
+
+void av1_quantize_fp_32x32_avx2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+  (void)scan_ptr;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  const unsigned int step = 16;
+
+  if (LIKELY(!skip_block)) {
+    __m256i qp[3];
+    __m256i coeff, thr;
+    const int log_scale = 1;
+
+    init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
+    read_coeff(coeff_ptr, &coeff);
+
+    __m256i eob = _mm256_setzero_si256();
+    quantize_32x32(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+    coeff_ptr += step;
+    qcoeff_ptr += step;
+    dqcoeff_ptr += step;
+    iscan_ptr += step;
+    n_coeffs -= step;
+
+    update_qp(log_scale, &thr, qp);
+
+    while (n_coeffs > 0) {
+      read_coeff(coeff_ptr, &coeff);
+      quantize_32x32(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr,
+                     &eob);
+
+      coeff_ptr += step;
+      qcoeff_ptr += step;
+      dqcoeff_ptr += step;
+      iscan_ptr += step;
+      n_coeffs -= step;
+    }
+    {
+      __m256i eob_s;
+      eob_s = _mm256_shuffle_epi32(eob, 0xe);
+      eob = _mm256_max_epi16(eob, eob_s);
+      eob_s = _mm256_shufflelo_epi16(eob, 0xe);
+      eob = _mm256_max_epi16(eob, eob_s);
+      eob_s = _mm256_shufflelo_epi16(eob, 1);
+      eob = _mm256_max_epi16(eob, eob_s);
+      const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
+                                              _mm256_extractf128_si256(eob, 1));
+      *eob_ptr = _mm_extract_epi16(final_eob, 0);
+    }
+  } else {
+    do {
+      write_zero(qcoeff_ptr);
+      write_zero(dqcoeff_ptr);
+      qcoeff_ptr += step;
+      dqcoeff_ptr += step;
+      n_coeffs -= step;
+    } while (n_coeffs > 0);
+    *eob_ptr = 0;
+  }
+}
diff --git a/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c b/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c
index 37c4b0d88..496c33395 100644
--- a/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c
+++ b/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c
@@ -203,8 +203,12 @@ static void fidtx4_sse2(__m128i *in) {
 #endif  // CONFIG_EXT_TX
 
 void av1_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride,
-                     int tx_type) {
+                     TxfmParam *txfm_param) {
   __m128i in[4];
+  int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif
 
   switch (tx_type) {
     case DCT_DCT: aom_fdct4x4_sse2(input, output, stride); break;
@@ -1301,8 +1305,12 @@ static void fidtx8_sse2(__m128i *in) {
 #endif  // CONFIG_EXT_TX
 
 void av1_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride,
-                     int tx_type) {
+                     TxfmParam *txfm_param) {
   __m128i in[8];
+  int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif
 
   switch (tx_type) {
     case DCT_DCT: aom_fdct8x8_sse2(input, output, stride); break;
@@ -2334,8 +2342,12 @@ static void fidtx16_sse2(__m128i *in0, __m128i *in1) {
 #endif  // CONFIG_EXT_TX
 
 void av1_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride,
-                       int tx_type) {
+                       TxfmParam *txfm_param) {
   __m128i in0[16], in1[16];
+  int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif
 
   switch (tx_type) {
     case DCT_DCT:
@@ -2550,8 +2562,12 @@ static INLINE void write_buffer_4x8(tran_low_t *output, __m128i *res) {
 }
 
 void av1_fht4x8_sse2(const int16_t *input, tran_low_t *output, int stride,
-                     int tx_type) {
+                     TxfmParam *txfm_param) {
   __m128i in[8];
+  int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif
 
   switch (tx_type) {
     case DCT_DCT:
@@ -2724,8 +2740,12 @@ static INLINE void write_buffer_8x4(tran_low_t *output, __m128i *res) {
 }
 
 void av1_fht8x4_sse2(const int16_t *input, tran_low_t *output, int stride,
-                     int tx_type) {
+                     TxfmParam *txfm_param) {
   __m128i in[8];
+  int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif
 
   switch (tx_type) {
     case DCT_DCT:
@@ -2864,8 +2884,12 @@ static void row_8x16_rounding(__m128i *in, int bits) {
 }
 
 void av1_fht8x16_sse2(const int16_t *input, tran_low_t *output, int stride,
-                      int tx_type) {
+                      TxfmParam *txfm_param) {
   __m128i in[16];
+  int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif
 
   __m128i *const t = in;      // Alias to top 8x8 sub block
   __m128i *const b = in + 8;  // Alias to bottom 8x8 sub block
@@ -3045,8 +3069,12 @@ static INLINE void load_buffer_16x8(const int16_t *input, __m128i *in,
 #define col_16x8_rounding row_8x16_rounding
 
 void av1_fht16x8_sse2(const int16_t *input, tran_low_t *output, int stride,
-                      int tx_type) {
+                      TxfmParam *txfm_param) {
   __m128i in[16];
+  int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif
 
   __m128i *const l = in;      // Alias to left 8x8 sub block
   __m128i *const r = in + 8;  // Alias to right 8x8 sub block, which we store
@@ -3355,8 +3383,12 @@ static INLINE void fhalfright32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
 // For 16x32, this means the input is a 2x2 grid of such blocks.
 // For 32x16, it means the input is a 4x1 grid.
 void av1_fht16x32_sse2(const int16_t *input, tran_low_t *output, int stride,
-                       int tx_type) {
+                       TxfmParam *txfm_param) {
   __m128i intl[16], intr[16], inbl[16], inbr[16];
+  int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif
 
   switch (tx_type) {
     case DCT_DCT:
@@ -3544,8 +3576,12 @@ static INLINE void write_buffer_32x16(tran_low_t *output, __m128i *res0,
 }
 
 void av1_fht32x16_sse2(const int16_t *input, tran_low_t *output, int stride,
-                       int tx_type) {
+                       TxfmParam *txfm_param) {
   __m128i in0[16], in1[16], in2[16], in3[16];
+  int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif
 
   load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
   switch (tx_type) {
@@ -3784,8 +3820,12 @@ static INLINE void write_buffer_32x32(__m128i *in0, __m128i *in1, __m128i *in2,
 }
 
 void av1_fht32x32_sse2(const int16_t *input, tran_low_t *output, int stride,
-                       int tx_type) {
+                       TxfmParam *txfm_param) {
   __m128i in0[32], in1[32], in2[32], in3[32];
+  int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+  assert(tx_type != MRC_DCT && "No 32x32 sse2 MRC_DCT implementation");
+#endif
 
   load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 0);
   switch (tx_type) {
diff --git a/third_party/aom/av1/encoder/x86/error_intrin_avx2.c b/third_party/aom/av1/encoder/x86/error_intrin_avx2.c
index ae733a1ce..20ba4149c 100644
--- a/third_party/aom/av1/encoder/x86/error_intrin_avx2.c
+++ b/third_party/aom/av1/encoder/x86/error_intrin_avx2.c
@@ -14,7 +14,20 @@
 #include "./av1_rtcd.h"
 #include "aom/aom_integer.h"
 
-int64_t av1_block_error_avx2(const int16_t *coeff, const int16_t *dqcoeff,
+static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset,
+                              __m256i *c) {
+  const tran_low_t *addr = coeff + offset;
+#if CONFIG_HIGHBITDEPTH
+  const __m256i x0 = _mm256_loadu_si256((const __m256i *)addr);
+  const __m256i x1 = _mm256_loadu_si256((const __m256i *)addr + 1);
+  const __m256i y = _mm256_packs_epi32(x0, x1);
+  *c = _mm256_permute4x64_epi64(y, 0xD8);
+#else
+  *c = _mm256_loadu_si256((const __m256i *)addr);
+#endif
+}
+
+int64_t av1_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff,
                              intptr_t block_size, int64_t *ssz) {
   __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg;
   __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi;
@@ -22,16 +35,16 @@ int64_t av1_block_error_avx2(const int16_t *coeff, const int16_t *dqcoeff,
   __m128i sse_reg128, ssz_reg128;
   int64_t sse;
   int i;
-  const __m256i zero_reg = _mm256_set1_epi16(0);
+  const __m256i zero_reg = _mm256_setzero_si256();
 
   // init sse and ssz registerd to zero
-  sse_reg = _mm256_set1_epi16(0);
-  ssz_reg = _mm256_set1_epi16(0);
+  sse_reg = _mm256_setzero_si256();
+  ssz_reg = _mm256_setzero_si256();
 
   for (i = 0; i < block_size; i += 16) {
     // load 32 bytes from coeff and dqcoeff
-    coeff_reg = _mm256_loadu_si256((const __m256i *)(coeff + i));
-    dqcoeff_reg = _mm256_loadu_si256((const __m256i *)(dqcoeff + i));
+    read_coeff(coeff, i, &coeff_reg);
+    read_coeff(dqcoeff, i, &dqcoeff_reg);
     // dqcoeff - coeff
     dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg);
     // madd (dqcoeff - coeff)
diff --git a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
index b56eed518..cab36f2bd 100644
--- a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
+++ b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
@@ -113,25 +113,13 @@ static void fdct4x4_sse4_1(__m128i *in, int bit) {
   in[3] = _mm_unpackhi_epi64(v1, v3);
 }
 
-static INLINE void write_buffer_4x4(__m128i *res, tran_low_t *output) {
+static INLINE void write_buffer_4x4(__m128i *res, int32_t *output) {
   _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
   _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
   _mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
   _mm_store_si128((__m128i *)(output + 3 * 4), res[3]);
 }
 
-// Note:
-//  We implement av1_fwd_txfm2d_4x4(). This function is kept here since
-//  av1_highbd_fht4x4_c() is not removed yet
-void av1_highbd_fht4x4_sse4_1(const int16_t *input, tran_low_t *output,
-                              int stride, int tx_type) {
-  (void)input;
-  (void)output;
-  (void)stride;
-  (void)tx_type;
-  assert(0);
-}
-
 static void fadst4x4_sse4_1(__m128i *in, int bit) {
   const int32_t *cospi = cospi_arr(bit);
   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
@@ -416,7 +404,7 @@ static INLINE void col_txfm_8x8_rounding(__m128i *in, int shift) {
   in[15] = _mm_srai_epi32(in[15], shift);
 }
 
-static INLINE void write_buffer_8x8(const __m128i *res, tran_low_t *output) {
+static INLINE void write_buffer_8x8(const __m128i *res, int32_t *output) {
   _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
   _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
   _mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
@@ -1800,7 +1788,7 @@ static void col_txfm_16x16_rounding(__m128i *in, int shift) {
   col_txfm_8x8_rounding(&in[48], shift);
 }
 
-static void write_buffer_16x16(const __m128i *in, tran_low_t *output) {
+static void write_buffer_16x16(const __m128i *in, int32_t *output) {
   const int size_8x8 = 16 * 4;
   write_buffer_8x8(&in[0], output);
   output += size_8x8;
diff --git a/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c b/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
index 8495ad1aa..af8e9a5f4 100644
--- a/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
+++ b/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
@@ -18,51 +18,6 @@
 #include "aom_dsp/txfm_common.h"
 #include "aom_dsp/x86/txfm_common_avx2.h"
 
-static int32_t get_16x16_sum(const int16_t *input, int stride) {
-  __m256i r0, r1, r2, r3, u0, u1;
-  __m256i zero = _mm256_setzero_si256();
-  __m256i sum = _mm256_setzero_si256();
-  const int16_t *blockBound = input + (stride << 4);
-  __m128i v0, v1;
-
-  while (input < blockBound) {
-    r0 = _mm256_loadu_si256((__m256i const *)input);
-    r1 = _mm256_loadu_si256((__m256i const *)(input + stride));
-    r2 = _mm256_loadu_si256((__m256i const *)(input + 2 * stride));
-    r3 = _mm256_loadu_si256((__m256i const *)(input + 3 * stride));
-
-    u0 = _mm256_add_epi16(r0, r1);
-    u1 = _mm256_add_epi16(r2, r3);
-    sum = _mm256_add_epi16(sum, u0);
-    sum = _mm256_add_epi16(sum, u1);
-
-    input += stride << 2;
-  }
-
-  // unpack 16 int16_t into 2x8 int32_t
-  u0 = _mm256_unpacklo_epi16(zero, sum);
-  u1 = _mm256_unpackhi_epi16(zero, sum);
-  u0 = _mm256_srai_epi32(u0, 16);
-  u1 = _mm256_srai_epi32(u1, 16);
-  sum = _mm256_add_epi32(u0, u1);
-
-  u0 = _mm256_srli_si256(sum, 8);
-  u1 = _mm256_add_epi32(sum, u0);
-
-  v0 = _mm_add_epi32(_mm256_extracti128_si256(u1, 1),
-                     _mm256_castsi256_si128(u1));
-  v1 = _mm_srli_si128(v0, 4);
-  v0 = _mm_add_epi32(v0, v1);
-  return (int32_t)_mm_extract_epi32(v0, 0);
-}
-
-void aom_fdct16x16_1_avx2(const int16_t *input, tran_low_t *output,
-                          int stride) {
-  int32_t dc = get_16x16_sum(input, stride);
-  output[0] = (tran_low_t)(dc >> 1);
-  _mm256_zeroupper();
-}
-
 static INLINE void load_buffer_16x16(const int16_t *input, int stride,
                                      int flipud, int fliplr, __m256i *in) {
   if (!flipud) {
@@ -959,8 +914,12 @@ static void fidtx16_avx2(__m256i *in) {
 #endif
 
 void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride,
-                       int tx_type) {
+                       TxfmParam *txfm_param) {
   __m256i in[16];
+  int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif
 
   switch (tx_type) {
     case DCT_DCT:
@@ -1084,22 +1043,6 @@ void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride,
   _mm256_zeroupper();
 }
 
-void aom_fdct32x32_1_avx2(const int16_t *input, tran_low_t *output,
-                          int stride) {
-  // left and upper corner
-  int32_t sum = get_16x16_sum(input, stride);
-  // right and upper corner
-  sum += get_16x16_sum(input + 16, stride);
-  // left and lower corner
-  sum += get_16x16_sum(input + (stride << 4), stride);
-  // right and lower corner
-  sum += get_16x16_sum(input + (stride << 4) + 16, stride);
-
-  sum >>= 3;
-  output[0] = (tran_low_t)sum;
-  _mm256_zeroupper();
-}
-
 static void mm256_vectors_swap(__m256i *a0, __m256i *a1, const int size) {
   int i = 0;
   __m256i temp;
@@ -1570,9 +1513,13 @@ static void fidtx32_avx2(__m256i *in0, __m256i *in1) {
 #endif
 
 void av1_fht32x32_avx2(const int16_t *input, tran_low_t *output, int stride,
-                       int tx_type) {
+                       TxfmParam *txfm_param) {
   __m256i in0[32];  // left 32 columns
   __m256i in1[32];  // right 32 columns
+  int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+  assert(tx_type != MRC_DCT && "No avx2 32x32 implementation of MRC_DCT");
+#endif
 
   switch (tx_type) {
     case DCT_DCT: