63 files changed, 14138 insertions, 8836 deletions
diff --git a/third_party/aom/av1/encoder/aq_cyclicrefresh.c b/third_party/aom/av1/encoder/aq_cyclicrefresh.c
index 05aa28c9f..8f61c7eb8 100644
--- a/third_party/aom/av1/encoder/aq_cyclicrefresh.c
+++ b/third_party/aom/av1/encoder/aq_cyclicrefresh.c
@@ -39,7 +39,7 @@ struct CYCLIC_REFRESH {
   // RD mult. parameters for segment 1.
   int rdmult;
   // Cyclic refresh map.
-  signed char *map;
+  int8_t *map;
   // Map of the last q a block was coded at.
   uint8_t *last_coded_q_map;
   // Thresholds applied to the projected rate/distortion of the coding block,
@@ -397,6 +397,7 @@ static void cyclic_refresh_update_map(AV1_COMP *const cpi) {
   // Set the segmentation map: cycle through the superblocks, starting at
   // cr->mb_index, and stopping when either block_count blocks have been found
   // to be refreshed, or we have passed through whole frame.
+  if (cr->sb_index >= sbs_in_frame) cr->sb_index = 0;
   assert(cr->sb_index < sbs_in_frame);
   i = cr->sb_index;
   cr->target_num_seg_blocks = 0;
diff --git a/third_party/aom/av1/encoder/aq_variance.c b/third_party/aom/av1/encoder/aq_variance.c
index ab9b3790b..84d967215 100644
--- a/third_party/aom/av1/encoder/aq_variance.c
+++ b/third_party/aom/av1/encoder/aq_variance.c
@@ -151,8 +151,8 @@ static unsigned int block_variance(const AV1_COMP *const cpi, MACROBLOCK *x,
       (xd->mb_to_bottom_edge < 0) ? ((-xd->mb_to_bottom_edge) >> 3) : 0;
 
   if (right_overflow || bottom_overflow) {
-    const int bw = 8 * mi_size_wide[bs] - right_overflow;
-    const int bh = 8 * mi_size_high[bs] - bottom_overflow;
+    const int bw = MI_SIZE * mi_size_wide[bs] - right_overflow;
+    const int bh = MI_SIZE * mi_size_high[bs] - bottom_overflow;
     int avg;
 #if CONFIG_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
diff --git a/third_party/aom/av1/encoder/arm/neon/dct_neon.c b/third_party/aom/av1/encoder/arm/neon/dct_neon.c
deleted file mode 100644
index f6ce24a3d..000000000
--- a/third_party/aom/av1/encoder/arm/neon/dct_neon.c
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "./av1_rtcd.h"
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-
-#include "av1/common/blockd.h"
-#include "aom_dsp/txfm_common.h"
-
-void av1_fdct8x8_quant_neon(const int16_t *input, int stride,
-                            int16_t *coeff_ptr, intptr_t n_coeffs,
-                            int skip_block, const int16_t *zbin_ptr,
-                            const int16_t *round_ptr, const int16_t *quant_ptr,
-                            const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
-                            int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                            uint16_t *eob_ptr, const int16_t *scan_ptr,
-                            const int16_t *iscan_ptr) {
-  int16_t temp_buffer[64];
-  (void)coeff_ptr;
-
-  aom_fdct8x8_neon(input, temp_buffer, stride);
-  av1_quantize_fp_neon(temp_buffer, n_coeffs, skip_block, zbin_ptr, round_ptr,
-                       quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
-                       dequant_ptr, eob_ptr, scan_ptr, iscan_ptr);
-}
diff --git a/third_party/aom/av1/encoder/av1_quantize.c b/third_party/aom/av1/encoder/av1_quantize.c
index dd53d4223..033b4ba1a 100644
--- a/third_party/aom/av1/encoder/av1_quantize.c
+++ b/third_party/aom/av1/encoder/av1_quantize.c
@@ -443,11 +443,8 @@ static void quantize_fp_helper_c(
     const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan,
-#if CONFIG_AOM_QM
-    const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr,
-#endif
-    int log_scale) {
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr, int log_scale) {
   int i, eob = -1;
   // TODO(jingning) Decide the need of these arguments after the
   // quantization process is completed.
@@ -464,35 +461,22 @@ static void quantize_fp_helper_c(
     for (i = 0; i < n_coeffs; i++) {
       const int rc = scan[i];
       const int coeff = coeff_ptr[rc];
-#if CONFIG_AOM_QM
-      const qm_val_t wt = qm_ptr[rc];
-      const qm_val_t iwt = iqm_ptr[rc];
+      const qm_val_t wt = qm_ptr ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+      const qm_val_t iwt = iqm_ptr ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
       const int dequant =
           (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
           AOM_QM_BITS;
-#endif
       const int coeff_sign = (coeff >> 31);
-      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
       int tmp32 = 0;
-#if CONFIG_AOM_QM
       if (abs_coeff * wt >=
           (dequant_ptr[rc != 0] << (AOM_QM_BITS - (1 + log_scale)))) {
-#else
-      if (abs_coeff >= (dequant_ptr[rc != 0] >> (1 + log_scale))) {
-#endif
         abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale);
-        abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
-#if CONFIG_AOM_QM
+        abs_coeff = clamp64(abs_coeff, INT16_MIN, INT16_MAX);
         tmp32 = (int)((abs_coeff * wt * quant_ptr[rc != 0]) >>
-                      ((16 - log_scale) + AOM_QM_BITS));
+                      (16 - log_scale + AOM_QM_BITS));
         qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
         dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / (1 << log_scale);
-#else
-        tmp32 = (int)((abs_coeff * quant_ptr[rc != 0]) >> (16 - log_scale));
-        qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
-        dqcoeff_ptr[rc] =
-            qcoeff_ptr[rc] * dequant_ptr[rc != 0] / (1 << log_scale);
-#endif
       }
 
       if (tmp32) eob = i;
@@ -501,25 +485,60 @@ static void quantize_fp_helper_c(
   *eob_ptr = eob + 1;
 }
 
+static void highbd_quantize_fp_helper_c(
+    const tran_low_t *coeff_ptr, intptr_t count, int skip_block,
+    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr, int log_scale) {
+  int i;
+  int eob = -1;
+  const int scale = 1 << log_scale;
+  const int shift = 16 - log_scale;
+  // TODO(jingning) Decide the need of these arguments after the
+  // quantization process is completed.
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < count; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+      const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+      const int dequant =
+          (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int64_t tmp = abs_coeff + (round_ptr[rc != 0] >> log_scale);
+      const int abs_qcoeff =
+          (int)((tmp * quant_ptr[rc != 0] * wt) >> (shift + AOM_QM_BITS));
+      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / scale;
+      if (abs_qcoeff) eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
 void av1_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                        int skip_block, const int16_t *zbin_ptr,
                        const int16_t *round_ptr, const int16_t *quant_ptr,
                        const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
                        tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
                        uint16_t *eob_ptr, const int16_t *scan,
-                       const int16_t *iscan
-#if CONFIG_AOM_QM
-                       ,
-                       const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
-#endif
-                       ) {
+                       const int16_t *iscan) {
   quantize_fp_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr,
                        quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
-                       dequant_ptr, eob_ptr, scan, iscan,
-#if CONFIG_AOM_QM
-                       qm_ptr, iqm_ptr,
-#endif
-                       0);
+                       dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 0);
 }
 
 void av1_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
@@ -528,19 +547,10 @@ void av1_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                              const int16_t *quant_shift_ptr,
                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                             const int16_t *scan, const int16_t *iscan
-#if CONFIG_AOM_QM
-                             ,
-                             const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
-#endif
-                             ) {
+                             const int16_t *scan, const int16_t *iscan) {
   quantize_fp_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr,
                        quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
-                       dequant_ptr, eob_ptr, scan, iscan,
-#if CONFIG_AOM_QM
-                       qm_ptr, iqm_ptr,
-#endif
-                       1);
+                       dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 1);
 }
 
 #if CONFIG_TX64X64
@@ -550,19 +560,10 @@ void av1_quantize_fp_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                              const int16_t *quant_shift_ptr,
                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                             const int16_t *scan, const int16_t *iscan
-#if CONFIG_AOM_QM
-                             ,
-                             const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
-#endif
-                             ) {
+                             const int16_t *scan, const int16_t *iscan) {
   quantize_fp_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr,
                        quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
-                       dequant_ptr, eob_ptr, scan, iscan,
-#if CONFIG_AOM_QM
-                       qm_ptr, iqm_ptr,
-#endif
-                       2);
+                       dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 2);
 }
 #endif  // CONFIG_TX64X64
 
@@ -576,58 +577,47 @@ void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 #if CONFIG_AOM_QM
   const qm_val_t *qm_ptr = qparam->qmatrix;
   const qm_val_t *iqm_ptr = qparam->iqmatrix;
-#endif  // CONFIG_AOM_QM
-
-  switch (qparam->log_scale) {
-    case 0:
-      if (n_coeffs < 16) {
-        // TODO(jingning): Need SIMD implementation for smaller block size
-        // quantization.
-        quantize_fp_helper_c(coeff_ptr, n_coeffs, skip_block, p->zbin,
-                             p->round_fp, p->quant_fp, p->quant_shift,
-                             qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr,
-                             sc->scan, sc->iscan,
-#if CONFIG_AOM_QM
-                             qm_ptr, iqm_ptr,
-#endif
-                             qparam->log_scale);
-      } else {
-        av1_quantize_fp(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp,
-                        p->quant_fp, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
-                        pd->dequant, eob_ptr, sc->scan, sc->iscan
-#if CONFIG_AOM_QM
-                        ,
-                        qm_ptr, iqm_ptr
-#endif
-                        );
-      }
-      break;
-    case 1:
-      av1_quantize_fp_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin,
-                            p->round_fp, p->quant_fp, p->quant_shift,
-                            qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr,
-                            sc->scan, sc->iscan
-#if CONFIG_AOM_QM
-                            ,
-                            qm_ptr, iqm_ptr
+  if (qm_ptr != NULL && iqm_ptr != NULL) {
+    quantize_fp_helper_c(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp,
+                         p->quant_fp, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
+                         pd->dequant, eob_ptr, sc->scan, sc->iscan, qm_ptr,
+                         iqm_ptr, qparam->log_scale);
+  } else {
 #endif
-                            );
-      break;
+    switch (qparam->log_scale) {
+      case 0:
+        if (n_coeffs < 16) {
+          // TODO(jingning): Need SIMD implementation for smaller block size
+          // quantization.
+          quantize_fp_helper_c(
+              coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp,
+              p->quant_fp, p->quant_shift, qcoeff_ptr, dqcoeff_ptr, pd->dequant,
+              eob_ptr, sc->scan, sc->iscan, NULL, NULL, qparam->log_scale);
+        } else {
+          av1_quantize_fp(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp,
+                          p->quant_fp, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
+                          pd->dequant, eob_ptr, sc->scan, sc->iscan);
+        }
+        break;
+      case 1:
+        av1_quantize_fp_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin,
+                              p->round_fp, p->quant_fp, p->quant_shift,
+                              qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr,
+                              sc->scan, sc->iscan);
+        break;
 #if CONFIG_TX64X64
-    case 2:
-      av1_quantize_fp_64x64(coeff_ptr, n_coeffs, skip_block, p->zbin,
-                            p->round_fp, p->quant_fp, p->quant_shift,
-                            qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr,
-                            sc->scan, sc->iscan
-#if CONFIG_AOM_QM
-                            ,
-                            qm_ptr, iqm_ptr
-#endif
-                            );
-      break;
+      case 2:
+        av1_quantize_fp_64x64(coeff_ptr, n_coeffs, skip_block, p->zbin,
+                              p->round_fp, p->quant_fp, p->quant_shift,
+                              qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr,
+                              sc->scan, sc->iscan);
+        break;
 #endif  // CONFIG_TX64X64
-    default: assert(0);
+      default: assert(0);
+    }
+#if CONFIG_AOM_QM
   }
+#endif
 }
 
 void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
@@ -640,43 +630,69 @@ void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 #if CONFIG_AOM_QM
   const qm_val_t *qm_ptr = qparam->qmatrix;
   const qm_val_t *iqm_ptr = qparam->iqmatrix;
+  if (qm_ptr != NULL && iqm_ptr != NULL) {
+    quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
+                        p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
+                        pd->dequant, eob_ptr, sc->scan, sc->iscan, qm_ptr,
+                        iqm_ptr, qparam->log_scale);
+  } else {
 #endif  // CONFIG_AOM_QM
 
-  switch (qparam->log_scale) {
-    case 0:
-      aom_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
-                     p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
-                     pd->dequant, eob_ptr, sc->scan, sc->iscan
-#if CONFIG_AOM_QM
-                     ,
-                     qm_ptr, iqm_ptr
-#endif
-                     );
-      break;
-    case 1:
-      aom_quantize_b_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
-                           p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
-                           pd->dequant, eob_ptr, sc->scan, sc->iscan
-#if CONFIG_AOM_QM
-                           ,
-                           qm_ptr, iqm_ptr
-#endif
-                           );
-      break;
+    switch (qparam->log_scale) {
+      case 0:
+        aom_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
+                       p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
+                       pd->dequant, eob_ptr, sc->scan, sc->iscan);
+        break;
+      case 1:
+        aom_quantize_b_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
+                             p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
+                             pd->dequant, eob_ptr, sc->scan, sc->iscan);
+        break;
 #if CONFIG_TX64X64
-    case 2:
-      aom_quantize_b_64x64(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
-                           p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
-                           pd->dequant, eob_ptr, sc->scan, sc->iscan
+      case 2:
+        aom_quantize_b_64x64(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
+                             p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
+                             pd->dequant, eob_ptr, sc->scan, sc->iscan);
+        break;
+#endif  // CONFIG_TX64X64
+      default: assert(0);
+    }
 #if CONFIG_AOM_QM
-                           ,
-                           qm_ptr, iqm_ptr
+  }
 #endif
-                           );
-      break;
-#endif  // CONFIG_TX64X64
-    default: assert(0);
+}
+
+static void quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
+                        int skip_block, const int16_t *round_ptr,
+                        const int16_t quant, tran_low_t *qcoeff_ptr,
+                        tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
+                        uint16_t *eob_ptr, const qm_val_t *qm_ptr,
+                        const qm_val_t *iqm_ptr, const int log_scale) {
+  const int rc = 0;
+  const int coeff = coeff_ptr[rc];
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int64_t tmp;
+  int eob = -1;
+  int32_t tmp32;
+  int dequant;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    const int wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+    const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+    tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale),
+                INT16_MIN, INT16_MAX);
+    tmp32 = (int32_t)((tmp * wt * quant) >> (16 - log_scale + AOM_QM_BITS));
+    qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+    dequant = (dequant_ptr * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+    dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / (1 << log_scale);
+    if (tmp32) eob = 0;
   }
+  *eob_ptr = eob + 1;
 }
 
 void av1_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
@@ -686,45 +702,18 @@ void av1_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                             const SCAN_ORDER *sc, const QUANT_PARAM *qparam) {
   // obsolete skip_block
   const int skip_block = 0;
+  (void)sc;
+  assert(qparam->log_scale >= 0 && qparam->log_scale < (2 + CONFIG_TX64X64));
 #if CONFIG_AOM_QM
   const qm_val_t *qm_ptr = qparam->qmatrix;
   const qm_val_t *iqm_ptr = qparam->iqmatrix;
-#endif  // CONFIG_AOM_QM
-
-  (void)sc;
-
-  switch (qparam->log_scale) {
-    case 0:
-      aom_quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round,
-                      p->quant_fp[0], qcoeff_ptr, dqcoeff_ptr, pd->dequant[0],
-                      eob_ptr
-#if CONFIG_AOM_QM
-                      ,
-                      qm_ptr, iqm_ptr
-#endif
-                      );
-      break;
-    case 1:
-      aom_quantize_dc_32x32(coeff_ptr, skip_block, p->round, p->quant_fp[0],
-                            qcoeff_ptr, dqcoeff_ptr, pd->dequant[0], eob_ptr
-#if CONFIG_AOM_QM
-                            ,
-                            qm_ptr, iqm_ptr
-#endif
-                            );
-      break;
-#if CONFIG_TX64X64
-      aom_quantize_dc_64x64(coeff_ptr, skip_block, p->round, p->quant_fp[0],
-                            qcoeff_ptr, dqcoeff_ptr, pd->dequant[0], eob_ptr
-#if CONFIG_AOM_QM
-                            ,
-                            qm_ptr, iqm_ptr
+#else
+  const qm_val_t *qm_ptr = NULL;
+  const qm_val_t *iqm_ptr = NULL;
 #endif
-                            );
-    case 2: break;
-#endif  // CONFIG_TX64X64
-    default: assert(0);
-  }
+  quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round, p->quant_fp[0],
+              qcoeff_ptr, dqcoeff_ptr, pd->dequant[0], eob_ptr, qm_ptr, iqm_ptr,
+              qparam->log_scale);
 }
 
 #if CONFIG_NEW_QUANT
@@ -857,29 +846,31 @@ void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr,
 #if CONFIG_AOM_QM
   const qm_val_t *qm_ptr = qparam->qmatrix;
   const qm_val_t *iqm_ptr = qparam->iqmatrix;
+  if (qm_ptr != NULL && iqm_ptr != NULL) {
+    highbd_quantize_fp_helper_c(
+        coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp, p->quant_fp,
+        p->quant_shift, qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr, sc->scan,
+        sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+  } else {
 #endif  // CONFIG_AOM_QM
 
-  if (n_coeffs < 16) {
-    // TODO(jingning): Need SIMD implementation for smaller block size
-    // quantization.
-    av1_highbd_quantize_fp_c(coeff_ptr, n_coeffs, skip_block, p->zbin,
-                             p->round_fp, p->quant_fp, p->quant_shift,
-                             qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr,
-                             sc->scan, sc->iscan,
-#if CONFIG_AOM_QM
-                             qm_ptr, iqm_ptr,
-#endif
-                             qparam->log_scale);
-    return;
-  }
+    if (n_coeffs < 16) {
+      // TODO(jingning): Need SIMD implementation for smaller block size
+      // quantization.
+      av1_highbd_quantize_fp_c(coeff_ptr, n_coeffs, skip_block, p->zbin,
+                               p->round_fp, p->quant_fp, p->quant_shift,
+                               qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr,
+                               sc->scan, sc->iscan, qparam->log_scale);
+      return;
+    }
 
-  av1_highbd_quantize_fp(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp,
-                         p->quant_fp, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
-                         pd->dequant, eob_ptr, sc->scan, sc->iscan,
+    av1_highbd_quantize_fp(coeff_ptr, n_coeffs, skip_block, p->zbin,
+                           p->round_fp, p->quant_fp, p->quant_shift, qcoeff_ptr,
+                           dqcoeff_ptr, pd->dequant, eob_ptr, sc->scan,
+                           sc->iscan, qparam->log_scale);
 #if CONFIG_AOM_QM
-                         qm_ptr, iqm_ptr,
+  }
 #endif
-                         qparam->log_scale);
 }
 
 void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
@@ -894,86 +885,76 @@ void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
 #if CONFIG_AOM_QM
   const qm_val_t *qm_ptr = qparam->qmatrix;
   const qm_val_t *iqm_ptr = qparam->iqmatrix;
+  if (qm_ptr != NULL && iqm_ptr != NULL) {
+    highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, p->zbin,
+                               p->round, p->quant, p->quant_shift, qcoeff_ptr,
+                               dqcoeff_ptr, pd->dequant, eob_ptr, sc->scan,
+                               sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+  } else {
 #endif  // CONFIG_AOM_QM
 
-  switch (qparam->log_scale) {
-    case 0:
-      if (LIKELY(n_coeffs >= 8)) {
-        aom_highbd_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin,
-                              p->round, p->quant, p->quant_shift, qcoeff_ptr,
-                              dqcoeff_ptr, pd->dequant, eob_ptr, sc->scan,
-                              sc->iscan
-#if CONFIG_AOM_QM
-                              ,
-                              qm_ptr, iqm_ptr
-#endif
-                              );
-      } else {
-        // TODO(luoyi): Need SIMD (e.g. sse2) for smaller block size
-        // quantization
-        aom_highbd_quantize_b_c(coeff_ptr, n_coeffs, skip_block, p->zbin,
+    switch (qparam->log_scale) {
+      case 0:
+        if (LIKELY(n_coeffs >= 8)) {
+          aom_highbd_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin,
                                 p->round, p->quant, p->quant_shift, qcoeff_ptr,
                                 dqcoeff_ptr, pd->dequant, eob_ptr, sc->scan,
-                                sc->iscan
-#if CONFIG_AOM_QM
-                                ,
-                                qm_ptr, iqm_ptr
-#endif
-                                );
-      }
-      break;
-    case 1:
-      aom_highbd_quantize_b_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin,
+                                sc->iscan);
+        } else {
+          // TODO(luoyi): Need SIMD (e.g. sse2) for smaller block size
+          // quantization
+          aom_highbd_quantize_b_c(coeff_ptr, n_coeffs, skip_block, p->zbin,
                                   p->round, p->quant, p->quant_shift,
                                   qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr,
-                                  sc->scan, sc->iscan
-#if CONFIG_AOM_QM
-                                  ,
-                                  qm_ptr, iqm_ptr
-#endif
-                                  );
-      break;
+                                  sc->scan, sc->iscan);
+        }
+        break;
+      case 1:
+        aom_highbd_quantize_b_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin,
+                                    p->round, p->quant, p->quant_shift,
+                                    qcoeff_ptr, dqcoeff_ptr, pd->dequant,
+                                    eob_ptr, sc->scan, sc->iscan);
+        break;
 #if CONFIG_TX64X64
-    case 2:
-      aom_highbd_quantize_b_64x64(coeff_ptr, n_coeffs, skip_block, p->zbin,
-                                  p->round, p->quant, p->quant_shift,
-                                  qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr,
-                                  sc->scan, sc->iscan
-#if CONFIG_AOM_QM
-                                  ,
-                                  qm_ptr, iqm_ptr
-#endif
-                                  );
-      break;
+      case 2:
+        aom_highbd_quantize_b_64x64(coeff_ptr, n_coeffs, skip_block, p->zbin,
+                                    p->round, p->quant, p->quant_shift,
+                                    qcoeff_ptr, dqcoeff_ptr, pd->dequant,
+                                    eob_ptr, sc->scan, sc->iscan);
+        break;
 #endif  // CONFIG_TX64X64
-    default: assert(0);
+      default: assert(0);
+    }
+#if CONFIG_AOM_QM
   }
+#endif
 }
 
 static INLINE void highbd_quantize_dc(
     const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
     const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr,
-#if CONFIG_AOM_QM
-    const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr,
-#endif
-    const int log_scale) {
+    const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, const int log_scale) {
   int eob = -1;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-#if CONFIG_AOM_QM
-  (void)qm_ptr;
-  (void)iqm_ptr;
-#endif
+
   if (!skip_block) {
+    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[0] : (1 << AOM_QM_BITS);
+    const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[0] : (1 << AOM_QM_BITS);
     const int coeff = coeff_ptr[0];
     const int coeff_sign = (coeff >> 31);
     const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-    const int64_t tmp = abs_coeff + round_ptr[0];
-    const int abs_qcoeff = (int)((tmp * quant) >> (16 - log_scale));
+    const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], log_scale);
+    const int64_t tmpw = tmp * wt;
+    const int abs_qcoeff =
+        (int)((tmpw * quant) >> (16 - log_scale + AOM_QM_BITS));
     qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / (1 << log_scale);
+    const int dequant =
+        (dequant_ptr * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+
+    dqcoeff_ptr[0] = (qcoeff_ptr[0] * dequant) / (1 << log_scale);
     if (abs_qcoeff) eob = 0;
   }
   *eob_ptr = eob + 1;
@@ -991,17 +972,16 @@ void av1_highbd_quantize_dc_facade(const tran_low_t *coeff_ptr,
 #if CONFIG_AOM_QM
   const qm_val_t *qm_ptr = qparam->qmatrix;
   const qm_val_t *iqm_ptr = qparam->iqmatrix;
+#else
+  const qm_val_t *qm_ptr = NULL;
+  const qm_val_t *iqm_ptr = NULL;
 #endif  // CONFIG_AOM_QM
 
   (void)sc;
 
   highbd_quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round,
                      p->quant_fp[0], qcoeff_ptr, dqcoeff_ptr, pd->dequant[0],
-                     eob_ptr,
-#if CONFIG_AOM_QM
-                     qm_ptr, iqm_ptr,
-#endif
-                     qparam->log_scale);
+                     eob_ptr, qm_ptr, iqm_ptr, qparam->log_scale);
 }
 
 #if CONFIG_NEW_QUANT
@@ -1517,61 +1497,16 @@ void av1_highbd_quantize_dc_nuq_facade(
 }
 #endif  // CONFIG_NEW_QUANT
 
-void av1_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count,
-                              int skip_block, const int16_t *zbin_ptr,
-                              const int16_t *round_ptr,
-                              const int16_t *quant_ptr,
-                              const int16_t *quant_shift_ptr,
-                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                              const int16_t *scan, const int16_t *iscan,
-#if CONFIG_AOM_QM
-                              const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr,
-#endif
-                              int log_scale) {
-  int i;
-  int eob = -1;
-  const int scale = 1 << log_scale;
-  const int shift = 16 - log_scale;
-  // TODO(jingning) Decide the need of these arguments after the
-  // quantization process is completed.
-  (void)zbin_ptr;
-  (void)quant_shift_ptr;
-  (void)iscan;
-
-  memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    // Quantization pass: All coefficients with index >= zero_flag are
-    // skippable. Note: zero_flag can be zero.
-    for (i = 0; i < count; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-#if CONFIG_AOM_QM
-      const qm_val_t wt = qm_ptr[rc];
-      const qm_val_t iwt = iqm_ptr[rc];
-      const int dequant =
-          (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
-          AOM_QM_BITS;
-#endif
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      const int64_t tmp = abs_coeff + (round_ptr[rc != 0] >> log_scale);
-#if CONFIG_AOM_QM
-      const int abs_qcoeff =
-          (int)((tmp * quant_ptr[rc != 0] * wt) >> (shift + AOM_QM_BITS));
-      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / scale;
-#else
-      const int abs_qcoeff = (int)((tmp * quant_ptr[rc != 0]) >> shift);
-      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / scale;
-#endif
-      if (abs_qcoeff) eob = i;
-    }
-  }
-  *eob_ptr = eob + 1;
+void av1_highbd_quantize_fp_c(
+    const tran_low_t *coeff_ptr, intptr_t count, int skip_block,
+    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, int log_scale) {
+  highbd_quantize_fp_helper_c(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
+                              quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                              dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
+                              NULL, NULL, log_scale);
 }
 
 static void invert_quant(int16_t *quant, int16_t *shift, int d) {
@@ -1682,22 +1617,19 @@ void av1_init_plane_quantizers(const AV1_COMP *cpi, MACROBLOCK *x,
   MACROBLOCKD *const xd = &x->e_mbd;
   const QUANTS *const quants = &cpi->quants;
 
-#if CONFIG_DELTA_Q
 #if CONFIG_EXT_DELTA_Q
-  int current_q_index = AOMMAX(
-      0, AOMMIN(QINDEX_RANGE - 1, cpi->oxcf.deltaq_mode != NO_DELTA_Q
-                                      ? cm->base_qindex + xd->delta_qindex
-                                      : cm->base_qindex));
+  int current_q_index =
+      AOMMAX(0, AOMMIN(QINDEX_RANGE - 1,
+                       cpi->oxcf.deltaq_mode != NO_DELTA_Q
+                           ? cm->base_qindex + xd->delta_qindex
+                           : cm->base_qindex));
 #else
   int current_q_index = AOMMAX(
-      0, AOMMIN(QINDEX_RANGE - 1, cm->delta_q_present_flag
-                                      ? cm->base_qindex + xd->delta_qindex
-                                      : cm->base_qindex));
+      0, AOMMIN(QINDEX_RANGE - 1,
+                cm->delta_q_present_flag ? cm->base_qindex + xd->delta_qindex
+                                         : cm->base_qindex));
 #endif
   const int qindex = av1_get_qindex(&cm->seg, segment_id, current_q_index);
-#else
-  const int qindex = av1_get_qindex(&cm->seg, segment_id, cm->base_qindex);
-#endif
   const int rdmult = av1_compute_rd_mult(cpi, qindex + cm->y_dc_delta_q);
   int i;
 #if CONFIG_AOM_QM
diff --git a/third_party/aom/av1/encoder/bgsprite.c b/third_party/aom/av1/encoder/bgsprite.c
index 64deade06..ae2cb1d40 100644
--- a/third_party/aom/av1/encoder/bgsprite.c
+++ b/third_party/aom/av1/encoder/bgsprite.c
@@ -34,13 +34,28 @@
  */
 #define BGSPRITE_BLENDING_MODE 1
 
+// Enable removal of outliers from mean blending mode.
+#if BGSPRITE_BLENDING_MODE == 1
+#define BGSPRITE_MEAN_REMOVE_OUTLIERS 0
+#endif  // BGSPRITE_BLENDING_MODE == 1
+
 /* Interpolation for panorama alignment sampling:
  * 0 = Nearest neighbor
  * 1 = Bilinear
  */
 #define BGSPRITE_INTERPOLATION 0
 
-#define TRANSFORM_MAT_DIM 3
+// Enable turning off bgsprite from firstpass metrics in define_gf_group.
+#define BGSPRITE_ENABLE_METRICS 1
+
+// Enable foreground/backgrond segmentation and combine with temporal filter.
+#define BGSPRITE_ENABLE_SEGMENTATION 1
+
+// Enable alignment using global motion.
+#define BGSPRITE_ENABLE_GME 0
+
+// Block size for foreground mask.
+#define BGSPRITE_MASK_BLOCK_SIZE 4
 
 typedef struct {
 #if CONFIG_HIGHBITDEPTH
@@ -52,8 +67,29 @@ typedef struct {
   uint8_t u;
   uint8_t v;
 #endif  // CONFIG_HIGHBITDEPTH
+  uint8_t exists;
 } YuvPixel;
 
+typedef struct {
+  int curr_model;
+  double mean[2];
+  double var[2];
+  int age[2];
+  double u_mean[2];
+  double v_mean[2];
+
+#if CONFIG_HIGHBITDEPTH
+  uint16_t y;
+  uint16_t u;
+  uint16_t v;
+#else
+  uint8_t y;
+  uint8_t u;
+  uint8_t v;
+#endif  // CONFIG_HIGHBITDEPTH
+  double final_var;
+} YuvPixelGaussian;
+
 // Maps to convert from matrix form to param vector form.
 static const int params_to_matrix_map[] = { 2, 3, 0, 4, 5, 1, 6, 7 };
 static const int matrix_to_params_map[] = { 2, 5, 0, 1, 3, 4, 6, 7 };
@@ -75,6 +111,8 @@ static void matrix_to_params(const double *const matrix, double *target) {
   }
 }
 
+#define TRANSFORM_MAT_DIM 3
+
 // Do matrix multiplication on params.
 static void multiply_params(double *const m1, double *const m2,
                             double *target) {
@@ -124,20 +162,20 @@ static void find_frame_limit(int width, int height,
   *y_max = (int)ceil(uv_matrix[1]);
   *y_min = (int)floor(uv_matrix[1]);
 
-  xy_matrix[0] = width;
+  xy_matrix[0] = width - 1;
   xy_matrix[1] = 0;
   multiply_mat(transform_matrix, xy_matrix, uv_matrix, TRANSFORM_MAT_DIM,
                TRANSFORM_MAT_DIM, 1);
   UPDATELIMITS(uv_matrix[0], uv_matrix[1], x_min, x_max, y_min, y_max);
 
-  xy_matrix[0] = width;
-  xy_matrix[1] = height;
+  xy_matrix[0] = width - 1;
+  xy_matrix[1] = height - 1;
   multiply_mat(transform_matrix, xy_matrix, uv_matrix, TRANSFORM_MAT_DIM,
                TRANSFORM_MAT_DIM, 1);
   UPDATELIMITS(uv_matrix[0], uv_matrix[1], x_min, x_max, y_min, y_max);
 
   xy_matrix[0] = 0;
-  xy_matrix[1] = height;
+  xy_matrix[1] = height - 1;
   multiply_mat(transform_matrix, xy_matrix, uv_matrix, TRANSFORM_MAT_DIM,
                TRANSFORM_MAT_DIM, 1);
   UPDATELIMITS(uv_matrix[0], uv_matrix[1], x_min, x_max, y_min, y_max);
@@ -198,79 +236,13 @@ static void invert_params(const double *const params, double *target) {
   matrix_to_params(inverse, target);
 }
 
-#if BGSPRITE_BLENDING_MODE == 0
-// swaps two YuvPixels.
-static void swap_yuv(YuvPixel *a, YuvPixel *b) {
-  const YuvPixel temp = *b;
-  *b = *a;
-  *a = temp;
-}
-
-// Partitions array to find pivot index in qselect.
-static int partition(YuvPixel arr[], int left, int right, int pivot_idx) {
-  YuvPixel pivot = arr[pivot_idx];
-
-  // Move pivot to the end.
-  swap_yuv(&arr[pivot_idx], &arr[right]);
-
-  int p_idx = left;
-  for (int i = left; i < right; ++i) {
-    if (arr[i].y <= pivot.y) {
-      swap_yuv(&arr[i], &arr[p_idx]);
-      p_idx++;
-    }
-  }
-
-  swap_yuv(&arr[p_idx], &arr[right]);
-
-  return p_idx;
-}
-
-// Returns the kth element in array, partially sorted in place (quickselect).
-static YuvPixel qselect(YuvPixel arr[], int left, int right, int k) {
-  if (left >= right) {
-    return arr[left];
-  }
-  unsigned int seed = (int)time(NULL);
-  int pivot_idx = left + rand_r(&seed) % (right - left + 1);
-  pivot_idx = partition(arr, left, right, pivot_idx);
-
-  if (k == pivot_idx) {
-    return arr[k];
-  } else if (k < pivot_idx) {
-    return qselect(arr, left, pivot_idx - 1, k);
-  } else {
-    return qselect(arr, pivot_idx + 1, right, k);
-  }
-}
-#endif  // BGSPRITE_BLENDING_MODE == 0
-
-// Stitches images together to create ARF and stores it in 'panorama'.
-static void stitch_images(YV12_BUFFER_CONFIG **const frames,
-                          const int num_frames, const int center_idx,
-                          const double **const params, const int *const x_min,
-                          const int *const x_max, const int *const y_min,
-                          const int *const y_max, int pano_x_min,
-                          int pano_x_max, int pano_y_min, int pano_y_max,
-                          YV12_BUFFER_CONFIG *panorama) {
-  const int width = pano_x_max - pano_x_min + 1;
-  const int height = pano_y_max - pano_y_min + 1;
-
-  // Create temp_pano[y][x][num_frames] stack of pixel values
-  YuvPixel ***temp_pano = aom_malloc(height * sizeof(*temp_pano));
-  for (int i = 0; i < height; ++i) {
-    temp_pano[i] = aom_malloc(width * sizeof(**temp_pano));
-    for (int j = 0; j < width; ++j) {
-      temp_pano[i][j] = aom_malloc(num_frames * sizeof(***temp_pano));
-    }
-  }
-  // Create count[y][x] to count how many values in stack for median filtering
-  int **count = aom_malloc(height * sizeof(*count));
-  for (int i = 0; i < height; ++i) {
-    count[i] = aom_calloc(width, sizeof(**count));  // counts initialized to 0
-  }
-
-  // Re-sample images onto panorama (pre-median filtering).
+static void build_image_stack(YV12_BUFFER_CONFIG **const frames,
+                              const int num_frames, const double **const params,
+                              const int *const x_min, const int *const x_max,
+                              const int *const y_min, const int *const y_max,
+                              int pano_x_min, int pano_y_min,
+                              YuvPixel ***img_stack) {
+  // Re-sample images onto panorama (pre-filtering).
   const int x_offset = -pano_x_min;
   const int y_offset = -pano_y_min;
   const int frame_width = frames[0]->y_width;
@@ -376,24 +348,19 @@ static void stitch_images(YV12_BUFFER_CONFIG **const frames,
 
 #if CONFIG_HIGHBITDEPTH
           if (frames[i]->flags & YV12_FLAG_HIGHBITDEPTH) {
-            temp_pano[pano_y][pano_x][count[pano_y][pano_x]].y =
-                (uint16_t)interpolated_yvalue;
-            temp_pano[pano_y][pano_x][count[pano_y][pano_x]].u =
-                (uint16_t)interpolated_uvalue;
-            temp_pano[pano_y][pano_x][count[pano_y][pano_x]].v =
-                (uint16_t)interpolated_vvalue;
+            img_stack[pano_y][pano_x][i].y = (uint16_t)interpolated_yvalue;
+            img_stack[pano_y][pano_x][i].u = (uint16_t)interpolated_uvalue;
+            img_stack[pano_y][pano_x][i].v = (uint16_t)interpolated_vvalue;
+            img_stack[pano_y][pano_x][i].exists = 1;
           } else {
 #endif  // CONFIG_HIGHBITDEPTH
-            temp_pano[pano_y][pano_x][count[pano_y][pano_x]].y =
-                (uint8_t)interpolated_yvalue;
-            temp_pano[pano_y][pano_x][count[pano_y][pano_x]].u =
-                (uint8_t)interpolated_uvalue;
-            temp_pano[pano_y][pano_x][count[pano_y][pano_x]].v =
-                (uint8_t)interpolated_vvalue;
+            img_stack[pano_y][pano_x][i].y = (uint8_t)interpolated_yvalue;
+            img_stack[pano_y][pano_x][i].u = (uint8_t)interpolated_uvalue;
+            img_stack[pano_y][pano_x][i].v = (uint8_t)interpolated_vvalue;
+            img_stack[pano_y][pano_x][i].exists = 1;
 #if CONFIG_HIGHBITDEPTH
           }
 #endif  // CONFIG_HIGHBITDEPTH
-          ++count[pano_y][pano_x];
         } else if (image_x >= 0 && image_x < frame_width && image_y >= 0 &&
                    image_y < frame_height) {
           // Place in panorama stack.
@@ -406,104 +373,405 @@ static void stitch_images(YV12_BUFFER_CONFIG **const frames,
               (image_x >> frames[i]->subsampling_x);
 #if CONFIG_HIGHBITDEPTH
           if (frames[i]->flags & YV12_FLAG_HIGHBITDEPTH) {
-            temp_pano[pano_y][pano_x][count[pano_y][pano_x]].y =
-                y_buffer16[ychannel_idx];
-            temp_pano[pano_y][pano_x][count[pano_y][pano_x]].u =
-                u_buffer16[uvchannel_idx];
-            temp_pano[pano_y][pano_x][count[pano_y][pano_x]].v =
-                v_buffer16[uvchannel_idx];
+            img_stack[pano_y][pano_x][i].y = y_buffer16[ychannel_idx];
+            img_stack[pano_y][pano_x][i].u = u_buffer16[uvchannel_idx];
+            img_stack[pano_y][pano_x][i].v = v_buffer16[uvchannel_idx];
+            img_stack[pano_y][pano_x][i].exists = 1;
           } else {
 #endif  // CONFIG_HIGHBITDEPTH
-            temp_pano[pano_y][pano_x][count[pano_y][pano_x]].y =
-                frames[i]->y_buffer[ychannel_idx];
-            temp_pano[pano_y][pano_x][count[pano_y][pano_x]].u =
-                frames[i]->u_buffer[uvchannel_idx];
-            temp_pano[pano_y][pano_x][count[pano_y][pano_x]].v =
-                frames[i]->v_buffer[uvchannel_idx];
+            img_stack[pano_y][pano_x][i].y = frames[i]->y_buffer[ychannel_idx];
+            img_stack[pano_y][pano_x][i].u = frames[i]->u_buffer[uvchannel_idx];
+            img_stack[pano_y][pano_x][i].v = frames[i]->v_buffer[uvchannel_idx];
+            img_stack[pano_y][pano_x][i].exists = 1;
 #if CONFIG_HIGHBITDEPTH
           }
 #endif  // CONFIG_HIGHBITDEPTH
-          ++count[pano_y][pano_x];
         }
       }
     }
   }
+}
 
-#if BGSPRITE_BLENDING_MODE == 1
-  // Apply mean filtering and store result in temp_pano[y][x][0].
+#if BGSPRITE_BLENDING_MODE == 0
+// swaps two YuvPixels.
+static void swap_yuv(YuvPixel *a, YuvPixel *b) {
+  const YuvPixel temp = *b;
+  *b = *a;
+  *a = temp;
+}
+
+// Partitions array to find pivot index in qselect.
+static int partition(YuvPixel arr[], int left, int right, int pivot_idx) {
+  YuvPixel pivot = arr[pivot_idx];
+
+  // Move pivot to the end.
+  swap_yuv(&arr[pivot_idx], &arr[right]);
+
+  int p_idx = left;
+  for (int i = left; i < right; ++i) {
+    if (arr[i].y <= pivot.y) {
+      swap_yuv(&arr[i], &arr[p_idx]);
+      p_idx++;
+    }
+  }
+
+  swap_yuv(&arr[p_idx], &arr[right]);
+
+  return p_idx;
+}
+
+// Returns the kth element in array, partially sorted in place (quickselect).
+static YuvPixel qselect(YuvPixel arr[], int left, int right, int k) {
+  if (left >= right) {
+    return arr[left];
+  }
+  unsigned int seed = (int)time(NULL);
+  int pivot_idx = left + rand_r(&seed) % (right - left + 1);
+  pivot_idx = partition(arr, left, right, pivot_idx);
+
+  if (k == pivot_idx) {
+    return arr[k];
+  } else if (k < pivot_idx) {
+    return qselect(arr, left, pivot_idx - 1, k);
+  } else {
+    return qselect(arr, pivot_idx + 1, right, k);
+  }
+}
+
+// Blends image stack together using a temporal median.
+static void blend_median(const int width, const int height,
+                         const int num_frames, const YuvPixel ***image_stack,
+                         YuvPixel **blended_img) {
+  // Allocate stack of pixels
+  YuvPixel *pixel_stack = aom_calloc(num_frames, sizeof(*pixel_stack));
+
+  // Apply median filtering using quickselect.
   for (int y = 0; y < height; ++y) {
     for (int x = 0; x < width; ++x) {
-      if (count[y][x] == 0) {
+      int count = 0;
+      for (int i = 0; i < num_frames; ++i) {
+        if (image_stack[y][x][i].exists) {
+          pixel_stack[count] = image_stack[y][x][i];
+          ++count;
+        }
+      }
+      if (count == 0) {
         // Just make the pixel black.
         // TODO(toddnguyen): Color the pixel with nearest neighbor
+        blended_img[y][x].exists = 0;
       } else {
-        // Find
-        uint32_t y_sum = 0;
-        uint32_t u_sum = 0;
-        uint32_t v_sum = 0;
-        for (int i = 0; i < count[y][x]; ++i) {
-          y_sum += temp_pano[y][x][i].y;
-          u_sum += temp_pano[y][x][i].u;
-          v_sum += temp_pano[y][x][i].v;
+        const int median_idx = (int)floor(count / 2);
+        YuvPixel median = qselect(pixel_stack, 0, count - 1, median_idx);
+
+        // Make the median value the 0th index for UV subsampling later
+        blended_img[y][x] = median;
+        blended_img[y][x].exists = 1;
+      }
+    }
+  }
+
+  aom_free(pixel_stack);
+}
+#endif  // BGSPRITE_BLENDING_MODE == 0
+
+#if BGSPRITE_BLENDING_MODE == 1
+// Blends image stack together using a temporal mean.
+static void blend_mean(const int width, const int height, const int num_frames,
+                       const YuvPixel ***image_stack, YuvPixel **blended_img,
+                       int highbitdepth) {
+  for (int y = 0; y < height; ++y) {
+    for (int x = 0; x < width; ++x) {
+      // Find
+      uint32_t y_sum = 0;
+      uint32_t u_sum = 0;
+      uint32_t v_sum = 0;
+      uint32_t count = 0;
+      for (int i = 0; i < num_frames; ++i) {
+        if (image_stack[y][x][i].exists) {
+          y_sum += image_stack[y][x][i].y;
+          u_sum += image_stack[y][x][i].u;
+          v_sum += image_stack[y][x][i].v;
+          ++count;
         }
+      }
 
-        const uint32_t unsigned_count = (uint32_t)count[y][x];
+#if BGSPRITE_MEAN_REMOVE_OUTLIERS
+      if (count > 1) {
+        double stdev = 0;
+        double y_mean = (double)y_sum / count;
+        for (int i = 0; i < num_frames; ++i) {
+          if (image_stack[y][x][i].exists) {
+            stdev += pow(y_mean - image_stack[y][x][i].y, 2);
+          }
+        }
+        stdev = sqrt(stdev / count);
+
+        uint32_t inlier_y_sum = 0;
+        uint32_t inlier_u_sum = 0;
+        uint32_t inlier_v_sum = 0;
+        uint32_t inlier_count = 0;
+        for (int i = 0; i < num_frames; ++i) {
+          if (image_stack[y][x][i].exists &&
+              fabs(image_stack[y][x][i].y - y_mean) <= 1.5 * stdev) {
+            inlier_y_sum += image_stack[y][x][i].y;
+            inlier_u_sum += image_stack[y][x][i].u;
+            inlier_v_sum += image_stack[y][x][i].v;
+            ++inlier_count;
+          }
+        }
+        count = inlier_count;
+        y_sum = inlier_y_sum;
+        u_sum = inlier_u_sum;
+        v_sum = inlier_v_sum;
+      }
+#endif  // BGSPRITE_MEAN_REMOVE_OUTLIERS
 
+      if (count != 0) {
+        blended_img[y][x].exists = 1;
 #if CONFIG_HIGHBITDEPTH
-        if (panorama->flags & YV12_FLAG_HIGHBITDEPTH) {
-          temp_pano[y][x][0].y = (uint16_t)OD_DIVU(y_sum, unsigned_count);
-          temp_pano[y][x][0].u = (uint16_t)OD_DIVU(u_sum, unsigned_count);
-          temp_pano[y][x][0].v = (uint16_t)OD_DIVU(v_sum, unsigned_count);
+        if (highbitdepth) {
+          blended_img[y][x].y = (uint16_t)OD_DIVU(y_sum, count);
+          blended_img[y][x].u = (uint16_t)OD_DIVU(u_sum, count);
+          blended_img[y][x].v = (uint16_t)OD_DIVU(v_sum, count);
         } else {
 #endif  // CONFIG_HIGHBITDEPTH
-          temp_pano[y][x][0].y = (uint8_t)OD_DIVU(y_sum, unsigned_count);
-          temp_pano[y][x][0].u = (uint8_t)OD_DIVU(u_sum, unsigned_count);
-          temp_pano[y][x][0].v = (uint8_t)OD_DIVU(v_sum, unsigned_count);
+          (void)highbitdepth;
+          blended_img[y][x].y = (uint8_t)OD_DIVU(y_sum, count);
+          blended_img[y][x].u = (uint8_t)OD_DIVU(u_sum, count);
+          blended_img[y][x].v = (uint8_t)OD_DIVU(v_sum, count);
 #if CONFIG_HIGHBITDEPTH
         }
 #endif  // CONFIG_HIGHBITDEPTH
+      } else {
+        blended_img[y][x].exists = 0;
       }
     }
   }
-#else
-  // Apply median filtering using quickselect.
-  for (int y = 0; y < height; ++y) {
-    for (int x = 0; x < width; ++x) {
-      if (count[y][x] == 0) {
-        // Just make the pixel black.
-        // TODO(toddnguyen): Color the pixel with nearest neighbor
+}
+#endif  // BGSPRITE_BLENDING_MODE == 1
+
+#if BGSPRITE_ENABLE_SEGMENTATION
+// Builds dual-mode single gaussian model from image stack.
+static void build_gaussian(const YuvPixel ***image_stack, const int num_frames,
+                           const int width, const int height,
+                           const int x_block_width, const int y_block_height,
+                           const int block_size, YuvPixelGaussian **gauss) {
+  const double initial_variance = 10.0;
+  const double s_theta = 2.0;
+
+  // Add images to dual-mode single gaussian model
+  for (int y_block = 0; y_block < y_block_height; ++y_block) {
+    for (int x_block = 0; x_block < x_block_width; ++x_block) {
+      // Process all blocks.
+      YuvPixelGaussian *model = &gauss[y_block][x_block];
+
+      // Process all frames.
+      for (int i = 0; i < num_frames; ++i) {
+        // Add block to the Gaussian model.
+        double max_variance[2] = { 0.0, 0.0 };
+        double temp_y_mean = 0.0;
+        double temp_u_mean = 0.0;
+        double temp_v_mean = 0.0;
+
+        // Find mean/variance of a block of pixels.
+        int temp_count = 0;
+        for (int sub_y = 0; sub_y < block_size; ++sub_y) {
+          for (int sub_x = 0; sub_x < block_size; ++sub_x) {
+            const int y = y_block * block_size + sub_y;
+            const int x = x_block * block_size + sub_x;
+            if (y < height && x < width && image_stack[y][x][i].exists) {
+              ++temp_count;
+              temp_y_mean += (double)image_stack[y][x][i].y;
+              temp_u_mean += (double)image_stack[y][x][i].u;
+              temp_v_mean += (double)image_stack[y][x][i].v;
+
+              const double variance_0 =
+                  pow((double)image_stack[y][x][i].y - model->mean[0], 2);
+              const double variance_1 =
+                  pow((double)image_stack[y][x][i].y - model->mean[1], 2);
+
+              if (variance_0 > max_variance[0]) {
+                max_variance[0] = variance_0;
+              }
+              if (variance_1 > max_variance[1]) {
+                max_variance[1] = variance_1;
+              }
+            }
+          }
+        }
+
+        // If pixels exist in the block, add to the model.
+        if (temp_count > 0) {
+          assert(temp_count <= block_size * block_size);
+          temp_y_mean /= temp_count;
+          temp_u_mean /= temp_count;
+          temp_v_mean /= temp_count;
+
+          // Switch the background model to the oldest model.
+          if (model->age[0] > model->age[1]) {
+            model->curr_model = 0;
+          } else if (model->age[1] > model->age[0]) {
+            model->curr_model = 1;
+          }
+
+          // If model is empty, initialize model.
+          if (model->age[model->curr_model] == 0) {
+            model->mean[model->curr_model] = temp_y_mean;
+            model->u_mean[model->curr_model] = temp_u_mean;
+            model->v_mean[model->curr_model] = temp_v_mean;
+            model->var[model->curr_model] = initial_variance;
+            model->age[model->curr_model] = 1;
+          } else {
+            // Constants for current model and foreground model (0 or 1).
+            const int opposite = 1 - model->curr_model;
+            const int current = model->curr_model;
+            const double j = i;
+
+            // Put block into the appropriate model.
+            if (pow(temp_y_mean - model->mean[current], 2) <
+                s_theta * model->var[current]) {
+              // Add block to the current background model
+              model->age[current] += 1;
+              const double prev_weight = 1 / j;
+              const double curr_weight = (j - 1) / j;
+              model->mean[current] = prev_weight * model->mean[current] +
+                                     curr_weight * temp_y_mean;
+              model->u_mean[current] = prev_weight * model->u_mean[current] +
+                                       curr_weight * temp_u_mean;
+              model->v_mean[current] = prev_weight * model->v_mean[current] +
+                                       curr_weight * temp_v_mean;
+              model->var[current] = prev_weight * model->var[current] +
+                                    curr_weight * max_variance[current];
+            } else {
+              // Block does not fit into current background candidate. Add to
+              // foreground candidate and reinitialize if necessary.
+              const double var_fg = pow(temp_y_mean - model->mean[opposite], 2);
+
+              if (var_fg <= s_theta * model->var[opposite]) {
+                model->age[opposite] += 1;
+                const double prev_weight = 1 / j;
+                const double curr_weight = (j - 1) / j;
+                model->mean[opposite] = prev_weight * model->mean[opposite] +
+                                        curr_weight * temp_y_mean;
+                model->u_mean[opposite] =
+                    prev_weight * model->u_mean[opposite] +
+                    curr_weight * temp_u_mean;
+                model->v_mean[opposite] =
+                    prev_weight * model->v_mean[opposite] +
+                    curr_weight * temp_v_mean;
+                model->var[opposite] = prev_weight * model->var[opposite] +
+                                       curr_weight * max_variance[opposite];
+              } else if (model->age[opposite] == 0 ||
+                         var_fg > s_theta * model->var[opposite]) {
+                model->mean[opposite] = temp_y_mean;
+                model->u_mean[opposite] = temp_u_mean;
+                model->v_mean[opposite] = temp_v_mean;
+                model->var[opposite] = initial_variance;
+                model->age[opposite] = 1;
+              } else {
+                // This case should never happen.
+                assert(0);
+              }
+            }
+          }
+        }
+      }
+
+      // Select the oldest candidate as the background model.
+      if (model->age[0] == 0 && model->age[1] == 0) {
+        model->y = 0;
+        model->u = 0;
+        model->v = 0;
+        model->final_var = 0;
+      } else if (model->age[0] > model->age[1]) {
+        model->y = (uint8_t)model->mean[0];
+        model->u = (uint8_t)model->u_mean[0];
+        model->v = (uint8_t)model->v_mean[0];
+        model->final_var = model->var[0];
       } else {
-        // Find
-        const int median_idx = (int)floor(count[y][x] / 2);
-        YuvPixel median =
-            qselect(temp_pano[y][x], 0, count[y][x] - 1, median_idx);
+        model->y = (uint8_t)model->mean[1];
+        model->u = (uint8_t)model->u_mean[1];
+        model->v = (uint8_t)model->v_mean[1];
+        model->final_var = model->var[1];
+      }
+    }
+  }
+}
 
-        // Make the median value the 0th index for UV subsampling later
-        temp_pano[y][x][0] = median;
-        assert(median.y == temp_pano[y][x][0].y &&
-               median.u == temp_pano[y][x][0].u &&
-               median.v == temp_pano[y][x][0].v);
+// Builds foreground mask based on reference image and gaussian model.
+// In mask[][], 1 is foreground and 0 is background.
+static void build_mask(const int x_min, const int y_min, const int x_offset,
+                       const int y_offset, const int x_block_width,
+                       const int y_block_height, const int block_size,
+                       const YuvPixelGaussian **gauss,
+                       YV12_BUFFER_CONFIG *const reference,
+                       YV12_BUFFER_CONFIG *const panorama, uint8_t **mask) {
+  const int crop_x_offset = x_min + x_offset;
+  const int crop_y_offset = y_min + y_offset;
+  const double d_theta = 4.0;
+
+  for (int y_block = 0; y_block < y_block_height; ++y_block) {
+    for (int x_block = 0; x_block < x_block_width; ++x_block) {
+      // Create mask to determine if ARF is background for foreground.
+      const YuvPixelGaussian *model = &gauss[y_block][x_block];
+      double temp_y_mean = 0.0;
+      int temp_count = 0;
+
+      for (int sub_y = 0; sub_y < block_size; ++sub_y) {
+        for (int sub_x = 0; sub_x < block_size; ++sub_x) {
+          // x and y are panorama coordinates.
+          const int y = y_block * block_size + sub_y;
+          const int x = x_block * block_size + sub_x;
+
+          const int arf_y = y - crop_y_offset;
+          const int arf_x = x - crop_x_offset;
+
+          if (arf_y >= 0 && arf_y < panorama->y_height && arf_x >= 0 &&
+              arf_x < panorama->y_width) {
+            ++temp_count;
+            const int ychannel_idx = arf_y * panorama->y_stride + arf_x;
+            temp_y_mean += (double)reference->y_buffer[ychannel_idx];
+          }
+        }
+      }
+      if (temp_count > 0) {
+        assert(temp_count <= block_size * block_size);
+        temp_y_mean /= temp_count;
+
+        if (pow(temp_y_mean - model->y, 2) > model->final_var * d_theta) {
+          // Mark block as foreground.
+          mask[y_block][x_block] = 1;
+        }
       }
     }
   }
-#endif  // BGSPRITE_BLENDING_MODE == 1
+}
+#endif  // BGSPRITE_ENABLE_SEGMENTATION
 
-  // NOTE(toddnguyen): Right now the ARF in the cpi struct is fixed size at
-  // the same size as the frames. For now, we crop the generated panorama.
-  // assert(panorama->y_width < width && panorama->y_height < height);
+// Resamples blended_img into panorama, including UV subsampling.
+static void resample_panorama(YuvPixel **blended_img, const int center_idx,
+                              const int *const x_min, const int *const y_min,
+                              int pano_x_min, int pano_x_max, int pano_y_min,
+                              int pano_y_max, YV12_BUFFER_CONFIG *panorama) {
+  const int width = pano_x_max - pano_x_min + 1;
+  const int height = pano_y_max - pano_y_min + 1;
+  const int x_offset = -pano_x_min;
+  const int y_offset = -pano_y_min;
   const int crop_x_offset = x_min[center_idx] + x_offset;
   const int crop_y_offset = y_min[center_idx] + y_offset;
-
 #if CONFIG_HIGHBITDEPTH
   if (panorama->flags & YV12_FLAG_HIGHBITDEPTH) {
     // Use median Y value.
     uint16_t *pano_y_buffer16 = CONVERT_TO_SHORTPTR(panorama->y_buffer);
+    uint16_t *pano_u_buffer16 = CONVERT_TO_SHORTPTR(panorama->u_buffer);
+    uint16_t *pano_v_buffer16 = CONVERT_TO_SHORTPTR(panorama->v_buffer);
+
     for (int y = 0; y < panorama->y_height; ++y) {
       for (int x = 0; x < panorama->y_width; ++x) {
         const int ychannel_idx = y * panorama->y_stride + x;
-        if (count[y + crop_y_offset][x + crop_x_offset] > 0) {
+        if (blended_img[y + crop_y_offset][x + crop_x_offset].exists) {
           pano_y_buffer16[ychannel_idx] =
-              temp_pano[y + crop_y_offset][x + crop_x_offset][0].y;
+              blended_img[y + crop_y_offset][x + crop_x_offset].y;
         } else {
           pano_y_buffer16[ychannel_idx] = 0;
         }
@@ -511,9 +779,6 @@ static void stitch_images(YV12_BUFFER_CONFIG **const frames,
     }
 
     // UV subsampling with median UV values
-    uint16_t *pano_u_buffer16 = CONVERT_TO_SHORTPTR(panorama->u_buffer);
-    uint16_t *pano_v_buffer16 = CONVERT_TO_SHORTPTR(panorama->v_buffer);
-
     for (int y = 0; y < panorama->uv_height; ++y) {
       for (int x = 0; x < panorama->uv_width; ++x) {
         uint32_t avg_count = 0;
@@ -526,9 +791,9 @@ static void stitch_images(YV12_BUFFER_CONFIG **const frames,
             int y_sample = crop_y_offset + (y << panorama->subsampling_y) + s_y;
             int x_sample = crop_x_offset + (x << panorama->subsampling_x) + s_x;
             if (y_sample > 0 && y_sample < height && x_sample > 0 &&
-                x_sample < width && count[y_sample][x_sample] > 0) {
-              u_sum += temp_pano[y_sample][x_sample][0].u;
-              v_sum += temp_pano[y_sample][x_sample][0].v;
+                x_sample < width && blended_img[y_sample][x_sample].exists) {
+              u_sum += blended_img[y_sample][x_sample].u;
+              v_sum += blended_img[y_sample][x_sample].v;
               avg_count++;
             }
           }
@@ -546,35 +811,36 @@ static void stitch_images(YV12_BUFFER_CONFIG **const frames,
     }
   } else {
 #endif  // CONFIG_HIGHBITDEPTH
-    // Use median Y value.
+    // Use blended Y value.
     for (int y = 0; y < panorama->y_height; ++y) {
       for (int x = 0; x < panorama->y_width; ++x) {
         const int ychannel_idx = y * panorama->y_stride + x;
-        if (count[y + crop_y_offset][x + crop_x_offset] > 0) {
+        // Use filtered background.
+        if (blended_img[y + crop_y_offset][x + crop_x_offset].exists) {
           panorama->y_buffer[ychannel_idx] =
-              temp_pano[y + crop_y_offset][x + crop_x_offset][0].y;
+              blended_img[y + crop_y_offset][x + crop_x_offset].y;
         } else {
           panorama->y_buffer[ychannel_idx] = 0;
         }
       }
     }
 
-    // UV subsampling with median UV values
+    // UV subsampling with blended UV values.
     for (int y = 0; y < panorama->uv_height; ++y) {
       for (int x = 0; x < panorama->uv_width; ++x) {
         uint16_t avg_count = 0;
         uint16_t u_sum = 0;
         uint16_t v_sum = 0;
 
-        // Look at surrounding pixels for subsampling
+        // Look at surrounding pixels for subsampling.
         for (int s_x = 0; s_x < panorama->subsampling_x + 1; ++s_x) {
           for (int s_y = 0; s_y < panorama->subsampling_y + 1; ++s_y) {
             int y_sample = crop_y_offset + (y << panorama->subsampling_y) + s_y;
             int x_sample = crop_x_offset + (x << panorama->subsampling_x) + s_x;
             if (y_sample > 0 && y_sample < height && x_sample > 0 &&
-                x_sample < width && count[y_sample][x_sample] > 0) {
-              u_sum += temp_pano[y_sample][x_sample][0].u;
-              v_sum += temp_pano[y_sample][x_sample][0].v;
+                x_sample < width && blended_img[y_sample][x_sample].exists) {
+              u_sum += blended_img[y_sample][x_sample].u;
+              v_sum += blended_img[y_sample][x_sample].v;
               avg_count++;
             }
           }
@@ -595,19 +861,266 @@ static void stitch_images(YV12_BUFFER_CONFIG **const frames,
 #if CONFIG_HIGHBITDEPTH
   }
 #endif  // CONFIG_HIGHBITDEPTH
+}
 
+#if BGSPRITE_ENABLE_SEGMENTATION
+// Combines temporal filter output and bgsprite output to make final ARF output
+static void combine_arf(YV12_BUFFER_CONFIG *const temporal_arf,
+                        YV12_BUFFER_CONFIG *const bgsprite,
+                        uint8_t **const mask, const int block_size,
+                        const int x_offset, const int y_offset,
+                        YV12_BUFFER_CONFIG *target) {
+  const int height = temporal_arf->y_height;
+  const int width = temporal_arf->y_width;
+
+  YuvPixel **blended_img = aom_malloc(height * sizeof(*blended_img));
   for (int i = 0; i < height; ++i) {
+    blended_img[i] = aom_malloc(width * sizeof(**blended_img));
+  }
+
+  const int block_2_height = (height / BGSPRITE_MASK_BLOCK_SIZE) +
+                             (height % BGSPRITE_MASK_BLOCK_SIZE != 0 ? 1 : 0);
+  const int block_2_width = (width / BGSPRITE_MASK_BLOCK_SIZE) +
+                            (width % BGSPRITE_MASK_BLOCK_SIZE != 0 ? 1 : 0);
+
+  for (int block_y = 0; block_y < block_2_height; ++block_y) {
+    for (int block_x = 0; block_x < block_2_width; ++block_x) {
+      int count = 0;
+      int total = 0;
+      for (int sub_y = 0; sub_y < BGSPRITE_MASK_BLOCK_SIZE; ++sub_y) {
+        for (int sub_x = 0; sub_x < BGSPRITE_MASK_BLOCK_SIZE; ++sub_x) {
+          const int img_y = block_y * BGSPRITE_MASK_BLOCK_SIZE + sub_y;
+          const int img_x = block_x * BGSPRITE_MASK_BLOCK_SIZE + sub_x;
+          const int mask_y = (y_offset + img_y) / block_size;
+          const int mask_x = (x_offset + img_x) / block_size;
+
+          if (img_y < height && img_x < width) {
+            if (mask[mask_y][mask_x]) {
+              ++count;
+            }
+            ++total;
+          }
+        }
+      }
+
+      const double threshold = 0.30;
+      const int amount = (int)(threshold * total);
+      for (int sub_y = 0; sub_y < BGSPRITE_MASK_BLOCK_SIZE; ++sub_y) {
+        for (int sub_x = 0; sub_x < BGSPRITE_MASK_BLOCK_SIZE; ++sub_x) {
+          const int y = block_y * BGSPRITE_MASK_BLOCK_SIZE + sub_y;
+          const int x = block_x * BGSPRITE_MASK_BLOCK_SIZE + sub_x;
+          if (y < height && x < width) {
+            blended_img[y][x].exists = 1;
+            const int ychannel_idx = y * temporal_arf->y_stride + x;
+            const int uvchannel_idx =
+                (y >> temporal_arf->subsampling_y) * temporal_arf->uv_stride +
+                (x >> temporal_arf->subsampling_x);
+
+            if (count > amount) {
+// Foreground; use temporal arf.
+#if CONFIG_HIGHBITDEPTH
+              if (temporal_arf->flags & YV12_FLAG_HIGHBITDEPTH) {
+                uint16_t *pano_y_buffer16 =
+                    CONVERT_TO_SHORTPTR(temporal_arf->y_buffer);
+                uint16_t *pano_u_buffer16 =
+                    CONVERT_TO_SHORTPTR(temporal_arf->u_buffer);
+                uint16_t *pano_v_buffer16 =
+                    CONVERT_TO_SHORTPTR(temporal_arf->v_buffer);
+                blended_img[y][x].y = pano_y_buffer16[ychannel_idx];
+                blended_img[y][x].u = pano_u_buffer16[uvchannel_idx];
+                blended_img[y][x].v = pano_v_buffer16[uvchannel_idx];
+              } else {
+#endif  // CONFIG_HIGHBITDEPTH
+                blended_img[y][x].y = temporal_arf->y_buffer[ychannel_idx];
+                blended_img[y][x].u = temporal_arf->u_buffer[uvchannel_idx];
+                blended_img[y][x].v = temporal_arf->v_buffer[uvchannel_idx];
+#if CONFIG_HIGHBITDEPTH
+              }
+#endif  // CONFIG_HIGHBITDEPTH
+            } else {
+// Background; use bgsprite arf.
+#if CONFIG_HIGHBITDEPTH
+              if (bgsprite->flags & YV12_FLAG_HIGHBITDEPTH) {
+                uint16_t *pano_y_buffer16 =
+                    CONVERT_TO_SHORTPTR(bgsprite->y_buffer);
+                uint16_t *pano_u_buffer16 =
+                    CONVERT_TO_SHORTPTR(bgsprite->u_buffer);
+                uint16_t *pano_v_buffer16 =
+                    CONVERT_TO_SHORTPTR(bgsprite->v_buffer);
+                blended_img[y][x].y = pano_y_buffer16[ychannel_idx];
+                blended_img[y][x].u = pano_u_buffer16[uvchannel_idx];
+                blended_img[y][x].v = pano_v_buffer16[uvchannel_idx];
+              } else {
+#endif  // CONFIG_HIGHBITDEPTH
+                blended_img[y][x].y = bgsprite->y_buffer[ychannel_idx];
+                blended_img[y][x].u = bgsprite->u_buffer[uvchannel_idx];
+                blended_img[y][x].v = bgsprite->v_buffer[uvchannel_idx];
+#if CONFIG_HIGHBITDEPTH
+              }
+#endif  // CONFIG_HIGHBITDEPTH
+            }
+          }
+        }
+      }
+    }
+  }
+
+  const int x_min = 0;
+  const int y_min = 0;
+  resample_panorama(blended_img, 0, &x_min, &y_min, 0, width - 1, 0, height - 1,
+                    target);
+
+  for (int i = 0; i < height; ++i) {
+    aom_free(blended_img[i]);
+  }
+  aom_free(blended_img);
+}
+#endif  // BGSPRITE_ENABLE_SEGMENTATION
+
+// Stitches images together to create ARF and stores it in 'panorama'.
+static void stitch_images(AV1_COMP *cpi, YV12_BUFFER_CONFIG **const frames,
+                          const int num_frames, const int distance,
+                          const int center_idx, const double **const params,
+                          const int *const x_min, const int *const x_max,
+                          const int *const y_min, const int *const y_max,
+                          int pano_x_min, int pano_x_max, int pano_y_min,
+                          int pano_y_max, YV12_BUFFER_CONFIG *panorama) {
+  const int width = pano_x_max - pano_x_min + 1;
+  const int height = pano_y_max - pano_y_min + 1;
+
+  // Create pano_stack[y][x][num_frames] stack of pixel values
+  YuvPixel ***pano_stack = aom_malloc(height * sizeof(*pano_stack));
+  for (int i = 0; i < height; ++i) {
+    pano_stack[i] = aom_malloc(width * sizeof(**pano_stack));
     for (int j = 0; j < width; ++j) {
-      aom_free(temp_pano[i][j]);
+      pano_stack[i][j] = aom_calloc(num_frames, sizeof(***pano_stack));
     }
-    aom_free(temp_pano[i]);
-    aom_free(count[i]);
   }
-  aom_free(count);
-  aom_free(temp_pano);
+
+  build_image_stack(frames, num_frames, params, x_min, x_max, y_min, y_max,
+                    pano_x_min, pano_y_min, pano_stack);
+
+  // Create blended_img[y][x] of combined panorama pixel values.
+  YuvPixel **blended_img = aom_malloc(height * sizeof(*blended_img));
+  for (int i = 0; i < height; ++i) {
+    blended_img[i] = aom_malloc(width * sizeof(**blended_img));
+  }
+
+// Blending and saving result in blended_img.
+#if BGSPRITE_BLENDING_MODE == 1
+  blend_mean(width, height, num_frames, (const YuvPixel ***)pano_stack,
+             blended_img, panorama->flags & YV12_FLAG_HIGHBITDEPTH);
+#else   // BGSPRITE_BLENDING_MODE != 1
+  blend_median(width, height, num_frames, (const YuvPixel ***)pano_stack,
+               blended_img);
+#endif  // BGSPRITE_BLENDING_MODE == 1
+
+  // NOTE(toddnguyen): Right now the ARF in the cpi struct is fixed size at
+  // the same size as the frames. For now, we crop the generated panorama.
+  assert(panorama->y_width <= width && panorama->y_height <= height);
+
+  // Resamples the blended_img into the panorama buffer.
+  YV12_BUFFER_CONFIG bgsprite;
+  memset(&bgsprite, 0, sizeof(bgsprite));
+  aom_alloc_frame_buffer(&bgsprite, frames[0]->y_width, frames[0]->y_height,
+                         frames[0]->subsampling_x, frames[0]->subsampling_y,
+#if CONFIG_HIGHBITDEPTH
+                         frames[0]->flags & YV12_FLAG_HIGHBITDEPTH,
+#endif
+                         frames[0]->border, 0);
+  aom_yv12_copy_frame(frames[0], &bgsprite);
+  bgsprite.bit_depth = frames[0]->bit_depth;
+  resample_panorama(blended_img, center_idx, x_min, y_min, pano_x_min,
+                    pano_x_max, pano_y_min, pano_y_max, &bgsprite);
+
+#if BGSPRITE_ENABLE_SEGMENTATION
+  YV12_BUFFER_CONFIG temporal_bgsprite;
+  memset(&temporal_bgsprite, 0, sizeof(temporal_bgsprite));
+  aom_alloc_frame_buffer(&temporal_bgsprite, frames[0]->y_width,
+                         frames[0]->y_height, frames[0]->subsampling_x,
+                         frames[0]->subsampling_y,
+#if CONFIG_HIGHBITDEPTH
+                         frames[0]->flags & YV12_FLAG_HIGHBITDEPTH,
+#endif
+                         frames[0]->border, 0);
+  aom_yv12_copy_frame(frames[0], &temporal_bgsprite);
+  temporal_bgsprite.bit_depth = frames[0]->bit_depth;
+
+  av1_temporal_filter(cpi, &bgsprite, &temporal_bgsprite, distance);
+
+  // Block size constants for gaussian model.
+  const int N_1 = 2;
+  const int y_block_height = (height / N_1) + (height % N_1 != 0 ? 1 : 0);
+  const int x_block_width = (width / N_1) + (height % N_1 != 0 ? 1 : 0);
+  YuvPixelGaussian **gauss = aom_malloc(y_block_height * sizeof(*gauss));
+  for (int i = 0; i < y_block_height; ++i) {
+    gauss[i] = aom_calloc(x_block_width, sizeof(**gauss));
+  }
+
+  // Build Gaussian model.
+  build_gaussian((const YuvPixel ***)pano_stack, num_frames, width, height,
+                 x_block_width, y_block_height, N_1, gauss);
+
+  // Select background model and build foreground mask.
+  uint8_t **mask = aom_malloc(y_block_height * sizeof(*mask));
+  for (int i = 0; i < y_block_height; ++i) {
+    mask[i] = aom_calloc(x_block_width, sizeof(**mask));
+  }
+
+  const int x_offset = -pano_x_min;
+  const int y_offset = -pano_y_min;
+  build_mask(x_min[center_idx], y_min[center_idx], x_offset, y_offset,
+             x_block_width, y_block_height, N_1,
+             (const YuvPixelGaussian **)gauss,
+             (YV12_BUFFER_CONFIG * const) frames[center_idx], panorama, mask);
+
+  YV12_BUFFER_CONFIG temporal_arf;
+  memset(&temporal_arf, 0, sizeof(temporal_arf));
+  aom_alloc_frame_buffer(&temporal_arf, frames[0]->y_width, frames[0]->y_height,
+                         frames[0]->subsampling_x, frames[0]->subsampling_y,
+#if CONFIG_HIGHBITDEPTH
+                         frames[0]->flags & YV12_FLAG_HIGHBITDEPTH,
+#endif
+                         frames[0]->border, 0);
+  aom_yv12_copy_frame(frames[0], &temporal_arf);
+  temporal_arf.bit_depth = frames[0]->bit_depth;
+  av1_temporal_filter(cpi, NULL, &temporal_arf, distance);
+
+  combine_arf(&temporal_arf, &temporal_bgsprite, mask, N_1, x_offset, y_offset,
+              panorama);
+
+  aom_free_frame_buffer(&temporal_arf);
+  aom_free_frame_buffer(&temporal_bgsprite);
+  for (int i = 0; i < y_block_height; ++i) {
+    aom_free(gauss[i]);
+    aom_free(mask[i]);
+  }
+  aom_free(gauss);
+  aom_free(mask);
+#else   // !BGSPRITE_ENABLE_SEGMENTATION
+  av1_temporal_filter(cpi, &bgsprite, panorama, distance);
+#endif  // BGSPRITE_ENABLE_SEGMENTATION
+
+  aom_free_frame_buffer(&bgsprite);
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; ++j) {
+      aom_free(pano_stack[i][j]);
+    }
+    aom_free(pano_stack[i]);
+    aom_free(blended_img[i]);
+  }
+  aom_free(pano_stack);
+  aom_free(blended_img);
 }
 
 int av1_background_sprite(AV1_COMP *cpi, int distance) {
+#if BGSPRITE_ENABLE_METRICS
+  // Do temporal filter if firstpass stats disable bgsprite.
+  if (!cpi->bgsprite_allowed) {
+    return 1;
+  }
+#endif  // BGSPRITE_ENABLE_METRICS
+
   YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = { NULL };
   static const double identity_params[MAX_PARAMDIM - 1] = {
     0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0
@@ -626,7 +1139,6 @@ int av1_background_sprite(AV1_COMP *cpi, int distance) {
 #if CONFIG_EXT_REFS
   const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
   if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW) {
-    cpi->alt_ref_buffer = av1_lookahead_peek(cpi->lookahead, distance)->img;
     cpi->is_arf_filter_off[gf_group->arf_update_idx[gf_group->index]] = 1;
     frames_fwd = 0;
     frames_bwd = 0;
@@ -646,17 +1158,6 @@ int av1_background_sprite(AV1_COMP *cpi, int distance) {
     frames[frames_to_stitch - 1 - frame] = &buf->img;
   }
 
-  YV12_BUFFER_CONFIG temp_bg;
-  memset(&temp_bg, 0, sizeof(temp_bg));
-  aom_alloc_frame_buffer(&temp_bg, frames[0]->y_width, frames[0]->y_height,
-                         frames[0]->subsampling_x, frames[0]->subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-                         frames[0]->flags & YV12_FLAG_HIGHBITDEPTH,
-#endif
-                         frames[0]->border, 0);
-  aom_yv12_copy_frame(frames[0], &temp_bg);
-  temp_bg.bit_depth = frames[0]->bit_depth;
-
   // Allocate empty arrays for parameters between frames.
   double **params = aom_malloc(frames_to_stitch * sizeof(*params));
   for (int i = 0; i < frames_to_stitch; ++i) {
@@ -664,9 +1165,10 @@ int av1_background_sprite(AV1_COMP *cpi, int distance) {
     memcpy(params[i], identity_params, sizeof(identity_params));
   }
 
-  // Use global motion to find affine transformations between frames.
-  // params[i] will have the transform from frame[i] to frame[i-1].
-  // params[0] will have the identity matrix because it has no previous frame.
+// Use global motion to find affine transformations between frames.
+// params[i] will have the transform from frame[i] to frame[i-1].
+// params[0] will have the identity matrix (has no previous frame).
+#if BGSPRITE_ENABLE_GME
   TransformationType model = AFFINE;
   int inliers_by_motion[RANSAC_NUM_MOTIONS];
   for (int frame = 0; frame < frames_to_stitch - 1; ++frame) {
@@ -686,6 +1188,7 @@ int av1_background_sprite(AV1_COMP *cpi, int distance) {
       return 1;
     }
   }
+#endif  // BGSPRITE_ENABLE_GME
 
   // Compound the transformation parameters.
   for (int i = 1; i < frames_to_stitch; ++i) {
@@ -702,7 +1205,7 @@ int av1_background_sprite(AV1_COMP *cpi, int distance) {
   int *y_max = aom_malloc(frames_to_stitch * sizeof(*y_max));
   int *y_min = aom_malloc(frames_to_stitch * sizeof(*y_min));
 
-  find_limits(cpi->initial_width, cpi->initial_height,
+  find_limits(frames[0]->y_width, frames[0]->y_height,
               (const double **const)params, frames_to_stitch, x_min, x_max,
               y_min, y_max, &pano_x_min, &pano_x_max, &pano_y_min, &pano_y_max);
 
@@ -721,20 +1224,17 @@ int av1_background_sprite(AV1_COMP *cpi, int distance) {
   }
 
   // Recompute frame limits for new adjusted center.
-  find_limits(cpi->initial_width, cpi->initial_height,
+  find_limits(frames[0]->y_width, frames[0]->y_height,
               (const double **const)params, frames_to_stitch, x_min, x_max,
               y_min, y_max, &pano_x_min, &pano_x_max, &pano_y_min, &pano_y_max);
 
-  // Stitch Images.
-  stitch_images(frames, frames_to_stitch, center_idx,
+  // Stitch Images and apply bgsprite filter.
+  stitch_images(cpi, frames, frames_to_stitch, distance, center_idx,
                 (const double **const)params, x_min, x_max, y_min, y_max,
-                pano_x_min, pano_x_max, pano_y_min, pano_y_max, &temp_bg);
-
-  // Apply temporal filter.
-  av1_temporal_filter(cpi, &temp_bg, distance);
+                pano_x_min, pano_x_max, pano_y_min, pano_y_max,
+                &cpi->alt_ref_buffer);
 
   // Free memory.
-  aom_free_frame_buffer(&temp_bg);
   for (int i = 0; i < frames_to_stitch; ++i) {
     aom_free(params[i]);
   }
@@ -746,3 +1246,12 @@ int av1_background_sprite(AV1_COMP *cpi, int distance) {
 
   return 0;
 }
+
+#undef _POSIX_C_SOURCE
+#undef BGSPRITE_BLENDING_MODE
+#undef BGSPRITE_INTERPOLATION
+#undef BGSPRITE_ENABLE_METRICS
+#undef BGSPRITE_ENABLE_SEGMENTATION
+#undef BGSPRITE_ENABLE_GME
+#undef BGSPRITE_MASK_BLOCK_SIZE
+#undef TRANSFORM_MAT_DIM
diff --git a/third_party/aom/av1/encoder/bitstream.c b/third_party/aom/av1/encoder/bitstream.c
index 2e0abc186..08f605f10 100644
--- a/third_party/aom/av1/encoder/bitstream.c
+++ b/third_party/aom/av1/encoder/bitstream.c
@@ -14,9 +14,9 @@
 #include <stdio.h>
 
 #include "aom/aom_encoder.h"
-#include "aom_dsp/bitwriter_buffer.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/binary_codes_writer.h"
+#include "aom_dsp/bitwriter_buffer.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem_ops.h"
 #include "aom_ports/system_state.h"
@@ -40,9 +40,6 @@
 #include "av1/common/seg_common.h"
 #include "av1/common/tile_common.h"
 
-#if CONFIG_ANS
-#include "aom_dsp/buf_ans.h"
-#endif  // CONFIG_ANS
 #if CONFIG_LV_MAP
 #include "av1/encoder/encodetxb.h"
 #endif  // CONFIG_LV_MAP
@@ -50,9 +47,9 @@
 #include "av1/encoder/cost.h"
 #include "av1/encoder/encodemv.h"
 #include "av1/encoder/mcomp.h"
-#if CONFIG_PALETTE && CONFIG_PALETTE_DELTA_ENCODING
+#if CONFIG_PALETTE_DELTA_ENCODING
 #include "av1/encoder/palette.h"
-#endif  // CONFIG_PALETTE && CONFIG_PALETTE_DELTA_ENCODING
+#endif  // CONFIG_PALETTE_DELTA_ENCODING
 #include "av1/encoder/segmentation.h"
 #include "av1/encoder/subexp.h"
 #include "av1/encoder/tokenize.h"
@@ -62,12 +59,13 @@
 
 #define ENC_MISMATCH_DEBUG 0
 
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
 static struct av1_token
     inter_singleref_comp_mode_encodings[INTER_SINGLEREF_COMP_MODES];
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
 
-#if CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA || CONFIG_PALETTE
+// TODO(anybody) : remove this flag when PVQ supports pallete coding tool
+#if !CONFIG_PVQ || CONFIG_EXT_INTRA
 static INLINE void write_uniform(aom_writer *w, int n, int v) {
   const int l = get_unsigned_bits(n);
   const int m = (1 << l) - n;
@@ -79,63 +77,47 @@ static INLINE void write_uniform(aom_writer *w, int n, int v) {
     aom_write_literal(w, (v - m) & 1, 1);
   }
 }
-#endif  // CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA || CONFIG_PALETTE
+#endif  // !CONFIG_PVQ || CONFIG_EXT_INTRA
 
-#if CONFIG_EXT_TX
-static struct av1_token ext_tx_inter_encodings[EXT_TX_SETS_INTER][TX_TYPES];
-static struct av1_token ext_tx_intra_encodings[EXT_TX_SETS_INTRA][TX_TYPES];
-#else
-static struct av1_token ext_tx_encodings[TX_TYPES];
-#endif  // CONFIG_EXT_TX
 #if CONFIG_EXT_INTRA
 #if CONFIG_INTRA_INTERP
 static struct av1_token intra_filter_encodings[INTRA_FILTERS];
 #endif  // CONFIG_INTRA_INTERP
 #endif  // CONFIG_EXT_INTRA
-#if CONFIG_EXT_INTER
 #if CONFIG_INTERINTRA
 static struct av1_token interintra_mode_encodings[INTERINTRA_MODES];
 #endif
 #if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
 static struct av1_token compound_type_encodings[COMPOUND_TYPES];
 #endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
-#endif  // CONFIG_EXT_INTER
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-static struct av1_token ncobmc_mode_encodings[MAX_NCOBMC_MODES];
-#endif
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 #if CONFIG_LOOP_RESTORATION
 static struct av1_token switchable_restore_encodings[RESTORE_SWITCHABLE_TYPES];
+static void loop_restoration_write_sb_coeffs(const AV1_COMMON *const cm,
+                                             MACROBLOCKD *xd,
+                                             aom_writer *const w, int plane,
+                                             int rtile_idx);
 #endif  // CONFIG_LOOP_RESTORATION
-static void write_uncompressed_header(AV1_COMP *cpi,
-                                      struct aom_write_bit_buffer *wb);
+#if CONFIG_OBU
+static void write_uncompressed_header_obu(AV1_COMP *cpi,
+                                          struct aom_write_bit_buffer *wb);
+#else
+static void write_uncompressed_header_frame(AV1_COMP *cpi,
+                                            struct aom_write_bit_buffer *wb);
+#endif
+
 static uint32_t write_compressed_header(AV1_COMP *cpi, uint8_t *data);
+
+#if !CONFIG_OBU || CONFIG_EXT_TILE
 static int remux_tiles(const AV1_COMMON *const cm, uint8_t *dst,
                        const uint32_t data_size, const uint32_t max_tile_size,
                        const uint32_t max_tile_col_size,
                        int *const tile_size_bytes,
                        int *const tile_col_size_bytes);
-
+#endif
 void av1_encode_token_init(void) {
-#if CONFIG_EXT_TX
-  int s;
-#endif  // CONFIG_EXT_TX
-#if CONFIG_EXT_TX
-  for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
-    av1_tokens_from_tree(ext_tx_inter_encodings[s], av1_ext_tx_inter_tree[s]);
-  }
-  for (s = 1; s < EXT_TX_SETS_INTRA; ++s) {
-    av1_tokens_from_tree(ext_tx_intra_encodings[s], av1_ext_tx_intra_tree[s]);
-  }
-#else
-  av1_tokens_from_tree(ext_tx_encodings, av1_ext_tx_tree);
-#endif  // CONFIG_EXT_TX
-
 #if CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
   av1_tokens_from_tree(intra_filter_encodings, av1_intra_filter_tree);
 #endif  // CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
-#if CONFIG_EXT_INTER
 #if CONFIG_INTERINTRA
   av1_tokens_from_tree(interintra_mode_encodings, av1_interintra_mode_tree);
 #endif  // CONFIG_INTERINTRA
@@ -146,35 +128,10 @@ void av1_encode_token_init(void) {
 #if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
   av1_tokens_from_tree(compound_type_encodings, av1_compound_type_tree);
 #endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
-#endif  // CONFIG_EXT_INTER
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-  av1_tokens_from_tree(ncobmc_mode_encodings, av1_ncobmc_mode_tree);
-#endif
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 #if CONFIG_LOOP_RESTORATION
   av1_tokens_from_tree(switchable_restore_encodings,
                        av1_switchable_restore_tree);
 #endif  // CONFIG_LOOP_RESTORATION
-
-  /* This hack is necessary when CONFIG_DUAL_FILTER is enabled because the five
-      SWITCHABLE_FILTERS are not consecutive, e.g., 0, 1, 2, 3, 4, when doing
-      an in-order traversal of the av1_switchable_interp_tree structure. */
-  av1_indices_from_tree(av1_switchable_interp_ind, av1_switchable_interp_inv,
-                        av1_switchable_interp_tree);
-/* This hack is necessary because the four TX_TYPES are not consecutive,
-    e.g., 0, 1, 2, 3, when doing an in-order traversal of the av1_ext_tx_tree
-    structure. */
-#if CONFIG_EXT_TX
-  for (s = 1; s < EXT_TX_SETS_INTRA; ++s)
-    av1_indices_from_tree(av1_ext_tx_intra_ind[s], av1_ext_tx_intra_inv[s],
-                          av1_ext_tx_intra_tree[s]);
-  for (s = 1; s < EXT_TX_SETS_INTER; ++s)
-    av1_indices_from_tree(av1_ext_tx_inter_ind[s], av1_ext_tx_inter_inv[s],
-                          av1_ext_tx_inter_tree[s]);
-#else
-  av1_indices_from_tree(av1_ext_tx_ind, av1_ext_tx_inv, av1_ext_tx_tree);
-#endif
 }
 
 static void write_intra_mode_kf(const AV1_COMMON *cm, FRAME_CONTEXT *frame_ctx,
@@ -184,7 +141,7 @@ static void write_intra_mode_kf(const AV1_COMMON *cm, FRAME_CONTEXT *frame_ctx,
 #if CONFIG_INTRABC
   assert(!is_intrabc_block(&mi->mbmi));
 #endif  // CONFIG_INTRABC
-  aom_write_symbol(w, av1_intra_mode_ind[mode],
+  aom_write_symbol(w, mode,
                    get_y_mode_cdf(frame_ctx, mi, above_mi, left_mi, block),
                    INTRA_MODES);
   (void)cm;
@@ -234,16 +191,12 @@ static void write_drl_idx(FRAME_CONTEXT *ec_ctx, const MB_MODE_INFO *mbmi,
 
   assert(mbmi->ref_mv_idx < 3);
 
-#if CONFIG_EXT_INTER
 #if CONFIG_COMPOUND_SINGLEREF
   if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV ||
       mbmi->mode == SR_NEW_NEWMV) {
 #else   // !CONFIG_COMPOUND_SINGLEREF
   if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) {
 #endif  // CONFIG_COMPOUND_SINGLEREF
-#else   // !CONFIG_EXT_INTER
-  if (mbmi->mode == NEWMV) {
-#endif  // CONFIG_EXT_INTER
     int idx;
     for (idx = 0; idx < 2; ++idx) {
       if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
@@ -282,7 +235,6 @@ static void write_drl_idx(FRAME_CONTEXT *ec_ctx, const MB_MODE_INFO *mbmi,
   }
 }
 
-#if CONFIG_EXT_INTER
 static void write_inter_compound_mode(AV1_COMMON *cm, MACROBLOCKD *xd,
                                       aom_writer *w, PREDICTION_MODE mode,
                                       const int16_t mode_ctx) {
@@ -305,30 +257,12 @@ static void write_inter_singleref_comp_mode(MACROBLOCKD *xd, aom_writer *w,
                    inter_singleref_comp_cdf, INTER_SINGLEREF_COMP_MODES);
 }
 #endif  // CONFIG_COMPOUND_SINGLEREF
-#endif  // CONFIG_EXT_INTER
 
 static void encode_unsigned_max(struct aom_write_bit_buffer *wb, int data,
                                 int max) {
   aom_wb_write_literal(wb, data, get_unsigned_bits(max));
 }
 
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-static void prob_diff_update(const aom_tree_index *tree,
-                             aom_prob probs[/*n - 1*/],
-                             const unsigned int counts[/* n */], int n,
-                             int probwt, aom_writer *w) {
-  int i;
-  unsigned int branch_ct[32][2];
-
-  // Assuming max number of probabilities <= 32
-  assert(n <= 32);
-
-  av1_tree_probs_from_distribution(tree, branch_ct, counts);
-  for (i = 0; i < n - 1; ++i)
-    av1_cond_prob_diff_update(w, &probs[i], branch_ct[i], probwt);
-}
-#endif
-
 #if CONFIG_VAR_TX
 static void write_tx_size_vartx(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                 const MB_MODE_INFO *mbmi, TX_SIZE tx_size,
@@ -381,7 +315,7 @@ static void write_tx_size_vartx(const AV1_COMMON *cm, MACROBLOCKD *xd,
     aom_write(w, 1, cm->fc->txfm_partition_prob[ctx]);
 #endif
 
-    if (tx_size == TX_8X8) {
+    if (sub_txs == TX_4X4) {
       txfm_partition_update(xd->above_txfm_context + blk_col,
                             xd->left_txfm_context + blk_row, sub_txs, tx_size);
       return;
@@ -406,7 +340,7 @@ static void update_txfm_partition_probs(AV1_COMMON *cm, aom_writer *w,
                               counts->txfm_partition[k], probwt);
 }
 #endif  // CONFIG_NEW_MULTISYMBOL
-#endif
+#endif  // CONFIG_VAR_TX
 
 static void write_selected_tx_size(const AV1_COMMON *cm, const MACROBLOCKD *xd,
                                    aom_writer *w) {
@@ -414,17 +348,12 @@ static void write_selected_tx_size(const AV1_COMMON *cm, const MACROBLOCKD *xd,
   const BLOCK_SIZE bsize = mbmi->sb_type;
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
   (void)cm;
-// For sub8x8 blocks the tx_size symbol does not need to be sent
-#if CONFIG_CB4X4 && (CONFIG_VAR_TX || CONFIG_EXT_TX) && CONFIG_RECT_TX
-  if (bsize > BLOCK_4X4) {
-#else
-  if (bsize >= BLOCK_8X8) {
-#endif
+  if (block_signals_txsize(bsize)) {
     const TX_SIZE tx_size = mbmi->tx_size;
     const int is_inter = is_inter_block(mbmi);
     const int tx_size_ctx = get_tx_size_context(xd);
-    const int tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize]
-                                     : intra_tx_size_cat_lookup[bsize];
+    const int32_t tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize]
+                                         : intra_tx_size_cat_lookup[bsize];
     const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size];
     const int depth = tx_size_to_depth(coded_tx_size);
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
@@ -435,9 +364,14 @@ static void write_selected_tx_size(const AV1_COMMON *cm, const MACROBLOCKD *xd,
                      tx_size_cat + 2);
 #if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
     if (is_quarter_tx_allowed(xd, mbmi, is_inter) && tx_size != coded_tx_size)
+#if CONFIG_NEW_MULTISYMBOL
+      aom_write_symbol(w, tx_size == quarter_txsize_lookup[bsize],
+                       cm->fc->quarter_tx_size_cdf, 2);
+#else
       aom_write(w, tx_size == quarter_txsize_lookup[bsize],
                 cm->fc->quarter_tx_size_prob);
 #endif
+#endif
   }
 }
 
@@ -496,14 +430,12 @@ static void write_motion_mode(const AV1_COMMON *cm, MACROBLOCKD *xd,
                               const MODE_INFO *mi, aom_writer *w) {
   const MB_MODE_INFO *mbmi = &mi->mbmi;
 
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-  MOTION_MODE last_motion_mode_allowed =
-      motion_mode_allowed_wrapper(0,
-#if CONFIG_GLOBAL_MOTION
-                                  0, cm->global_motion,
-#endif  // CONFIG_GLOBAL_MOTION
-                                  mi);
-#else
+#if !CONFIG_GLOBAL_MOTION
+  // The cm parameter is only used with global_motion or with
+  // motion_var and warped_motion. In other cases, explicitly ignore
+  // it to avoid a compiler warning.
+  (void)cm;
+#endif
   MOTION_MODE last_motion_mode_allowed = motion_mode_allowed(
 #if CONFIG_GLOBAL_MOTION
       0, cm->global_motion,
@@ -512,9 +444,18 @@ static void write_motion_mode(const AV1_COMMON *cm, MACROBLOCKD *xd,
       xd,
 #endif
       mi);
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
   if (last_motion_mode_allowed == SIMPLE_TRANSLATION) return;
 #if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+#if CONFIG_NCOBMC_ADAPT_WEIGHT
+  if (last_motion_mode_allowed == NCOBMC_ADAPT_WEIGHT) {
+    aom_write_symbol(w, mbmi->motion_mode,
+                     xd->tile_ctx->ncobmc_cdf[mbmi->sb_type],
+                     OBMC_FAMILY_MODES);
+  } else if (last_motion_mode_allowed == OBMC_CAUSAL) {
+    aom_write_symbol(w, mbmi->motion_mode == OBMC_CAUSAL,
+                     xd->tile_ctx->obmc_cdf[mbmi->sb_type], 2);
+  } else {
+#else
   if (last_motion_mode_allowed == OBMC_CAUSAL) {
 #if CONFIG_NEW_MULTISYMBOL
     aom_write_symbol(w, mbmi->motion_mode == OBMC_CAUSAL,
@@ -524,6 +465,7 @@ static void write_motion_mode(const AV1_COMMON *cm, MACROBLOCKD *xd,
               cm->fc->obmc_prob[mbmi->sb_type]);
 #endif
   } else {
+#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
 #endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
     aom_write_symbol(w, mbmi->motion_mode,
                      xd->tile_ctx->motion_mode_cdf[mbmi->sb_type],
@@ -540,30 +482,16 @@ static void write_ncobmc_mode(MACROBLOCKD *xd, const MODE_INFO *mi,
   ADAPT_OVERLAP_BLOCK ao_block = adapt_overlap_block_lookup[mbmi->sb_type];
   if (mbmi->motion_mode != NCOBMC_ADAPT_WEIGHT) return;
 
-#ifndef TRAINING_WEIGHTS
   aom_write_symbol(w, mbmi->ncobmc_mode[0],
                    xd->tile_ctx->ncobmc_mode_cdf[ao_block], MAX_NCOBMC_MODES);
   if (mi_size_wide[mbmi->sb_type] != mi_size_high[mbmi->sb_type]) {
     aom_write_symbol(w, mbmi->ncobmc_mode[1],
                      xd->tile_ctx->ncobmc_mode_cdf[ao_block], MAX_NCOBMC_MODES);
   }
-#else
-  int block;
-  for (block = 0; block < 4; ++block)
-    aom_write_symbol(w, mbmi->ncobmc_mode[0][block],
-                     xd->tile_ctx->ncobmc_mode_cdf[ao_block], MAX_NCOBMC_MODES);
-  if (mi_size_wide[mbmi->sb_type] != mi_size_high[mbmi->sb_type]) {
-    for (block = 0; block < 4; ++block)
-      aom_write_symbol(w, mbmi->ncobmc_mode[1][block],
-                       xd->tile_ctx->ncobmc_mode_cdf[ao_block],
-                       MAX_NCOBMC_MODES);
-  }
-#endif
 }
 #endif
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 
-#if CONFIG_DELTA_Q
 static void write_delta_qindex(const AV1_COMMON *cm, const MACROBLOCKD *xd,
                                int delta_qindex, aom_writer *w) {
   int sign = delta_qindex < 0;
@@ -579,7 +507,7 @@ static void write_delta_qindex(const AV1_COMMON *cm, const MACROBLOCKD *xd,
   if (!smallval) {
     rem_bits = OD_ILOG_NZ(abs - 1) - 1;
     thr = (1 << rem_bits) + 1;
-    aom_write_literal(w, rem_bits, 3);
+    aom_write_literal(w, rem_bits - 1, 3);
     aom_write_literal(w, abs - thr, rem_bits);
   }
   if (abs > 0) {
@@ -589,6 +517,9 @@ static void write_delta_qindex(const AV1_COMMON *cm, const MACROBLOCKD *xd,
 
 #if CONFIG_EXT_DELTA_Q
 static void write_delta_lflevel(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+#if CONFIG_LOOPFILTER_LEVEL
+                                int lf_id,
+#endif
                                 int delta_lflevel, aom_writer *w) {
   int sign = delta_lflevel < 0;
   int abs = sign ? -delta_lflevel : delta_lflevel;
@@ -597,13 +528,24 @@ static void write_delta_lflevel(const AV1_COMMON *cm, const MACROBLOCKD *xd,
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
   (void)cm;
 
+#if CONFIG_LOOPFILTER_LEVEL
+  if (cm->delta_lf_multi) {
+    assert(lf_id >= 0 && lf_id < FRAME_LF_COUNT);
+    aom_write_symbol(w, AOMMIN(abs, DELTA_LF_SMALL),
+                     ec_ctx->delta_lf_multi_cdf[lf_id], DELTA_LF_PROBS + 1);
+  } else {
+    aom_write_symbol(w, AOMMIN(abs, DELTA_LF_SMALL), ec_ctx->delta_lf_cdf,
+                     DELTA_LF_PROBS + 1);
+  }
+#else
   aom_write_symbol(w, AOMMIN(abs, DELTA_LF_SMALL), ec_ctx->delta_lf_cdf,
                    DELTA_LF_PROBS + 1);
+#endif  // CONFIG_LOOPFILTER_LEVEL
 
   if (!smallval) {
     rem_bits = OD_ILOG_NZ(abs - 1) - 1;
     thr = (1 << rem_bits) + 1;
-    aom_write_literal(w, rem_bits, 3);
+    aom_write_literal(w, rem_bits - 1, 3);
     aom_write_literal(w, abs - thr, rem_bits);
   }
   if (abs > 0) {
@@ -611,7 +553,6 @@ static void write_delta_lflevel(const AV1_COMMON *cm, const MACROBLOCKD *xd,
   }
 }
 #endif  // CONFIG_EXT_DELTA_Q
-#endif  // CONFIG_DELTA_Q
 
 #if !CONFIG_NEW_MULTISYMBOL
 static void update_skip_probs(AV1_COMMON *cm, aom_writer *w,
@@ -625,20 +566,21 @@ static void update_skip_probs(AV1_COMMON *cm, aom_writer *w,
 }
 #endif
 
-#if CONFIG_PALETTE
-static void pack_palette_tokens(aom_writer *w, const TOKENEXTRA **tp, int n,
-                                int num) {
+// TODO(anybody) : remove this flag when PVQ supports pallete coding tool
+#if !CONFIG_PVQ
+static void pack_map_tokens(aom_writer *w, const TOKENEXTRA **tp, int n,
+                            int num) {
   const TOKENEXTRA *p = *tp;
   write_uniform(w, n, p->token);  // The first color index.
   ++p;
   --num;
   for (int i = 0; i < num; ++i) {
-    aom_write_symbol(w, p->token, p->palette_cdf, n);
+    aom_write_symbol(w, p->token, p->color_map_cdf, n);
     ++p;
   }
   *tp = p;
 }
-#endif  // CONFIG_PALETTE
+#endif  // !CONFIG_PVQ
 
 #if !CONFIG_PVQ
 #if CONFIG_SUPERTX
@@ -667,6 +609,7 @@ static void update_supertx_probs(AV1_COMMON *cm, int probwt, aom_writer *w) {
 }
 #endif  // CONFIG_SUPERTX
 
+#if !CONFIG_LV_MAP
 #if CONFIG_NEW_MULTISYMBOL
 static INLINE void write_coeff_extra(const aom_cdf_prob *const *cdf, int val,
                                      int n, aom_writer *w) {
@@ -693,12 +636,14 @@ static INLINE void write_coeff_extra(const aom_prob *pb, int value,
     aom_write_record(w, bb, pb[index], token_stats);
   }
 }
-#endif
+#endif  // CONFIG_NEW_MULTISYMBOL
 
-#if !CONFIG_LV_MAP
 static void pack_mb_tokens(aom_writer *w, const TOKENEXTRA **tp,
                            const TOKENEXTRA *const stop,
                            aom_bit_depth_t bit_depth, const TX_SIZE tx_size,
+#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+                           TX_TYPE tx_type, int is_inter,
+#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
                            TOKEN_STATS *token_stats) {
   const TOKENEXTRA *p = *tp;
 #if CONFIG_VAR_TX
@@ -706,6 +651,17 @@ static void pack_mb_tokens(aom_writer *w, const TOKENEXTRA **tp,
   const int seg_eob = tx_size_2d[tx_size];
 #endif
 
+#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+  if (tx_type == MRC_DCT && ((is_inter && SIGNAL_MRC_MASK_INTER) ||
+                             (!is_inter && SIGNAL_MRC_MASK_INTRA))) {
+    int rows = tx_size_high[tx_size];
+    int cols = tx_size_wide[tx_size];
+    assert(tx_size == TX_32X32);
+    assert(p < stop);
+    pack_map_tokens(w, &p, 2, rows * cols);
+  }
+#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+
   while (p < stop && p->token != EOSB_TOKEN) {
     const int token = p->token;
     const int eob_val = p->eob_val;
@@ -949,6 +905,10 @@ static void pack_txb_tokens(aom_writer *w, const TOKENEXTRA **tp,
   TX_SIZE plane_tx_size;
   const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
   const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+  TX_TYPE tx_type = av1_get_tx_type(plane ? PLANE_TYPE_UV : PLANE_TYPE_Y, xd,
+                                    blk_row, blk_col, block, tx_size);
+#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
 
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
@@ -960,7 +920,11 @@ static void pack_txb_tokens(aom_writer *w, const TOKENEXTRA **tp,
     TOKEN_STATS tmp_token_stats;
     init_token_stats(&tmp_token_stats);
 #if !CONFIG_PVQ
-    pack_mb_tokens(w, tp, tok_end, bit_depth, tx_size, &tmp_token_stats);
+    pack_mb_tokens(w, tp, tok_end, bit_depth, tx_size,
+#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+                   tx_type, is_inter_block(mbmi),
+#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+                   &tmp_token_stats);
 #else
     pack_pvq_tokens(w, x, xd, plane, bsize, tx_size);
 #endif
@@ -1020,9 +984,13 @@ static void write_segment_id(aom_writer *w, const struct segmentation *seg,
 #if CONFIG_NEW_MULTISYMBOL
 #define WRITE_REF_BIT(bname, pname) \
   aom_write_symbol(w, bname, av1_get_pred_cdf_##pname(cm, xd), 2)
+#define WRITE_REF_BIT2(bname, pname) \
+  aom_write_symbol(w, bname, av1_get_pred_cdf_##pname(xd), 2)
 #else
 #define WRITE_REF_BIT(bname, pname) \
   aom_write(w, bname, av1_get_pred_prob_##pname(cm, xd))
+#define WRITE_REF_BIT2(bname, pname) \
+  aom_write(w, bname, av1_get_pred_prob_##pname(cm, xd))
 #endif
 
 // This function encodes the reference frame
@@ -1042,14 +1010,12 @@ static void write_ref_frames(const AV1_COMMON *cm, const MACROBLOCKD *xd,
     // does the feature use compound prediction or not
     // (if not specified at the frame/segment level)
     if (cm->reference_mode == REFERENCE_MODE_SELECT) {
-#if !SUB8X8_COMP_REF
-      if (mbmi->sb_type != BLOCK_4X4)
-#endif
+      if (is_comp_ref_allowed(mbmi->sb_type))
 #if CONFIG_NEW_MULTISYMBOL
         aom_write_symbol(w, is_compound, av1_get_reference_mode_cdf(cm, xd), 2);
 #else
-      aom_write(w, is_compound, av1_get_reference_mode_prob(cm, xd));
-#endif
+        aom_write(w, is_compound, av1_get_reference_mode_prob(cm, xd));
+#endif  // CONFIG_NEW_MULTISYMBOL
     } else {
       assert((!is_compound) == (cm->reference_mode == SINGLE_REFERENCE));
     }
@@ -1064,7 +1030,12 @@ static void write_ref_frames(const AV1_COMMON *cm, const MACROBLOCKD *xd,
       if ((L_OR_L2(cm) || L3_OR_G(cm)) && BWD_OR_ALT(cm))
         if (L_AND_L2(cm) || L_AND_L3(cm) || L_AND_G(cm) || BWD_AND_ALT(cm))
 #endif  // CONFIG_VAR_REFS
-          aom_write(w, comp_ref_type, av1_get_comp_reference_type_prob(cm, xd));
+#if CONFIG_NEW_MULTISYMBOL
+          aom_write_symbol(w, comp_ref_type,
+                           av1_get_comp_reference_type_cdf(xd), 2);
+#else
+      aom_write(w, comp_ref_type, av1_get_comp_reference_type_prob(cm, xd));
+#endif
 #if CONFIG_VAR_REFS
         else
           assert(comp_ref_type == BIDIR_COMP_REFERENCE);
@@ -1081,7 +1052,7 @@ static void write_ref_frames(const AV1_COMMON *cm, const MACROBLOCKD *xd,
 #if CONFIG_VAR_REFS
         if ((L_AND_L2(cm) || L_AND_L3(cm) || L_AND_G(cm)) && BWD_AND_ALT(cm))
 #endif  // CONFIG_VAR_REFS
-          aom_write(w, bit, av1_get_pred_prob_uni_comp_ref_p(cm, xd));
+          WRITE_REF_BIT2(bit, uni_comp_ref_p);
 
         if (!bit) {
           assert(mbmi->ref_frame[0] == LAST_FRAME);
@@ -1090,14 +1061,13 @@ static void write_ref_frames(const AV1_COMMON *cm, const MACROBLOCKD *xd,
 #endif  // CONFIG_VAR_REFS
             const int bit1 = mbmi->ref_frame[1] == LAST3_FRAME ||
                              mbmi->ref_frame[1] == GOLDEN_FRAME;
-            aom_write(w, bit1, av1_get_pred_prob_uni_comp_ref_p1(cm, xd));
-
+            WRITE_REF_BIT2(bit1, uni_comp_ref_p1);
             if (bit1) {
 #if CONFIG_VAR_REFS
               if (L_AND_L3(cm) && L_AND_G(cm)) {
 #endif  // CONFIG_VAR_REFS
                 const int bit2 = mbmi->ref_frame[1] == GOLDEN_FRAME;
-                aom_write(w, bit2, av1_get_pred_prob_uni_comp_ref_p2(cm, xd));
+                WRITE_REF_BIT2(bit2, uni_comp_ref_p2);
 #if CONFIG_VAR_REFS
               }
 #endif  // CONFIG_VAR_REFS
@@ -1147,11 +1117,20 @@ static void write_ref_frames(const AV1_COMMON *cm, const MACROBLOCKD *xd,
       }
 
 #if CONFIG_VAR_REFS
-      // Test need to explicitly code (BWD) vs (ALT) branch node in tree
-      if (BWD_AND_ALT(cm)) {
+      // Test need to explicitly code (BWD,ALT2) vs (ALT) branch node in tree
+      if (BWD_OR_ALT2(cm) && ALTREF_IS_VALID(cm)) {
 #endif  // CONFIG_VAR_REFS
         const int bit_bwd = mbmi->ref_frame[1] == ALTREF_FRAME;
         WRITE_REF_BIT(bit_bwd, comp_bwdref_p);
+
+        if (!bit_bwd) {
+#if CONFIG_VAR_REFS
+          // Test need to explicitly code (BWD,ALT2) vs (ALT) branch node in
+          // tree
+          if (BWD_AND_ALT2(cm))
+#endif  // CONFIG_VAR_REFS
+            WRITE_REF_BIT(mbmi->ref_frame[1] == ALTREF2_FRAME, comp_bwdref_p1);
+        }
 #if CONFIG_VAR_REFS
       }
 #endif  // CONFIG_VAR_REFS
@@ -1162,22 +1141,31 @@ static void write_ref_frames(const AV1_COMMON *cm, const MACROBLOCKD *xd,
 #endif  // CONFIG_EXT_REFS
     } else {
 #if CONFIG_EXT_REFS
-      const int bit0 = (mbmi->ref_frame[0] == ALTREF_FRAME ||
-                        mbmi->ref_frame[0] == BWDREF_FRAME);
+      const int bit0 = (mbmi->ref_frame[0] <= ALTREF_FRAME &&
+                        mbmi->ref_frame[0] >= BWDREF_FRAME);
 #if CONFIG_VAR_REFS
-      // Test need to explicitly code (L,L2,L3,G) vs (BWD,ALT) branch node in
-      // tree
-      if ((L_OR_L2(cm) || L3_OR_G(cm)) && BWD_OR_ALT(cm))
+      // Test need to explicitly code (L,L2,L3,G) vs (BWD,ALT2,ALT) branch node
+      // in tree
+      if ((L_OR_L2(cm) || L3_OR_G(cm)) &&
+          (BWD_OR_ALT2(cm) || ALTREF_IS_VALID(cm)))
 #endif  // CONFIG_VAR_REFS
         WRITE_REF_BIT(bit0, single_ref_p1);
 
       if (bit0) {
 #if CONFIG_VAR_REFS
-        // Test need to explicitly code (BWD) vs (ALT) branch node in tree
-        if (BWD_AND_ALT(cm)) {
+        // Test need to explicitly code (BWD,ALT2) vs (ALT) branch node in tree
+        if (BWD_OR_ALT2(cm) && ALTREF_IS_VALID(cm)) {
 #endif  // CONFIG_VAR_REFS
           const int bit1 = mbmi->ref_frame[0] == ALTREF_FRAME;
           WRITE_REF_BIT(bit1, single_ref_p2);
+
+          if (!bit1) {
+#if CONFIG_VAR_REFS
+            // Test need to explicitly code (BWD) vs (ALT2) branch node in tree
+            if (BWD_AND_ALT2(cm))
+#endif  // CONFIG_VAR_REFS
+              WRITE_REF_BIT(mbmi->ref_frame[0] == ALTREF2_FRAME, single_ref_p6);
+          }
 #if CONFIG_VAR_REFS
         }
 #endif  // CONFIG_VAR_REFS
@@ -1231,11 +1219,7 @@ static void write_filter_intra_mode_info(const AV1_COMMON *const cm,
                                          const MB_MODE_INFO *const mbmi,
                                          int mi_row, int mi_col,
                                          aom_writer *w) {
-  if (mbmi->mode == DC_PRED
-#if CONFIG_PALETTE
-      && mbmi->palette_mode_info.palette_size[0] == 0
-#endif  // CONFIG_PALETTE
-      ) {
+  if (mbmi->mode == DC_PRED && mbmi->palette_mode_info.palette_size[0] == 0) {
     aom_write(w, mbmi->filter_intra_mode_info.use_filter_intra_mode[0],
               cm->fc->filter_intra_probs[0]);
     if (mbmi->filter_intra_mode_info.use_filter_intra_mode[0]) {
@@ -1256,11 +1240,8 @@ static void write_filter_intra_mode_info(const AV1_COMMON *const cm,
   (void)mi_col;
 #endif  // CONFIG_CB4X4
 
-  if (mbmi->uv_mode == UV_DC_PRED
-#if CONFIG_PALETTE
-      && mbmi->palette_mode_info.palette_size[1] == 0
-#endif  // CONFIG_PALETTE
-      ) {
+  if (mbmi->uv_mode == UV_DC_PRED &&
+      mbmi->palette_mode_info.palette_size[1] == 0) {
     aom_write(w, mbmi->filter_intra_mode_info.use_filter_intra_mode[1],
               cm->fc->filter_intra_probs[1]);
     if (mbmi->filter_intra_mode_info.use_filter_intra_mode[1]) {
@@ -1312,16 +1293,9 @@ static void write_mb_interp_filter(AV1_COMP *cpi, const MACROBLOCKD *xd,
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 
   if (!av1_is_interp_needed(xd)) {
-#if CONFIG_DUAL_FILTER
-    for (int i = 0; i < 4; ++i)
-      assert(mbmi->interp_filter[i] == (cm->interp_filter == SWITCHABLE
-                                            ? EIGHTTAP_REGULAR
-                                            : cm->interp_filter));
-#else
-    assert(mbmi->interp_filter == (cm->interp_filter == SWITCHABLE
-                                       ? EIGHTTAP_REGULAR
-                                       : cm->interp_filter));
-#endif  // CONFIG_DUAL_FILTER
+    assert(mbmi->interp_filters ==
+           av1_broadcast_interp_filter(
+               av1_unswitchable_filter(cm->interp_filter)));
     return;
   }
   if (cm->interp_filter == SWITCHABLE) {
@@ -1332,26 +1306,28 @@ static void write_mb_interp_filter(AV1_COMP *cpi, const MACROBLOCKD *xd,
           (mbmi->ref_frame[1] > INTRA_FRAME &&
            has_subpel_mv_component(xd->mi[0], xd, dir + 2))) {
         const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
-        aom_write_symbol(w, av1_switchable_interp_ind[mbmi->interp_filter[dir]],
-                         ec_ctx->switchable_interp_cdf[ctx],
+        InterpFilter filter =
+            av1_extract_interp_filter(mbmi->interp_filters, dir);
+        aom_write_symbol(w, filter, ec_ctx->switchable_interp_cdf[ctx],
                          SWITCHABLE_FILTERS);
-        ++cpi->interp_filter_selected[0][mbmi->interp_filter[dir]];
+        ++cpi->interp_filter_selected[0][filter];
       } else {
-        assert(mbmi->interp_filter[dir] == EIGHTTAP_REGULAR);
+        assert(av1_extract_interp_filter(mbmi->interp_filters, dir) ==
+               EIGHTTAP_REGULAR);
       }
     }
 #else
     {
       const int ctx = av1_get_pred_context_switchable_interp(xd);
-      aom_write_symbol(w, av1_switchable_interp_ind[mbmi->interp_filter],
-                       ec_ctx->switchable_interp_cdf[ctx], SWITCHABLE_FILTERS);
-      ++cpi->interp_filter_selected[0][mbmi->interp_filter];
+      InterpFilter filter = av1_extract_interp_filter(mbmi->interp_filters, 0);
+      aom_write_symbol(w, filter, ec_ctx->switchable_interp_cdf[ctx],
+                       SWITCHABLE_FILTERS);
+      ++cpi->interp_filter_selected[0][filter];
     }
 #endif  // CONFIG_DUAL_FILTER
   }
 }
 
-#if CONFIG_PALETTE
 #if CONFIG_PALETTE_DELTA_ENCODING
 // Transmit color values with delta encoding. Write the first value as
 // literal, and the deltas between each value and the previous one. "min_val" is
@@ -1392,10 +1368,8 @@ static void write_palette_colors_y(const MACROBLOCKD *const xd,
                                    const PALETTE_MODE_INFO *const pmi,
                                    int bit_depth, aom_writer *w) {
   const int n = pmi->palette_size[0];
-  const MODE_INFO *const above_mi = xd->above_mi;
-  const MODE_INFO *const left_mi = xd->left_mi;
   uint16_t color_cache[2 * PALETTE_MAX_SIZE];
-  const int n_cache = av1_get_palette_cache(above_mi, left_mi, 0, color_cache);
+  const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
   int out_cache_colors[PALETTE_MAX_SIZE];
   uint8_t cache_color_found[2 * PALETTE_MAX_SIZE];
   const int n_out_cache =
@@ -1421,10 +1395,8 @@ static void write_palette_colors_uv(const MACROBLOCKD *const xd,
   const uint16_t *colors_u = pmi->palette_colors + PALETTE_MAX_SIZE;
   const uint16_t *colors_v = pmi->palette_colors + 2 * PALETTE_MAX_SIZE;
   // U channel colors.
-  const MODE_INFO *const above_mi = xd->above_mi;
-  const MODE_INFO *const left_mi = xd->left_mi;
   uint16_t color_cache[2 * PALETTE_MAX_SIZE];
-  const int n_cache = av1_get_palette_cache(above_mi, left_mi, 1, color_cache);
+  const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
   int out_cache_colors[PALETTE_MAX_SIZE];
   uint8_t cache_color_found[2 * PALETTE_MAX_SIZE];
   const int n_out_cache = av1_index_color_cache(
@@ -1484,6 +1456,9 @@ static void write_palette_mode_info(const AV1_COMMON *cm, const MACROBLOCKD *xd,
   const BLOCK_SIZE bsize = mbmi->sb_type;
   const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
 
+  assert(bsize >= BLOCK_8X8 && bsize <= BLOCK_LARGEST);
+  const int block_palette_idx = bsize - BLOCK_8X8;
+
   if (mbmi->mode == DC_PRED) {
     const int n = pmi->palette_size[0];
     int palette_y_mode_ctx = 0;
@@ -1495,12 +1470,19 @@ static void write_palette_mode_info(const AV1_COMMON *cm, const MACROBLOCKD *xd,
       palette_y_mode_ctx +=
           (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
     }
+#if CONFIG_NEW_MULTISYMBOL
+    aom_write_symbol(
+        w, n > 0,
+        xd->tile_ctx->palette_y_mode_cdf[block_palette_idx][palette_y_mode_ctx],
+        2);
+#else
     aom_write(
         w, n > 0,
-        av1_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_y_mode_ctx]);
+        av1_default_palette_y_mode_prob[block_palette_idx][palette_y_mode_ctx]);
+#endif
     if (n > 0) {
       aom_write_symbol(w, n - PALETTE_MIN_SIZE,
-                       xd->tile_ctx->palette_y_size_cdf[bsize - BLOCK_8X8],
+                       xd->tile_ctx->palette_y_size_cdf[block_palette_idx],
                        PALETTE_SIZES);
 #if CONFIG_PALETTE_DELTA_ENCODING
       write_palette_colors_y(xd, pmi, cm->bit_depth, w);
@@ -1516,10 +1498,15 @@ static void write_palette_mode_info(const AV1_COMMON *cm, const MACROBLOCKD *xd,
   if (mbmi->uv_mode == UV_DC_PRED) {
     const int n = pmi->palette_size[1];
     const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0);
+#if CONFIG_NEW_MULTISYMBOL
+    aom_write_symbol(w, n > 0,
+                     xd->tile_ctx->palette_uv_mode_cdf[palette_uv_mode_ctx], 2);
+#else
     aom_write(w, n > 0, av1_default_palette_uv_mode_prob[palette_uv_mode_ctx]);
+#endif
     if (n > 0) {
       aom_write_symbol(w, n - PALETTE_MIN_SIZE,
-                       xd->tile_ctx->palette_uv_size_cdf[bsize - BLOCK_8X8],
+                       xd->tile_ctx->palette_uv_size_cdf[block_palette_idx],
                        PALETTE_SIZES);
 #if CONFIG_PALETTE_DELTA_ENCODING
       write_palette_colors_uv(xd, pmi, cm->bit_depth, w);
@@ -1538,7 +1525,6 @@ static void write_palette_mode_info(const AV1_COMMON *cm, const MACROBLOCKD *xd,
     }
   }
 }
-#endif  // CONFIG_PALETTE
 
 void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
 #if CONFIG_SUPERTX
@@ -1583,25 +1569,64 @@ void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
         !supertx_enabled &&
 #endif  // CONFIG_SUPERTX
         !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+#if CONFIG_MRC_TX
+      if (tx_type == MRC_DCT)
+        assert(mbmi->valid_mrc_mask && "Invalid MRC mask");
+#endif  // CONFIG_MRC_TX
+      const TxSetType tx_set_type = get_ext_tx_set_type(
+          tx_size, bsize, is_inter, cm->reduced_tx_set_used);
       const int eset =
           get_ext_tx_set(tx_size, bsize, is_inter, cm->reduced_tx_set_used);
       // eset == 0 should correspond to a set with only DCT_DCT and there
       // is no need to send the tx_type
       assert(eset > 0);
+      assert(av1_ext_tx_used[tx_set_type][tx_type]);
+#if !CONFIG_LGT_FROM_PRED
       if (is_inter) {
-        assert(ext_tx_used_inter[eset][tx_type]);
-        aom_write_symbol(w, av1_ext_tx_inter_ind[eset][tx_type],
+        aom_write_symbol(w, av1_ext_tx_ind[tx_set_type][tx_type],
                          ec_ctx->inter_ext_tx_cdf[eset][square_tx_size],
-                         ext_tx_cnt_inter[eset]);
+                         av1_num_ext_tx_set[tx_set_type]);
       } else if (ALLOW_INTRA_EXT_TX) {
-        assert(ext_tx_used_intra[eset][tx_type]);
         aom_write_symbol(
-            w, av1_ext_tx_intra_ind[eset][tx_type],
+            w, av1_ext_tx_ind[tx_set_type][tx_type],
             ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][mbmi->mode],
-            ext_tx_cnt_intra[eset]);
+            av1_num_ext_tx_set[tx_set_type]);
       }
-    }
 #else
+      // only signal tx_type when lgt is not allowed or not selected
+      if (is_inter) {
+        if (LGT_FROM_PRED_INTER) {
+          if (is_lgt_allowed(mbmi->mode, tx_size) && !cm->reduced_tx_set_used)
+            aom_write(w, mbmi->use_lgt, ec_ctx->inter_lgt_prob[square_tx_size]);
+          if (!mbmi->use_lgt)
+            aom_write_symbol(w, av1_ext_tx_ind[tx_set_type][tx_type],
+                             ec_ctx->inter_ext_tx_cdf[eset][square_tx_size],
+                             av1_num_ext_tx_set[tx_set_type]);
+        } else {
+          aom_write_symbol(w, av1_ext_tx_ind[tx_set_type][tx_type],
+                           ec_ctx->inter_ext_tx_cdf[eset][square_tx_size],
+                           av1_num_ext_tx_set[tx_set_type]);
+        }
+      } else if (ALLOW_INTRA_EXT_TX) {
+        if (LGT_FROM_PRED_INTRA) {
+          if (is_lgt_allowed(mbmi->mode, tx_size) && !cm->reduced_tx_set_used)
+            aom_write(w, mbmi->use_lgt,
+                      ec_ctx->intra_lgt_prob[square_tx_size][mbmi->mode]);
+          if (!mbmi->use_lgt)
+            aom_write_symbol(
+                w, av1_ext_tx_ind[tx_set_type][tx_type],
+                ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][mbmi->mode],
+                av1_num_ext_tx_set[tx_set_type]);
+        } else {
+          aom_write_symbol(
+              w, av1_ext_tx_ind[tx_set_type][tx_type],
+              ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][mbmi->mode],
+              av1_num_ext_tx_set[tx_set_type]);
+        }
+      }
+#endif  // CONFIG_LGT_FROM_PRED
+    }
+#else  // CONFIG_EXT_TX
     if (tx_size < TX_32X32 &&
         ((!cm->seg.enabled && cm->base_qindex > 0) ||
          (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) &&
@@ -1627,36 +1652,32 @@ void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
 
 static void write_intra_mode(FRAME_CONTEXT *frame_ctx, BLOCK_SIZE bsize,
                              PREDICTION_MODE mode, aom_writer *w) {
-  aom_write_symbol(w, av1_intra_mode_ind[mode],
-                   frame_ctx->y_mode_cdf[size_group_lookup[bsize]],
+  aom_write_symbol(w, mode, frame_ctx->y_mode_cdf[size_group_lookup[bsize]],
                    INTRA_MODES);
 }
 
 static void write_intra_uv_mode(FRAME_CONTEXT *frame_ctx,
                                 UV_PREDICTION_MODE uv_mode,
                                 PREDICTION_MODE y_mode, aom_writer *w) {
-  aom_write_symbol(w, av1_intra_mode_ind[get_uv_mode(uv_mode)],
-                   frame_ctx->uv_mode_cdf[y_mode], UV_INTRA_MODES);
+#if !CONFIG_CFL
+  uv_mode = get_uv_mode(uv_mode);
+#endif
+  aom_write_symbol(w, uv_mode, frame_ctx->uv_mode_cdf[y_mode], UV_INTRA_MODES);
 }
 
 #if CONFIG_CFL
-static void write_cfl_alphas(FRAME_CONTEXT *const frame_ctx, int ind,
-                             const CFL_SIGN_TYPE signs[CFL_SIGNS],
-                             aom_writer *w) {
-  // Check for uninitialized signs
-  if (cfl_alpha_codes[ind][CFL_PRED_U] == 0)
-    assert(signs[CFL_PRED_U] == CFL_SIGN_POS);
-  if (cfl_alpha_codes[ind][CFL_PRED_V] == 0)
-    assert(signs[CFL_PRED_V] == CFL_SIGN_POS);
-
-  // Write a symbol representing a combination of alpha Cb and alpha Cr.
-  aom_write_symbol(w, ind, frame_ctx->cfl_alpha_cdf, CFL_ALPHABET_SIZE);
-
-  // Signs are only signaled for nonzero codes.
-  if (cfl_alpha_codes[ind][CFL_PRED_U] != 0)
-    aom_write_bit(w, signs[CFL_PRED_U]);
-  if (cfl_alpha_codes[ind][CFL_PRED_V] != 0)
-    aom_write_bit(w, signs[CFL_PRED_V]);
+static void write_cfl_alphas(FRAME_CONTEXT *const ec_ctx, int idx,
+                             int joint_sign, aom_writer *w) {
+  aom_write_symbol(w, joint_sign, ec_ctx->cfl_sign_cdf, CFL_JOINT_SIGNS);
+  // Magnitudes are only signaled for nonzero codes.
+  if (CFL_SIGN_U(joint_sign) != CFL_SIGN_ZERO) {
+    aom_cdf_prob *cdf_u = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)];
+    aom_write_symbol(w, CFL_IDX_U(idx), cdf_u, CFL_ALPHABET_SIZE);
+  }
+  if (CFL_SIGN_V(joint_sign) != CFL_SIGN_ZERO) {
+    aom_cdf_prob *cdf_v = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)];
+    aom_write_symbol(w, CFL_IDX_V(idx), cdf_v, CFL_ALPHABET_SIZE);
+  }
 }
 #endif
 
@@ -1715,7 +1736,6 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
 #else
   skip = write_skip(cm, xd, segment_id, mi, w);
 #endif  // CONFIG_SUPERTX
-#if CONFIG_DELTA_Q
   if (cm->delta_q_present_flag) {
     int super_block_upper_left =
         ((mi_row & MAX_MIB_MASK) == 0) && ((mi_col & MAX_MIB_MASK) == 0);
@@ -1726,6 +1746,25 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
       write_delta_qindex(cm, xd, reduced_delta_qindex, w);
       xd->prev_qindex = mbmi->current_q_index;
 #if CONFIG_EXT_DELTA_Q
+#if CONFIG_LOOPFILTER_LEVEL
+      if (cm->delta_lf_present_flag) {
+        if (cm->delta_lf_multi) {
+          for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id) {
+            int reduced_delta_lflevel =
+                (mbmi->curr_delta_lf[lf_id] - xd->prev_delta_lf[lf_id]) /
+                cm->delta_lf_res;
+            write_delta_lflevel(cm, xd, lf_id, reduced_delta_lflevel, w);
+            xd->prev_delta_lf[lf_id] = mbmi->curr_delta_lf[lf_id];
+          }
+        } else {
+          int reduced_delta_lflevel =
+              (mbmi->current_delta_lf_from_base - xd->prev_delta_lf_from_base) /
+              cm->delta_lf_res;
+          write_delta_lflevel(cm, xd, -1, reduced_delta_lflevel, w);
+          xd->prev_delta_lf_from_base = mbmi->current_delta_lf_from_base;
+        }
+      }
+#else
       if (cm->delta_lf_present_flag) {
         int reduced_delta_lflevel =
             (mbmi->current_delta_lf_from_base - xd->prev_delta_lf_from_base) /
@@ -1733,10 +1772,10 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
         write_delta_lflevel(cm, xd, reduced_delta_lflevel, w);
         xd->prev_delta_lf_from_base = mbmi->current_delta_lf_from_base;
       }
+#endif  // CONFIG_LOOPFILTER_LEVEL
 #endif  // CONFIG_EXT_DELTA_Q
     }
   }
-#endif
 
 #if CONFIG_SUPERTX
   if (!supertx_enabled)
@@ -1744,14 +1783,10 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
     write_is_inter(cm, xd, mbmi->segment_id, w, is_inter);
 
   if (cm->tx_mode == TX_MODE_SELECT &&
-#if CONFIG_CB4X4 && (CONFIG_VAR_TX || CONFIG_RECT_TX)
-#if CONFIG_RECT_TX
-      bsize > BLOCK_4X4 &&
-#else
+#if CONFIG_CB4X4 && CONFIG_VAR_TX && !CONFIG_RECT_TX
       (bsize >= BLOCK_8X8 || (bsize > BLOCK_4X4 && is_inter)) &&
-#endif  // CONFIG_RECT_TX
 #else
-      bsize >= BLOCK_8X8 &&
+      block_signals_txsize(bsize) &&
 #endif
 #if CONFIG_SUPERTX
       !supertx_enabled &&
@@ -1759,23 +1794,30 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
       !(is_inter && skip) && !xd->lossless[segment_id]) {
 #if CONFIG_VAR_TX
     if (is_inter) {  // This implies skip flag is 0.
-      const TX_SIZE max_tx_size = get_vartx_max_txsize(mbmi, bsize);
+      const TX_SIZE max_tx_size = get_vartx_max_txsize(mbmi, bsize, 0);
       const int bh = tx_size_high_unit[max_tx_size];
       const int bw = tx_size_wide_unit[max_tx_size];
       const int width = block_size_wide[bsize] >> tx_size_wide_log2[0];
       const int height = block_size_high[bsize] >> tx_size_wide_log2[0];
+      int init_depth =
+          (height != width) ? RECT_VARTX_DEPTH_INIT : SQR_VARTX_DEPTH_INIT;
       int idx, idy;
       for (idy = 0; idy < height; idy += bh)
         for (idx = 0; idx < width; idx += bw)
-          write_tx_size_vartx(cm, xd, mbmi, max_tx_size, height != width, idy,
-                              idx, w);
+          write_tx_size_vartx(cm, xd, mbmi, max_tx_size, init_depth, idy, idx,
+                              w);
 #if CONFIG_RECT_TX_EXT
       if (is_quarter_tx_allowed(xd, mbmi, is_inter_block(mbmi)) &&
           quarter_txsize_lookup[bsize] != max_tx_size &&
           (mbmi->tx_size == quarter_txsize_lookup[bsize] ||
            mbmi->tx_size == max_tx_size)) {
+#if CONFIG_NEW_MULTISYMBOL
+        aom_write_symbol(w, mbmi->tx_size != max_tx_size,
+                         cm->fc->quarter_tx_size_cdf, 2);
+#else
         aom_write(w, mbmi->tx_size != max_tx_size,
                   cm->fc->quarter_tx_size_prob);
+#endif
       }
 #endif
     } else {
@@ -1812,7 +1854,7 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
 #endif  // CONFIG_CB4X4
 
 #if CONFIG_CFL
-      if (mbmi->uv_mode == UV_DC_PRED) {
+      if (mbmi->uv_mode == UV_CFL_PRED) {
         write_cfl_alphas(ec_ctx, mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, w);
       }
 #endif
@@ -1824,10 +1866,8 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
 #if CONFIG_EXT_INTRA
     write_intra_angle_info(xd, ec_ctx, w);
 #endif  // CONFIG_EXT_INTRA
-#if CONFIG_PALETTE
-    if (bsize >= BLOCK_8X8 && cm->allow_screen_content_tools)
+    if (av1_allow_palette(cm->allow_screen_content_tools, bsize))
       write_palette_mode_info(cm, xd, mi, w);
-#endif  // CONFIG_PALETTE
 #if CONFIG_FILTER_INTRA
     if (bsize >= BLOCK_8X8 || unify_bsize)
       write_filter_intra_mode_info(cm, xd, mbmi, mi_row, mi_col, w);
@@ -1836,16 +1876,15 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
     int16_t mode_ctx;
     write_ref_frames(cm, xd, w);
 
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
     if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
       // NOTE: Handle single ref comp mode
       if (!is_compound)
         aom_write(w, is_inter_singleref_comp_mode(mode),
                   av1_get_inter_mode_prob(cm, xd));
     }
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
 
-#if CONFIG_EXT_INTER
 #if CONFIG_COMPOUND_SINGLEREF
     if (is_compound || is_inter_singleref_comp_mode(mode))
 #else   // !CONFIG_COMPOUND_SINGLEREF
@@ -1853,7 +1892,6 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
 #endif  // CONFIG_COMPOUND_SINGLEREF
       mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]];
     else
-#endif  // CONFIG_EXT_INTER
 
       mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
                                            mbmi->ref_frame, bsize, -1);
@@ -1861,7 +1899,6 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
     // If segment skip is not enabled code the mode.
     if (!segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
       if (bsize >= BLOCK_8X8 || unify_bsize) {
-#if CONFIG_EXT_INTER
         if (is_inter_compound_mode(mode))
           write_inter_compound_mode(cm, xd, w, mode, mode_ctx);
 #if CONFIG_COMPOUND_SINGLEREF
@@ -1869,18 +1906,13 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
           write_inter_singleref_comp_mode(xd, w, mode, mode_ctx);
 #endif  // CONFIG_COMPOUND_SINGLEREF
         else if (is_inter_singleref_mode(mode))
-#endif  // CONFIG_EXT_INTER
           write_inter_mode(w, mode, ec_ctx, mode_ctx);
 
-#if CONFIG_EXT_INTER
         if (mode == NEWMV || mode == NEW_NEWMV ||
 #if CONFIG_COMPOUND_SINGLEREF
             mbmi->mode == SR_NEW_NEWMV ||
 #endif  // CONFIG_COMPOUND_SINGLEREF
             have_nearmv_in_inter_mode(mode))
-#else   // !CONFIG_EXT_INTER
-        if (mode == NEARMV || mode == NEWMV)
-#endif  // CONFIG_EXT_INTER
           write_drl_idx(ec_ctx, mbmi, mbmi_ext, w);
         else
           assert(mbmi->ref_mv_idx == 0);
@@ -1903,23 +1935,15 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
         for (idx = 0; idx < 2; idx += num_4x4_w) {
           const int j = idy * 2 + idx;
           const PREDICTION_MODE b_mode = mi->bmi[j].as_mode;
-#if CONFIG_EXT_INTER
           if (!is_compound)
-#endif  // CONFIG_EXT_INTER
             mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
                                                  mbmi->ref_frame, bsize, j);
-#if CONFIG_EXT_INTER
           if (is_inter_compound_mode(b_mode))
             write_inter_compound_mode(cm, xd, w, b_mode, mode_ctx);
           else if (is_inter_singleref_mode(b_mode))
-#endif  // CONFIG_EXT_INTER
             write_inter_mode(w, b_mode, ec_ctx, mode_ctx);
 
-#if CONFIG_EXT_INTER
           if (b_mode == NEWMV || b_mode == NEW_NEWMV) {
-#else
-          if (b_mode == NEWMV) {
-#endif  // CONFIG_EXT_INTER
             for (ref = 0; ref < 1 + is_compound; ++ref) {
               int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
               int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
@@ -1927,16 +1951,9 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
                                         mbmi->ref_mv_idx);
               nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx];
               av1_encode_mv(cpi, w, &mi->bmi[j].as_mv[ref].as_mv,
-#if CONFIG_EXT_INTER
-                            &mi->bmi[j].ref_mv[ref].as_mv,
-#else
-                            &mi->bmi[j].pred_mv[ref].as_mv,
-#endif  // CONFIG_EXT_INTER
-                            nmvc, allow_hp);
+                            &mi->bmi[j].ref_mv[ref].as_mv, nmvc, allow_hp);
             }
-          }
-#if CONFIG_EXT_INTER
-          else if (b_mode == NEAREST_NEWMV || b_mode == NEAR_NEWMV) {
+          } else if (b_mode == NEAREST_NEWMV || b_mode == NEAR_NEWMV) {
             int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
             int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
                                       mbmi_ext->ref_mv_stack[rf_type], 1,
@@ -1953,15 +1970,10 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
             av1_encode_mv(cpi, w, &mi->bmi[j].as_mv[0].as_mv,
                           &mi->bmi[j].ref_mv[0].as_mv, nmvc, allow_hp);
           }
-#endif  // CONFIG_EXT_INTER
         }
       }
     } else {
-#if CONFIG_EXT_INTER
       if (mode == NEWMV || mode == NEW_NEWMV) {
-#else
-      if (mode == NEWMV) {
-#endif  // CONFIG_EXT_INTER
         int_mv ref_mv;
         for (ref = 0; ref < 1 + is_compound; ++ref) {
           int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
@@ -1973,7 +1985,6 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
           av1_encode_mv(cpi, w, &mbmi->mv[ref].as_mv, &ref_mv.as_mv, nmvc,
                         allow_hp);
         }
-#if CONFIG_EXT_INTER
       } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
         int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
         int nmv_ctx =
@@ -2008,11 +2019,10 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
         av1_encode_mv(cpi, w, &mbmi->mv[1].as_mv, &ref_mv.as_mv, nmvc,
                       allow_hp);
 #endif  // CONFIG_COMPOUND_SINGLEREF
-#endif  // CONFIG_EXT_INTER
       }
     }
 
-#if CONFIG_EXT_INTER && CONFIG_INTERINTRA
+#if CONFIG_INTERINTRA
     if (cpi->common.reference_mode != COMPOUND_REFERENCE &&
 #if CONFIG_SUPERTX
         !supertx_enabled &&
@@ -2045,22 +2055,18 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
         }
       }
     }
-#endif  // CONFIG_EXT_INTER && CONFIG_INTERINTRA
+#endif  // CONFIG_INTERINTRA
 
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 #if CONFIG_SUPERTX
     if (!supertx_enabled)
 #endif  // CONFIG_SUPERTX
-#if CONFIG_EXT_INTER
-      if (mbmi->ref_frame[1] != INTRA_FRAME)
-#endif  // CONFIG_EXT_INTER
-        write_motion_mode(cm, xd, mi, w);
+      if (mbmi->ref_frame[1] != INTRA_FRAME) write_motion_mode(cm, xd, mi, w);
 #if CONFIG_NCOBMC_ADAPT_WEIGHT
     write_ncobmc_mode(xd, mi, w);
 #endif
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 
-#if CONFIG_EXT_INTER
     if (
 #if CONFIG_COMPOUND_SINGLEREF
         is_inter_anyref_comp_mode(mbmi->mode) &&
@@ -2074,10 +2080,16 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
         is_any_masked_compound_used(bsize)) {
 #if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
       if (cm->allow_masked_compound) {
-        aom_write_symbol(w, mbmi->interinter_compound_type,
-                         ec_ctx->compound_type_cdf[bsize], COMPOUND_TYPES);
+#if CONFIG_WEDGE && CONFIG_COMPOUND_SEGMENT
+        if (!is_interinter_compound_used(COMPOUND_WEDGE, bsize))
+          aom_write_bit(w, mbmi->interinter_compound_type == COMPOUND_AVERAGE);
+        else
+#endif  // CONFIG_WEDGE && CONFIG_COMPOUND_SEGMENT
+          aom_write_symbol(w, mbmi->interinter_compound_type,
+                           ec_ctx->compound_type_cdf[bsize], COMPOUND_TYPES);
 #if CONFIG_WEDGE
-        if (mbmi->interinter_compound_type == COMPOUND_WEDGE) {
+        if (is_interinter_compound_used(COMPOUND_WEDGE, bsize) &&
+            mbmi->interinter_compound_type == COMPOUND_WEDGE) {
           aom_write_literal(w, mbmi->wedge_index, get_wedge_bits_lookup(bsize));
           aom_write_bit(w, mbmi->wedge_sign);
         }
@@ -2090,7 +2102,6 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
       }
 #endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
     }
-#endif  // CONFIG_EXT_INTER
 
 #if CONFIG_DUAL_FILTER || CONFIG_WARPED_MOTION || CONFIG_GLOBAL_MOTION
     write_mb_interp_filter(cpi, xd, w);
@@ -2106,12 +2117,7 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
 #endif  // !CONFIG_TXK_SEL
 }
 
-static void write_mb_modes_kf(AV1_COMMON *cm,
-#if CONFIG_DELTA_Q
-                              MACROBLOCKD *xd,
-#else
-                              const MACROBLOCKD *xd,
-#endif  // CONFIG_DELTA_Q
+static void write_mb_modes_kf(AV1_COMMON *cm, MACROBLOCKD *xd,
 #if CONFIG_INTRABC
                               const MB_MODE_INFO_EXT *mbmi_ext,
 #endif  // CONFIG_INTRABC
@@ -2135,7 +2141,6 @@ static void write_mb_modes_kf(AV1_COMMON *cm,
 
   if (seg->update_map) write_segment_id(w, seg, segp, mbmi->segment_id);
 
-#if CONFIG_DELTA_Q
   const int skip = write_skip(cm, xd, mbmi->segment_id, mi, w);
   if (cm->delta_q_present_flag) {
     int super_block_upper_left =
@@ -2147,6 +2152,25 @@ static void write_mb_modes_kf(AV1_COMMON *cm,
       write_delta_qindex(cm, xd, reduced_delta_qindex, w);
       xd->prev_qindex = mbmi->current_q_index;
 #if CONFIG_EXT_DELTA_Q
+#if CONFIG_LOOPFILTER_LEVEL
+      if (cm->delta_lf_present_flag) {
+        if (cm->delta_lf_multi) {
+          for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id) {
+            int reduced_delta_lflevel =
+                (mbmi->curr_delta_lf[lf_id] - xd->prev_delta_lf[lf_id]) /
+                cm->delta_lf_res;
+            write_delta_lflevel(cm, xd, lf_id, reduced_delta_lflevel, w);
+            xd->prev_delta_lf[lf_id] = mbmi->curr_delta_lf[lf_id];
+          }
+        } else {
+          int reduced_delta_lflevel =
+              (mbmi->current_delta_lf_from_base - xd->prev_delta_lf_from_base) /
+              cm->delta_lf_res;
+          write_delta_lflevel(cm, xd, -1, reduced_delta_lflevel, w);
+          xd->prev_delta_lf_from_base = mbmi->current_delta_lf_from_base;
+        }
+      }
+#else
       if (cm->delta_lf_present_flag) {
         int reduced_delta_lflevel =
             (mbmi->current_delta_lf_from_base - xd->prev_delta_lf_from_base) /
@@ -2154,29 +2178,19 @@ static void write_mb_modes_kf(AV1_COMMON *cm,
         write_delta_lflevel(cm, xd, reduced_delta_lflevel, w);
         xd->prev_delta_lf_from_base = mbmi->current_delta_lf_from_base;
       }
+#endif  // CONFIG_LOOPFILTER_LEVEL
 #endif  // CONFIG_EXT_DELTA_Q
     }
   }
-#else
-  write_skip(cm, xd, mbmi->segment_id, mi, w);
-#endif
 
   int enable_tx_size = cm->tx_mode == TX_MODE_SELECT &&
-#if CONFIG_CB4X4 && (CONFIG_VAR_TX || CONFIG_RECT_TX)
-#if CONFIG_RECT_TX
-                       bsize > BLOCK_4X4 &&
-#else
-                       bsize >= BLOCK_8X8 &&
-#endif  // CONFIG_RECT_TX
-#else
-                       bsize >= BLOCK_8X8 &&
-#endif
+                       block_signals_txsize(bsize) &&
                        !xd->lossless[mbmi->segment_id];
 
 #if CONFIG_INTRABC
-  if (bsize >= BLOCK_8X8 && cm->allow_screen_content_tools) {
+  if (av1_allow_intrabc(bsize, cm)) {
     int use_intrabc = is_intrabc_block(mbmi);
-    aom_write(w, use_intrabc, ec_ctx->intrabc_prob);
+    aom_write_symbol(w, use_intrabc, ec_ctx->intrabc_cdf, 2);
     if (use_intrabc) {
       assert(mbmi->mode == DC_PRED);
       assert(mbmi->uv_mode == UV_DC_PRED);
@@ -2221,7 +2235,7 @@ static void write_mb_modes_kf(AV1_COMMON *cm,
 #endif  // CONFIG_CB4X4
 
 #if CONFIG_CFL
-    if (mbmi->uv_mode == UV_DC_PRED) {
+    if (mbmi->uv_mode == UV_CFL_PRED) {
       write_cfl_alphas(ec_ctx, mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, w);
     }
 #endif
@@ -2232,10 +2246,8 @@ static void write_mb_modes_kf(AV1_COMMON *cm,
 #if CONFIG_EXT_INTRA
   write_intra_angle_info(xd, ec_ctx, w);
 #endif  // CONFIG_EXT_INTRA
-#if CONFIG_PALETTE
-  if (bsize >= BLOCK_8X8 && cm->allow_screen_content_tools)
+  if (av1_allow_palette(cm->allow_screen_content_tools, bsize))
     write_palette_mode_info(cm, xd, mi, w);
-#endif  // CONFIG_PALETTE
 #if CONFIG_FILTER_INTRA
   if (bsize >= BLOCK_8X8 || unify_bsize)
     write_filter_intra_mode_info(cm, xd, mbmi, mi_row, mi_col, w);
@@ -2312,7 +2324,7 @@ static void enc_dump_logs(AV1_COMP *cpi, int mi_row, int mi_col) {
   m = xd->mi[0];
   if (is_inter_block(&m->mbmi)) {
 #define FRAME_TO_CHECK 1
-    if (cm->current_video_frame == FRAME_TO_CHECK /* && cm->show_frame == 1*/) {
+    if (cm->current_video_frame == FRAME_TO_CHECK && cm->show_frame == 1) {
       const MB_MODE_INFO *const mbmi = &m->mbmi;
       const BLOCK_SIZE bsize = mbmi->sb_type;
 
@@ -2331,21 +2343,6 @@ static void enc_dump_logs(AV1_COMP *cpi, int mi_row, int mi_col) {
 #endif  // CONFIG_COMPOUND_SINGLEREF
           mv[1].as_int = 0;
       }
-      int interp_ctx[2] = { -1 };
-      int interp_filter[2] = { cm->interp_filter };
-      if (cm->interp_filter == SWITCHABLE) {
-        int dir;
-        for (dir = 0; dir < 2; ++dir) {
-          if (has_subpel_mv_component(xd->mi[0], xd, dir) ||
-              (mbmi->ref_frame[1] > INTRA_FRAME &&
-               has_subpel_mv_component(xd->mi[0], xd, dir + 2))) {
-            interp_ctx[dir] = av1_get_pred_context_switchable_interp(xd, dir);
-            interp_filter[dir] = mbmi->interp_filter[dir];
-          } else {
-            interp_filter[dir] = EIGHTTAP_REGULAR;
-          }
-        }
-      }
 
       MACROBLOCK *const x = &cpi->td.mb;
       const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
@@ -2373,13 +2370,11 @@ static void enc_dump_logs(AV1_COMP *cpi, int mi_row, int mi_col) {
           "Frame=%d, (mi_row,mi_col)=(%d,%d), mode=%d, bsize=%d, "
           "show_frame=%d, mv[0]=(%d,%d), mv[1]=(%d,%d), ref[0]=%d, "
           "ref[1]=%d, motion_mode=%d, inter_mode_ctx=%d, mode_ctx=%d, "
-          "interp_ctx=(%d,%d), interp_filter=(%d,%d), newmv_ctx=%d, "
-          "zeromv_ctx=%d, refmv_ctx=%d\n",
+          "newmv_ctx=%d, zeromv_ctx=%d, refmv_ctx=%d\n",
           cm->current_video_frame, mi_row, mi_col, mbmi->mode, bsize,
           cm->show_frame, mv[0].as_mv.row, mv[0].as_mv.col, mv[1].as_mv.row,
           mv[1].as_mv.col, mbmi->ref_frame[0], mbmi->ref_frame[1],
           mbmi->motion_mode, mbmi_ext->mode_context[ref_frame_type], mode_ctx,
-          interp_ctx[0], interp_ctx[1], interp_filter[0], interp_filter[1],
           newmv_ctx, zeromv_ctx, refmv_ctx);
     }
   }
@@ -2400,7 +2395,7 @@ static void write_mbmi_b(AV1_COMP *cpi, const TileInfo *const tile,
   m = xd->mi[0];
 
   assert(m->mbmi.sb_type <= cm->sb_size ||
-         (m->mbmi.sb_type >= BLOCK_4X16 && m->mbmi.sb_type <= BLOCK_32X8));
+         (m->mbmi.sb_type >= BLOCK_SIZES && m->mbmi.sb_type < BLOCK_SIZES_ALL));
 
   bh = mi_size_high[m->mbmi.sb_type];
   bw = mi_size_wide[m->mbmi.sb_type];
@@ -2431,14 +2426,13 @@ static void write_mbmi_b(AV1_COMP *cpi, const TileInfo *const tile,
     // up if they are scaled. has_subpel_mv_component is in turn needed by
     // write_switchable_interp_filter, which is called by pack_inter_mode_mvs.
     set_ref_ptrs(cm, xd, m->mbmi.ref_frame[0], m->mbmi.ref_frame[1]);
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
     if (!has_second_ref(&m->mbmi) && is_inter_singleref_comp_mode(m->mbmi.mode))
       xd->block_refs[1] = xd->block_refs[0];
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
 #endif  // CONFIG_DUAL_FILTER || CONFIG_WARPED_MOTION
 
 #if ENC_MISMATCH_DEBUG
-    // NOTE(zoeliu): For debug
     enc_dump_logs(cpi, mi_row, mi_col);
 #endif  // ENC_MISMATCH_DEBUG
 
@@ -2469,7 +2463,7 @@ static void write_tokens_b(AV1_COMP *cpi, const TileInfo *const tile,
   xd->mi = cm->mi_grid_visible + mi_offset;
 
   assert(mbmi->sb_type <= cm->sb_size ||
-         (mbmi->sb_type >= BLOCK_4X16 && mbmi->sb_type <= BLOCK_32X8));
+         (mbmi->sb_type >= BLOCK_SIZES && mbmi->sb_type < BLOCK_SIZES_ALL));
 
   bh = mi_size_high[mbmi->sb_type];
   bw = mi_size_wide[mbmi->sb_type];
@@ -2481,7 +2475,8 @@ static void write_tokens_b(AV1_COMP *cpi, const TileInfo *const tile,
 #endif  // CONFIG_DEPENDENT_HORZTILES
                  cm->mi_rows, cm->mi_cols);
 
-#if CONFIG_PALETTE
+// TODO(anybody) : remove this flag when PVQ supports pallete coding tool
+#if !CONFIG_PVQ
   for (plane = 0; plane <= 1; ++plane) {
     const uint8_t palette_size_plane =
         mbmi->palette_mode_info.palette_size[plane];
@@ -2494,11 +2489,13 @@ static void write_tokens_b(AV1_COMP *cpi, const TileInfo *const tile,
       av1_get_block_dimensions(mbmi->sb_type, plane, xd, NULL, NULL, &rows,
                                &cols);
       assert(*tok < tok_end);
-      pack_palette_tokens(w, tok, palette_size_plane, rows * cols);
+      pack_map_tokens(w, tok, palette_size_plane, rows * cols);
+#if !CONFIG_LV_MAP
       assert(*tok < tok_end + mbmi->skip);
+#endif  // !CONFIG_LV_MAP
     }
   }
-#endif  // CONFIG_PALETTE
+#endif  // !CONFIG_PVQ
 
 #if CONFIG_COEF_INTERLEAVE
   if (!mbmi->skip) {
@@ -2585,7 +2582,9 @@ static void write_tokens_b(AV1_COMP *cpi, const TileInfo *const tile,
       if (!is_chroma_reference(mi_row, mi_col, mbmi->sb_type,
                                xd->plane[plane].subsampling_x,
                                xd->plane[plane].subsampling_y)) {
+#if !CONFIG_LV_MAP
         (*tok)++;
+#endif  // !CONFIG_LV_MAP
         continue;
       }
 #endif
@@ -2620,12 +2619,15 @@ static void write_tokens_b(AV1_COMP *cpi, const TileInfo *const tile,
       mu_blocks_high = AOMMIN(num_4x4_h, mu_blocks_high);
 
       if (is_inter_block(mbmi)) {
-        const TX_SIZE max_tx_size = get_vartx_max_txsize(mbmi, plane_bsize);
+        const TX_SIZE max_tx_size = get_vartx_max_txsize(
+            mbmi, plane_bsize, pd->subsampling_x || pd->subsampling_y);
         int block = 0;
         const int step =
             tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
         const int bkw = tx_size_wide_unit[max_tx_size];
         const int bkh = tx_size_high_unit[max_tx_size];
+        assert(bkw <= mu_blocks_wide);
+        assert(bkh <= mu_blocks_high);
         for (row = 0; row < num_4x4_h; row += mu_blocks_high) {
           const int unit_height = AOMMIN(mu_blocks_high + row, num_4x4_h);
           for (col = 0; col < num_4x4_w; col += mu_blocks_wide) {
@@ -2673,7 +2675,15 @@ static void write_tokens_b(AV1_COMP *cpi, const TileInfo *const tile,
             for (blk_row = row; blk_row < unit_height; blk_row += bkh) {
               for (blk_col = col; blk_col < unit_width; blk_col += bkw) {
 #if !CONFIG_PVQ
+#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+                TX_TYPE tx_type =
+                    av1_get_tx_type(plane ? PLANE_TYPE_UV : PLANE_TYPE_Y, xd,
+                                    blk_row, blk_col, 0, tx);
+#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
                 pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx,
+#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+                               tx_type, is_inter_block(mbmi),
+#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
                                &token_stats);
 #else
                 pack_pvq_tokens(w, x, xd, plane, bsize, tx);
@@ -2692,8 +2702,16 @@ static void write_tokens_b(AV1_COMP *cpi, const TileInfo *const tile,
 #if CONFIG_LV_MAP
       (void)tx;
       av1_write_coeffs_mb(cm, x, w, plane);
-#else   // CONFIG_LV_MAP
-      pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx, &token_stats);
+#else  // CONFIG_LV_MAP
+#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+      TX_TYPE tx_type = av1_get_tx_type(plane ? PLANE_TYPE_UV : PLANE_TYPE_Y,
+                                        xd, blk_row, blk_col, 0, tx);
+#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+      pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx,
+#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+                     tx_type, is_inter_block(mbmi),
+#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+                     &token_stats);
 #endif  // CONFIG_LV_MAP
 
 #else
@@ -2718,7 +2736,7 @@ static void write_tokens_b(AV1_COMP *cpi, const TileInfo *const tile,
 #endif  // CONFIG_COEF_INTERLEAVE
 }
 
-#if CONFIG_MOTION_VAR && (CONFIG_NCOBMC || CONFIG_NCOBMC_ADAPT_WEIGHT)
+#if CONFIG_MOTION_VAR && NC_MODE_INFO
 static void write_tokens_sb(AV1_COMP *cpi, const TileInfo *const tile,
                             aom_writer *w, const TOKENEXTRA **tok,
                             const TOKENEXTRA *const tok_end, int mi_row,
@@ -2765,6 +2783,9 @@ static void write_tokens_sb(AV1_COMP *cpi, const TileInfo *const tile,
                         subsize);
         break;
 #if CONFIG_EXT_PARTITION_TYPES
+#if CONFIG_EXT_PARTITION_TYPES_AB
+#error NC_MODE_INFO+MOTION_VAR not yet supported for new HORZ/VERT_AB partitions
+#endif
       case PARTITION_HORZ_A:
         write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
         write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
@@ -2804,7 +2825,8 @@ static void write_modes_b(AV1_COMP *cpi, const TileInfo *const tile,
                supertx_enabled,
 #endif
                mi_row, mi_col);
-#if CONFIG_MOTION_VAR && (CONFIG_NCOBMC || CONFIG_NCOBMC_ADAPT_WEIGHT)
+
+#if CONFIG_MOTION_VAR && NC_MODE_INFO
   (void)tok;
   (void)tok_end;
 #else
@@ -2829,13 +2851,6 @@ static void write_partition(const AV1_COMMON *const cm,
 #endif
                                                 bsize)
                       : 0;
-#if CONFIG_UNPOISON_PARTITION_CTX
-  const aom_prob *const probs =
-      ctx < PARTITION_CONTEXTS ? cm->fc->partition_prob[ctx] : NULL;
-#else
-  const aom_prob *const probs = cm->fc->partition_prob[ctx];
-#endif
-
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
   (void)cm;
 
@@ -2843,19 +2858,26 @@ static void write_partition(const AV1_COMMON *const cm,
 
   if (has_rows && has_cols) {
 #if CONFIG_EXT_PARTITION_TYPES
-    if (bsize <= BLOCK_8X8)
-      aom_write_symbol(w, p, ec_ctx->partition_cdf[ctx], PARTITION_TYPES);
-    else
-      aom_write_symbol(w, p, ec_ctx->partition_cdf[ctx], EXT_PARTITION_TYPES);
+    const int num_partition_types =
+        (mi_width_log2_lookup[bsize] > mi_width_log2_lookup[BLOCK_8X8])
+            ? EXT_PARTITION_TYPES
+            : PARTITION_TYPES;
 #else
-    aom_write_symbol(w, p, ec_ctx->partition_cdf[ctx], PARTITION_TYPES);
-#endif  // CONFIG_EXT_PARTITION_TYPES
+    const int num_partition_types = PARTITION_TYPES;
+#endif
+    aom_write_symbol(w, p, ec_ctx->partition_cdf[ctx], num_partition_types);
   } else if (!has_rows && has_cols) {
     assert(p == PARTITION_SPLIT || p == PARTITION_HORZ);
-    aom_write(w, p == PARTITION_SPLIT, probs[1]);
+    assert(bsize > BLOCK_8X8);
+    aom_cdf_prob cdf[2];
+    partition_gather_vert_alike(cdf, ec_ctx->partition_cdf[ctx]);
+    aom_write_cdf(w, p == PARTITION_SPLIT, cdf, 2);
   } else if (has_rows && !has_cols) {
     assert(p == PARTITION_SPLIT || p == PARTITION_VERT);
-    aom_write(w, p == PARTITION_SPLIT, probs[2]);
+    assert(bsize > BLOCK_8X8);
+    aom_cdf_prob cdf[2];
+    partition_gather_horz_alike(cdf, ec_ctx->partition_cdf[ctx]);
+    aom_write_cdf(w, p == PARTITION_SPLIT, cdf, 2);
   } else {
     assert(p == PARTITION_SPLIT);
   }
@@ -2885,7 +2907,10 @@ static void write_modes_sb(AV1_COMP *const cpi, const TileInfo *const tile,
 #if CONFIG_EXT_PARTITION_TYPES
   const int quarter_step = mi_size_wide[bsize] / 4;
   int i;
-#endif
+#if CONFIG_EXT_PARTITION_TYPES_AB
+  const int qbs = mi_size_wide[bsize] / 4;
+#endif  // CONFIG_EXT_PARTITION_TYPES_AB
+#endif  // CONFIG_EXT_PARTITION_TYPES
   const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
   const BLOCK_SIZE subsize = get_subsize(bsize, partition);
 #if CONFIG_CB4X4
@@ -2899,7 +2924,6 @@ static void write_modes_sb(AV1_COMP *const cpi, const TileInfo *const tile,
   MB_MODE_INFO *mbmi;
   const int pack_token = !supertx_enabled;
   TX_SIZE supertx_size;
-  int plane;
 #endif
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
@@ -2959,6 +2983,42 @@ static void write_modes_sb(AV1_COMP *const cpi, const TileInfo *const tile,
                                mi_row + hbs, mi_col + hbs, subsize);
         break;
 #if CONFIG_EXT_PARTITION_TYPES
+#if CONFIG_EXT_PARTITION_TYPES_AB
+      case PARTITION_HORZ_A:
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row, mi_col);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row + qbs, mi_col);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row + hbs, mi_col);
+        break;
+      case PARTITION_HORZ_B:
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row, mi_col);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row + hbs, mi_col);
+        if (mi_row + 3 * qbs < cm->mi_rows)
+          write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                                mi_row + 3 * qbs, mi_col);
+        break;
+      case PARTITION_VERT_A:
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row, mi_col);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row, mi_col + qbs);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row, mi_col + hbs);
+        break;
+      case PARTITION_VERT_B:
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row, mi_col);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row, mi_col + hbs);
+        if (mi_col + 3 * qbs < cm->mi_cols)
+          write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                                mi_row, mi_col + 3 * qbs);
+        break;
+#else
       case PARTITION_HORZ_A:
         write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
                               mi_row, mi_col);
@@ -2991,6 +3051,7 @@ static void write_modes_sb(AV1_COMP *const cpi, const TileInfo *const tile,
         write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
                               mi_row + hbs, mi_col + hbs);
         break;
+#endif
       case PARTITION_HORZ_4:
         for (i = 0; i < 4; ++i) {
           int this_mi_row = mi_row + i * quarter_step;
@@ -3039,10 +3100,12 @@ static void write_modes_sb(AV1_COMP *const cpi, const TileInfo *const tile,
         !skip) {
       const int eset =
           get_ext_tx_set(supertx_size, bsize, 1, cm->reduced_tx_set_used);
+      const int tx_set_type =
+          get_ext_tx_set_type(supertx_size, bsize, 1, cm->reduced_tx_set_used);
       if (eset > 0) {
-        aom_write_symbol(w, av1_ext_tx_inter_ind[eset][mbmi->tx_type],
+        aom_write_symbol(w, av1_ext_tx_ind[tx_set_type][mbmi->tx_type],
                          ec_ctx->inter_ext_tx_cdf[eset][supertx_size],
-                         ext_tx_cnt_inter[eset]);
+                         av1_num_ext_tx_set[tx_set_type]);
       }
     }
 #else
@@ -3054,7 +3117,11 @@ static void write_modes_sb(AV1_COMP *const cpi, const TileInfo *const tile,
 
     if (!skip) {
       assert(*tok < tok_end);
-      for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+      for (int plane = 0; plane < MAX_MB_PLANE; ++plane) {
+#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+        TX_TYPE tx_type = av1_get_tx_type(plane ? PLANE_TYPE_UV : PLANE_TYPE_Y,
+                                          xd, blk_row, blk_col, block, tx_size);
+#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
         const struct macroblockd_plane *const pd = &xd->plane[plane];
         const int mbmi_txb_size = txsize_to_bsize[mbmi->tx_size];
         const BLOCK_SIZE plane_bsize = get_plane_block_size(mbmi_txb_size, pd);
@@ -3073,7 +3140,11 @@ static void write_modes_sb(AV1_COMP *const cpi, const TileInfo *const tile,
         token_stats.cost = 0;
         for (row = 0; row < max_blocks_high; row += stepr)
           for (col = 0; col < max_blocks_wide; col += stepc)
-            pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx, &token_stats);
+            pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx,
+#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+                           tx_type, is_inter_block(mbmi),
+#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+                           &token_stats);
         assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
         (*tok)++;
       }
@@ -3096,6 +3167,61 @@ static void write_modes_sb(AV1_COMP *const cpi, const TileInfo *const tile,
     update_partition_context(xd, mi_row, mi_col, subsize, bsize);
 #endif  // CONFIG_EXT_PARTITION_TYPES
 
+#if CONFIG_LPF_SB
+  // send filter level for each superblock (64x64)
+  if (bsize == cm->sb_size) {
+    if (mi_row == 0 && mi_col == 0) {
+      aom_write_literal(w, cm->mi_grid_visible[0]->mbmi.filt_lvl, 6);
+      cm->mi_grid_visible[0]->mbmi.reuse_sb_lvl = 0;
+      cm->mi_grid_visible[0]->mbmi.delta = 0;
+      cm->mi_grid_visible[0]->mbmi.sign = 0;
+    } else {
+      int prev_mi_row, prev_mi_col;
+      if (mi_col - MAX_MIB_SIZE < 0) {
+        prev_mi_row = mi_row - MAX_MIB_SIZE;
+        prev_mi_col = mi_col;
+      } else {
+        prev_mi_row = mi_row;
+        prev_mi_col = mi_col - MAX_MIB_SIZE;
+      }
+      MB_MODE_INFO *curr_mbmi =
+          &cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]->mbmi;
+      MB_MODE_INFO *prev_mbmi =
+          &cm->mi_grid_visible[prev_mi_row * cm->mi_stride + prev_mi_col]->mbmi;
+
+      const uint8_t curr_lvl = curr_mbmi->filt_lvl;
+      const uint8_t prev_lvl = prev_mbmi->filt_lvl;
+
+      const int reuse_prev_lvl = curr_lvl == prev_lvl;
+      const int reuse_ctx = prev_mbmi->reuse_sb_lvl;
+      curr_mbmi->reuse_sb_lvl = reuse_prev_lvl;
+      aom_write_symbol(w, reuse_prev_lvl,
+                       xd->tile_ctx->lpf_reuse_cdf[reuse_ctx], 2);
+
+      if (reuse_prev_lvl) {
+        curr_mbmi->delta = 0;
+        curr_mbmi->sign = 0;
+      } else {
+        const unsigned int delta = abs(curr_lvl - prev_lvl) / LPF_STEP;
+        const int delta_ctx = prev_mbmi->delta;
+        curr_mbmi->delta = delta;
+        aom_write_symbol(w, delta, xd->tile_ctx->lpf_delta_cdf[delta_ctx],
+                         DELTA_RANGE);
+
+        if (delta) {
+          const int sign = curr_lvl > prev_lvl;
+          const int sign_ctx = prev_mbmi->sign;
+          curr_mbmi->sign = sign;
+          aom_write_symbol(w, sign,
+                           xd->tile_ctx->lpf_sign_cdf[reuse_ctx][sign_ctx], 2);
+        } else {
+          curr_mbmi->sign = 0;
+        }
+      }
+    }
+  }
+#endif
+
 #if CONFIG_CDEF
   if (bsize == cm->sb_size && cm->cdef_bits != 0 && !cm->all_lossless) {
     int width_step = mi_size_wide[BLOCK_64X64];
@@ -3109,14 +3235,30 @@ static void write_modes_sb(AV1_COMP *const cpi, const TileInfo *const tile,
            width += width_step) {
         if (!sb_all_skip(cm, mi_row + height, mi_col + width))
           aom_write_literal(
-              w, cm->mi_grid_visible[(mi_row + height) * cm->mi_stride +
-                                     (mi_col + width)]
-                     ->mbmi.cdef_strength,
+              w,
+              cm->mi_grid_visible[(mi_row + height) * cm->mi_stride +
+                                  (mi_col + width)]
+                  ->mbmi.cdef_strength,
               cm->cdef_bits);
       }
     }
   }
 #endif
+#if CONFIG_LOOP_RESTORATION
+  for (int plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    int rcol0, rcol1, rrow0, rrow1, nhtiles;
+    if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize,
+                                           &rcol0, &rcol1, &rrow0, &rrow1,
+                                           &nhtiles)) {
+      for (int rrow = rrow0; rrow < rrow1; ++rrow) {
+        for (int rcol = rcol0; rcol < rcol1; ++rcol) {
+          int rtile_idx = rcol + rrow * nhtiles;
+          loop_restoration_write_sb_coeffs(cm, xd, w, plane, rtile_idx);
+        }
+      }
+    }
+  }
+#endif
 }
 
 static void write_modes(AV1_COMP *const cpi, const TileInfo *const tile,
@@ -3141,16 +3283,18 @@ static void write_modes(AV1_COMP *const cpi, const TileInfo *const tile,
 #if CONFIG_PVQ
   assert(cpi->td.mb.pvq_q->curr_pos == 0);
 #endif
-#if CONFIG_DELTA_Q
   if (cpi->common.delta_q_present_flag) {
     xd->prev_qindex = cpi->common.base_qindex;
 #if CONFIG_EXT_DELTA_Q
     if (cpi->common.delta_lf_present_flag) {
+#if CONFIG_LOOPFILTER_LEVEL
+      for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id)
+        xd->prev_delta_lf[lf_id] = 0;
+#endif  // CONFIG_LOOPFILTER_LEVEL
       xd->prev_delta_lf_from_base = 0;
     }
 #endif  // CONFIG_EXT_DELTA_Q
   }
-#endif
 
   for (mi_row = mi_row_start; mi_row < mi_row_end; mi_row += cm->mib_size) {
     av1_zero_left_context(xd);
@@ -3158,7 +3302,7 @@ static void write_modes(AV1_COMP *const cpi, const TileInfo *const tile,
     for (mi_col = mi_col_start; mi_col < mi_col_end; mi_col += cm->mib_size) {
       write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, 0, mi_row, mi_col,
                              cm->sb_size);
-#if CONFIG_MOTION_VAR && (CONFIG_NCOBMC || CONFIG_NCOBMC_ADAPT_WEIGHT)
+#if CONFIG_MOTION_VAR && NC_MODE_INFO
       write_tokens_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, cm->sb_size);
 #endif
     }
@@ -3224,8 +3368,9 @@ static void encode_restoration_mode(AV1_COMMON *cm,
   int s = AOMMIN(cm->subsampling_x, cm->subsampling_y);
   if (s && (cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
             cm->rst_info[2].frame_restoration_type != RESTORE_NONE)) {
-    aom_wb_write_bit(wb, cm->rst_info[1].restoration_tilesize !=
-                             cm->rst_info[0].restoration_tilesize);
+    aom_wb_write_bit(wb,
+                     cm->rst_info[1].restoration_tilesize !=
+                         cm->rst_info[0].restoration_tilesize);
     assert(cm->rst_info[1].restoration_tilesize ==
                cm->rst_info[0].restoration_tilesize ||
            cm->rst_info[1].restoration_tilesize ==
@@ -3240,13 +3385,17 @@ static void encode_restoration_mode(AV1_COMMON *cm,
   }
 }
 
-static void write_wiener_filter(WienerInfo *wiener_info,
+static void write_wiener_filter(int wiener_win, WienerInfo *wiener_info,
                                 WienerInfo *ref_wiener_info, aom_writer *wb) {
-  aom_write_primitive_refsubexpfin(
-      wb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
-      WIENER_FILT_TAP0_SUBEXP_K,
-      ref_wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV,
-      wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV);
+  if (wiener_win == WIENER_WIN)
+    aom_write_primitive_refsubexpfin(
+        wb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+        WIENER_FILT_TAP0_SUBEXP_K,
+        ref_wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV,
+        wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV);
+  else
+    assert(wiener_info->vfilter[0] == 0 &&
+           wiener_info->vfilter[WIENER_WIN - 1] == 0);
   aom_write_primitive_refsubexpfin(
       wb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
       WIENER_FILT_TAP1_SUBEXP_K,
@@ -3257,11 +3406,15 @@ static void write_wiener_filter(WienerInfo *wiener_info,
       WIENER_FILT_TAP2_SUBEXP_K,
       ref_wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV,
       wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV);
-  aom_write_primitive_refsubexpfin(
-      wb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
-      WIENER_FILT_TAP0_SUBEXP_K,
-      ref_wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV,
-      wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV);
+  if (wiener_win == WIENER_WIN)
+    aom_write_primitive_refsubexpfin(
+        wb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+        WIENER_FILT_TAP0_SUBEXP_K,
+        ref_wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV,
+        wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV);
+  else
+    assert(wiener_info->hfilter[0] == 0 &&
+           wiener_info->hfilter[WIENER_WIN - 1] == 0);
   aom_write_primitive_refsubexpfin(
       wb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
       WIENER_FILT_TAP1_SUBEXP_K,
@@ -3290,99 +3443,63 @@ static void write_sgrproj_filter(SgrprojInfo *sgrproj_info,
   memcpy(ref_sgrproj_info, sgrproj_info, sizeof(*sgrproj_info));
 }
 
-static void encode_restoration(AV1_COMMON *cm, aom_writer *wb) {
-  int i, p;
-#if CONFIG_FRAME_SUPERRES
-  const int width = cm->superres_upscaled_width;
-  const int height = cm->superres_upscaled_height;
-#else
-  const int width = cm->width;
-  const int height = cm->height;
-#endif  // CONFIG_FRAME_SUPERRES
-  const int ntiles =
-      av1_get_rest_ntiles(width, height, cm->rst_info[0].restoration_tilesize,
-                          NULL, NULL, NULL, NULL);
-  WienerInfo ref_wiener_info;
-  SgrprojInfo ref_sgrproj_info;
-  set_default_wiener(&ref_wiener_info);
-  set_default_sgrproj(&ref_sgrproj_info);
-  const int ntiles_uv = av1_get_rest_ntiles(
-      ROUND_POWER_OF_TWO(width, cm->subsampling_x),
-      ROUND_POWER_OF_TWO(height, cm->subsampling_y),
-      cm->rst_info[1].restoration_tilesize, NULL, NULL, NULL, NULL);
-  RestorationInfo *rsi = &cm->rst_info[0];
-  if (rsi->frame_restoration_type != RESTORE_NONE) {
-    if (rsi->frame_restoration_type == RESTORE_SWITCHABLE) {
-      // RESTORE_SWITCHABLE
-      for (i = 0; i < ntiles; ++i) {
-        av1_write_token(
-            wb, av1_switchable_restore_tree, cm->fc->switchable_restore_prob,
-            &switchable_restore_encodings[rsi->restoration_type[i]]);
-        if (rsi->restoration_type[i] == RESTORE_WIENER) {
-          write_wiener_filter(&rsi->wiener_info[i], &ref_wiener_info, wb);
-        } else if (rsi->restoration_type[i] == RESTORE_SGRPROJ) {
-          write_sgrproj_filter(&rsi->sgrproj_info[i], &ref_sgrproj_info, wb);
-        }
-      }
-    } else if (rsi->frame_restoration_type == RESTORE_WIENER) {
-      for (i = 0; i < ntiles; ++i) {
-        aom_write(wb, rsi->restoration_type[i] != RESTORE_NONE,
-                  RESTORE_NONE_WIENER_PROB);
-        if (rsi->restoration_type[i] != RESTORE_NONE) {
-          write_wiener_filter(&rsi->wiener_info[i], &ref_wiener_info, wb);
-        }
-      }
-    } else if (rsi->frame_restoration_type == RESTORE_SGRPROJ) {
-      for (i = 0; i < ntiles; ++i) {
-        aom_write(wb, rsi->restoration_type[i] != RESTORE_NONE,
-                  RESTORE_NONE_SGRPROJ_PROB);
-        if (rsi->restoration_type[i] != RESTORE_NONE) {
-          write_sgrproj_filter(&rsi->sgrproj_info[i], &ref_sgrproj_info, wb);
-        }
-      }
+static void loop_restoration_write_sb_coeffs(const AV1_COMMON *const cm,
+                                             MACROBLOCKD *xd,
+                                             aom_writer *const w, int plane,
+                                             int rtile_idx) {
+  const RestorationInfo *rsi = cm->rst_info + plane;
+  if (rsi->frame_restoration_type == RESTORE_NONE) return;
+
+  const int wiener_win = (plane > 0) ? WIENER_WIN_CHROMA : WIENER_WIN;
+  WienerInfo *wiener_info = xd->wiener_info + plane;
+  SgrprojInfo *sgrproj_info = xd->sgrproj_info + plane;
+
+  if (rsi->frame_restoration_type == RESTORE_SWITCHABLE) {
+    assert(plane == 0);
+    av1_write_token(
+        w, av1_switchable_restore_tree, cm->fc->switchable_restore_prob,
+        &switchable_restore_encodings[rsi->restoration_type[rtile_idx]]);
+    if (rsi->restoration_type[rtile_idx] == RESTORE_WIENER) {
+      write_wiener_filter(wiener_win, &rsi->wiener_info[rtile_idx], wiener_info,
+                          w);
+    } else if (rsi->restoration_type[rtile_idx] == RESTORE_SGRPROJ) {
+      write_sgrproj_filter(&rsi->sgrproj_info[rtile_idx], sgrproj_info, w);
     }
-  }
-  for (p = 1; p < MAX_MB_PLANE; ++p) {
-    set_default_wiener(&ref_wiener_info);
-    set_default_sgrproj(&ref_sgrproj_info);
-    rsi = &cm->rst_info[p];
-    if (rsi->frame_restoration_type == RESTORE_WIENER) {
-      for (i = 0; i < ntiles_uv; ++i) {
-        if (ntiles_uv > 1)
-          aom_write(wb, rsi->restoration_type[i] != RESTORE_NONE,
-                    RESTORE_NONE_WIENER_PROB);
-        if (rsi->restoration_type[i] != RESTORE_NONE) {
-          write_wiener_filter(&rsi->wiener_info[i], &ref_wiener_info, wb);
-        }
-      }
-    } else if (rsi->frame_restoration_type == RESTORE_SGRPROJ) {
-      for (i = 0; i < ntiles_uv; ++i) {
-        if (ntiles_uv > 1)
-          aom_write(wb, rsi->restoration_type[i] != RESTORE_NONE,
-                    RESTORE_NONE_SGRPROJ_PROB);
-        if (rsi->restoration_type[i] != RESTORE_NONE) {
-          write_sgrproj_filter(&rsi->sgrproj_info[i], &ref_sgrproj_info, wb);
-        }
-      }
-    } else if (rsi->frame_restoration_type != RESTORE_NONE) {
-      assert(0);
+  } else if (rsi->frame_restoration_type == RESTORE_WIENER) {
+    aom_write(w, rsi->restoration_type[rtile_idx] != RESTORE_NONE,
+              RESTORE_NONE_WIENER_PROB);
+    if (rsi->restoration_type[rtile_idx] != RESTORE_NONE) {
+      write_wiener_filter(wiener_win, &rsi->wiener_info[rtile_idx], wiener_info,
+                          w);
+    }
+  } else if (rsi->frame_restoration_type == RESTORE_SGRPROJ) {
+    aom_write(w, rsi->restoration_type[rtile_idx] != RESTORE_NONE,
+              RESTORE_NONE_SGRPROJ_PROB);
+    if (rsi->restoration_type[rtile_idx] != RESTORE_NONE) {
+      write_sgrproj_filter(&rsi->sgrproj_info[rtile_idx], sgrproj_info, w);
     }
   }
 }
+
 #endif  // CONFIG_LOOP_RESTORATION
 
 static void encode_loopfilter(AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
   int i;
   struct loopfilter *lf = &cm->lf;
 
-  // Encode the loop filter level and type
-  aom_wb_write_literal(wb, lf->filter_level, 6);
-#if CONFIG_UV_LVL
-  if (lf->filter_level > 0) {
+// Encode the loop filter level and type
+#if !CONFIG_LPF_SB
+#if CONFIG_LOOPFILTER_LEVEL
+  aom_wb_write_literal(wb, lf->filter_level[0], 6);
+  aom_wb_write_literal(wb, lf->filter_level[1], 6);
+  if (lf->filter_level[0] || lf->filter_level[1]) {
     aom_wb_write_literal(wb, lf->filter_level_u, 6);
     aom_wb_write_literal(wb, lf->filter_level_v, 6);
   }
-#endif
+#else
+  aom_wb_write_literal(wb, lf->filter_level, 6);
+#endif  // CONFIG_LOOPFILTER_LEVEL
+#endif  // CONFIG_LPF_SB
   aom_wb_write_literal(wb, lf->sharpness_level, 3);
 
   // Write out loop filter deltas applied at the MB level based on mode or
@@ -3418,12 +3535,18 @@ static void encode_loopfilter(AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
 #if CONFIG_CDEF
 static void encode_cdef(const AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
   int i;
-  aom_wb_write_literal(wb, cm->cdef_dering_damping - 5, 1);
-  aom_wb_write_literal(wb, cm->cdef_clpf_damping - 3, 2);
+#if CONFIG_CDEF_SINGLEPASS
+  aom_wb_write_literal(wb, cm->cdef_pri_damping - 3, 2);
+  assert(cm->cdef_pri_damping == cm->cdef_sec_damping);
+#else
+  aom_wb_write_literal(wb, cm->cdef_pri_damping - 5, 1);
+  aom_wb_write_literal(wb, cm->cdef_sec_damping - 3, 2);
+#endif
   aom_wb_write_literal(wb, cm->cdef_bits, 2);
   for (i = 0; i < cm->nb_cdef_strengths; i++) {
     aom_wb_write_literal(wb, cm->cdef_strengths[i], CDEF_STRENGTH_BITS);
-    aom_wb_write_literal(wb, cm->cdef_uv_strengths[i], CDEF_STRENGTH_BITS);
+    if (cm->subsampling_x == cm->subsampling_y)
+      aom_wb_write_literal(wb, cm->cdef_uv_strengths[i], CDEF_STRENGTH_BITS);
   }
 }
 #endif
@@ -3564,6 +3687,72 @@ static void fix_interp_filter(AV1_COMMON *cm, FRAME_COUNTS *counts) {
   }
 }
 
+#if CONFIG_MAX_TILE
+
+// Same function as write_uniform but writing to uncompresses header wb
+static void wb_write_uniform(struct aom_write_bit_buffer *wb, int n, int v) {
+  const int l = get_unsigned_bits(n);
+  const int m = (1 << l) - n;
+  if (l == 0) return;
+  if (v < m) {
+    aom_wb_write_literal(wb, v, l - 1);
+  } else {
+    aom_wb_write_literal(wb, m + ((v - m) >> 1), l - 1);
+    aom_wb_write_literal(wb, (v - m) & 1, 1);
+  }
+}
+
+static void write_tile_info_max_tile(const AV1_COMMON *const cm,
+                                     struct aom_write_bit_buffer *wb) {
+  int width_mi = ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
+  int height_mi = ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2);
+  int width_sb = width_mi >> MAX_MIB_SIZE_LOG2;
+  int height_sb = height_mi >> MAX_MIB_SIZE_LOG2;
+  int size_sb, i;
+
+  aom_wb_write_bit(wb, cm->uniform_tile_spacing_flag);
+
+  if (cm->uniform_tile_spacing_flag) {
+    // Uniform spaced tiles with power-of-two number of rows and columns
+    // tile columns
+    int ones = cm->log2_tile_cols - cm->min_log2_tile_cols;
+    while (ones--) {
+      aom_wb_write_bit(wb, 1);
+    }
+    if (cm->log2_tile_cols < cm->max_log2_tile_cols) {
+      aom_wb_write_bit(wb, 0);
+    }
+
+    // rows
+    ones = cm->log2_tile_rows - cm->min_log2_tile_rows;
+    while (ones--) {
+      aom_wb_write_bit(wb, 1);
+    }
+    if (cm->log2_tile_rows < cm->max_log2_tile_rows) {
+      aom_wb_write_bit(wb, 0);
+    }
+  } else {
+    // Explicit tiles with configurable tile widths and heights
+    // columns
+    for (i = 0; i < cm->tile_cols; i++) {
+      size_sb = cm->tile_col_start_sb[i + 1] - cm->tile_col_start_sb[i];
+      wb_write_uniform(wb, AOMMIN(width_sb, MAX_TILE_WIDTH_SB), size_sb - 1);
+      width_sb -= size_sb;
+    }
+    assert(width_sb == 0);
+
+    // rows
+    for (i = 0; i < cm->tile_rows; i++) {
+      size_sb = cm->tile_row_start_sb[i + 1] - cm->tile_row_start_sb[i];
+      wb_write_uniform(wb, AOMMIN(height_sb, cm->max_tile_height_sb),
+                       size_sb - 1);
+      height_sb -= size_sb;
+    }
+    assert(height_sb == 0);
+  }
+}
+#endif
+
 static void write_tile_info(const AV1_COMMON *const cm,
                             struct aom_write_bit_buffer *wb) {
 #if CONFIG_EXT_TILE
@@ -3596,20 +3785,25 @@ static void write_tile_info(const AV1_COMMON *const cm,
 #endif  // CONFIG_EXT_PARTITION
   } else {
 #endif  // CONFIG_EXT_TILE
-    int min_log2_tile_cols, max_log2_tile_cols, ones;
-    av1_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
 
-    // columns
-    ones = cm->log2_tile_cols - min_log2_tile_cols;
-    while (ones--) aom_wb_write_bit(wb, 1);
+#if CONFIG_MAX_TILE
+    write_tile_info_max_tile(cm, wb);
+#else
+  int min_log2_tile_cols, max_log2_tile_cols, ones;
+  av1_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
 
-    if (cm->log2_tile_cols < max_log2_tile_cols) aom_wb_write_bit(wb, 0);
+  // columns
+  ones = cm->log2_tile_cols - min_log2_tile_cols;
+  while (ones--) aom_wb_write_bit(wb, 1);
 
-    // rows
-    aom_wb_write_bit(wb, cm->log2_tile_rows != 0);
-    if (cm->log2_tile_rows != 0) aom_wb_write_bit(wb, cm->log2_tile_rows != 1);
+  if (cm->log2_tile_cols < max_log2_tile_cols) aom_wb_write_bit(wb, 0);
+
+  // rows
+  aom_wb_write_bit(wb, cm->log2_tile_rows != 0);
+  if (cm->log2_tile_rows != 0) aom_wb_write_bit(wb, cm->log2_tile_rows != 1);
+#endif
 #if CONFIG_DEPENDENT_HORZTILES
-    if (cm->log2_tile_rows != 0) aom_wb_write_bit(wb, cm->dependent_horz_tiles);
+    if (cm->tile_rows > 1) aom_wb_write_bit(wb, cm->dependent_horz_tiles);
 #endif
 #if CONFIG_EXT_TILE
   }
@@ -3620,10 +3814,30 @@ static void write_tile_info(const AV1_COMMON *const cm,
 #endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
 }
 
-static int get_refresh_mask(AV1_COMP *cpi) {
+#if CONFIG_EXT_REFS
+#if USE_GF16_MULTI_LAYER
+static int get_refresh_mask_gf16(AV1_COMP *cpi) {
   int refresh_mask = 0;
 
+  if (cpi->refresh_last_frame || cpi->refresh_golden_frame ||
+      cpi->refresh_bwd_ref_frame || cpi->refresh_alt2_ref_frame ||
+      cpi->refresh_alt_ref_frame) {
+    assert(cpi->refresh_fb_idx >= 0 && cpi->refresh_fb_idx < REF_FRAMES);
+    refresh_mask |= (1 << cpi->refresh_fb_idx);
+  }
+
+  return refresh_mask;
+}
+#endif  // USE_GF16_MULTI_LAYER
+#endif  // CONFIG_EXT_REFS
+
+static int get_refresh_mask(AV1_COMP *cpi) {
+  int refresh_mask = 0;
 #if CONFIG_EXT_REFS
+#if USE_GF16_MULTI_LAYER
+  if (cpi->rc.baseline_gf_interval == 16) return get_refresh_mask_gf16(cpi);
+#endif  // USE_GF16_MULTI_LAYER
+
   // NOTE(zoeliu): When LAST_FRAME is to get refreshed, the decoder will be
   // notified to get LAST3_FRAME refreshed and then the virtual indexes for all
   // the 3 LAST reference frames will be updated accordingly, i.e.:
@@ -3634,13 +3848,10 @@ static int get_refresh_mask(AV1_COMP *cpi) {
   //     LAST3_FRAME.
   refresh_mask |=
       (cpi->refresh_last_frame << cpi->lst_fb_idxes[LAST_REF_FRAMES - 1]);
-  if (cpi->rc.is_bwd_ref_frame && cpi->num_extra_arfs) {
-    // We have swapped the virtual indices
-    refresh_mask |= (cpi->refresh_bwd_ref_frame << cpi->arf_map[0]);
-  } else {
-    refresh_mask |= (cpi->refresh_bwd_ref_frame << cpi->bwd_fb_idx);
-  }
-#else
+
+  refresh_mask |= (cpi->refresh_bwd_ref_frame << cpi->bwd_fb_idx);
+  refresh_mask |= (cpi->refresh_alt2_ref_frame << cpi->alt2_fb_idx);
+#else   // !CONFIG_EXT_REFS
   refresh_mask |= (cpi->refresh_last_frame << cpi->lst_fb_idx);
 #endif  // CONFIG_EXT_REFS
 
@@ -3658,9 +3869,8 @@ static int get_refresh_mask(AV1_COMP *cpi) {
     return refresh_mask | (cpi->refresh_golden_frame << cpi->alt_fb_idx);
   } else {
 #if CONFIG_EXT_REFS
-    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-    int arf_idx = cpi->arf_map[gf_group->arf_update_idx[gf_group->index]];
-#else
+    const int arf_idx = cpi->alt_fb_idx;
+#else   // !CONFIG_EXT_REFS
     int arf_idx = cpi->alt_fb_idx;
     if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) {
       const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
@@ -3725,15 +3935,12 @@ static INLINE int find_identical_tile(
 }
 #endif  // CONFIG_EXT_TILE
 
+#if !CONFIG_OBU || CONFIG_EXT_TILE
 static uint32_t write_tiles(AV1_COMP *const cpi, uint8_t *const dst,
                             unsigned int *max_tile_size,
                             unsigned int *max_tile_col_size) {
   const AV1_COMMON *const cm = &cpi->common;
-#if CONFIG_ANS
-  struct BufAnsCoder *buf_ans = &cpi->buf_ans;
-#else
   aom_writer mode_bc;
-#endif  // CONFIG_ANS
   int tile_row, tile_col;
   TOKENEXTRA *(*const tok_buffers)[MAX_TILE_COLS] = cpi->tile_tok;
   TileBufferEnc(*const tile_buffers)[MAX_TILE_COLS] = cpi->tile_buffers;
@@ -3744,7 +3951,7 @@ static uint32_t write_tiles(AV1_COMP *const cpi, uint8_t *const dst,
   const int have_tiles = tile_cols * tile_rows > 1;
   struct aom_write_bit_buffer wb = { dst, 0 };
   const int n_log2_tiles = cm->log2_tile_rows + cm->log2_tile_cols;
-  uint32_t comp_hdr_size;
+  uint32_t compressed_hdr_size;
   // Fixed size tile groups for the moment
   const int num_tg_hdrs = cm->num_tg;
   const int tg_size =
@@ -3759,7 +3966,6 @@ static uint32_t write_tiles(AV1_COMP *const cpi, uint8_t *const dst,
   int tile_size_bytes = 4;
   int tile_col_size_bytes;
   uint32_t uncompressed_hdr_size = 0;
-  struct aom_write_bit_buffer comp_hdr_len_wb;
   struct aom_write_bit_buffer tg_params_wb;
   struct aom_write_bit_buffer tile_size_bytes_wb;
   uint32_t saved_offset;
@@ -3806,19 +4012,14 @@ static uint32_t write_tiles(AV1_COMP *const cpi, uint8_t *const dst,
         cpi->td.mb.pvq_q = &this_tile->pvq_q;
         cpi->td.mb.daala_enc.state.adapt = &this_tile->tctx.pvq_context;
 #endif  // CONFIG_PVQ
-#if !CONFIG_ANS
+#if CONFIG_ANS
+        mode_bc.size = 1 << cpi->common.ans_window_size_log2;
+#endif
         aom_start_encode(&mode_bc, buf->data + data_offset);
         write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end);
         assert(tok == tok_end);
         aom_stop_encode(&mode_bc);
         tile_size = mode_bc.pos;
-#else
-        buf_ans_write_init(buf_ans, buf->data + data_offset);
-        write_modes(cpi, &tile_info, buf_ans, &tok, tok_end);
-        assert(tok == tok_end);
-        aom_buf_ans_flush(buf_ans);
-        tile_size = buf_ans_write_end(buf_ans);
-#endif  // !CONFIG_ANS
 #if CONFIG_PVQ
         cpi->td.mb.pvq_q = NULL;
 #endif
@@ -3866,7 +4067,7 @@ static uint32_t write_tiles(AV1_COMP *const cpi, uint8_t *const dst,
     }
   } else {
 #endif  // CONFIG_EXT_TILE
-    write_uncompressed_header(cpi, &wb);
+    write_uncompressed_header_frame(cpi, &wb);
 
 #if CONFIG_EXT_REFS
     if (cm->show_existing_frame) {
@@ -3887,14 +4088,22 @@ static uint32_t write_tiles(AV1_COMP *const cpi, uint8_t *const dst,
       aom_wb_overwrite_literal(&wb, (1 << n_log2_tiles) - 1, n_log2_tiles);
     }
 
-    /* Write a placeholder for the compressed header length */
-    comp_hdr_len_wb = wb;
-    aom_wb_write_literal(&wb, 0, 16);
+    if (!use_compressed_header(cm)) {
+      uncompressed_hdr_size = aom_wb_bytes_written(&wb);
+      compressed_hdr_size = 0;
+    } else {
+      /* Write a placeholder for the compressed header length */
+      struct aom_write_bit_buffer comp_hdr_len_wb = wb;
+      aom_wb_write_literal(&wb, 0, 16);
+
+      uncompressed_hdr_size = aom_wb_bytes_written(&wb);
+      compressed_hdr_size =
+          write_compressed_header(cpi, dst + uncompressed_hdr_size);
+      aom_wb_overwrite_literal(&comp_hdr_len_wb, (int)(compressed_hdr_size),
+                               16);
+    }
 
-    uncompressed_hdr_size = aom_wb_bytes_written(&wb);
-    comp_hdr_size = write_compressed_header(cpi, dst + uncompressed_hdr_size);
-    aom_wb_overwrite_literal(&comp_hdr_len_wb, (int)(comp_hdr_size), 16);
-    hdr_size = uncompressed_hdr_size + comp_hdr_size;
+    hdr_size = uncompressed_hdr_size + compressed_hdr_size;
     total_size += hdr_size;
 
     for (tile_row = 0; tile_row < tile_rows; tile_row++) {
@@ -3938,7 +4147,7 @@ static uint32_t write_tiles(AV1_COMP *const cpi, uint8_t *const dst,
             // Copy compressed header
             memmove(dst + old_total_size + uncompressed_hdr_size,
                     dst + uncompressed_hdr_size,
-                    comp_hdr_size * sizeof(uint8_t));
+                    compressed_hdr_size * sizeof(uint8_t));
             total_size += hdr_size;
             tile_count = 1;
             curr_tg_data_size = hdr_size + tile_size + 4;
@@ -3957,7 +4166,7 @@ static uint32_t write_tiles(AV1_COMP *const cpi, uint8_t *const dst,
             // Copy compressed header
             memmove(dst + total_size + uncompressed_hdr_size,
                     dst + uncompressed_hdr_size,
-                    comp_hdr_size * sizeof(uint8_t));
+                    compressed_hdr_size * sizeof(uint8_t));
             total_size += hdr_size;
             tile_count = 0;
             curr_tg_data_size = hdr_size;
@@ -3982,22 +4191,24 @@ static uint32_t write_tiles(AV1_COMP *const cpi, uint8_t *const dst,
         cpi->td.mb.daala_enc.state.adapt = &this_tile->tctx.pvq_context;
 #endif  // CONFIG_PVQ
 #if CONFIG_ANS
-        buf_ans_write_init(buf_ans, dst + total_size);
-        write_modes(cpi, &tile_info, buf_ans, &tok, tok_end);
-        assert(tok == tok_end);
-        aom_buf_ans_flush(buf_ans);
-        tile_size = buf_ans_write_end(buf_ans);
-#else
-      aom_start_encode(&mode_bc, dst + total_size);
-      write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end);
+        mode_bc.size = 1 << cpi->common.ans_window_size_log2;
+#endif  // CONFIG_ANS
+#if CONFIG_LOOP_RESTORATION
+        for (int p = 0; p < MAX_MB_PLANE; ++p) {
+          set_default_wiener(cpi->td.mb.e_mbd.wiener_info + p);
+          set_default_sgrproj(cpi->td.mb.e_mbd.sgrproj_info + p);
+        }
+#endif  // CONFIG_LOOP_RESTORATION
+
+        aom_start_encode(&mode_bc, dst + total_size);
+        write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end);
 #if !CONFIG_LV_MAP
 #if !CONFIG_PVQ
-      assert(tok == tok_end);
+        assert(tok == tok_end);
 #endif  // !CONFIG_PVQ
 #endif  // !CONFIG_LV_MAP
-      aom_stop_encode(&mode_bc);
-      tile_size = mode_bc.pos;
-#endif  // CONFIG_ANS
+        aom_stop_encode(&mode_bc);
+        tile_size = mode_bc.pos;
 #if CONFIG_PVQ
         cpi->td.mb.pvq_q = NULL;
 #endif
@@ -4018,18 +4229,20 @@ static uint32_t write_tiles(AV1_COMP *const cpi, uint8_t *const dst,
     }
     // Write the final tile group size
     if (n_log2_tiles) {
-      aom_wb_overwrite_literal(&tg_params_wb, (1 << n_log2_tiles) - tile_count,
-                               n_log2_tiles);
+      aom_wb_overwrite_literal(
+          &tg_params_wb, (tile_cols * tile_rows) - tile_count, n_log2_tiles);
       aom_wb_overwrite_literal(&tg_params_wb, tile_count - 1, n_log2_tiles);
     }
     // Remux if possible. TODO (Thomas Davies): do this for more than one tile
     // group
     if (have_tiles && tg_count == 1) {
-      int data_size = total_size - (uncompressed_hdr_size + comp_hdr_size);
-      data_size = remux_tiles(cm, dst + uncompressed_hdr_size + comp_hdr_size,
-                              data_size, *max_tile_size, *max_tile_col_size,
-                              &tile_size_bytes, &tile_col_size_bytes);
-      total_size = data_size + uncompressed_hdr_size + comp_hdr_size;
+      int data_size =
+          total_size - (uncompressed_hdr_size + compressed_hdr_size);
+      data_size =
+          remux_tiles(cm, dst + uncompressed_hdr_size + compressed_hdr_size,
+                      data_size, *max_tile_size, *max_tile_col_size,
+                      &tile_size_bytes, &tile_col_size_bytes);
+      total_size = data_size + uncompressed_hdr_size + compressed_hdr_size;
       aom_wb_overwrite_literal(&tile_size_bytes_wb, tile_size_bytes - 1, 2);
     }
 
@@ -4038,6 +4251,7 @@ static uint32_t write_tiles(AV1_COMP *const cpi, uint8_t *const dst,
 #endif  // CONFIG_EXT_TILE
   return (uint32_t)total_size;
 }
+#endif
 
 static void write_render_size(const AV1_COMMON *cm,
                               struct aom_write_bit_buffer *wb) {
@@ -4053,12 +4267,12 @@ static void write_render_size(const AV1_COMMON *cm,
 static void write_superres_scale(const AV1_COMMON *const cm,
                                  struct aom_write_bit_buffer *wb) {
   // First bit is whether to to scale or not
-  if (cm->superres_scale_numerator == SCALE_DENOMINATOR) {
+  if (cm->superres_scale_denominator == SCALE_NUMERATOR) {
     aom_wb_write_bit(wb, 0);  // no scaling
   } else {
     aom_wb_write_bit(wb, 1);  // scaling, write scale factor
     aom_wb_write_literal(
-        wb, cm->superres_scale_numerator - SUPERRES_SCALE_NUMERATOR_MIN,
+        wb, cm->superres_scale_denominator - SUPERRES_SCALE_DENOMINATOR_MIN,
         SUPERRES_SCALE_BITS);
   }
 }
@@ -4109,12 +4323,6 @@ static void write_frame_size_with_refs(AV1_COMP *cpi,
   if (!found) write_frame_size(cm, wb);
 }
 
-static void write_sync_code(struct aom_write_bit_buffer *wb) {
-  aom_wb_write_literal(wb, AV1_SYNC_CODE_0, 8);
-  aom_wb_write_literal(wb, AV1_SYNC_CODE_1, 8);
-  aom_wb_write_literal(wb, AV1_SYNC_CODE_2, 8);
-}
-
 static void write_profile(BITSTREAM_PROFILE profile,
                           struct aom_write_bit_buffer *wb) {
   switch (profile) {
@@ -4161,11 +4369,9 @@ static void write_bitdepth_colorspace_sampling(
 }
 
 #if CONFIG_REFERENCE_BUFFER
-void write_sequence_header(
-#if CONFIG_EXT_TILE
-    AV1_COMMON *const cm,
-#endif  // CONFIG_EXT_TILE
-    SequenceHeader *seq_params) {
+void write_sequence_header(AV1_COMMON *const cm,
+                           struct aom_write_bit_buffer *wb) {
+  SequenceHeader *seq_params = &cm->seq_params;
   /* Placeholder for actually writing to the bitstream */
   seq_params->frame_id_numbers_present_flag =
 #if CONFIG_EXT_TILE
@@ -4174,10 +4380,29 @@ void write_sequence_header(
                            FRAME_ID_NUMBERS_PRESENT_FLAG;
   seq_params->frame_id_length_minus7 = FRAME_ID_LENGTH_MINUS7;
   seq_params->delta_frame_id_length_minus2 = DELTA_FRAME_ID_LENGTH_MINUS2;
+
+  aom_wb_write_bit(wb, seq_params->frame_id_numbers_present_flag);
+  if (seq_params->frame_id_numbers_present_flag) {
+    aom_wb_write_literal(wb, seq_params->frame_id_length_minus7, 4);
+    aom_wb_write_literal(wb, seq_params->delta_frame_id_length_minus2, 4);
+  }
+}
+#endif  // CONFIG_REFERENCE_BUFFER
+
+static void write_sb_size(const AV1_COMMON *cm,
+                          struct aom_write_bit_buffer *wb) {
+  (void)cm;
+  (void)wb;
+  assert(cm->mib_size == mi_size_wide[cm->sb_size]);
+  assert(cm->mib_size == 1 << cm->mib_size_log2);
+#if CONFIG_EXT_PARTITION
+  assert(cm->sb_size == BLOCK_128X128 || cm->sb_size == BLOCK_64X64);
+  aom_wb_write_bit(wb, cm->sb_size == BLOCK_128X128 ? 1 : 0);
+#else
+  assert(cm->sb_size == BLOCK_64X64);
+#endif  // CONFIG_EXT_PARTITION
 }
-#endif
 
-#if CONFIG_EXT_INTER
 static void write_compound_tools(const AV1_COMMON *cm,
                                  struct aom_write_bit_buffer *wb) {
   (void)cm;
@@ -4201,22 +4426,129 @@ static void write_compound_tools(const AV1_COMMON *cm,
   }
 #endif  // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
 }
-#endif  // CONFIG_EXT_INTER
 
-static void write_uncompressed_header(AV1_COMP *cpi,
-                                      struct aom_write_bit_buffer *wb) {
-  AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+#if CONFIG_GLOBAL_MOTION
+static void write_global_motion_params(const WarpedMotionParams *params,
+                                       const WarpedMotionParams *ref_params,
+                                       struct aom_write_bit_buffer *wb,
+                                       int allow_hp) {
+  TransformationType type = params->wmtype;
+  int trans_bits;
+  int trans_prec_diff;
 
-#if CONFIG_REFERENCE_BUFFER
-  /* TODO: Move outside frame loop or inside key-frame branch */
-  write_sequence_header(
-#if CONFIG_EXT_TILE
-      cm,
-#endif  // CONFIG_EXT_TILE
-      &cpi->seq_params);
+  aom_wb_write_bit(wb, type != IDENTITY);
+  if (type != IDENTITY) {
+#if GLOBAL_TRANS_TYPES > 4
+    aom_wb_write_literal(wb, type - 1, GLOBAL_TYPE_BITS);
+#else
+    aom_wb_write_bit(wb, type == ROTZOOM);
+    if (type != ROTZOOM) aom_wb_write_bit(wb, type == TRANSLATION);
+#endif  // GLOBAL_TRANS_TYPES > 4
+  }
+
+  switch (type) {
+    case HOMOGRAPHY:
+    case HORTRAPEZOID:
+    case VERTRAPEZOID:
+      if (type != HORTRAPEZOID)
+        aom_wb_write_signed_primitive_refsubexpfin(
+            wb, GM_ROW3HOMO_MAX + 1, SUBEXPFIN_K,
+            (ref_params->wmmat[6] >> GM_ROW3HOMO_PREC_DIFF),
+            (params->wmmat[6] >> GM_ROW3HOMO_PREC_DIFF));
+      if (type != VERTRAPEZOID)
+        aom_wb_write_signed_primitive_refsubexpfin(
+            wb, GM_ROW3HOMO_MAX + 1, SUBEXPFIN_K,
+            (ref_params->wmmat[7] >> GM_ROW3HOMO_PREC_DIFF),
+            (params->wmmat[7] >> GM_ROW3HOMO_PREC_DIFF));
+    // fallthrough intended
+    case AFFINE:
+    case ROTZOOM:
+      aom_wb_write_signed_primitive_refsubexpfin(
+          wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+          (ref_params->wmmat[2] >> GM_ALPHA_PREC_DIFF) -
+              (1 << GM_ALPHA_PREC_BITS),
+          (params->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
+      if (type != VERTRAPEZOID)
+        aom_wb_write_signed_primitive_refsubexpfin(
+            wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+            (ref_params->wmmat[3] >> GM_ALPHA_PREC_DIFF),
+            (params->wmmat[3] >> GM_ALPHA_PREC_DIFF));
+      if (type >= AFFINE) {
+        if (type != HORTRAPEZOID)
+          aom_wb_write_signed_primitive_refsubexpfin(
+              wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+              (ref_params->wmmat[4] >> GM_ALPHA_PREC_DIFF),
+              (params->wmmat[4] >> GM_ALPHA_PREC_DIFF));
+        aom_wb_write_signed_primitive_refsubexpfin(
+            wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+            (ref_params->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
+                (1 << GM_ALPHA_PREC_BITS),
+            (params->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
+                (1 << GM_ALPHA_PREC_BITS));
+      }
+    // fallthrough intended
+    case TRANSLATION:
+      trans_bits = (type == TRANSLATION) ? GM_ABS_TRANS_ONLY_BITS - !allow_hp
+                                         : GM_ABS_TRANS_BITS;
+      trans_prec_diff = (type == TRANSLATION)
+                            ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp
+                            : GM_TRANS_PREC_DIFF;
+      aom_wb_write_signed_primitive_refsubexpfin(
+          wb, (1 << trans_bits) + 1, SUBEXPFIN_K,
+          (ref_params->wmmat[0] >> trans_prec_diff),
+          (params->wmmat[0] >> trans_prec_diff));
+      aom_wb_write_signed_primitive_refsubexpfin(
+          wb, (1 << trans_bits) + 1, SUBEXPFIN_K,
+          (ref_params->wmmat[1] >> trans_prec_diff),
+          (params->wmmat[1] >> trans_prec_diff));
+      break;
+    case IDENTITY: break;
+    default: assert(0);
+  }
+}
+
+static void write_global_motion(AV1_COMP *cpi,
+                                struct aom_write_bit_buffer *wb) {
+  AV1_COMMON *const cm = &cpi->common;
+  int frame;
+  for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
+    const WarpedMotionParams *ref_params =
+        cm->error_resilient_mode ? &default_warp_params
+                                 : &cm->prev_frame->global_motion[frame];
+    write_global_motion_params(&cm->global_motion[frame], ref_params, wb,
+                               cm->allow_high_precision_mv);
+    // TODO(sarahparker, debargha): The logic in the commented out code below
+    // does not work currently and causes mismatches when resize is on.
+    // Fix it before turning the optimization back on.
+    /*
+    YV12_BUFFER_CONFIG *ref_buf = get_ref_frame_buffer(cpi, frame);
+    if (cpi->source->y_crop_width == ref_buf->y_crop_width &&
+        cpi->source->y_crop_height == ref_buf->y_crop_height) {
+      write_global_motion_params(&cm->global_motion[frame],
+                                 &cm->prev_frame->global_motion[frame], wb,
+                                 cm->allow_high_precision_mv);
+    } else {
+      assert(cm->global_motion[frame].wmtype == IDENTITY &&
+             "Invalid warp type for frames of different resolutions");
+    }
+    */
+    /*
+    printf("Frame %d/%d: Enc Ref %d: %d %d %d %d\n",
+           cm->current_video_frame, cm->show_frame, frame,
+           cm->global_motion[frame].wmmat[0],
+           cm->global_motion[frame].wmmat[1], cm->global_motion[frame].wmmat[2],
+           cm->global_motion[frame].wmmat[3]);
+           */
+  }
+}
 #endif
 
+#if !CONFIG_OBU
+static void write_uncompressed_header_frame(AV1_COMP *cpi,
+                                            struct aom_write_bit_buffer *wb) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+
   aom_wb_write_literal(wb, AOM_FRAME_MARKER, 2);
 
   write_profile(cm->profile, wb);
@@ -4244,8 +4576,8 @@ static void write_uncompressed_header(AV1_COMP *cpi,
     aom_wb_write_literal(wb, cpi->existing_fb_idx_to_show, 3);
 
 #if CONFIG_REFERENCE_BUFFER
-    if (cpi->seq_params.frame_id_numbers_present_flag) {
-      int frame_id_len = cpi->seq_params.frame_id_length_minus7 + 7;
+    if (cm->seq_params.frame_id_numbers_present_flag) {
+      int frame_id_len = cm->seq_params.frame_id_length_minus7 + 7;
       int display_frame_id = cm->ref_frame_id[cpi->existing_fb_idx_to_show];
       aom_wb_write_literal(wb, display_frame_id, frame_id_len);
       /* Add a zero byte to prevent emulation of superframe marker */
@@ -4253,7 +4585,7 @@ static void write_uncompressed_header(AV1_COMP *cpi,
       /* Consider to have this logic only one place */
       aom_wb_write_literal(wb, 0, 8);
     }
-#endif
+#endif  // CONFIG_REFERENCE_BUFFER
 
     return;
   } else {
@@ -4265,33 +4597,46 @@ static void write_uncompressed_header(AV1_COMP *cpi,
 
   aom_wb_write_bit(wb, cm->frame_type);
   aom_wb_write_bit(wb, cm->show_frame);
+  if (cm->frame_type != KEY_FRAME)
+    if (!cm->show_frame) aom_wb_write_bit(wb, cm->intra_only);
   aom_wb_write_bit(wb, cm->error_resilient_mode);
 
+  if (frame_is_intra_only(cm)) {
+#if CONFIG_REFERENCE_BUFFER
+    write_sequence_header(cm, wb);
+#endif  // CONFIG_REFERENCE_BUFFER
+  }
 #if CONFIG_REFERENCE_BUFFER
   cm->invalid_delta_frame_id_minus1 = 0;
-  if (cpi->seq_params.frame_id_numbers_present_flag) {
-    int frame_id_len = cpi->seq_params.frame_id_length_minus7 + 7;
+  if (cm->seq_params.frame_id_numbers_present_flag) {
+    int frame_id_len = cm->seq_params.frame_id_length_minus7 + 7;
     aom_wb_write_literal(wb, cm->current_frame_id, frame_id_len);
   }
-#endif
-
+#endif  // CONFIG_REFERENCE_BUFFER
   if (cm->frame_type == KEY_FRAME) {
-    write_sync_code(wb);
     write_bitdepth_colorspace_sampling(cm, wb);
     write_frame_size(cm, wb);
+    write_sb_size(cm, wb);
+
 #if CONFIG_ANS && ANS_MAX_SYMBOLS
     assert(cpi->common.ans_window_size_log2 >= 8);
     assert(cpi->common.ans_window_size_log2 < 24);
     aom_wb_write_literal(wb, cpi->common.ans_window_size_log2 - 8, 4);
 #endif  // CONFIG_ANS && ANS_MAX_SYMBOLS
-#if CONFIG_PALETTE || CONFIG_INTRABC
     aom_wb_write_bit(wb, cm->allow_screen_content_tools);
-#endif  // CONFIG_PALETTE || CONFIG_INTRABC
+#if CONFIG_AMVR
+    if (cm->allow_screen_content_tools) {
+      if (cm->seq_mv_precision_level == 2) {
+        aom_wb_write_bit(wb, 1);
+      } else {
+        aom_wb_write_bit(wb, 0);
+        aom_wb_write_bit(wb, cm->seq_mv_precision_level == 0);
+      }
+    }
+#endif
   } else {
-    if (!cm->show_frame) aom_wb_write_bit(wb, cm->intra_only);
-#if CONFIG_PALETTE || CONFIG_INTRABC
     if (cm->intra_only) aom_wb_write_bit(wb, cm->allow_screen_content_tools);
-#endif  // CONFIG_PALETTE || CONFIG_INTRABC
+#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING
     if (!cm->error_resilient_mode) {
       if (cm->intra_only) {
         aom_wb_write_bit(wb,
@@ -4304,13 +4649,12 @@ static void write_uncompressed_header(AV1_COMP *cpi,
                            cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL);
       }
     }
-
+#endif
 #if CONFIG_EXT_REFS
     cpi->refresh_frame_mask = get_refresh_mask(cpi);
 #endif  // CONFIG_EXT_REFS
 
     if (cm->intra_only) {
-      write_sync_code(wb);
       write_bitdepth_colorspace_sampling(cm, wb);
 
 #if CONFIG_EXT_REFS
@@ -4346,12 +4690,14 @@ static void write_uncompressed_header(AV1_COMP *cpi,
         assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX);
         aom_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame),
                              REF_FRAMES_LOG2);
+#if !CONFIG_FRAME_SIGN_BIAS
         aom_wb_write_bit(wb, cm->ref_frame_sign_bias[ref_frame]);
+#endif  // !CONFIG_FRAME_SIGN_BIAS
 #if CONFIG_REFERENCE_BUFFER
-        if (cpi->seq_params.frame_id_numbers_present_flag) {
+        if (cm->seq_params.frame_id_numbers_present_flag) {
           int i = get_ref_frame_map_idx(cpi, ref_frame);
-          int frame_id_len = cpi->seq_params.frame_id_length_minus7 + 7;
-          int diff_len = cpi->seq_params.delta_frame_id_length_minus2 + 2;
+          int frame_id_len = cm->seq_params.frame_id_length_minus7 + 7;
+          int diff_len = cm->seq_params.delta_frame_id_length_minus2 + 2;
           int delta_frame_id_minus1 =
               ((cm->current_frame_id - cm->ref_frame_id[i] +
                 (1 << frame_id_len)) %
@@ -4362,8 +4708,24 @@ static void write_uncompressed_header(AV1_COMP *cpi,
             cm->invalid_delta_frame_id_minus1 = 1;
           aom_wb_write_literal(wb, delta_frame_id_minus1, diff_len);
         }
-#endif
+#endif  // CONFIG_REFERENCE_BUFFER
+      }
+
+#if CONFIG_FRAME_SIGN_BIAS
+#define FRAME_SIGN_BIAS_DEBUG 0
+#if FRAME_SIGN_BIAS_DEBUG
+      {
+        printf("\n\nENCODER: Frame=%d, show_frame=%d:", cm->current_video_frame,
+               cm->show_frame);
+        for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+          printf(" sign_bias[%d]=%d", ref_frame,
+                 cm->ref_frame_sign_bias[ref_frame]);
+        }
+        printf("\n");
       }
+#endif  // FRAME_SIGN_BIAS_DEBUG
+#undef FRAME_SIGN_BIAS_DEBUG
+#endif  // CONFIG_FRAME_SIGN_BIAS
 
 #if CONFIG_FRAME_SIZE
       if (cm->error_resilient_mode == 0) {
@@ -4375,42 +4737,55 @@ static void write_uncompressed_header(AV1_COMP *cpi,
       write_frame_size_with_refs(cpi, wb);
 #endif
 
+#if CONFIG_AMVR
+      if (cm->seq_mv_precision_level == 2) {
+        aom_wb_write_bit(wb, cm->cur_frame_mv_precision_level == 0);
+      }
+#endif
       aom_wb_write_bit(wb, cm->allow_high_precision_mv);
 
       fix_interp_filter(cm, cpi->td.counts);
       write_frame_interp_filter(cm->interp_filter, wb);
 #if CONFIG_TEMPMV_SIGNALING
-      if (!cm->error_resilient_mode) {
+      if (frame_might_use_prev_frame_mvs(cm)) {
         aom_wb_write_bit(wb, cm->use_prev_frame_mvs);
       }
 #endif
     }
   }
 
-#if CONFIG_REFERENCE_BUFFER
-  cm->refresh_mask = cm->frame_type == KEY_FRAME ? 0xFF : get_refresh_mask(cpi);
+#if CONFIG_FRAME_MARKER
+  if (cm->show_frame == 0) {
+    int arf_offset = AOMMIN(
+        (MAX_GF_INTERVAL - 1),
+        cpi->twopass.gf_group.arf_src_offset[cpi->twopass.gf_group.index]);
+#if CONFIG_EXT_REFS
+    int brf_offset =
+        cpi->twopass.gf_group.brf_src_offset[cpi->twopass.gf_group.index];
+
+    arf_offset = AOMMIN((MAX_GF_INTERVAL - 1), arf_offset + brf_offset);
+#endif
+    aom_wb_write_literal(wb, arf_offset, 4);
+  }
 #endif
 
+#if CONFIG_REFERENCE_BUFFER
+  if (cm->seq_params.frame_id_numbers_present_flag) {
+    cm->refresh_mask =
+        cm->frame_type == KEY_FRAME ? 0xFF : get_refresh_mask(cpi);
+  }
+#endif  // CONFIG_REFERENCE_BUFFER
+
   if (!cm->error_resilient_mode) {
     aom_wb_write_bit(
         wb, cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_FORWARD);
   }
-
+#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING
   aom_wb_write_literal(wb, cm->frame_context_idx, FRAME_CONTEXTS_LOG2);
-
-  assert(cm->mib_size == mi_size_wide[cm->sb_size]);
-  assert(cm->mib_size == 1 << cm->mib_size_log2);
-#if CONFIG_EXT_PARTITION
-  assert(cm->sb_size == BLOCK_128X128 || cm->sb_size == BLOCK_64X64);
-  aom_wb_write_bit(wb, cm->sb_size == BLOCK_128X128 ? 1 : 0);
-#else
-  assert(cm->sb_size == BLOCK_64X64);
-#endif  // CONFIG_EXT_PARTITION
-
+#endif
   encode_loopfilter(cm, wb);
   encode_quantization(cm, wb);
   encode_segmentation(cm, xd, wb);
-#if CONFIG_DELTA_Q
   {
     int i;
     struct segmentation *const seg = &cm->seg;
@@ -4434,12 +4809,16 @@ static void write_uncompressed_header(AV1_COMP *cpi,
         if (cm->delta_lf_present_flag) {
           aom_wb_write_literal(wb, OD_ILOG_NZ(cm->delta_lf_res) - 1, 2);
           xd->prev_delta_lf_from_base = 0;
+#if CONFIG_LOOPFILTER_LEVEL
+          aom_wb_write_bit(wb, cm->delta_lf_multi);
+          for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id)
+            xd->prev_delta_lf[lf_id] = 0;
+#endif  // CONFIG_LOOPFILTER_LEVEL
         }
 #endif  // CONFIG_EXT_DELTA_Q
       }
     }
   }
-#endif
 #if CONFIG_CDEF
   if (!cm->all_lossless) {
     encode_cdef(cm, wb);
@@ -4461,113 +4840,372 @@ static void write_uncompressed_header(AV1_COMP *cpi,
     if (!use_hybrid_pred) aom_wb_write_bit(wb, use_compound_pred);
 #endif  // !CONFIG_REF_ADAPT
   }
-#if CONFIG_EXT_INTER
   write_compound_tools(cm, wb);
-#endif  // CONFIG_EXT_INTER
 
 #if CONFIG_EXT_TX
   aom_wb_write_bit(wb, cm->reduced_tx_set_used);
 #endif  // CONFIG_EXT_TX
 
-  write_tile_info(cm, wb);
-}
+#if CONFIG_ADAPT_SCAN
+  aom_wb_write_bit(wb, cm->use_adapt_scan);
+#endif
 
 #if CONFIG_GLOBAL_MOTION
-static void write_global_motion_params(WarpedMotionParams *params,
-                                       WarpedMotionParams *ref_params,
-                                       aom_writer *w, int allow_hp) {
-  TransformationType type = params->wmtype;
-  int trans_bits;
-  int trans_prec_diff;
-  aom_write_bit(w, type != IDENTITY);
-  if (type != IDENTITY) aom_write_literal(w, type - 1, GLOBAL_TYPE_BITS);
+  if (!frame_is_intra_only(cm)) write_global_motion(cpi, wb);
+#endif  // CONFIG_GLOBAL_MOTION
 
-  switch (type) {
-    case HOMOGRAPHY:
-    case HORTRAPEZOID:
-    case VERTRAPEZOID:
-      if (type != HORTRAPEZOID)
-        aom_write_signed_primitive_refsubexpfin(
-            w, GM_ROW3HOMO_MAX + 1, SUBEXPFIN_K,
-            (ref_params->wmmat[6] >> GM_ROW3HOMO_PREC_DIFF),
-            (params->wmmat[6] >> GM_ROW3HOMO_PREC_DIFF));
-      if (type != VERTRAPEZOID)
-        aom_write_signed_primitive_refsubexpfin(
-            w, GM_ROW3HOMO_MAX + 1, SUBEXPFIN_K,
-            (ref_params->wmmat[7] >> GM_ROW3HOMO_PREC_DIFF),
-            (params->wmmat[7] >> GM_ROW3HOMO_PREC_DIFF));
-    // fallthrough intended
-    case AFFINE:
-    case ROTZOOM:
-      aom_write_signed_primitive_refsubexpfin(
-          w, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
-          (ref_params->wmmat[2] >> GM_ALPHA_PREC_DIFF) -
-              (1 << GM_ALPHA_PREC_BITS),
-          (params->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
-      if (type != VERTRAPEZOID)
-        aom_write_signed_primitive_refsubexpfin(
-            w, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
-            (ref_params->wmmat[3] >> GM_ALPHA_PREC_DIFF),
-            (params->wmmat[3] >> GM_ALPHA_PREC_DIFF));
-      if (type >= AFFINE) {
-        if (type != HORTRAPEZOID)
-          aom_write_signed_primitive_refsubexpfin(
-              w, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
-              (ref_params->wmmat[4] >> GM_ALPHA_PREC_DIFF),
-              (params->wmmat[4] >> GM_ALPHA_PREC_DIFF));
-        aom_write_signed_primitive_refsubexpfin(
-            w, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
-            (ref_params->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
-                (1 << GM_ALPHA_PREC_BITS),
-            (params->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
-                (1 << GM_ALPHA_PREC_BITS));
-      }
-    // fallthrough intended
-    case TRANSLATION:
-      trans_bits = (type == TRANSLATION) ? GM_ABS_TRANS_ONLY_BITS - !allow_hp
-                                         : GM_ABS_TRANS_BITS;
-      trans_prec_diff = (type == TRANSLATION)
-                            ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp
-                            : GM_TRANS_PREC_DIFF;
-      aom_write_signed_primitive_refsubexpfin(
-          w, (1 << trans_bits) + 1, SUBEXPFIN_K,
-          (ref_params->wmmat[0] >> trans_prec_diff),
-          (params->wmmat[0] >> trans_prec_diff));
-      aom_write_signed_primitive_refsubexpfin(
-          w, (1 << trans_bits) + 1, SUBEXPFIN_K,
-          (ref_params->wmmat[1] >> trans_prec_diff),
-          (params->wmmat[1] >> trans_prec_diff));
-      break;
-    case IDENTITY: break;
-    default: assert(0);
-  }
+  write_tile_info(cm, wb);
 }
 
-static void write_global_motion(AV1_COMP *cpi, aom_writer *w) {
+#else
+// New function based on HLS R18
+static void write_uncompressed_header_obu(AV1_COMP *cpi,
+                                          struct aom_write_bit_buffer *wb) {
   AV1_COMMON *const cm = &cpi->common;
-  int frame;
-  YV12_BUFFER_CONFIG *ref_buf;
-  for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
-    ref_buf = get_ref_frame_buffer(cpi, frame);
-    if (cpi->source->y_crop_width == ref_buf->y_crop_width &&
-        cpi->source->y_crop_height == ref_buf->y_crop_height) {
-      write_global_motion_params(&cm->global_motion[frame],
-                                 &cm->prev_frame->global_motion[frame], w,
-                                 cm->allow_high_precision_mv);
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+
+#if CONFIG_EXT_TILE
+  aom_wb_write_literal(wb, cm->large_scale_tile, 1);
+#endif  // CONFIG_EXT_TILE
+
+#if CONFIG_EXT_REFS
+  // NOTE: By default all coded frames to be used as a reference
+  cm->is_reference_frame = 1;
+
+  if (cm->show_existing_frame) {
+    RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+    const int frame_to_show = cm->ref_frame_map[cpi->existing_fb_idx_to_show];
+
+    if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) {
+      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                         "Buffer %d does not contain a reconstructed frame",
+                         frame_to_show);
+    }
+    ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show);
+
+    aom_wb_write_bit(wb, 1);  // show_existing_frame
+    aom_wb_write_literal(wb, cpi->existing_fb_idx_to_show, 3);
+
+#if CONFIG_REFERENCE_BUFFER
+    if (cm->seq_params.frame_id_numbers_present_flag) {
+      int frame_id_len = cm->seq_params.frame_id_length_minus7 + 7;
+      int display_frame_id = cm->ref_frame_id[cpi->existing_fb_idx_to_show];
+      aom_wb_write_literal(wb, display_frame_id, frame_id_len);
+      /* Add a zero byte to prevent emulation of superframe marker */
+      /* Same logic as when when terminating the entropy coder */
+      /* Consider to have this logic only one place */
+      aom_wb_write_literal(wb, 0, 8);
+    }
+#endif  // CONFIG_REFERENCE_BUFFER
+
+    return;
+  } else {
+#endif  // CONFIG_EXT_REFS
+    aom_wb_write_bit(wb, 0);  // show_existing_frame
+#if CONFIG_EXT_REFS
+  }
+#endif  // CONFIG_EXT_REFS
+
+  cm->frame_type = cm->intra_only ? INTRA_ONLY_FRAME : cm->frame_type;
+  aom_wb_write_literal(wb, cm->frame_type, 2);
+
+  if (cm->intra_only) cm->frame_type = INTRA_ONLY_FRAME;
+
+  aom_wb_write_bit(wb, cm->show_frame);
+  aom_wb_write_bit(wb, cm->error_resilient_mode);
+
+#if CONFIG_REFERENCE_BUFFER
+  cm->invalid_delta_frame_id_minus1 = 0;
+  if (cm->seq_params.frame_id_numbers_present_flag) {
+    int frame_id_len = cm->seq_params.frame_id_length_minus7 + 7;
+    aom_wb_write_literal(wb, cm->current_frame_id, frame_id_len);
+  }
+#endif  // CONFIG_REFERENCE_BUFFER
+  if (cm->frame_type == KEY_FRAME) {
+    write_frame_size(cm, wb);
+    write_sb_size(cm, wb);
+
+#if CONFIG_ANS && ANS_MAX_SYMBOLS
+    assert(cpi->common.ans_window_size_log2 >= 8);
+    assert(cpi->common.ans_window_size_log2 < 24);
+    aom_wb_write_literal(wb, cpi->common.ans_window_size_log2 - 8, 4);
+#endif  // CONFIG_ANS && ANS_MAX_SYMBOLS
+    aom_wb_write_bit(wb, cm->allow_screen_content_tools);
+#if CONFIG_AMVR
+    if (cm->allow_screen_content_tools) {
+      if (cm->seq_mv_precision_level == 2) {
+        aom_wb_write_bit(wb, 1);
+      } else {
+        aom_wb_write_bit(wb, 0);
+        aom_wb_write_bit(wb, cm->seq_mv_precision_level == 0);
+      }
+    }
+#endif
+  } else if (cm->frame_type == INTRA_ONLY_FRAME) {
+    if (cm->intra_only) aom_wb_write_bit(wb, cm->allow_screen_content_tools);
+#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING
+    if (!cm->error_resilient_mode) {
+      if (cm->intra_only) {
+        aom_wb_write_bit(wb,
+                         cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL);
+      }
+    }
+#endif
+#if CONFIG_EXT_REFS
+    cpi->refresh_frame_mask = get_refresh_mask(cpi);
+#endif  // CONFIG_EXT_REFS
+
+    if (cm->intra_only) {
+#if CONFIG_EXT_REFS
+      aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES);
+#else
+      aom_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
+#endif  // CONFIG_EXT_REFS
+      write_frame_size(cm, wb);
+
+#if CONFIG_ANS && ANS_MAX_SYMBOLS
+      assert(cpi->common.ans_window_size_log2 >= 8);
+      assert(cpi->common.ans_window_size_log2 < 24);
+      aom_wb_write_literal(wb, cpi->common.ans_window_size_log2 - 8, 4);
+#endif  // CONFIG_ANS && ANS_MAX_SYMBOLS
+    }
+  } else if (cm->frame_type == INTER_FRAME) {
+    MV_REFERENCE_FRAME ref_frame;
+#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING
+    if (!cm->error_resilient_mode) {
+      aom_wb_write_bit(wb, cm->reset_frame_context != RESET_FRAME_CONTEXT_NONE);
+      if (cm->reset_frame_context != RESET_FRAME_CONTEXT_NONE)
+        aom_wb_write_bit(wb,
+                         cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL);
+    }
+#endif
+
+#if CONFIG_EXT_REFS
+    cpi->refresh_frame_mask = get_refresh_mask(cpi);
+    aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES);
+#else
+    aom_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
+#endif  // CONFIG_EXT_REFS
+
+#if CONFIG_EXT_REFS
+    if (!cpi->refresh_frame_mask) {
+      // NOTE: "cpi->refresh_frame_mask == 0" indicates that the coded frame
+      //       will not be used as a reference
+      cm->is_reference_frame = 0;
+    }
+#endif  // CONFIG_EXT_REFS
+
+    for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+      assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX);
+      aom_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame),
+                           REF_FRAMES_LOG2);
+#if !CONFIG_FRAME_SIGN_BIAS
+      aom_wb_write_bit(wb, cm->ref_frame_sign_bias[ref_frame]);
+#endif  // !CONFIG_FRAME_SIGN_BIAS
+#if CONFIG_REFERENCE_BUFFER
+      if (cm->seq_params.frame_id_numbers_present_flag) {
+        int i = get_ref_frame_map_idx(cpi, ref_frame);
+        int frame_id_len = cm->seq_params.frame_id_length_minus7 + 7;
+        int diff_len = cm->seq_params.delta_frame_id_length_minus2 + 2;
+        int delta_frame_id_minus1 =
+            ((cm->current_frame_id - cm->ref_frame_id[i] +
+              (1 << frame_id_len)) %
+             (1 << frame_id_len)) -
+            1;
+        if (delta_frame_id_minus1 < 0 ||
+            delta_frame_id_minus1 >= (1 << diff_len))
+          cm->invalid_delta_frame_id_minus1 = 1;
+        aom_wb_write_literal(wb, delta_frame_id_minus1, diff_len);
+      }
+#endif  // CONFIG_REFERENCE_BUFFER
+    }
+
+#if CONFIG_FRAME_SIZE
+    if (cm->error_resilient_mode == 0) {
+      write_frame_size_with_refs(cpi, wb);
     } else {
-      assert(cm->global_motion[frame].wmtype == IDENTITY &&
-             "Invalid warp type for frames of different resolutions");
+      write_frame_size(cm, wb);
     }
-    /*
-    printf("Frame %d/%d: Enc Ref %d (used %d): %d %d %d %d\n",
-           cm->current_video_frame, cm->show_frame, frame,
-           cpi->global_motion_used[frame], cm->global_motion[frame].wmmat[0],
-           cm->global_motion[frame].wmmat[1], cm->global_motion[frame].wmmat[2],
-           cm->global_motion[frame].wmmat[3]);
-           */
+#else
+    write_frame_size_with_refs(cpi, wb);
+#endif
+
+#if CONFIG_AMVR
+    if (cm->seq_mv_precision_level == 2) {
+      aom_wb_write_bit(wb, cm->cur_frame_mv_precision_level == 0);
+    }
+#endif
+    aom_wb_write_bit(wb, cm->allow_high_precision_mv);
+
+    fix_interp_filter(cm, cpi->td.counts);
+    write_frame_interp_filter(cm->interp_filter, wb);
+#if CONFIG_TEMPMV_SIGNALING
+    if (frame_might_use_prev_frame_mvs(cm)) {
+      aom_wb_write_bit(wb, cm->use_prev_frame_mvs);
+    }
+#endif
+  } else if (cm->frame_type == S_FRAME) {
+    MV_REFERENCE_FRAME ref_frame;
+
+#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING
+    if (!cm->error_resilient_mode) {
+      aom_wb_write_bit(wb, cm->reset_frame_context != RESET_FRAME_CONTEXT_NONE);
+      if (cm->reset_frame_context != RESET_FRAME_CONTEXT_NONE)
+        aom_wb_write_bit(wb,
+                         cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL);
+    }
+#endif
+
+#if CONFIG_EXT_REFS
+    if (!cpi->refresh_frame_mask) {
+      // NOTE: "cpi->refresh_frame_mask == 0" indicates that the coded frame
+      //       will not be used as a reference
+      cm->is_reference_frame = 0;
+    }
+#endif  // CONFIG_EXT_REFS
+
+    for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+      assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX);
+      aom_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame),
+                           REF_FRAMES_LOG2);
+      assert(cm->ref_frame_sign_bias[ref_frame] == 0);
+#if CONFIG_REFERENCE_BUFFER
+      if (cm->seq_params.frame_id_numbers_present_flag) {
+        int i = get_ref_frame_map_idx(cpi, ref_frame);
+        int frame_id_len = cm->seq_params.frame_id_length_minus7 + 7;
+        int diff_len = cm->seq_params.delta_frame_id_length_minus2 + 2;
+        int delta_frame_id_minus1 =
+            ((cm->current_frame_id - cm->ref_frame_id[i] +
+              (1 << frame_id_len)) %
+             (1 << frame_id_len)) -
+            1;
+        if (delta_frame_id_minus1 < 0 ||
+            delta_frame_id_minus1 >= (1 << diff_len))
+          cm->invalid_delta_frame_id_minus1 = 1;
+        aom_wb_write_literal(wb, delta_frame_id_minus1, diff_len);
+      }
+#endif  // CONFIG_REFERENCE_BUFFER
+    }
+
+#if CONFIG_FRAME_SIZE
+    if (cm->error_resilient_mode == 0) {
+      write_frame_size_with_refs(cpi, wb);
+    } else {
+      write_frame_size(cm, wb);
+    }
+#else
+    write_frame_size_with_refs(cpi, wb);
+#endif
+
+    aom_wb_write_bit(wb, cm->allow_high_precision_mv);
+
+    fix_interp_filter(cm, cpi->td.counts);
+    write_frame_interp_filter(cm->interp_filter, wb);
+#if CONFIG_TEMPMV_SIGNALING
+    if (frame_might_use_prev_frame_mvs(cm)) {
+      aom_wb_write_bit(wb, cm->use_prev_frame_mvs);
+    }
+#endif
+  }
+
+#if CONFIG_MFMV
+  if (cm->show_frame == 0) {
+    int arf_offset = AOMMIN(
+        (MAX_GF_INTERVAL - 1),
+        cpi->twopass.gf_group.arf_src_offset[cpi->twopass.gf_group.index]);
+#if CONFIG_EXT_REFS
+    int brf_offset =
+        cpi->twopass.gf_group.brf_src_offset[cpi->twopass.gf_group.index];
+
+    arf_offset = AOMMIN((MAX_GF_INTERVAL - 1), arf_offset + brf_offset);
+#endif
+    aom_wb_write_literal(wb, arf_offset, 4);
+  }
+#endif
+
+#if CONFIG_REFERENCE_BUFFER
+  if (cm->seq_params.frame_id_numbers_present_flag) {
+    cm->refresh_mask =
+        cm->frame_type == KEY_FRAME ? 0xFF : get_refresh_mask(cpi);
+  }
+#endif  // CONFIG_REFERENCE_BUFFER
+
+  if (!cm->error_resilient_mode) {
+    aom_wb_write_bit(
+        wb, cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_FORWARD);
+  }
+#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING
+  aom_wb_write_literal(wb, cm->frame_context_idx, FRAME_CONTEXTS_LOG2);
+#endif
+  encode_loopfilter(cm, wb);
+  encode_quantization(cm, wb);
+  encode_segmentation(cm, xd, wb);
+  {
+    int i;
+    struct segmentation *const seg = &cm->seg;
+    int segment_quantizer_active = 0;
+    for (i = 0; i < MAX_SEGMENTS; i++) {
+      if (segfeature_active(seg, i, SEG_LVL_ALT_Q)) {
+        segment_quantizer_active = 1;
+      }
+    }
+
+    if (cm->delta_q_present_flag)
+      assert(segment_quantizer_active == 0 && cm->base_qindex > 0);
+    if (segment_quantizer_active == 0 && cm->base_qindex > 0) {
+      aom_wb_write_bit(wb, cm->delta_q_present_flag);
+      if (cm->delta_q_present_flag) {
+        aom_wb_write_literal(wb, OD_ILOG_NZ(cm->delta_q_res) - 1, 2);
+        xd->prev_qindex = cm->base_qindex;
+#if CONFIG_EXT_DELTA_Q
+        assert(seg->abs_delta == SEGMENT_DELTADATA);
+        aom_wb_write_bit(wb, cm->delta_lf_present_flag);
+        if (cm->delta_lf_present_flag) {
+          aom_wb_write_literal(wb, OD_ILOG_NZ(cm->delta_lf_res) - 1, 2);
+#if CONFIG_LOOPFILTER_LEVEL
+          for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id)
+            xd->prev_delta_lf[lf_id] = 0;
+#endif  // CONFIG_LOOPFILTER_LEVEL
+          xd->prev_delta_lf_from_base = 0;
+        }
+#endif  // CONFIG_EXT_DELTA_Q
+      }
+    }
+  }
+#if CONFIG_CDEF
+  if (!cm->all_lossless) {
+    encode_cdef(cm, wb);
   }
-}
 #endif
+#if CONFIG_LOOP_RESTORATION
+  encode_restoration_mode(cm, wb);
+#endif  // CONFIG_LOOP_RESTORATION
+  write_tx_mode(cm, &cm->tx_mode, wb);
+
+  if (cpi->allow_comp_inter_inter) {
+    const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT;
+#if !CONFIG_REF_ADAPT
+    const int use_compound_pred = cm->reference_mode != SINGLE_REFERENCE;
+#endif  // !CONFIG_REF_ADAPT
+
+    aom_wb_write_bit(wb, use_hybrid_pred);
+#if !CONFIG_REF_ADAPT
+    if (!use_hybrid_pred) aom_wb_write_bit(wb, use_compound_pred);
+#endif  // !CONFIG_REF_ADAPT
+  }
+  write_compound_tools(cm, wb);
+
+#if CONFIG_EXT_TX
+  aom_wb_write_bit(wb, cm->reduced_tx_set_used);
+#endif  // CONFIG_EXT_TX
+
+#if CONFIG_GLOBAL_MOTION
+  if (!frame_is_intra_only(cm)) write_global_motion(cpi, wb);
+#endif  // CONFIG_GLOBAL_MOTION
+
+  write_tile_info(cm, wb);
+}
+#endif  // CONFIG_OBU
 
 static uint32_t write_compressed_header(AV1_COMP *cpi, uint8_t *data) {
   AV1_COMMON *const cm = &cpi->common;
@@ -4587,19 +5225,13 @@ static uint32_t write_compressed_header(AV1_COMP *cpi, uint8_t *data) {
   (void)i;
   (void)fc;
 
-#if CONFIG_ANS
-  int header_size;
-  header_bc = &cpi->buf_ans;
-  buf_ans_write_init(header_bc, data);
-#else
   aom_writer real_header_bc;
   header_bc = &real_header_bc;
-  aom_start_encode(header_bc, data);
+#if CONFIG_ANS
+  header_bc->size = 1 << cpi->common.ans_window_size_log2;
 #endif
+  aom_start_encode(header_bc, data);
 
-#if CONFIG_LOOP_RESTORATION
-  encode_restoration(cm, header_bc);
-#endif  // CONFIG_LOOP_RESTORATION
 #if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
   if (cm->tx_mode == TX_MODE_SELECT)
     av1_cond_prob_diff_update(header_bc, &cm->fc->quarter_tx_size_prob,
@@ -4610,27 +5242,18 @@ static uint32_t write_compressed_header(AV1_COMP *cpi, uint8_t *data) {
 #endif  // CONFIG_LV_MAP
 
 #if CONFIG_VAR_TX && !CONFIG_NEW_MULTISYMBOL
-  update_txfm_partition_probs(cm, header_bc, counts, probwt);
+  if (cm->tx_mode == TX_MODE_SELECT)
+    update_txfm_partition_probs(cm, header_bc, counts, probwt);
 #endif
 
 #if !CONFIG_NEW_MULTISYMBOL
   update_skip_probs(cm, header_bc, counts);
 #endif
 
-  if (frame_is_intra_only(cm)) {
-    av1_copy(cm->fc->kf_y_cdf, av1_kf_y_mode_cdf);
-
-#if CONFIG_INTRABC
-    if (cm->allow_screen_content_tools) {
-      av1_cond_prob_diff_update(header_bc, &fc->intrabc_prob,
-                                cm->counts.intrabc, probwt);
-    }
-#endif
-  } else {
+  if (!frame_is_intra_only(cm)) {
 #if !CONFIG_NEW_MULTISYMBOL
     update_inter_mode_probs(cm, header_bc, counts);
 #endif
-#if CONFIG_EXT_INTER
 #if CONFIG_INTERINTRA
     if (cm->reference_mode != COMPOUND_REFERENCE &&
         cm->allow_interintra_compound) {
@@ -4656,17 +5279,6 @@ static uint32_t write_compressed_header(AV1_COMP *cpi, uint8_t *data) {
 #endif  // CONFIG_WEDGE && CONFIG_NEW_MULTISYMBOL
     }
 #endif  // CONFIG_INTERINTRA
-#endif  // CONFIG_EXT_INTER
-
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-    for (i = ADAPT_OVERLAP_BLOCK_8X8; i < ADAPT_OVERLAP_BLOCKS; ++i) {
-      prob_diff_update(av1_ncobmc_mode_tree, fc->ncobmc_mode_prob[i],
-                       counts->ncobmc_mode[i], MAX_NCOBMC_MODES, probwt,
-                       header_bc);
-    }
-#endif
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 
 #if !CONFIG_NEW_MULTISYMBOL
     for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
@@ -4724,11 +5336,11 @@ static uint32_t write_compressed_header(AV1_COMP *cpi, uint8_t *data) {
     }
 #endif  // CONFIG_NEW_MULTISYMBOL
 
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
     for (i = 0; i < COMP_INTER_MODE_CONTEXTS; i++)
       av1_cond_prob_diff_update(header_bc, &fc->comp_inter_mode_prob[i],
                                 counts->comp_inter_mode[i], probwt);
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
 
 #if !CONFIG_NEW_MULTISYMBOL
     av1_write_nmv_probs(cm, cm->allow_high_precision_mv, header_bc, counts->mv);
@@ -4736,22 +5348,13 @@ static uint32_t write_compressed_header(AV1_COMP *cpi, uint8_t *data) {
 #if CONFIG_SUPERTX
     if (!xd->lossless[0]) update_supertx_probs(cm, probwt, header_bc);
 #endif  // CONFIG_SUPERTX
-#if CONFIG_GLOBAL_MOTION
-    write_global_motion(cpi, header_bc);
-#endif  // CONFIG_GLOBAL_MOTION
   }
-#if CONFIG_ANS
-  aom_buf_ans_flush(header_bc);
-  header_size = buf_ans_write_end(header_bc);
-  assert(header_size <= 0xffff);
-  return header_size;
-#else
   aom_stop_encode(header_bc);
   assert(header_bc->pos <= 0xffff);
   return header_bc->pos;
-#endif  // CONFIG_ANS
 }
 
+#if !CONFIG_OBU || CONFIG_EXT_TILE
 static int choose_size_bytes(uint32_t size, int spare_msbs) {
   // Choose the number of bytes required to represent size, without
   // using the 'spare_msbs' number of most significant bits.
@@ -4781,6 +5384,7 @@ static void mem_put_varsize(uint8_t *const dst, const int sz, const int val) {
     default: assert(0 && "Invalid size"); break;
   }
 }
+
 static int remux_tiles(const AV1_COMMON *const cm, uint8_t *dst,
                        const uint32_t data_size, const uint32_t max_tile_size,
                        const uint32_t max_tile_col_size,
@@ -4889,14 +5493,334 @@ static int remux_tiles(const AV1_COMMON *const cm, uint8_t *dst,
     return wpos;
   }
 }
+#endif
+
+#if CONFIG_OBU
+static uint32_t write_obu_header(OBU_TYPE obu_type, int obu_extension,
+                                 uint8_t *const dst) {
+  struct aom_write_bit_buffer wb = { dst, 0 };
+  uint32_t size = 0;
+
+  aom_wb_write_literal(&wb, (int)obu_type, 5);
+  aom_wb_write_literal(&wb, 0, 2);
+  aom_wb_write_literal(&wb, obu_extension ? 1 : 0, 1);
+  if (obu_extension) {
+    aom_wb_write_literal(&wb, obu_extension & 0xFF, 8);
+  }
+
+  size = aom_wb_bytes_written(&wb);
+  return size;
+}
+
+static uint32_t write_temporal_delimiter_obu() { return 0; }
+
+static uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst) {
+  AV1_COMMON *const cm = &cpi->common;
+  SequenceHeader *const seq_params = &cm->seq_params;
+  struct aom_write_bit_buffer wb = { dst, 0 };
+  uint32_t size = 0;
+
+  write_profile(cm->profile, &wb);
+
+  aom_wb_write_literal(&wb, 0, 4);
+
+  seq_params->frame_id_numbers_present_flag = FRAME_ID_NUMBERS_PRESENT_FLAG;
+  aom_wb_write_literal(&wb, seq_params->frame_id_numbers_present_flag, 1);
+  if (seq_params->frame_id_numbers_present_flag) {
+    seq_params->frame_id_length_minus7 = FRAME_ID_LENGTH_MINUS7;
+    seq_params->delta_frame_id_length_minus2 = DELTA_FRAME_ID_LENGTH_MINUS2;
+    aom_wb_write_literal(&wb, seq_params->frame_id_length_minus7, 4);
+    aom_wb_write_literal(&wb, seq_params->delta_frame_id_length_minus2, 4);
+  }
+
+  // color_config
+  write_bitdepth_colorspace_sampling(cm, &wb);
+
+  size = aom_wb_bytes_written(&wb);
+  return size;
+}
+
+static uint32_t write_frame_header_obu(AV1_COMP *cpi, uint8_t *const dst) {
+  AV1_COMMON *const cm = &cpi->common;
+  struct aom_write_bit_buffer wb = { dst, 0 };
+  uint32_t total_size = 0;
+  uint32_t compressed_hdr_size, uncompressed_hdr_size;
+
+  write_uncompressed_header_obu(cpi, &wb);
+
+  if (cm->show_existing_frame) {
+    total_size = aom_wb_bytes_written(&wb);
+    return total_size;
+  }
+
+  // write the tile length code  (Always 4 bytes for now)
+  aom_wb_write_literal(&wb, 3, 2);
+
+  if (!use_compressed_header(cm)) {
+    uncompressed_hdr_size = aom_wb_bytes_written(&wb);
+    compressed_hdr_size = 0;
+  } else {
+    // placeholder for the compressed header length
+    struct aom_write_bit_buffer compr_hdr_len_wb = wb;
+    aom_wb_write_literal(&wb, 0, 16);
+
+    uncompressed_hdr_size = aom_wb_bytes_written(&wb);
+    compressed_hdr_size =
+        write_compressed_header(cpi, dst + uncompressed_hdr_size);
+    aom_wb_overwrite_literal(&compr_hdr_len_wb, (int)(compressed_hdr_size), 16);
+  }
+
+  total_size = uncompressed_hdr_size + compressed_hdr_size;
+  return total_size;
+}
+
+static uint32_t write_tile_group_header(uint8_t *const dst, int startTile,
+                                        int endTile, int tiles_log2) {
+  struct aom_write_bit_buffer wb = { dst, 0 };
+  uint32_t size = 0;
+
+  aom_wb_write_literal(&wb, startTile, tiles_log2);
+  aom_wb_write_literal(&wb, endTile, tiles_log2);
+
+  size = aom_wb_bytes_written(&wb);
+  return size;
+}
+
+static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
+                                       unsigned int *max_tile_size,
+                                       unsigned int *max_tile_col_size,
+                                       uint8_t *const frame_header_obu_location,
+                                       uint32_t frame_header_obu_size,
+                                       int insert_frame_header_obu_flag) {
+  const AV1_COMMON *const cm = &cpi->common;
+  aom_writer mode_bc;
+  int tile_row, tile_col;
+  TOKENEXTRA *(*const tok_buffers)[MAX_TILE_COLS] = cpi->tile_tok;
+  TileBufferEnc(*const tile_buffers)[MAX_TILE_COLS] = cpi->tile_buffers;
+  uint32_t total_size = 0;
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
+  unsigned int tile_size = 0;
+  const int n_log2_tiles = cm->log2_tile_rows + cm->log2_tile_cols;
+  // Fixed size tile groups for the moment
+  const int num_tg_hdrs = cm->num_tg;
+  const int tg_size =
+#if CONFIG_EXT_TILE
+      (cm->large_scale_tile)
+          ? 1
+          :
+#endif  // CONFIG_EXT_TILE
+          (tile_rows * tile_cols + num_tg_hdrs - 1) / num_tg_hdrs;
+  int tile_count = 0;
+  int curr_tg_data_size = 0;
+  uint8_t *data = dst;
+  int new_tg = 1;
+#if CONFIG_EXT_TILE
+  const int have_tiles = tile_cols * tile_rows > 1;
+#endif
+
+  *max_tile_size = 0;
+  *max_tile_col_size = 0;
+
+#if CONFIG_EXT_TILE
+  if (cm->large_scale_tile) {
+    for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+      TileInfo tile_info;
+      const int is_last_col = (tile_col == tile_cols - 1);
+      const uint32_t col_offset = total_size;
+
+      av1_tile_set_col(&tile_info, cm, tile_col);
+
+      // The last column does not have a column header
+      if (!is_last_col) total_size += 4;
+
+      for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+        TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
+        const TOKENEXTRA *tok = tok_buffers[tile_row][tile_col];
+        const TOKENEXTRA *tok_end = tok + cpi->tok_count[tile_row][tile_col];
+        const int data_offset = have_tiles ? 4 : 0;
+        const int tile_idx = tile_row * tile_cols + tile_col;
+        TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
+        av1_tile_set_row(&tile_info, cm, tile_row);
+
+        buf->data = dst + total_size;
+
+        // Is CONFIG_EXT_TILE = 1, every tile in the row has a header,
+        // even for the last one, unless no tiling is used at all.
+        total_size += data_offset;
+        // Initialise tile context from the frame context
+        this_tile->tctx = *cm->fc;
+        cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
+#if CONFIG_PVQ
+        cpi->td.mb.pvq_q = &this_tile->pvq_q;
+        cpi->td.mb.daala_enc.state.adapt = &this_tile->tctx.pvq_context;
+#endif  // CONFIG_PVQ
+#if CONFIG_ANS
+        mode_bc.size = 1 << cpi->common.ans_window_size_log2;
+#endif
+        aom_start_encode(&mode_bc, buf->data + data_offset);
+        write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end);
+        assert(tok == tok_end);
+        aom_stop_encode(&mode_bc);
+        tile_size = mode_bc.pos;
+#if CONFIG_PVQ
+        cpi->td.mb.pvq_q = NULL;
+#endif
+        buf->size = tile_size;
+
+        // Record the maximum tile size we see, so we can compact headers later.
+        *max_tile_size = AOMMAX(*max_tile_size, tile_size);
+
+        if (have_tiles) {
+          // tile header: size of this tile, or copy offset
+          uint32_t tile_header = tile_size;
+          const int tile_copy_mode =
+              ((AOMMAX(cm->tile_width, cm->tile_height) << MI_SIZE_LOG2) <= 256)
+                  ? 1
+                  : 0;
+
+          // If tile_copy_mode = 1, check if this tile is a copy tile.
+          // Very low chances to have copy tiles on the key frames, so don't
+          // search on key frames to reduce unnecessary search.
+          if (cm->frame_type != KEY_FRAME && tile_copy_mode) {
+            const int idendical_tile_offset =
+                find_identical_tile(tile_row, tile_col, tile_buffers);
+
+            if (idendical_tile_offset > 0) {
+              tile_size = 0;
+              tile_header = idendical_tile_offset | 0x80;
+              tile_header <<= 24;
+            }
+          }
+
+          mem_put_le32(buf->data, tile_header);
+        }
+
+        total_size += tile_size;
+      }
+
+      if (!is_last_col) {
+        uint32_t col_size = total_size - col_offset - 4;
+        mem_put_le32(dst + col_offset, col_size);
+
+        // If it is not final packing, record the maximum tile column size we
+        // see, otherwise, check if the tile size is out of the range.
+        *max_tile_col_size = AOMMAX(*max_tile_col_size, col_size);
+      }
+    }
+  } else {
+#endif  // CONFIG_EXT_TILE
+
+    for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+      TileInfo tile_info;
+      const int is_last_row = (tile_row == tile_rows - 1);
+      av1_tile_set_row(&tile_info, cm, tile_row);
+
+      for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+        const int tile_idx = tile_row * tile_cols + tile_col;
+        TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
+        TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
+        const TOKENEXTRA *tok = tok_buffers[tile_row][tile_col];
+        const TOKENEXTRA *tok_end = tok + cpi->tok_count[tile_row][tile_col];
+        const int is_last_col = (tile_col == tile_cols - 1);
+        const int is_last_tile = is_last_col && is_last_row;
+        int is_last_tile_in_tg = 0;
+
+        if (new_tg) {
+          if (insert_frame_header_obu_flag && tile_idx) {
+            // insert a copy of frame header OBU (including 4-byte size),
+            // except before the first tile group
+            data = dst + total_size;
+            memmove(data, frame_header_obu_location, frame_header_obu_size);
+            total_size += frame_header_obu_size;
+          }
+          data = dst + total_size;
+          // A new tile group begins at this tile.  Write the obu header and
+          // tile group header
+          curr_tg_data_size = write_obu_header(OBU_TILE_GROUP, 0, data + 4);
+          if (n_log2_tiles)
+            curr_tg_data_size += write_tile_group_header(
+                data + curr_tg_data_size + 4, tile_idx,
+                AOMMIN(tile_idx + tg_size - 1, tile_cols * tile_rows - 1),
+                n_log2_tiles);
+          total_size += curr_tg_data_size + 4;
+          new_tg = 0;
+          tile_count = 0;
+        }
+        tile_count++;
+        av1_tile_set_col(&tile_info, cm, tile_col);
+
+        if (tile_count == tg_size || tile_idx == (tile_cols * tile_rows - 1)) {
+          is_last_tile_in_tg = 1;
+          new_tg = 1;
+        } else {
+          is_last_tile_in_tg = 0;
+        }
+
+#if CONFIG_DEPENDENT_HORZTILES
+        av1_tile_set_tg_boundary(&tile_info, cm, tile_row, tile_col);
+#endif
+        buf->data = dst + total_size;
+
+        // The last tile of the tile group does not have a header.
+        if (!is_last_tile_in_tg) total_size += 4;
+
+        // Initialise tile context from the frame context
+        this_tile->tctx = *cm->fc;
+        cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
+#if CONFIG_PVQ
+        cpi->td.mb.pvq_q = &this_tile->pvq_q;
+        cpi->td.mb.daala_enc.state.adapt = &this_tile->tctx.pvq_context;
+#endif  // CONFIG_PVQ
+#if CONFIG_ANS
+        mode_bc.size = 1 << cpi->common.ans_window_size_log2;
+#endif  // CONFIG_ANS
+        aom_start_encode(&mode_bc, dst + total_size);
+        write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end);
+#if !CONFIG_LV_MAP
+#if !CONFIG_PVQ
+        assert(tok == tok_end);
+#endif  // !CONFIG_PVQ
+#endif  // !CONFIG_LV_MAP
+        aom_stop_encode(&mode_bc);
+        tile_size = mode_bc.pos;
+#if CONFIG_PVQ
+        cpi->td.mb.pvq_q = NULL;
+#endif
+        assert(tile_size > 0);
+
+        curr_tg_data_size += (tile_size + (is_last_tile_in_tg ? 0 : 4));
+        buf->size = tile_size;
+
+        if (!is_last_tile) {
+          *max_tile_size = AOMMAX(*max_tile_size, tile_size);
+        }
+        if (!is_last_tile_in_tg) {
+          // size of this tile
+          mem_put_le32(buf->data, tile_size);
+        } else {
+          // write current tile group size
+          mem_put_le32(data, curr_tg_data_size);
+        }
+
+        total_size += tile_size;
+      }
+    }
+#if CONFIG_EXT_TILE
+  }
+#endif  // CONFIG_EXT_TILE
+  return (uint32_t)total_size;
+}
+
+#endif
 
 void av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) {
   uint8_t *data = dst;
   uint32_t data_size;
 #if CONFIG_EXT_TILE
   AV1_COMMON *const cm = &cpi->common;
-  uint32_t compressed_header_size = 0;
-  uint32_t uncompressed_header_size;
+  uint32_t compressed_hdr_size = 0;
+  uint32_t uncompressed_hdr_size;
   struct aom_write_bit_buffer saved_wb;
   struct aom_write_bit_buffer wb = { data, 0 };
   const int have_tiles = cm->tile_cols * cm->tile_rows > 1;
@@ -4905,15 +5829,59 @@ void av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) {
 #endif  // CONFIG_EXT_TILE
   unsigned int max_tile_size;
   unsigned int max_tile_col_size;
+#if CONFIG_OBU
+#if !CONFIG_EXT_TILE
+  AV1_COMMON *const cm = &cpi->common;
+#endif
+  uint32_t obu_size;
+  uint8_t *frame_header_location;
+  uint32_t frame_header_size;
+#endif
 
 #if CONFIG_BITSTREAM_DEBUG
   bitstream_queue_reset_write();
 #endif
 
+#if CONFIG_OBU
+  // write temporal delimiter obu, preceded by 4-byte size
+  obu_size = write_obu_header(OBU_TD, 0, data + 4);
+  obu_size += write_temporal_delimiter_obu(/*data + 4 + obu_size*/);
+  mem_put_le32(data, obu_size);
+  data += obu_size + 4;
+
+  // write sequence header obu if KEY_FRAME, preceded by 4-byte size
+  if (cm->frame_type == KEY_FRAME) {
+    obu_size = write_obu_header(OBU_SEQUENCE_HEADER, 0, data + 4);
+    obu_size += write_sequence_header_obu(cpi, data + 4 + obu_size);
+    mem_put_le32(data, obu_size);
+    data += obu_size + 4;
+  }
+
+  // write frame header obu, preceded by 4-byte size
+  frame_header_location = data + 4;
+  obu_size = write_obu_header(OBU_FRAME_HEADER, 0, frame_header_location);
+  frame_header_size = write_frame_header_obu(cpi, data + 4 + obu_size);
+  obu_size += frame_header_size;
+  mem_put_le32(data, obu_size);
+  data += obu_size + 4;
+
+  if (cm->show_existing_frame) {
+    data_size = 0;
+  } else {
+    //  Each tile group obu will be preceded by 4-byte size of the tile group
+    //  obu
+    data_size =
+        write_tiles_in_tg_obus(cpi, data, &max_tile_size, &max_tile_col_size,
+                               frame_header_location - 4, obu_size + 4,
+                               1 /* cm->error_resilient_mode */);
+  }
+
+#endif
+
 #if CONFIG_EXT_TILE
   if (cm->large_scale_tile) {
     // Write the uncompressed header
-    write_uncompressed_header(cpi, &wb);
+    write_uncompressed_header_frame(cpi, &wb);
 
 #if CONFIG_EXT_REFS
     if (cm->show_existing_frame) {
@@ -4934,23 +5902,29 @@ void av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) {
       // Number of bytes in tile size - 1
       aom_wb_write_literal(&wb, 0, 2);
     }
-    // Size of compressed header
-    aom_wb_write_literal(&wb, 0, 16);
-
-    uncompressed_header_size = (uint32_t)aom_wb_bytes_written(&wb);
-    data += uncompressed_header_size;
 
-    aom_clear_system_state();
-
-    // Write the compressed header
-    compressed_header_size = write_compressed_header(cpi, data);
-    data += compressed_header_size;
+    if (!use_compressed_header(cm)) {
+      uncompressed_hdr_size = (uint32_t)aom_wb_bytes_written(&wb);
+      aom_clear_system_state();
+      compressed_hdr_size = 0;
+    } else {
+      // Size of compressed header
+      aom_wb_write_literal(&wb, 0, 16);
+      uncompressed_hdr_size = (uint32_t)aom_wb_bytes_written(&wb);
+      aom_clear_system_state();
+      // Write the compressed header
+      compressed_hdr_size =
+          write_compressed_header(cpi, data + uncompressed_hdr_size);
+    }
+    data += uncompressed_hdr_size + compressed_hdr_size;
 
     // Write the encoded tile data
     data_size = write_tiles(cpi, data, &max_tile_size, &max_tile_col_size);
   } else {
 #endif  // CONFIG_EXT_TILE
+#if !CONFIG_OBU
     data_size = write_tiles(cpi, data, &max_tile_size, &max_tile_col_size);
+#endif
 #if CONFIG_EXT_TILE
   }
 #endif  // CONFIG_EXT_TILE
@@ -4972,9 +5946,9 @@ void av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) {
       assert(tile_size_bytes >= 1 && tile_size_bytes <= 4);
       aom_wb_write_literal(&saved_wb, tile_size_bytes - 1, 2);
     }
-    // TODO(jbb): Figure out what to do if compressed_header_size > 16 bits.
-    assert(compressed_header_size <= 0xffff);
-    aom_wb_write_literal(&saved_wb, compressed_header_size, 16);
+    // TODO(jbb): Figure out what to do if compressed_hdr_size > 16 bits.
+    assert(compressed_hdr_size <= 0xffff);
+    aom_wb_write_literal(&saved_wb, compressed_hdr_size, 16);
   } else {
 #endif  // CONFIG_EXT_TILE
     data += data_size;
diff --git a/third_party/aom/av1/encoder/bitstream.h b/third_party/aom/av1/encoder/bitstream.h
index 29c930356..76eb85116 100644
--- a/third_party/aom/av1/encoder/bitstream.h
+++ b/third_party/aom/av1/encoder/bitstream.h
@@ -18,12 +18,11 @@ extern "C" {
 
 #include "av1/encoder/encoder.h"
 
+struct aom_write_bit_buffer;
+
 #if CONFIG_REFERENCE_BUFFER
-void write_sequence_header(
-#if CONFIG_EXT_TILE
-    AV1_COMMON *const cm,
-#endif  // CONFIG_EXT_TILE
-    SequenceHeader *seq_params);
+void write_sequence_header(AV1_COMMON *const cm,
+                           struct aom_write_bit_buffer *wb);
 #endif
 
 void av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dest, size_t *size);
diff --git a/third_party/aom/av1/encoder/block.h b/third_party/aom/av1/encoder/block.h
index 7b6eb0b0e..8b6627825 100644
--- a/third_party/aom/av1/encoder/block.h
+++ b/third_party/aom/av1/encoder/block.h
@@ -18,6 +18,10 @@
 #include "av1/encoder/encint.h"
 #endif
 #include "av1/common/mvref_common.h"
+#include "av1/encoder/hash.h"
+#if CONFIG_DIST_8X8
+#include "aom/aomcx.h"
+#endif
 
 #ifdef __cplusplus
 extern "C" {
@@ -60,28 +64,52 @@ typedef struct macroblock_plane {
 #endif  // CONFIG_NEW_QUANT
 } MACROBLOCK_PLANE;
 
-/* The [2] dimension is for whether we skip the EOB node (i.e. if previous
- * coefficient in this block was zero) or not. */
-typedef unsigned int av1_coeff_cost[PLANE_TYPES][REF_TYPES][COEF_BANDS][2]
-                                   [COEFF_CONTEXTS][ENTROPY_TOKENS];
+typedef int av1_coeff_cost[PLANE_TYPES][REF_TYPES][COEF_BANDS][COEFF_CONTEXTS]
+                          [TAIL_TOKENS];
 
-typedef struct {
-  int_mv ref_mvs[MODE_CTX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
-  int16_t mode_context[MODE_CTX_REF_FRAMES];
 #if CONFIG_LV_MAP
-  // TODO(angiebird): Reduce the buffer size according to sb_type
+typedef struct {
+  int txb_skip_cost[TXB_SKIP_CONTEXTS][2];
+  int nz_map_cost[SIG_COEF_CONTEXTS][2];
+  int eob_cost[EOB_COEF_CONTEXTS][2];
+  int dc_sign_cost[DC_SIGN_CONTEXTS][2];
+  int base_cost[NUM_BASE_LEVELS][COEFF_BASE_CONTEXTS][2];
+#if BR_NODE
+  int lps_cost[LEVEL_CONTEXTS][COEFF_BASE_RANGE + 1];
+  int br_cost[BASE_RANGE_SETS][LEVEL_CONTEXTS][2];
+#else   // BR_NODE
+  int lps_cost[LEVEL_CONTEXTS][2];
+#endif  // BR_NODE
+#if CONFIG_CTX1D
+  int eob_mode_cost[TX_CLASSES][2];
+  int empty_line_cost[TX_CLASSES][EMPTY_LINE_CONTEXTS][2];
+  int hv_eob_cost[TX_CLASSES][HV_EOB_CONTEXTS][2];
+#endif
+} LV_MAP_COEFF_COST;
+
+typedef struct {
   tran_low_t tcoeff[MAX_MB_PLANE][MAX_SB_SQUARE];
   uint16_t eobs[MAX_MB_PLANE][MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
   uint8_t txb_skip_ctx[MAX_MB_PLANE]
                       [MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
   int dc_sign_ctx[MAX_MB_PLANE]
                  [MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
+} CB_COEFF_BUFFER;
+#endif
+
+typedef struct {
+  int_mv ref_mvs[MODE_CTX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
+  int16_t mode_context[MODE_CTX_REF_FRAMES];
+#if CONFIG_LV_MAP
+  // TODO(angiebird): Reduce the buffer size according to sb_type
+  tran_low_t *tcoeff[MAX_MB_PLANE];
+  uint16_t *eobs[MAX_MB_PLANE];
+  uint8_t *txb_skip_ctx[MAX_MB_PLANE];
+  int *dc_sign_ctx[MAX_MB_PLANE];
 #endif
   uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
   CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
-#if CONFIG_EXT_INTER
   int16_t compound_mode_context[MODE_CTX_REF_FRAMES];
-#endif  // CONFIG_EXT_INTER
 } MB_MODE_INFO_EXT;
 
 typedef struct {
@@ -91,17 +119,41 @@ typedef struct {
   int row_max;
 } MvLimits;
 
-#if CONFIG_PALETTE
 typedef struct {
   uint8_t best_palette_color_map[MAX_SB_SQUARE];
   float kmeans_data_buf[2 * MAX_SB_SQUARE];
 } PALETTE_BUFFER;
-#endif  // CONFIG_PALETTE
+
+typedef struct {
+  TX_TYPE tx_type;
+  TX_SIZE tx_size;
+#if CONFIG_VAR_TX
+  TX_SIZE min_tx_size;
+  TX_SIZE inter_tx_size[MAX_MIB_SIZE][MAX_MIB_SIZE];
+  uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE * 8];
+#endif  // CONFIG_VAR_TX
+#if CONFIG_TXK_SEL
+  TX_TYPE txk_type[MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
+#endif  // CONFIG_TXK_SEL
+  RD_STATS rd_stats;
+  uint32_t hash_value;
+} TX_RD_INFO;
+
+#define RD_RECORD_BUFFER_LEN 8
+typedef struct {
+  TX_RD_INFO tx_rd_info[RD_RECORD_BUFFER_LEN];  // Circular buffer.
+  int index_start;
+  int num;
+  CRC_CALCULATOR crc_calculator;  // Hash function.
+} TX_RD_RECORD;
 
 typedef struct macroblock MACROBLOCK;
 struct macroblock {
   struct macroblock_plane plane[MAX_MB_PLANE];
 
+  // Save the transform RD search info.
+  TX_RD_RECORD tx_rd_record;
+
   MACROBLOCKD e_mbd;
   MB_MODE_INFO_EXT *mbmi_ext;
   int skip_block;
@@ -150,9 +202,7 @@ struct macroblock {
   uint8_t *left_pred_buf;
 #endif  // CONFIG_MOTION_VAR
 
-#if CONFIG_PALETTE
   PALETTE_BUFFER *palette_buffer;
-#endif  // CONFIG_PALETTE
 
   // These define limits to motion vector components to prevent them
   // from extending outside the UMV borders
@@ -169,8 +219,92 @@ struct macroblock {
   int skip_chroma_rd;
 #endif
 
-  // note that token_costs is the cost when eob node is skipped
-  av1_coeff_cost token_costs[TX_SIZES];
+#if CONFIG_LV_MAP
+  LV_MAP_COEFF_COST coeff_costs[TX_SIZES][PLANE_TYPES];
+  uint16_t cb_offset;
+#endif
+
+  av1_coeff_cost token_head_costs[TX_SIZES];
+  av1_coeff_cost token_tail_costs[TX_SIZES];
+
+  // mode costs
+  int mbmode_cost[BLOCK_SIZE_GROUPS][INTRA_MODES];
+  int newmv_mode_cost[NEWMV_MODE_CONTEXTS][2];
+  int zeromv_mode_cost[ZEROMV_MODE_CONTEXTS][2];
+  int refmv_mode_cost[REFMV_MODE_CONTEXTS][2];
+  int drl_mode_cost0[DRL_MODE_CONTEXTS][2];
+
+  int inter_compound_mode_cost[INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES];
+  int compound_type_cost[BLOCK_SIZES_ALL][COMPOUND_TYPES];
+#if CONFIG_COMPOUND_SINGLEREF
+  int inter_singleref_comp_mode_cost[INTER_MODE_CONTEXTS]
+                                    [INTER_SINGLEREF_COMP_MODES];
+#endif  // CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_INTERINTRA
+  int interintra_mode_cost[BLOCK_SIZE_GROUPS][INTERINTRA_MODES];
+#endif  // CONFIG_INTERINTRA
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+  int motion_mode_cost[BLOCK_SIZES_ALL][MOTION_MODES];
+#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+  int motion_mode_cost1[BLOCK_SIZES_ALL][2];
+#if CONFIG_NCOBMC_ADAPT_WEIGHT
+  int motion_mode_cost2[BLOCK_SIZES_ALL][OBMC_FAMILY_MODES];
+#endif
+#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+#if CONFIG_MOTION_VAR && CONFIG_NCOBMC_ADAPT_WEIGHT
+  int ncobmc_mode_cost[ADAPT_OVERLAP_BLOCKS][MAX_NCOBMC_MODES];
+#endif  // CONFIG_MOTION_VAR && CONFIG_NCOBMC_ADAPT_WEIGHT
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+  int intra_uv_mode_cost[INTRA_MODES][UV_INTRA_MODES];
+  int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
+  int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
+#if CONFIG_EXT_PARTITION_TYPES
+  int partition_cost[PARTITION_CONTEXTS + CONFIG_UNPOISON_PARTITION_CTX]
+                    [EXT_PARTITION_TYPES];
+#else
+  int partition_cost[PARTITION_CONTEXTS + CONFIG_UNPOISON_PARTITION_CTX]
+                    [PARTITION_TYPES];
+#endif  // CONFIG_EXT_PARTITION_TYPES
+#if CONFIG_MRC_TX
+  int mrc_mask_inter_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
+                         [PALETTE_COLORS];
+  int mrc_mask_intra_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
+                         [PALETTE_COLORS];
+#endif  // CONFIG_MRC_TX
+  int palette_y_size_cost[PALETTE_BLOCK_SIZES][PALETTE_SIZES];
+  int palette_uv_size_cost[PALETTE_BLOCK_SIZES][PALETTE_SIZES];
+  int palette_y_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
+                          [PALETTE_COLORS];
+  int palette_uv_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
+                           [PALETTE_COLORS];
+#if CONFIG_CFL
+  // The rate associated with each alpha codeword
+  int cfl_cost[CFL_JOINT_SIGNS][CFL_PRED_PLANES][CFL_ALPHABET_SIZE];
+#endif  // CONFIG_CFL
+  int tx_size_cost[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES];
+#if CONFIG_EXT_TX
+#if CONFIG_LGT_FROM_PRED
+  int intra_lgt_cost[LGT_SIZES][INTRA_MODES][2];
+  int inter_lgt_cost[LGT_SIZES][2];
+#endif
+  int inter_tx_type_costs[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES];
+  int intra_tx_type_costs[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
+                         [TX_TYPES];
+#else
+  int intra_tx_type_costs[EXT_TX_SIZES][TX_TYPES][TX_TYPES];
+  int inter_tx_type_costs[EXT_TX_SIZES][TX_TYPES];
+#endif  // CONFIG_EXT_TX
+#if CONFIG_EXT_INTRA
+#if CONFIG_INTRA_INTERP
+  int intra_filter_cost[INTRA_FILTERS + 1][INTRA_FILTERS];
+#endif  // CONFIG_INTRA_INTERP
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_LOOP_RESTORATION
+  int switchable_restore_cost[RESTORE_SWITCHABLE_TYPES];
+#endif  // CONFIG_LOOP_RESTORATION
+#if CONFIG_INTRABC
+  int intrabc_cost[2];
+#endif  // CONFIG_INTRABC
 
   int optimize;
 
@@ -206,6 +340,8 @@ struct macroblock {
   int pvq_coded;  // Indicates whether pvq_info needs be stored to tokenize
 #endif
 #if CONFIG_DIST_8X8
+  int using_dist_8x8;
+  aom_tune_metric tune_metric;
 #if CONFIG_CB4X4
 #if CONFIG_HIGHBITDEPTH
   DECLARE_ALIGNED(16, uint16_t, decoded_8x8[8 * 8]);
@@ -214,10 +350,6 @@ struct macroblock {
 #endif
 #endif  // CONFIG_CB4X4
 #endif  // CONFIG_DIST_8X8
-#if CONFIG_CFL
-  // Whether luma needs to be stored during RDO.
-  int cfl_store_y;
-#endif
 };
 
 #ifdef __cplusplus
diff --git a/third_party/aom/av1/encoder/context_tree.c b/third_party/aom/av1/encoder/context_tree.c
index b1c01b28e..4bbf0e5fb 100644
--- a/third_party/aom/av1/encoder/context_tree.c
+++ b/third_party/aom/av1/encoder/context_tree.c
@@ -22,19 +22,14 @@ static const BLOCK_SIZE square[MAX_SB_SIZE_LOG2 - 1] = {
 #endif  // CONFIG_EXT_PARTITION
 };
 
-static void alloc_mode_context(AV1_COMMON *cm, int num_4x4_blk,
+static void alloc_mode_context(AV1_COMMON *cm, int num_pix,
 #if CONFIG_EXT_PARTITION_TYPES
                                PARTITION_TYPE partition,
 #endif
                                PICK_MODE_CONTEXT *ctx) {
-  const int num_blk = (num_4x4_blk < 4 ? 4 : num_4x4_blk);
-  const int num_pix = num_blk * tx_size_2d[0];
   int i;
-#if CONFIG_CB4X4 && CONFIG_VAR_TX
-  ctx->num_4x4_blk = num_blk / 4;
-#else
+  const int num_blk = num_pix / 16;
   ctx->num_4x4_blk = num_blk;
-#endif
 
 #if CONFIG_EXT_PARTITION_TYPES
   ctx->partition = partition;
@@ -64,13 +59,15 @@ static void alloc_mode_context(AV1_COMMON *cm, int num_4x4_blk,
 #endif
   }
 
-#if CONFIG_PALETTE
   for (i = 0; i < 2; ++i) {
     CHECK_MEM_ERROR(
         cm, ctx->color_index_map[i],
         aom_memalign(32, num_pix * sizeof(*ctx->color_index_map[i])));
   }
-#endif  // CONFIG_PALETTE
+#if CONFIG_MRC_TX
+  CHECK_MEM_ERROR(cm, ctx->mrc_mask,
+                  aom_memalign(32, num_pix * sizeof(*ctx->mrc_mask)));
+#endif  // CONFIG_MRC_TX
 }
 
 static void free_mode_context(PICK_MODE_CONTEXT *ctx) {
@@ -98,80 +95,63 @@ static void free_mode_context(PICK_MODE_CONTEXT *ctx) {
 #endif
   }
 
-#if CONFIG_PALETTE
   for (i = 0; i < 2; ++i) {
     aom_free(ctx->color_index_map[i]);
     ctx->color_index_map[i] = 0;
   }
-#endif  // CONFIG_PALETTE
+#if CONFIG_MRC_TX
+  aom_free(ctx->mrc_mask);
+  ctx->mrc_mask = 0;
+#endif  // CONFIG_MRC_TX
 }
 
-static void alloc_tree_contexts(AV1_COMMON *cm, PC_TREE *tree,
-                                int num_4x4_blk) {
+static void alloc_tree_contexts(AV1_COMMON *cm, PC_TREE *tree, int num_pix) {
 #if CONFIG_EXT_PARTITION_TYPES
-  alloc_mode_context(cm, num_4x4_blk, PARTITION_NONE, &tree->none);
-  alloc_mode_context(cm, num_4x4_blk / 2, PARTITION_HORZ, &tree->horizontal[0]);
-  alloc_mode_context(cm, num_4x4_blk / 2, PARTITION_VERT, &tree->vertical[0]);
-  alloc_mode_context(cm, num_4x4_blk / 2, PARTITION_VERT, &tree->horizontal[1]);
-  alloc_mode_context(cm, num_4x4_blk / 2, PARTITION_VERT, &tree->vertical[1]);
+  alloc_mode_context(cm, num_pix, PARTITION_NONE, &tree->none);
+  alloc_mode_context(cm, num_pix / 2, PARTITION_HORZ, &tree->horizontal[0]);
+  alloc_mode_context(cm, num_pix / 2, PARTITION_VERT, &tree->vertical[0]);
+  alloc_mode_context(cm, num_pix / 2, PARTITION_VERT, &tree->horizontal[1]);
+  alloc_mode_context(cm, num_pix / 2, PARTITION_VERT, &tree->vertical[1]);
 
-  alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_HORZ_A,
-                     &tree->horizontala[0]);
-  alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_HORZ_A,
-                     &tree->horizontala[1]);
-  alloc_mode_context(cm, num_4x4_blk / 2, PARTITION_HORZ_A,
-                     &tree->horizontala[2]);
-  alloc_mode_context(cm, num_4x4_blk / 2, PARTITION_HORZ_B,
-                     &tree->horizontalb[0]);
-  alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_HORZ_B,
-                     &tree->horizontalb[1]);
-  alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_HORZ_B,
-                     &tree->horizontalb[2]);
-  alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_VERT_A,
-                     &tree->verticala[0]);
-  alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_VERT_A,
-                     &tree->verticala[1]);
-  alloc_mode_context(cm, num_4x4_blk / 2, PARTITION_VERT_A,
-                     &tree->verticala[2]);
-  alloc_mode_context(cm, num_4x4_blk / 2, PARTITION_VERT_B,
-                     &tree->verticalb[0]);
-  alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_VERT_B,
-                     &tree->verticalb[1]);
-  alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_VERT_B,
-                     &tree->verticalb[2]);
+  alloc_mode_context(cm, num_pix / 4, PARTITION_HORZ_A, &tree->horizontala[0]);
+  alloc_mode_context(cm, num_pix / 4, PARTITION_HORZ_A, &tree->horizontala[1]);
+  alloc_mode_context(cm, num_pix / 2, PARTITION_HORZ_A, &tree->horizontala[2]);
+  alloc_mode_context(cm, num_pix / 2, PARTITION_HORZ_B, &tree->horizontalb[0]);
+  alloc_mode_context(cm, num_pix / 4, PARTITION_HORZ_B, &tree->horizontalb[1]);
+  alloc_mode_context(cm, num_pix / 4, PARTITION_HORZ_B, &tree->horizontalb[2]);
+  alloc_mode_context(cm, num_pix / 4, PARTITION_VERT_A, &tree->verticala[0]);
+  alloc_mode_context(cm, num_pix / 4, PARTITION_VERT_A, &tree->verticala[1]);
+  alloc_mode_context(cm, num_pix / 2, PARTITION_VERT_A, &tree->verticala[2]);
+  alloc_mode_context(cm, num_pix / 2, PARTITION_VERT_B, &tree->verticalb[0]);
+  alloc_mode_context(cm, num_pix / 4, PARTITION_VERT_B, &tree->verticalb[1]);
+  alloc_mode_context(cm, num_pix / 4, PARTITION_VERT_B, &tree->verticalb[2]);
   for (int i = 0; i < 4; ++i) {
-    alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_HORZ_4,
+    alloc_mode_context(cm, num_pix / 4, PARTITION_HORZ_4,
                        &tree->horizontal4[i]);
-    alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_HORZ_4,
-                       &tree->vertical4[i]);
+    alloc_mode_context(cm, num_pix / 4, PARTITION_HORZ_4, &tree->vertical4[i]);
   }
 #if CONFIG_SUPERTX
-  alloc_mode_context(cm, num_4x4_blk, PARTITION_HORZ,
-                     &tree->horizontal_supertx);
-  alloc_mode_context(cm, num_4x4_blk, PARTITION_VERT, &tree->vertical_supertx);
-  alloc_mode_context(cm, num_4x4_blk, PARTITION_SPLIT, &tree->split_supertx);
-  alloc_mode_context(cm, num_4x4_blk, PARTITION_HORZ_A,
-                     &tree->horizontala_supertx);
-  alloc_mode_context(cm, num_4x4_blk, PARTITION_HORZ_B,
-                     &tree->horizontalb_supertx);
-  alloc_mode_context(cm, num_4x4_blk, PARTITION_VERT_A,
-                     &tree->verticala_supertx);
-  alloc_mode_context(cm, num_4x4_blk, PARTITION_VERT_B,
-                     &tree->verticalb_supertx);
+  alloc_mode_context(cm, num_pix, PARTITION_HORZ, &tree->horizontal_supertx);
+  alloc_mode_context(cm, num_pix, PARTITION_VERT, &tree->vertical_supertx);
+  alloc_mode_context(cm, num_pix, PARTITION_SPLIT, &tree->split_supertx);
+  alloc_mode_context(cm, num_pix, PARTITION_HORZ_A, &tree->horizontala_supertx);
+  alloc_mode_context(cm, num_pix, PARTITION_HORZ_B, &tree->horizontalb_supertx);
+  alloc_mode_context(cm, num_pix, PARTITION_VERT_A, &tree->verticala_supertx);
+  alloc_mode_context(cm, num_pix, PARTITION_VERT_B, &tree->verticalb_supertx);
 #endif  // CONFIG_SUPERTX
 #else
-  alloc_mode_context(cm, num_4x4_blk, &tree->none);
-  alloc_mode_context(cm, num_4x4_blk / 2, &tree->horizontal[0]);
-  alloc_mode_context(cm, num_4x4_blk / 2, &tree->vertical[0]);
+  alloc_mode_context(cm, num_pix, &tree->none);
+  alloc_mode_context(cm, num_pix / 2, &tree->horizontal[0]);
+  alloc_mode_context(cm, num_pix / 2, &tree->vertical[0]);
 #if CONFIG_SUPERTX
-  alloc_mode_context(cm, num_4x4_blk, &tree->horizontal_supertx);
-  alloc_mode_context(cm, num_4x4_blk, &tree->vertical_supertx);
-  alloc_mode_context(cm, num_4x4_blk, &tree->split_supertx);
+  alloc_mode_context(cm, num_pix, &tree->horizontal_supertx);
+  alloc_mode_context(cm, num_pix, &tree->vertical_supertx);
+  alloc_mode_context(cm, num_pix, &tree->split_supertx);
 #endif
 
-  if (num_4x4_blk > 4) {
-    alloc_mode_context(cm, num_4x4_blk / 2, &tree->horizontal[1]);
-    alloc_mode_context(cm, num_4x4_blk / 2, &tree->vertical[1]);
+  if (num_pix > 16) {
+    alloc_mode_context(cm, num_pix / 2, &tree->horizontal[1]);
+    alloc_mode_context(cm, num_pix / 2, &tree->vertical[1]);
   } else {
     memset(&tree->horizontal[1], 0, sizeof(tree->horizontal[1]));
     memset(&tree->vertical[1], 0, sizeof(tree->vertical[1]));
@@ -217,8 +197,6 @@ static void free_tree_contexts(PC_TREE *tree) {
 // represents the state of our search.
 void av1_setup_pc_tree(AV1_COMMON *cm, ThreadData *td) {
   int i, j;
-// TODO(jingning): The pc_tree allocation is redundant. We can take out all
-// the leaf nodes after cb4x4 mode is enabled.
 #if CONFIG_CB4X4
 #if CONFIG_EXT_PARTITION
   const int tree_nodes_inc = 1024;
@@ -239,20 +217,21 @@ void av1_setup_pc_tree(AV1_COMMON *cm, ThreadData *td) {
 #endif  // CONFIG_EXT_PARTITION
   int pc_tree_index = 0;
   PC_TREE *this_pc;
-  PICK_MODE_CONTEXT *this_leaf;
   int square_index = 1;
   int nodes;
 
+#if !CONFIG_CB4X4
   aom_free(td->leaf_tree);
   CHECK_MEM_ERROR(cm, td->leaf_tree,
                   aom_calloc(leaf_nodes, sizeof(*td->leaf_tree)));
+  PICK_MODE_CONTEXT *this_leaf = &td->leaf_tree[0];
+#endif
   aom_free(td->pc_tree);
   CHECK_MEM_ERROR(cm, td->pc_tree,
                   aom_calloc(tree_nodes, sizeof(*td->pc_tree)));
-
   this_pc = &td->pc_tree[0];
-  this_leaf = &td->leaf_tree[0];
 
+#if !CONFIG_CB4X4
   // 4x4 blocks smaller than 8x8 but in the same 8x8 block share the same
   // context so we only need to allocate 1 for each 8x8 block.
   for (i = 0; i < leaf_nodes; ++i) {
@@ -262,6 +241,7 @@ void av1_setup_pc_tree(AV1_COMMON *cm, ThreadData *td) {
     alloc_mode_context(cm, 16, &td->leaf_tree[i]);
 #endif
   }
+#endif
 
   // Sets up all the leaf nodes in the tree.
   for (pc_tree_index = 0; pc_tree_index < leaf_nodes; ++pc_tree_index) {
@@ -272,8 +252,10 @@ void av1_setup_pc_tree(AV1_COMMON *cm, ThreadData *td) {
 #else
     alloc_tree_contexts(cm, tree, 4);
 #endif
+#if !CONFIG_CB4X4
     tree->leaf_split[0] = this_leaf++;
     for (j = 1; j < 4; j++) tree->leaf_split[j] = tree->leaf_split[0];
+#endif
   }
 
   // Each node has 4 leaf nodes, fill each block_size level of the tree
@@ -311,29 +293,28 @@ void av1_free_pc_tree(ThreadData *td) {
 #else
   const int tree_nodes_inc = 256;
 #endif  // CONFIG_EXT_PARTITION
-  const int leaf_factor = 4;
 #else
   const int tree_nodes_inc = 0;
-  const int leaf_factor = 1;
 #endif
 
 #if CONFIG_EXT_PARTITION
-  const int leaf_nodes = 256 * leaf_factor;
   const int tree_nodes = tree_nodes_inc + 256 + 64 + 16 + 4 + 1;
 #else
-  const int leaf_nodes = 64 * leaf_factor;
   const int tree_nodes = tree_nodes_inc + 64 + 16 + 4 + 1;
 #endif  // CONFIG_EXT_PARTITION
   int i;
-
-  // Set up all 4x4 mode contexts
-  for (i = 0; i < leaf_nodes; ++i) free_mode_context(&td->leaf_tree[i]);
-
-  // Sets up all the leaf nodes in the tree.
   for (i = 0; i < tree_nodes; ++i) free_tree_contexts(&td->pc_tree[i]);
-
   aom_free(td->pc_tree);
   td->pc_tree = NULL;
+#if !CONFIG_CB4X4
+  const int leaf_factor = 1;
+#if CONFIG_EXT_PARTITION
+  const int leaf_nodes = 256 * leaf_factor;
+#else
+  const int leaf_nodes = 64 * leaf_factor;
+#endif  // CONFIG_EXT_PARTITION
+  for (i = 0; i < leaf_nodes; ++i) free_mode_context(&td->leaf_tree[i]);
   aom_free(td->leaf_tree);
   td->leaf_tree = NULL;
+#endif
 }
diff --git a/third_party/aom/av1/encoder/context_tree.h b/third_party/aom/av1/encoder/context_tree.h
index bcfcc274a..38052ba27 100644
--- a/third_party/aom/av1/encoder/context_tree.h
+++ b/third_party/aom/av1/encoder/context_tree.h
@@ -27,9 +27,10 @@ struct ThreadData;
 typedef struct {
   MODE_INFO mic;
   MB_MODE_INFO_EXT mbmi_ext;
-#if CONFIG_PALETTE
   uint8_t *color_index_map[2];
-#endif  // CONFIG_PALETTE
+#if CONFIG_MRC_TX
+  uint8_t *mrc_mask;
+#endif  // CONFIG_MRC_TX
 #if CONFIG_VAR_TX
   uint8_t *blk_skip[MAX_MB_PLANE];
 #endif
@@ -84,6 +85,7 @@ typedef struct PC_TREE {
   PICK_MODE_CONTEXT horizontal4[4];
   PICK_MODE_CONTEXT vertical4[4];
 #endif
+  // TODO(jingning): remove leaf_split[] when cb4x4 experiment flag is removed.
   union {
     struct PC_TREE *split[4];
     PICK_MODE_CONTEXT *leaf_split[4];
diff --git a/third_party/aom/av1/encoder/dct.c b/third_party/aom/av1/encoder/dct.c
index 850b84ca9..a04d46b72 100644
--- a/third_party/aom/av1/encoder/dct.c
+++ b/third_party/aom/av1/encoder/dct.c
@@ -21,7 +21,8 @@
 #include "av1/common/av1_fwd_txfm1d.h"
 #include "av1/common/av1_fwd_txfm1d_cfg.h"
 #include "av1/common/idct.h"
-#if CONFIG_DAALA_DCT4 || CONFIG_DAALA_DCT8
+#if CONFIG_DAALA_DCT4 || CONFIG_DAALA_DCT8 || CONFIG_DAALA_DCT16 || \
+    CONFIG_DAALA_DCT32 || CONFIG_DAALA_DCT64
 #include "av1/common/daala_tx.h"
 #endif
 
@@ -42,18 +43,6 @@ static INLINE void range_check(const tran_low_t *input, const int size,
 #endif
 }
 
-#if CONFIG_DAALA_DCT4
-static void fdct4(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[4];
-  od_coeff y[4];
-  for (i = 0; i < 4; i++) x[i] = (od_coeff)input[i];
-  od_bin_fdct4(y, x, 1);
-  for (i = 0; i < 4; i++) output[i] = (tran_low_t)y[i];
-}
-
-#else
-
 static void fdct4(const tran_low_t *input, tran_low_t *output) {
   tran_high_t temp;
   tran_low_t step[4];
@@ -89,19 +78,6 @@ static void fdct4(const tran_low_t *input, tran_low_t *output) {
 
   range_check(output, 4, 16);
 }
-#endif
-
-#if CONFIG_DAALA_DCT8
-static void fdct8(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[8];
-  od_coeff y[8];
-  for (i = 0; i < 8; i++) x[i] = (od_coeff)input[i];
-  od_bin_fdct8(y, x, 1);
-  for (i = 0; i < 8; i++) output[i] = (tran_low_t)y[i];
-}
-
-#else
 
 static void fdct8(const tran_low_t *input, tran_low_t *output) {
   tran_high_t temp;
@@ -180,7 +156,6 @@ static void fdct8(const tran_low_t *input, tran_low_t *output) {
 
   range_check(output, 8, 16);
 }
-#endif
 
 static void fdct16(const tran_low_t *input, tran_low_t *output) {
   tran_high_t temp;
@@ -755,7 +730,6 @@ static void fdct32(const tran_low_t *input, tran_low_t *output) {
 }
 
 #ifndef AV1_DCT_GTEST
-
 static void fadst4(const tran_low_t *input, tran_low_t *output) {
   tran_high_t x0, x1, x2, x3;
   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
@@ -796,18 +770,6 @@ static void fadst4(const tran_low_t *input, tran_low_t *output) {
   output[3] = (tran_low_t)fdct_round_shift(s3);
 }
 
-#if CONFIG_DAALA_DCT8
-static void fadst8(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  od_coeff x[8];
-  od_coeff y[8];
-  for (i = 0; i < 8; i++) x[i] = (od_coeff)input[i];
-  od_bin_fdst8(y, x, 1);
-  for (i = 0; i < 8; i++) output[i] = (tran_low_t)y[i];
-}
-
-#else
-
 static void fadst8(const tran_low_t *input, tran_low_t *output) {
   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
 
@@ -878,7 +840,6 @@ static void fadst8(const tran_low_t *input, tran_low_t *output) {
   output[6] = (tran_low_t)x5;
   output[7] = (tran_low_t)-x1;
 }
-#endif
 
 static void fadst16(const tran_low_t *input, tran_low_t *output) {
   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
@@ -1066,9 +1027,27 @@ static void fhalfright32(const tran_low_t *input, tran_low_t *output) {
 #if CONFIG_MRC_TX
 static void get_masked_residual32(const int16_t **input, int *input_stride,
                                   const uint8_t *pred, int pred_stride,
-                                  int16_t *masked_input) {
-  int mrc_mask[32 * 32];
-  get_mrc_mask(pred, pred_stride, mrc_mask, 32, 32, 32);
+                                  int16_t *masked_input,
+                                  TxfmParam *txfm_param) {
+  int n_masked_vals = 0;
+  uint8_t *mrc_mask;
+  uint8_t mask_tmp[32 * 32];
+  if ((txfm_param->is_inter && SIGNAL_MRC_MASK_INTER) ||
+      (!txfm_param->is_inter && SIGNAL_MRC_MASK_INTRA)) {
+    mrc_mask = txfm_param->mask;
+    n_masked_vals = get_mrc_diff_mask(*input, *input_stride, mrc_mask, 32, 32,
+                                      32, txfm_param->is_inter);
+  } else {
+    mrc_mask = mask_tmp;
+    n_masked_vals = get_mrc_pred_mask(pred, pred_stride, mrc_mask, 32, 32, 32,
+                                      txfm_param->is_inter);
+  }
+
+  // Do not use MRC_DCT if mask is invalid. DCT_DCT will be used instead.
+  if (!is_valid_mrc_mask(n_masked_vals, 32, 32)) {
+    *txfm_param->valid_mask = 0;
+    return;
+  }
   int32_t sum = 0;
   int16_t avg;
   // Get the masked average of the prediction
@@ -1077,7 +1056,7 @@ static void get_masked_residual32(const int16_t **input, int *input_stride,
       sum += mrc_mask[i * 32 + j] * (*input)[i * (*input_stride) + j];
     }
   }
-  avg = ROUND_POWER_OF_TWO_SIGNED(sum, 10);
+  avg = sum / n_masked_vals;
   // Replace all of the unmasked pixels in the prediction with the average
   // of the masked pixels
   for (int i = 0; i < 32; ++i) {
@@ -1087,16 +1066,24 @@ static void get_masked_residual32(const int16_t **input, int *input_stride,
   }
   *input = masked_input;
   *input_stride = 32;
+  *txfm_param->valid_mask = 1;
 }
 #endif  // CONFIG_MRC_TX
 
-#if CONFIG_LGT
+#if CONFIG_LGT || CONFIG_LGT_FROM_PRED
 static void flgt4(const tran_low_t *input, tran_low_t *output,
                   const tran_high_t *lgtmtx) {
-  if (!(input[0] | input[1] | input[2] | input[3])) {
-    output[0] = output[1] = output[2] = output[3] = 0;
+  if (!lgtmtx) assert(0);
+#if CONFIG_LGT_FROM_PRED
+  // For DCT/ADST, use butterfly implementations
+  if (lgtmtx[0] == DCT4) {
+    fdct4(input, output);
+    return;
+  } else if (lgtmtx[0] == ADST4) {
+    fadst4(input, output);
     return;
   }
+#endif  // CONFIG_LGT_FROM_PRED
 
   // evaluate s[j] = sum of all lgtmtx[j][i]*input[i] over i=1,...,4
   tran_high_t s[4] = { 0 };
@@ -1108,6 +1095,18 @@ static void flgt4(const tran_low_t *input, tran_low_t *output,
 
 static void flgt8(const tran_low_t *input, tran_low_t *output,
                   const tran_high_t *lgtmtx) {
+  if (!lgtmtx) assert(0);
+#if CONFIG_LGT_FROM_PRED
+  // For DCT/ADST, use butterfly implementations
+  if (lgtmtx[0] == DCT8) {
+    fdct8(input, output);
+    return;
+  } else if (lgtmtx[0] == ADST8) {
+    fadst8(input, output);
+    return;
+  }
+#endif  // CONFIG_LGT_FROM_PRED
+
   // evaluate s[j] = sum of all lgtmtx[j][i]*input[i] over i=1,...,8
   tran_high_t s[8] = { 0 };
   for (int i = 0; i < 8; ++i)
@@ -1115,30 +1114,140 @@ static void flgt8(const tran_low_t *input, tran_low_t *output,
 
   for (int i = 0; i < 8; ++i) output[i] = (tran_low_t)fdct_round_shift(s[i]);
 }
+#endif  // CONFIG_LGT || CONFIG_LGT_FROM_PRED
 
-// The get_fwd_lgt functions return 1 if LGT is chosen to apply, and 0 otherwise
-int get_fwd_lgt4(transform_1d tx_orig, TxfmParam *txfm_param,
-                 const tran_high_t *lgtmtx[], int ntx) {
-  // inter/intra split
-  if (tx_orig == &fadst4) {
-    for (int i = 0; i < ntx; ++i)
-      lgtmtx[i] = txfm_param->is_inter ? &lgt4_170[0][0] : &lgt4_140[0][0];
-    return 1;
+#if CONFIG_LGT_FROM_PRED
+static void flgt16up(const tran_low_t *input, tran_low_t *output,
+                     const tran_high_t *lgtmtx) {
+  if (lgtmtx[0] == DCT16) {
+    fdct16(input, output);
+    return;
+  } else if (lgtmtx[0] == ADST16) {
+    fadst16(input, output);
+    return;
+  } else if (lgtmtx[0] == DCT32) {
+    fdct32(input, output);
+    return;
+  } else if (lgtmtx[0] == ADST32) {
+    fhalfright32(input, output);
+    return;
+  } else {
+    assert(0);
+  }
+}
+
+typedef void (*FlgtFunc)(const tran_low_t *input, tran_low_t *output,
+                         const tran_high_t *lgtmtx);
+
+static FlgtFunc flgt_func[4] = { flgt4, flgt8, flgt16up, flgt16up };
+
+typedef void (*GetLgtFunc)(const TxfmParam *txfm_param, int is_col,
+                           const tran_high_t *lgtmtx[], int ntx);
+
+static GetLgtFunc get_lgt_func[4] = { get_lgt4_from_pred, get_lgt8_from_pred,
+                                      get_lgt16up_from_pred,
+                                      get_lgt16up_from_pred };
+
+// this inline function corresponds to the up scaling before the first
+// transform in the av1_fht* functions
+static INLINE tran_low_t fwd_upscale_wrt_txsize(const tran_high_t val,
+                                                const TX_SIZE tx_size) {
+  switch (tx_size) {
+    case TX_4X4: return (tran_low_t)val << 4;
+    case TX_8X8:
+    case TX_4X16:
+    case TX_16X4:
+    case TX_8X32:
+    case TX_32X8: return (tran_low_t)val << 2;
+    case TX_4X8:
+    case TX_8X4:
+    case TX_8X16:
+    case TX_16X8: return (tran_low_t)fdct_round_shift(val * 4 * Sqrt2);
+    default: assert(0); break;
   }
   return 0;
 }
 
-int get_fwd_lgt8(transform_1d tx_orig, TxfmParam *txfm_param,
-                 const tran_high_t *lgtmtx[], int ntx) {
-  // inter/intra split
-  if (tx_orig == &fadst8) {
-    for (int i = 0; i < ntx; ++i)
-      lgtmtx[i] = txfm_param->is_inter ? &lgt8_170[0][0] : &lgt8_150[0][0];
-    return 1;
+// This inline function corresponds to the bit shift after the second
+// transform in the av1_fht* functions
+static INLINE tran_low_t fwd_downscale_wrt_txsize(const tran_low_t val,
+                                                  const TX_SIZE tx_size) {
+  switch (tx_size) {
+    case TX_4X4: return (val + 1) >> 2;
+    case TX_4X8:
+    case TX_8X4:
+    case TX_8X8:
+    case TX_4X16:
+    case TX_16X4: return (val + (val < 0)) >> 1;
+    case TX_8X16:
+    case TX_16X8: return val;
+    case TX_8X32:
+    case TX_32X8: return ROUND_POWER_OF_TWO_SIGNED(val, 2);
+    default: assert(0); break;
   }
   return 0;
 }
-#endif  // CONFIG_LGT
+
+void flgt2d_from_pred_c(const int16_t *input, tran_low_t *output, int stride,
+                        TxfmParam *txfm_param) {
+  const TX_SIZE tx_size = txfm_param->tx_size;
+  const int w = tx_size_wide[tx_size];
+  const int h = tx_size_high[tx_size];
+  const int wlog2 = tx_size_wide_log2[tx_size];
+  const int hlog2 = tx_size_high_log2[tx_size];
+  assert(w <= 8 || h <= 8);
+
+  int i, j;
+  tran_low_t out[256];  // max size: 8x32 and 32x8
+  tran_low_t temp_in[32], temp_out[32];
+  const tran_high_t *lgtmtx_col[1];
+  const tran_high_t *lgtmtx_row[1];
+  get_lgt_func[hlog2 - 2](txfm_param, 1, lgtmtx_col, w);
+  get_lgt_func[wlog2 - 2](txfm_param, 0, lgtmtx_row, h);
+
+  // For forward transforms, to be consistent with av1_fht functions, we apply
+  // short transform first and long transform second.
+  if (w < h) {
+    // Row transforms
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j)
+        temp_in[j] = fwd_upscale_wrt_txsize(input[i * stride + j], tx_size);
+      flgt_func[wlog2 - 2](temp_in, temp_out, lgtmtx_row[0]);
+      // right shift of 2 bits here in fht8x16 and fht16x8
+      for (j = 0; j < w; ++j)
+        out[j * h + i] = (tx_size == TX_16X8 || tx_size == TX_8X16)
+                             ? ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2)
+                             : temp_out[j];
+    }
+    // Column transforms
+    for (i = 0; i < w; ++i) {
+      for (j = 0; j < h; ++j) temp_in[j] = out[j + i * h];
+      flgt_func[hlog2 - 2](temp_in, temp_out, lgtmtx_col[0]);
+      for (j = 0; j < h; ++j)
+        output[j * w + i] = fwd_downscale_wrt_txsize(temp_out[j], tx_size);
+    }
+  } else {
+    // Column transforms
+    for (i = 0; i < w; ++i) {
+      for (j = 0; j < h; ++j)
+        temp_in[j] = fwd_upscale_wrt_txsize(input[j * stride + i], tx_size);
+      flgt_func[hlog2 - 2](temp_in, temp_out, lgtmtx_col[0]);
+      // fht8x16 and fht16x8 have right shift of 2 bits here
+      for (j = 0; j < h; ++j)
+        out[j * w + i] = (tx_size == TX_16X8 || tx_size == TX_8X16)
+                             ? ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2)
+                             : temp_out[j];
+    }
+    // Row transforms
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) temp_in[j] = out[j + i * w];
+      flgt_func[wlog2 - 2](temp_in, temp_out, lgtmtx_row[0]);
+      for (j = 0; j < w; ++j)
+        output[j + i * w] = fwd_downscale_wrt_txsize(temp_out[j], tx_size);
+    }
+  }
+}
+#endif  // CONFIG_LGT_FROM_PRED
 
 #if CONFIG_EXT_TX
 // TODO(sarahparker) these functions will be removed once the highbitdepth
@@ -1148,34 +1257,29 @@ int get_fwd_lgt8(transform_1d tx_orig, TxfmParam *txfm_param,
 static void fidtx4(const tran_low_t *input, tran_low_t *output) {
   int i;
   for (i = 0; i < 4; ++i) {
-#if CONFIG_DAALA_DCT4
-    output[i] = input[i];
-#else
     output[i] = (tran_low_t)fdct_round_shift(input[i] * Sqrt2);
-#endif
   }
 }
 
 static void fidtx8(const tran_low_t *input, tran_low_t *output) {
   int i;
   for (i = 0; i < 8; ++i) {
-#if CONFIG_DAALA_DCT8
-    output[i] = input[i];
-#else
     output[i] = input[i] * 2;
-#endif
   }
 }
 
 static void fidtx16(const tran_low_t *input, tran_low_t *output) {
   int i;
-  for (i = 0; i < 16; ++i)
+  for (i = 0; i < 16; ++i) {
     output[i] = (tran_low_t)fdct_round_shift(input[i] * 2 * Sqrt2);
+  }
 }
 
 static void fidtx32(const tran_low_t *input, tran_low_t *output) {
   int i;
-  for (i = 0; i < 32; ++i) output[i] = input[i] * 4;
+  for (i = 0; i < 32; ++i) {
+    output[i] = input[i] * 4;
+  }
 }
 
 static void copy_block(const int16_t *src, int src_stride, int l, int w,
@@ -1238,7 +1342,7 @@ static void copy_fliplrud(const int16_t *src, int src_stride, int l, int w,
 }
 
 static void maybe_flip_input(const int16_t **src, int *src_stride, int l, int w,
-                             int16_t *buff, int tx_type) {
+                             int16_t *buff, TX_TYPE tx_type) {
   switch (tx_type) {
 #if CONFIG_MRC_TX
     case MRC_DCT:
@@ -1278,7 +1382,7 @@ static void maybe_flip_input(const int16_t **src, int *src_stride, int l, int w,
 
 void av1_fht4x4_c(const int16_t *input, tran_low_t *output, int stride,
                   TxfmParam *txfm_param) {
-  int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
 #if CONFIG_MRC_TX
   assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
 #endif
@@ -1293,6 +1397,26 @@ void av1_fht4x4_c(const int16_t *input, tran_low_t *output, int stride,
 #endif
   {
     static const transform_2d FHT[] = {
+#if CONFIG_DAALA_DCT4
+      { daala_fdct4, daala_fdct4 },  // DCT_DCT
+      { daala_fdst4, daala_fdct4 },  // ADST_DCT
+      { daala_fdct4, daala_fdst4 },  // DCT_ADST
+      { daala_fdst4, daala_fdst4 },  // ADST_ADST
+#if CONFIG_EXT_TX
+      { daala_fdst4, daala_fdct4 },  // FLIPADST_DCT
+      { daala_fdct4, daala_fdst4 },  // DCT_FLIPADST
+      { daala_fdst4, daala_fdst4 },  // FLIPADST_FLIPADST
+      { daala_fdst4, daala_fdst4 },  // ADST_FLIPADST
+      { daala_fdst4, daala_fdst4 },  // FLIPADST_ADST
+      { daala_idtx4, daala_idtx4 },  // IDTX
+      { daala_fdct4, daala_idtx4 },  // V_DCT
+      { daala_idtx4, daala_fdct4 },  // H_DCT
+      { daala_fdst4, daala_idtx4 },  // V_ADST
+      { daala_idtx4, daala_fdst4 },  // H_ADST
+      { daala_fdst4, daala_idtx4 },  // V_FLIPADST
+      { daala_idtx4, daala_fdst4 },  // H_FLIPADST
+#endif
+#else
       { fdct4, fdct4 },    // DCT_DCT
       { fadst4, fdct4 },   // ADST_DCT
       { fdct4, fadst4 },   // DCT_ADST
@@ -1311,6 +1435,7 @@ void av1_fht4x4_c(const int16_t *input, tran_low_t *output, int stride,
       { fadst4, fidtx4 },  // V_FLIPADST
       { fidtx4, fadst4 },  // H_FLIPADST
 #endif
+#endif
     };
     const transform_2d ht = FHT[tx_type];
     tran_low_t out[4 * 4];
@@ -1325,10 +1450,10 @@ void av1_fht4x4_c(const int16_t *input, tran_low_t *output, int stride,
 #if CONFIG_LGT
     // Choose LGT adaptive to the prediction. We may apply different LGTs for
     // different rows/columns, indicated by the pointers to 2D arrays
-    const tran_high_t *lgtmtx_col[4];
-    const tran_high_t *lgtmtx_row[4];
-    int use_lgt_col = get_fwd_lgt4(ht.cols, txfm_param, lgtmtx_col, 4);
-    int use_lgt_row = get_fwd_lgt4(ht.rows, txfm_param, lgtmtx_row, 4);
+    const tran_high_t *lgtmtx_col[1];
+    const tran_high_t *lgtmtx_row[1];
+    int use_lgt_col = get_lgt4(txfm_param, 1, lgtmtx_col);
+    int use_lgt_row = get_lgt4(txfm_param, 0, lgtmtx_row);
 #endif
 
     // Columns
@@ -1340,7 +1465,7 @@ void av1_fht4x4_c(const int16_t *input, tran_low_t *output, int stride,
 #endif
 #if CONFIG_LGT
       if (use_lgt_col)
-        flgt4(temp_in, temp_out, lgtmtx_col[i]);
+        flgt4(temp_in, temp_out, lgtmtx_col[0]);
       else
 #endif
         ht.cols(temp_in, temp_out);
@@ -1352,7 +1477,7 @@ void av1_fht4x4_c(const int16_t *input, tran_low_t *output, int stride,
       for (j = 0; j < 4; ++j) temp_in[j] = out[j + i * 4];
 #if CONFIG_LGT
       if (use_lgt_row)
-        flgt4(temp_in, temp_out, lgtmtx_row[i]);
+        flgt4(temp_in, temp_out, lgtmtx_row[0]);
       else
 #endif
         ht.rows(temp_in, temp_out);
@@ -1369,7 +1494,7 @@ void av1_fht4x4_c(const int16_t *input, tran_low_t *output, int stride,
 
 void av1_fht4x8_c(const int16_t *input, tran_low_t *output, int stride,
                   TxfmParam *txfm_param) {
-  int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
 #if CONFIG_MRC_TX
   assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
 #endif  // CONFIG_MRC_TX
@@ -1408,10 +1533,10 @@ void av1_fht4x8_c(const int16_t *input, tran_low_t *output, int stride,
 #endif
 
 #if CONFIG_LGT
-  const tran_high_t *lgtmtx_col[4];
-  const tran_high_t *lgtmtx_row[8];
-  int use_lgt_col = get_fwd_lgt8(ht.cols, txfm_param, lgtmtx_col, 4);
-  int use_lgt_row = get_fwd_lgt4(ht.rows, txfm_param, lgtmtx_row, 8);
+  const tran_high_t *lgtmtx_col[1];
+  const tran_high_t *lgtmtx_row[1];
+  int use_lgt_col = get_lgt8(txfm_param, 1, lgtmtx_col);
+  int use_lgt_row = get_lgt4(txfm_param, 0, lgtmtx_row);
 #endif
 
   // Rows
@@ -1421,7 +1546,7 @@ void av1_fht4x8_c(const int16_t *input, tran_low_t *output, int stride,
           (tran_low_t)fdct_round_shift(input[i * stride + j] * 4 * Sqrt2);
 #if CONFIG_LGT
     if (use_lgt_row)
-      flgt4(temp_in, temp_out, lgtmtx_row[i]);
+      flgt4(temp_in, temp_out, lgtmtx_row[0]);
     else
 #endif
       ht.rows(temp_in, temp_out);
@@ -1433,7 +1558,7 @@ void av1_fht4x8_c(const int16_t *input, tran_low_t *output, int stride,
     for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
 #if CONFIG_LGT
     if (use_lgt_col)
-      flgt8(temp_in, temp_out, lgtmtx_col[i]);
+      flgt8(temp_in, temp_out, lgtmtx_col[0]);
     else
 #endif
       ht.cols(temp_in, temp_out);
@@ -1445,7 +1570,7 @@ void av1_fht4x8_c(const int16_t *input, tran_low_t *output, int stride,
 
 void av1_fht8x4_c(const int16_t *input, tran_low_t *output, int stride,
                   TxfmParam *txfm_param) {
-  int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
 #if CONFIG_MRC_TX
   assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
 #endif  // CONFIG_MRC_TX
@@ -1484,10 +1609,10 @@ void av1_fht8x4_c(const int16_t *input, tran_low_t *output, int stride,
 #endif
 
 #if CONFIG_LGT
-  const tran_high_t *lgtmtx_col[8];
-  const tran_high_t *lgtmtx_row[4];
-  int use_lgt_col = get_fwd_lgt4(ht.cols, txfm_param, lgtmtx_col, 8);
-  int use_lgt_row = get_fwd_lgt8(ht.rows, txfm_param, lgtmtx_row, 4);
+  const tran_high_t *lgtmtx_col[1];
+  const tran_high_t *lgtmtx_row[1];
+  int use_lgt_col = get_lgt4(txfm_param, 1, lgtmtx_col);
+  int use_lgt_row = get_lgt8(txfm_param, 0, lgtmtx_row);
 #endif
 
   // Columns
@@ -1497,7 +1622,7 @@ void av1_fht8x4_c(const int16_t *input, tran_low_t *output, int stride,
           (tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2);
 #if CONFIG_LGT
     if (use_lgt_col)
-      flgt4(temp_in, temp_out, lgtmtx_col[i]);
+      flgt4(temp_in, temp_out, lgtmtx_col[0]);
     else
 #endif
       ht.cols(temp_in, temp_out);
@@ -1509,7 +1634,7 @@ void av1_fht8x4_c(const int16_t *input, tran_low_t *output, int stride,
     for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
 #if CONFIG_LGT
     if (use_lgt_row)
-      flgt8(temp_in, temp_out, lgtmtx_row[i]);
+      flgt8(temp_in, temp_out, lgtmtx_row[0]);
     else
 #endif
       ht.rows(temp_in, temp_out);
@@ -1521,7 +1646,7 @@ void av1_fht8x4_c(const int16_t *input, tran_low_t *output, int stride,
 
 void av1_fht4x16_c(const int16_t *input, tran_low_t *output, int stride,
                    TxfmParam *txfm_param) {
-  int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
 #if CONFIG_MRC_TX
   assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
 #endif  // CONFIG_MRC_TX
@@ -1560,8 +1685,8 @@ void av1_fht4x16_c(const int16_t *input, tran_low_t *output, int stride,
 #endif
 
 #if CONFIG_LGT
-  const tran_high_t *lgtmtx_row[16];
-  int use_lgt_row = get_fwd_lgt4(ht.rows, txfm_param, lgtmtx_row, 16);
+  const tran_high_t *lgtmtx_row[1];
+  int use_lgt_row = get_lgt4(txfm_param, 0, lgtmtx_row);
 #endif
 
   // Rows
@@ -1569,7 +1694,7 @@ void av1_fht4x16_c(const int16_t *input, tran_low_t *output, int stride,
     for (j = 0; j < n; ++j) temp_in[j] = input[i * stride + j] * 4;
 #if CONFIG_LGT
     if (use_lgt_row)
-      flgt4(temp_in, temp_out, lgtmtx_row[i]);
+      flgt4(temp_in, temp_out, lgtmtx_row[0]);
     else
 #endif
       ht.rows(temp_in, temp_out);
@@ -1588,7 +1713,7 @@ void av1_fht4x16_c(const int16_t *input, tran_low_t *output, int stride,
 
 void av1_fht16x4_c(const int16_t *input, tran_low_t *output, int stride,
                    TxfmParam *txfm_param) {
-  int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
 #if CONFIG_MRC_TX
   assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
 #endif  // CONFIG_MRC_TX
@@ -1627,8 +1752,8 @@ void av1_fht16x4_c(const int16_t *input, tran_low_t *output, int stride,
 #endif
 
 #if CONFIG_LGT
-  const tran_high_t *lgtmtx_col[16];
-  int use_lgt_col = get_fwd_lgt4(ht.cols, txfm_param, lgtmtx_col, 16);
+  const tran_high_t *lgtmtx_col[1];
+  int use_lgt_col = get_lgt4(txfm_param, 1, lgtmtx_col);
 #endif
 
   // Columns
@@ -1636,7 +1761,7 @@ void av1_fht16x4_c(const int16_t *input, tran_low_t *output, int stride,
     for (j = 0; j < n; ++j) temp_in[j] = input[j * stride + i] * 4;
 #if CONFIG_LGT
     if (use_lgt_col)
-      flgt4(temp_in, temp_out, lgtmtx_col[i]);
+      flgt4(temp_in, temp_out, lgtmtx_col[0]);
     else
 #endif
       ht.cols(temp_in, temp_out);
@@ -1655,7 +1780,7 @@ void av1_fht16x4_c(const int16_t *input, tran_low_t *output, int stride,
 
 void av1_fht8x16_c(const int16_t *input, tran_low_t *output, int stride,
                    TxfmParam *txfm_param) {
-  int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
 #if CONFIG_MRC_TX
   assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
 #endif  // CONFIG_MRC_TX
@@ -1694,8 +1819,8 @@ void av1_fht8x16_c(const int16_t *input, tran_low_t *output, int stride,
 #endif
 
 #if CONFIG_LGT
-  const tran_high_t *lgtmtx_row[16];
-  int use_lgt_row = get_fwd_lgt8(ht.rows, txfm_param, lgtmtx_row, 16);
+  const tran_high_t *lgtmtx_row[1];
+  int use_lgt_row = get_lgt8(txfm_param, 0, lgtmtx_row);
 #endif
 
   // Rows
@@ -1705,7 +1830,7 @@ void av1_fht8x16_c(const int16_t *input, tran_low_t *output, int stride,
           (tran_low_t)fdct_round_shift(input[i * stride + j] * 4 * Sqrt2);
 #if CONFIG_LGT
     if (use_lgt_row)
-      flgt8(temp_in, temp_out, lgtmtx_row[i]);
+      flgt8(temp_in, temp_out, lgtmtx_row[0]);
     else
 #endif
       ht.rows(temp_in, temp_out);
@@ -1724,7 +1849,7 @@ void av1_fht8x16_c(const int16_t *input, tran_low_t *output, int stride,
 
 void av1_fht16x8_c(const int16_t *input, tran_low_t *output, int stride,
                    TxfmParam *txfm_param) {
-  int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
 #if CONFIG_MRC_TX
   assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
 #endif  // CONFIG_MRC_TX
@@ -1763,8 +1888,8 @@ void av1_fht16x8_c(const int16_t *input, tran_low_t *output, int stride,
 #endif
 
 #if CONFIG_LGT
-  const tran_high_t *lgtmtx_col[16];
-  int use_lgt_col = get_fwd_lgt8(ht.cols, txfm_param, lgtmtx_col, 16);
+  const tran_high_t *lgtmtx_col[1];
+  int use_lgt_col = get_lgt8(txfm_param, 1, lgtmtx_col);
 #endif
 
   // Columns
@@ -1774,7 +1899,7 @@ void av1_fht16x8_c(const int16_t *input, tran_low_t *output, int stride,
           (tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2);
 #if CONFIG_LGT
     if (use_lgt_col)
-      flgt8(temp_in, temp_out, lgtmtx_col[i]);
+      flgt8(temp_in, temp_out, lgtmtx_col[0]);
     else
 #endif
       ht.cols(temp_in, temp_out);
@@ -1793,7 +1918,7 @@ void av1_fht16x8_c(const int16_t *input, tran_low_t *output, int stride,
 
 void av1_fht8x32_c(const int16_t *input, tran_low_t *output, int stride,
                    TxfmParam *txfm_param) {
-  int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
 #if CONFIG_MRC_TX
   assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
 #endif  // CONFIG_MRC_TX
@@ -1832,8 +1957,8 @@ void av1_fht8x32_c(const int16_t *input, tran_low_t *output, int stride,
 #endif
 
 #if CONFIG_LGT
-  const tran_high_t *lgtmtx_row[32];
-  int use_lgt_row = get_fwd_lgt8(ht.rows, txfm_param, lgtmtx_row, 32);
+  const tran_high_t *lgtmtx_row[1];
+  int use_lgt_row = get_lgt8(txfm_param, 0, lgtmtx_row);
 #endif
 
   // Rows
@@ -1841,7 +1966,7 @@ void av1_fht8x32_c(const int16_t *input, tran_low_t *output, int stride,
     for (j = 0; j < n; ++j) temp_in[j] = input[i * stride + j] * 4;
 #if CONFIG_LGT
     if (use_lgt_row)
-      flgt8(temp_in, temp_out, lgtmtx_row[i]);
+      flgt8(temp_in, temp_out, lgtmtx_row[0]);
     else
 #endif
       ht.rows(temp_in, temp_out);
@@ -1855,12 +1980,12 @@ void av1_fht8x32_c(const int16_t *input, tran_low_t *output, int stride,
     for (j = 0; j < n4; ++j)
       output[i + j * n] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
   }
-  // Note: overall scale factor of transform is 4 times unitary
+  // Note: overall scale factor of transform is 8 times unitary
 }
 
 void av1_fht32x8_c(const int16_t *input, tran_low_t *output, int stride,
                    TxfmParam *txfm_param) {
-  int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
 #if CONFIG_MRC_TX
   assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
 #endif  // CONFIG_MRC_TX
@@ -1899,8 +2024,8 @@ void av1_fht32x8_c(const int16_t *input, tran_low_t *output, int stride,
 #endif
 
 #if CONFIG_LGT
-  const tran_high_t *lgtmtx_col[32];
-  int use_lgt_col = get_fwd_lgt8(ht.cols, txfm_param, lgtmtx_col, 32);
+  const tran_high_t *lgtmtx_col[1];
+  int use_lgt_col = get_lgt8(txfm_param, 1, lgtmtx_col);
 #endif
 
   // Columns
@@ -1908,7 +2033,7 @@ void av1_fht32x8_c(const int16_t *input, tran_low_t *output, int stride,
     for (j = 0; j < n; ++j) temp_in[j] = input[j * stride + i] * 4;
 #if CONFIG_LGT
     if (use_lgt_col)
-      flgt8(temp_in, temp_out, lgtmtx_col[i]);
+      flgt8(temp_in, temp_out, lgtmtx_col[0]);
     else
 #endif
       ht.cols(temp_in, temp_out);
@@ -1922,12 +2047,12 @@ void av1_fht32x8_c(const int16_t *input, tran_low_t *output, int stride,
     for (j = 0; j < n4; ++j)
       output[j + i * n4] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
   }
-  // Note: overall scale factor of transform is 4 times unitary
+  // Note: overall scale factor of transform is 8 times unitary
 }
 
 void av1_fht16x32_c(const int16_t *input, tran_low_t *output, int stride,
                     TxfmParam *txfm_param) {
-  int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
 #if CONFIG_MRC_TX
   assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
 #endif  // CONFIG_MRC_TX
@@ -1986,7 +2111,7 @@ void av1_fht16x32_c(const int16_t *input, tran_low_t *output, int stride,
 
 void av1_fht32x16_c(const int16_t *input, tran_low_t *output, int stride,
                     TxfmParam *txfm_param) {
-  int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
 #if CONFIG_MRC_TX
   assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
 #endif  // CONFIG_MRC_TX
@@ -2043,134 +2168,9 @@ void av1_fht32x16_c(const int16_t *input, tran_low_t *output, int stride,
   // Note: overall scale factor of transform is 4 times unitary
 }
 
-void av1_fdct8x8_quant_c(const int16_t *input, int stride,
-                         tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                         int skip_block, const int16_t *zbin_ptr,
-                         const int16_t *round_ptr, const int16_t *quant_ptr,
-                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                         uint16_t *eob_ptr, const int16_t *scan,
-                         const int16_t *iscan
-#if CONFIG_AOM_QM
-                         ,
-                         const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
-#endif
-                         ) {
-  int eob = -1;
-
-  int i, j;
-  tran_low_t intermediate[64];
-
-  // Transform columns
-  {
-    tran_low_t *output = intermediate;
-    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
-    tran_high_t t0, t1, t2, t3;                  // needs32
-    tran_high_t x0, x1, x2, x3;                  // canbe16
-
-    for (i = 0; i < 8; i++) {
-      // stage 1
-      s0 = (input[0 * stride] + input[7 * stride]) * 4;
-      s1 = (input[1 * stride] + input[6 * stride]) * 4;
-      s2 = (input[2 * stride] + input[5 * stride]) * 4;
-      s3 = (input[3 * stride] + input[4 * stride]) * 4;
-      s4 = (input[3 * stride] - input[4 * stride]) * 4;
-      s5 = (input[2 * stride] - input[5 * stride]) * 4;
-      s6 = (input[1 * stride] - input[6 * stride]) * 4;
-      s7 = (input[0 * stride] - input[7 * stride]) * 4;
-
-      // fdct4(step, step);
-      x0 = s0 + s3;
-      x1 = s1 + s2;
-      x2 = s1 - s2;
-      x3 = s0 - s3;
-      t0 = (x0 + x1) * cospi_16_64;
-      t1 = (x0 - x1) * cospi_16_64;
-      t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
-      t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
-      output[0 * 8] = (tran_low_t)fdct_round_shift(t0);
-      output[2 * 8] = (tran_low_t)fdct_round_shift(t2);
-      output[4 * 8] = (tran_low_t)fdct_round_shift(t1);
-      output[6 * 8] = (tran_low_t)fdct_round_shift(t3);
-
-      // stage 2
-      t0 = (s6 - s5) * cospi_16_64;
-      t1 = (s6 + s5) * cospi_16_64;
-      t2 = fdct_round_shift(t0);
-      t3 = fdct_round_shift(t1);
-
-      // stage 3
-      x0 = s4 + t2;
-      x1 = s4 - t2;
-      x2 = s7 - t3;
-      x3 = s7 + t3;
-
-      // stage 4
-      t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
-      t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
-      t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
-      t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
-      output[1 * 8] = (tran_low_t)fdct_round_shift(t0);
-      output[3 * 8] = (tran_low_t)fdct_round_shift(t2);
-      output[5 * 8] = (tran_low_t)fdct_round_shift(t1);
-      output[7 * 8] = (tran_low_t)fdct_round_shift(t3);
-      input++;
-      output++;
-    }
-  }
-
-  // Rows
-  for (i = 0; i < 8; ++i) {
-    fdct8(&intermediate[i * 8], &coeff_ptr[i * 8]);
-    for (j = 0; j < 8; ++j) coeff_ptr[j + i * 8] /= 2;
-  }
-
-  // TODO(jingning) Decide the need of these arguments after the
-  // quantization process is completed.
-  (void)zbin_ptr;
-  (void)quant_shift_ptr;
-  (void)iscan;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    // Quantization pass: All coefficients with index >= zero_flag are
-    // skippable. Note: zero_flag can be zero.
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-#if CONFIG_AOM_QM
-      const qm_val_t wt = qm_ptr[rc];
-      const qm_val_t iwt = iqm_ptr[rc];
-      const int dequant =
-          (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
-          AOM_QM_BITS;
-#endif
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-
-      int64_t tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
-      int tmp32;
-#if CONFIG_AOM_QM
-      tmp32 = (int)((tmp * quant_ptr[rc != 0] * wt) >> (16 + AOM_QM_BITS));
-      qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant;
-#else
-      tmp32 = (int)((tmp * quant_ptr[rc != 0]) >> 16);
-      qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
-#endif
-
-      if (tmp32) eob = i;
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-
 void av1_fht8x8_c(const int16_t *input, tran_low_t *output, int stride,
                   TxfmParam *txfm_param) {
-  int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
 #if CONFIG_MRC_TX
   assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
 #endif  // CONFIG_MRC_TX
@@ -2185,6 +2185,26 @@ void av1_fht8x8_c(const int16_t *input, tran_low_t *output, int stride,
 #endif
   {
     static const transform_2d FHT[] = {
+#if CONFIG_DAALA_DCT8
+      { daala_fdct8, daala_fdct8 },  // DCT_DCT
+      { daala_fdst8, daala_fdct8 },  // ADST_DCT
+      { daala_fdct8, daala_fdst8 },  // DCT_ADST
+      { daala_fdst8, daala_fdst8 },  // ADST_ADST
+#if CONFIG_EXT_TX
+      { daala_fdst8, daala_fdct8 },  // FLIPADST_DCT
+      { daala_fdct8, daala_fdst8 },  // DCT_FLIPADST
+      { daala_fdst8, daala_fdst8 },  // FLIPADST_FLIPADST
+      { daala_fdst8, daala_fdst8 },  // ADST_FLIPADST
+      { daala_fdst8, daala_fdst8 },  // FLIPADST_ADST
+      { daala_idtx8, daala_idtx8 },  // IDTX
+      { daala_fdct8, daala_idtx8 },  // V_DCT
+      { daala_idtx8, daala_fdct8 },  // H_DCT
+      { daala_fdst8, daala_idtx8 },  // V_ADST
+      { daala_idtx8, daala_fdst8 },  // H_ADST
+      { daala_fdst8, daala_idtx8 },  // V_FLIPADST
+      { daala_idtx8, daala_fdst8 },  // H_FLIPADST
+#endif
+#else
       { fdct8, fdct8 },    // DCT_DCT
       { fadst8, fdct8 },   // ADST_DCT
       { fdct8, fadst8 },   // DCT_ADST
@@ -2203,6 +2223,7 @@ void av1_fht8x8_c(const int16_t *input, tran_low_t *output, int stride,
       { fadst8, fidtx8 },  // V_FLIPADST
       { fidtx8, fadst8 },  // H_FLIPADST
 #endif
+#endif
     };
     const transform_2d ht = FHT[tx_type];
     tran_low_t out[64];
@@ -2215,10 +2236,10 @@ void av1_fht8x8_c(const int16_t *input, tran_low_t *output, int stride,
 #endif
 
 #if CONFIG_LGT
-    const tran_high_t *lgtmtx_col[8];
-    const tran_high_t *lgtmtx_row[8];
-    int use_lgt_col = get_fwd_lgt8(ht.cols, txfm_param, lgtmtx_col, 8);
-    int use_lgt_row = get_fwd_lgt8(ht.rows, txfm_param, lgtmtx_row, 8);
+    const tran_high_t *lgtmtx_col[1];
+    const tran_high_t *lgtmtx_row[1];
+    int use_lgt_col = get_lgt8(txfm_param, 1, lgtmtx_col);
+    int use_lgt_row = get_lgt8(txfm_param, 0, lgtmtx_row);
 #endif
 
     // Columns
@@ -2230,7 +2251,7 @@ void av1_fht8x8_c(const int16_t *input, tran_low_t *output, int stride,
 #endif
 #if CONFIG_LGT
       if (use_lgt_col)
-        flgt8(temp_in, temp_out, lgtmtx_col[i]);
+        flgt8(temp_in, temp_out, lgtmtx_col[0]);
       else
 #endif
         ht.cols(temp_in, temp_out);
@@ -2242,7 +2263,7 @@ void av1_fht8x8_c(const int16_t *input, tran_low_t *output, int stride,
       for (j = 0; j < 8; ++j) temp_in[j] = out[j + i * 8];
 #if CONFIG_LGT
       if (use_lgt_row)
-        flgt8(temp_in, temp_out, lgtmtx_row[i]);
+        flgt8(temp_in, temp_out, lgtmtx_row[0]);
       else
 #endif
         ht.rows(temp_in, temp_out);
@@ -2315,7 +2336,7 @@ void av1_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
 
 void av1_fht16x16_c(const int16_t *input, tran_low_t *output, int stride,
                     TxfmParam *txfm_param) {
-  int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
 #if CONFIG_MRC_TX
   assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
 #endif  // CONFIG_MRC_TX
@@ -2323,6 +2344,26 @@ void av1_fht16x16_c(const int16_t *input, tran_low_t *output, int stride,
   assert(tx_type == DCT_DCT);
 #endif
   static const transform_2d FHT[] = {
+#if CONFIG_DAALA_DCT16
+    { daala_fdct16, daala_fdct16 },  // DCT_DCT
+    { daala_fdst16, daala_fdct16 },  // ADST_DCT
+    { daala_fdct16, daala_fdst16 },  // DCT_ADST
+    { daala_fdst16, daala_fdst16 },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { daala_fdst16, daala_fdct16 },  // FLIPADST_DCT
+    { daala_fdct16, daala_fdst16 },  // DCT_FLIPADST
+    { daala_fdst16, daala_fdst16 },  // FLIPADST_FLIPADST
+    { daala_fdst16, daala_fdst16 },  // ADST_FLIPADST
+    { daala_fdst16, daala_fdst16 },  // FLIPADST_ADST
+    { daala_idtx16, daala_idtx16 },  // IDTX
+    { daala_fdct16, daala_idtx16 },  // V_DCT
+    { daala_idtx16, daala_fdct16 },  // H_DCT
+    { daala_fdst16, daala_idtx16 },  // V_ADST
+    { daala_idtx16, daala_fdst16 },  // H_ADST
+    { daala_fdst16, daala_idtx16 },  // V_FLIPADST
+    { daala_idtx16, daala_fdst16 },  // H_FLIPADST
+#endif
+#else
     { fdct16, fdct16 },    // DCT_DCT
     { fadst16, fdct16 },   // ADST_DCT
     { fdct16, fadst16 },   // DCT_ADST
@@ -2341,6 +2382,7 @@ void av1_fht16x16_c(const int16_t *input, tran_low_t *output, int stride,
     { fadst16, fidtx16 },  // V_FLIPADST
     { fidtx16, fadst16 },  // H_FLIPADST
 #endif
+#endif
   };
   const transform_2d ht = FHT[tx_type];
   tran_low_t out[256];
@@ -2354,17 +2396,34 @@ void av1_fht16x16_c(const int16_t *input, tran_low_t *output, int stride,
 
   // Columns
   for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j) temp_in[j] = input[j * stride + i] * 4;
+    for (j = 0; j < 16; ++j) {
+#if CONFIG_DAALA_DCT16
+      temp_in[j] = input[j * stride + i] * 16;
+#else
+      temp_in[j] = input[j * stride + i] * 4;
+#endif
+    }
     ht.cols(temp_in, temp_out);
-    for (j = 0; j < 16; ++j)
+    for (j = 0; j < 16; ++j) {
+#if CONFIG_DAALA_DCT16
+      out[j * 16 + i] = temp_out[j];
+#else
       out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+#endif
+    }
   }
 
   // Rows
   for (i = 0; i < 16; ++i) {
     for (j = 0; j < 16; ++j) temp_in[j] = out[j + i * 16];
     ht.rows(temp_in, temp_out);
-    for (j = 0; j < 16; ++j) output[j + i * 16] = temp_out[j];
+    for (j = 0; j < 16; ++j) {
+#if CONFIG_DAALA_DCT16
+      output[j + i * 16] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
+#else
+      output[j + i * 16] = temp_out[j];
+#endif
+    }
   }
 }
 
@@ -2375,12 +2434,32 @@ void av1_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output,
 
 void av1_fht32x32_c(const int16_t *input, tran_low_t *output, int stride,
                     TxfmParam *txfm_param) {
-  int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
 #if CONFIG_DCT_ONLY
   assert(tx_type == DCT_DCT);
 #endif
   static const transform_2d FHT[] = {
-    { fdct32, fdct32 },  // DCT_DCT
+#if CONFIG_DAALA_DCT32
+    { daala_fdct32, daala_fdct32 },  // DCT_DCT
+#if CONFIG_EXT_TX
+    { daala_fdst32, daala_fdct32 },  // ADST_DCT
+    { daala_fdct32, daala_fdst32 },  // DCT_ADST
+    { daala_fdst32, daala_fdst32 },  // ADST_ADST
+    { daala_fdst32, daala_fdct32 },  // FLIPADST_DCT
+    { daala_fdct32, daala_fdst32 },  // DCT_FLIPADST
+    { daala_fdst32, daala_fdst32 },  // FLIPADST_FLIPADST
+    { daala_fdst32, daala_fdst32 },  // ADST_FLIPADST
+    { daala_fdst32, daala_fdst32 },  // FLIPADST_ADST
+    { daala_idtx32, daala_idtx32 },  // IDTX
+    { daala_fdct32, daala_idtx32 },  // V_DCT
+    { daala_idtx32, daala_fdct32 },  // H_DCT
+    { daala_fdst32, daala_idtx32 },  // V_ADST
+    { daala_idtx32, daala_fdst32 },  // H_ADST
+    { daala_fdst32, daala_idtx32 },  // V_FLIPADST
+    { daala_idtx32, daala_fdst32 },  // H_FLIPADST
+#endif
+#else
+    { fdct32, fdct32 },              // DCT_DCT
 #if CONFIG_EXT_TX
     { fhalfright32, fdct32 },        // ADST_DCT
     { fdct32, fhalfright32 },        // DCT_ADST
@@ -2398,6 +2477,7 @@ void av1_fht32x32_c(const int16_t *input, tran_low_t *output, int stride,
     { fhalfright32, fidtx32 },       // V_FLIPADST
     { fidtx32, fhalfright32 },       // H_FLIPADST
 #endif
+#endif
 #if CONFIG_MRC_TX
     { fdct32, fdct32 },  // MRC_TX
 #endif                   // CONFIG_MRC_TX
@@ -2416,27 +2496,41 @@ void av1_fht32x32_c(const int16_t *input, tran_low_t *output, int stride,
   if (tx_type == MRC_DCT) {
     int16_t masked_input[32 * 32];
     get_masked_residual32(&input, &stride, txfm_param->dst, txfm_param->stride,
-                          masked_input);
+                          masked_input, txfm_param);
   }
 #endif  // CONFIG_MRC_TX
 
   // Columns
   for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
+    for (j = 0; j < 32; ++j) {
+#if CONFIG_DAALA_DCT32
+      temp_in[j] = input[j * stride + i] * 16;
+#else
+      temp_in[j] = input[j * stride + i] * 4;
+#endif
+    }
     ht.cols(temp_in, temp_out);
-    for (j = 0; j < 32; ++j)
+    for (j = 0; j < 32; ++j) {
+#if CONFIG_DAALA_DCT32
+      out[j * 32 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
+#else
       out[j * 32 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
+#endif
+    }
   }
 
   // Rows
   for (i = 0; i < 32; ++i) {
     for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32];
     ht.rows(temp_in, temp_out);
-    for (j = 0; j < 32; ++j) output[j + i * 32] = temp_out[j];
+    for (j = 0; j < 32; ++j) {
+      output[j + i * 32] = temp_out[j];
+    }
   }
 }
 
 #if CONFIG_TX64X64
+#if !CONFIG_DAALA_DCT64
 #if CONFIG_EXT_TX
 static void fidtx64(const tran_low_t *input, tran_low_t *output) {
   int i;
@@ -2475,10 +2569,11 @@ static void fdct64_row(const tran_low_t *input, tran_low_t *output) {
   av1_fdct64_new(in, out, fwd_cos_bit_row_dct_64, fwd_stage_range_row_dct_64);
   for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
 }
+#endif
 
 void av1_fht64x64_c(const int16_t *input, tran_low_t *output, int stride,
                     TxfmParam *txfm_param) {
-  int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
 #if CONFIG_MRC_TX
   assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
 #endif  // CONFIG_MRC_TX
@@ -2486,7 +2581,27 @@ void av1_fht64x64_c(const int16_t *input, tran_low_t *output, int stride,
   assert(tx_type == DCT_DCT);
 #endif
   static const transform_2d FHT[] = {
-    { fdct64_col, fdct64_row },  // DCT_DCT
+#if CONFIG_DAALA_DCT64
+    { daala_fdct64, daala_fdct64 },  // DCT_DCT
+#if CONFIG_EXT_TX
+    { daala_fdst64, daala_fdct64 },  // ADST_DCT
+    { daala_fdct64, daala_fdst64 },  // DCT_ADST
+    { daala_fdst64, daala_fdst64 },  // ADST_ADST
+    { daala_fdst64, daala_fdct64 },  // FLIPADST_DCT
+    { daala_fdct64, daala_fdst64 },  // DCT_FLIPADST
+    { daala_fdst64, daala_fdst64 },  // FLIPADST_FLIPADST
+    { daala_fdst64, daala_fdst64 },  // ADST_FLIPADST
+    { daala_fdst64, daala_fdst64 },  // FLIPADST_ADST
+    { daala_idtx64, daala_idtx64 },  // IDTX
+    { daala_fdct64, daala_idtx64 },  // V_DCT
+    { daala_idtx64, daala_fdct64 },  // H_DCT
+    { daala_fdst64, daala_idtx64 },  // V_ADST
+    { daala_idtx64, daala_fdst64 },  // H_ADST
+    { daala_fdst64, daala_idtx64 },  // V_FLIPADST
+    { daala_idtx64, daala_fdst64 },  // H_FLIPADST
+#endif                               // CONFIG_EXT_TX
+#else
+    { fdct64_col, fdct64_row },      // DCT_DCT
 #if CONFIG_EXT_TX
     { fhalfright64, fdct64_row },    // ADST_DCT
     { fdct64_col, fhalfright64 },    // DCT_ADST
@@ -2503,7 +2618,8 @@ void av1_fht64x64_c(const int16_t *input, tran_low_t *output, int stride,
     { fidtx64, fhalfright64 },       // H_ADST
     { fhalfright64, fidtx64 },       // V_FLIPADST
     { fidtx64, fhalfright64 },       // H_FLIPADST
-#endif
+#endif  // CONFIG_EXT_TX
+#endif  // CONFIG_DAALA_DCT64
   };
   const transform_2d ht = FHT[tx_type];
   tran_low_t out[4096];
@@ -2516,10 +2632,18 @@ void av1_fht64x64_c(const int16_t *input, tran_low_t *output, int stride,
 
   // Columns
   for (i = 0; i < 64; ++i) {
+#if CONFIG_DAALA_DCT64
+    for (j = 0; j < 64; ++j) temp_in[j] = input[j * stride + i] * 16;
+    ht.cols(temp_in, temp_out);
+    for (j = 0; j < 64; ++j)
+      out[j * 64 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 3;
+
+#else
     for (j = 0; j < 64; ++j) temp_in[j] = input[j * stride + i];
     ht.cols(temp_in, temp_out);
     for (j = 0; j < 64; ++j)
       out[j * 64 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+#endif
   }
 
   // Rows
@@ -2527,8 +2651,129 @@ void av1_fht64x64_c(const int16_t *input, tran_low_t *output, int stride,
     for (j = 0; j < 64; ++j) temp_in[j] = out[j + i * 64];
     ht.rows(temp_in, temp_out);
     for (j = 0; j < 64; ++j)
+#if CONFIG_DAALA_DCT64
+      output[j + i * 64] = temp_out[j];
+#else
       output[j + i * 64] =
           (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
+#endif
+  }
+}
+
+void av1_fht64x32_c(const int16_t *input, tran_low_t *output, int stride,
+                    TxfmParam *txfm_param) {
+  const TX_TYPE tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif  // CONFIG_MRC_TX
+#if CONFIG_DCT_ONLY
+  assert(tx_type == DCT_DCT);
+#endif
+  static const transform_2d FHT[] = {
+    { fdct32, fdct64_row },  // DCT_DCT
+#if CONFIG_EXT_TX
+    { fhalfright32, fdct64_row },    // ADST_DCT
+    { fdct32, fhalfright64 },        // DCT_ADST
+    { fhalfright32, fhalfright64 },  // ADST_ADST
+    { fhalfright32, fdct64_row },    // FLIPADST_DCT
+    { fdct32, fhalfright64 },        // DCT_FLIPADST
+    { fhalfright32, fhalfright64 },  // FLIPADST_FLIPADST
+    { fhalfright32, fhalfright64 },  // ADST_FLIPADST
+    { fhalfright32, fhalfright64 },  // FLIPADST_ADST
+    { fidtx32, fidtx64 },            // IDTX
+    { fdct32, fidtx64 },             // V_DCT
+    { fidtx32, fdct64_row },         // H_DCT
+    { fhalfright32, fidtx64 },       // V_ADST
+    { fidtx32, fhalfright64 },       // H_ADST
+    { fhalfright32, fidtx64 },       // V_FLIPADST
+    { fidtx32, fhalfright64 },       // H_FLIPADST
+#endif                               // CONFIG_EXT_TX
+  };
+  const transform_2d ht = FHT[tx_type];
+  tran_low_t out[2048];
+  int i, j;
+  tran_low_t temp_in[64], temp_out[64];
+  const int n = 32;
+  const int n2 = 64;
+#if CONFIG_EXT_TX
+  int16_t flipped_input[32 * 64];
+  maybe_flip_input(&input, &stride, n, n2, flipped_input, tx_type);
+#endif
+
+  // Columns
+  for (i = 0; i < n2; ++i) {
+    for (j = 0; j < n; ++j)
+      temp_in[j] = (tran_low_t)fdct_round_shift(input[j * stride + i] * Sqrt2);
+    ht.cols(temp_in, temp_out);
+    for (j = 0; j < n; ++j)
+      out[j * n2 + i] = (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
+  }
+
+  // Rows
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
+    ht.rows(temp_in, temp_out);
+    for (j = 0; j < n2; ++j)
+      output[j + i * n2] =
+          (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
+  }
+}
+
+void av1_fht32x64_c(const int16_t *input, tran_low_t *output, int stride,
+                    TxfmParam *txfm_param) {
+  const TX_TYPE tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif  // CONFIG_MRC_TX
+#if CONFIG_DCT_ONLY
+  assert(tx_type == DCT_DCT);
+#endif
+  static const transform_2d FHT[] = {
+    { fdct64_row, fdct32 },  // DCT_DCT
+#if CONFIG_EXT_TX
+    { fhalfright64, fdct32 },        // ADST_DCT
+    { fdct64_row, fhalfright32 },    // DCT_ADST
+    { fhalfright64, fhalfright32 },  // ADST_ADST
+    { fhalfright64, fdct32 },        // FLIPADST_DCT
+    { fdct64_row, fhalfright32 },    // DCT_FLIPADST
+    { fhalfright64, fhalfright32 },  // FLIPADST_FLIPADST
+    { fhalfright64, fhalfright32 },  // ADST_FLIPADST
+    { fhalfright64, fhalfright32 },  // FLIPADST_ADST
+    { fidtx64, fidtx32 },            // IDTX
+    { fdct64_row, fidtx32 },         // V_DCT
+    { fidtx64, fdct32 },             // H_DCT
+    { fhalfright64, fidtx32 },       // V_ADST
+    { fidtx64, fhalfright32 },       // H_ADST
+    { fhalfright64, fidtx32 },       // V_FLIPADST
+    { fidtx64, fhalfright32 },       // H_FLIPADST
+#endif                               // CONFIG_EXT_TX
+  };
+  const transform_2d ht = FHT[tx_type];
+  tran_low_t out[32 * 64];
+  int i, j;
+  tran_low_t temp_in[64], temp_out[64];
+  const int n = 32;
+  const int n2 = 64;
+#if CONFIG_EXT_TX
+  int16_t flipped_input[32 * 64];
+  maybe_flip_input(&input, &stride, n2, n, flipped_input, tx_type);
+#endif
+
+  // Rows
+  for (i = 0; i < n2; ++i) {
+    for (j = 0; j < n; ++j)
+      temp_in[j] = (tran_low_t)fdct_round_shift(input[i * stride + j] * Sqrt2);
+    ht.rows(temp_in, temp_out);
+    for (j = 0; j < n; ++j)
+      out[j * n2 + i] = (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
+  }
+
+  // Columns
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
+    ht.cols(temp_in, temp_out);
+    for (j = 0; j < n2; ++j)
+      output[i + j * n] = (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
   }
 }
 #endif  // CONFIG_TX64X64
@@ -2536,110 +2781,17 @@ void av1_fht64x64_c(const int16_t *input, tran_low_t *output, int stride,
 #if CONFIG_EXT_TX
 // Forward identity transform.
 void av1_fwd_idtx_c(const int16_t *src_diff, tran_low_t *coeff, int stride,
-                    int bs, int tx_type) {
+                    int bsx, int bsy, TX_TYPE tx_type) {
   int r, c;
-  const int shift = bs < 32 ? 3 : (bs < 64 ? 2 : 1);
+  const int pels = bsx * bsy;
+  const int shift = 3 - ((pels > 256) + (pels > 1024));
   if (tx_type == IDTX) {
-    for (r = 0; r < bs; ++r) {
-      for (c = 0; c < bs; ++c) coeff[c] = src_diff[c] * (1 << shift);
+    for (r = 0; r < bsy; ++r) {
+      for (c = 0; c < bsx; ++c) coeff[c] = src_diff[c] * (1 << shift);
       src_diff += stride;
-      coeff += bs;
+      coeff += bsx;
     }
   }
 }
 #endif  // CONFIG_EXT_TX
-
-#if CONFIG_DPCM_INTRA
-void av1_dpcm_ft4_c(const int16_t *input, int stride, TX_TYPE_1D tx_type,
-                    tran_low_t *output) {
-  assert(tx_type < TX_TYPES_1D);
-  static const transform_1d FHT[] = { fdct4, fadst4, fadst4, fidtx4 };
-  const transform_1d ft = FHT[tx_type];
-  tran_low_t temp_in[4];
-  for (int i = 0; i < 4; ++i)
-    temp_in[i] = (tran_low_t)fdct_round_shift(input[i * stride] * 4 * Sqrt2);
-  ft(temp_in, output);
-}
-
-void av1_dpcm_ft8_c(const int16_t *input, int stride, TX_TYPE_1D tx_type,
-                    tran_low_t *output) {
-  assert(tx_type < TX_TYPES_1D);
-  static const transform_1d FHT[] = { fdct8, fadst8, fadst8, fidtx8 };
-  const transform_1d ft = FHT[tx_type];
-  tran_low_t temp_in[8];
-  for (int i = 0; i < 8; ++i) temp_in[i] = input[i * stride] * 4;
-  ft(temp_in, output);
-}
-
-void av1_dpcm_ft16_c(const int16_t *input, int stride, TX_TYPE_1D tx_type,
-                     tran_low_t *output) {
-  assert(tx_type < TX_TYPES_1D);
-  static const transform_1d FHT[] = { fdct16, fadst16, fadst16, fidtx16 };
-  const transform_1d ft = FHT[tx_type];
-  tran_low_t temp_in[16];
-  for (int i = 0; i < 16; ++i)
-    temp_in[i] = (tran_low_t)fdct_round_shift(input[i * stride] * 2 * Sqrt2);
-  ft(temp_in, output);
-}
-
-void av1_dpcm_ft32_c(const int16_t *input, int stride, TX_TYPE_1D tx_type,
-                     tran_low_t *output) {
-  assert(tx_type < TX_TYPES_1D);
-  static const transform_1d FHT[] = { fdct32, fhalfright32, fhalfright32,
-                                      fidtx32 };
-  const transform_1d ft = FHT[tx_type];
-  tran_low_t temp_in[32];
-  for (int i = 0; i < 32; ++i) temp_in[i] = input[i * stride];
-  ft(temp_in, output);
-}
-
-#if CONFIG_HIGHBITDEPTH
-void av1_hbd_dpcm_ft4_c(const int16_t *input, int stride, TX_TYPE_1D tx_type,
-                        tran_low_t *output, int dir) {
-  (void)dir;
-  assert(tx_type < TX_TYPES_1D);
-  static const transform_1d FHT[] = { fdct4, fadst4, fadst4, fidtx4 };
-  const transform_1d ft = FHT[tx_type];
-  tran_low_t temp_in[4];
-  for (int i = 0; i < 4; ++i)
-    temp_in[i] = (tran_low_t)fdct_round_shift(input[i * stride] * 4 * Sqrt2);
-  ft(temp_in, output);
-}
-
-void av1_hbd_dpcm_ft8_c(const int16_t *input, int stride, TX_TYPE_1D tx_type,
-                        tran_low_t *output, int dir) {
-  (void)dir;
-  assert(tx_type < TX_TYPES_1D);
-  static const transform_1d FHT[] = { fdct8, fadst8, fadst8, fidtx8 };
-  const transform_1d ft = FHT[tx_type];
-  tran_low_t temp_in[8];
-  for (int i = 0; i < 8; ++i) temp_in[i] = input[i * stride] * 4;
-  ft(temp_in, output);
-}
-
-void av1_hbd_dpcm_ft16_c(const int16_t *input, int stride, TX_TYPE_1D tx_type,
-                         tran_low_t *output, int dir) {
-  (void)dir;
-  assert(tx_type < TX_TYPES_1D);
-  static const transform_1d FHT[] = { fdct16, fadst16, fadst16, fidtx16 };
-  const transform_1d ft = FHT[tx_type];
-  tran_low_t temp_in[16];
-  for (int i = 0; i < 16; ++i)
-    temp_in[i] = (tran_low_t)fdct_round_shift(input[i * stride] * 2 * Sqrt2);
-  ft(temp_in, output);
-}
-
-void av1_hbd_dpcm_ft32_c(const int16_t *input, int stride, TX_TYPE_1D tx_type,
-                         tran_low_t *output, int dir) {
-  (void)dir;
-  assert(tx_type < TX_TYPES_1D);
-  static const transform_1d FHT[] = { fdct32, fhalfright32, fhalfright32,
-                                      fidtx32 };
-  const transform_1d ft = FHT[tx_type];
-  tran_low_t temp_in[32];
-  for (int i = 0; i < 32; ++i) temp_in[i] = input[i * stride];
-  ft(temp_in, output);
-}
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_DPCM_INTRA
 #endif  // !AV1_DCT_GTEST
diff --git a/third_party/aom/av1/encoder/encodeframe.c b/third_party/aom/av1/encoder/encodeframe.c
index d13eb42fb..f79a678fb 100644
--- a/third_party/aom/av1/encoder/encodeframe.c
+++ b/third_party/aom/av1/encoder/encodeframe.c
@@ -81,10 +81,8 @@ static int check_intra_sb(const AV1_COMP *cpi, const TileInfo *const tile,
                           int mi_row, int mi_col, BLOCK_SIZE bsize,
                           PC_TREE *pc_tree);
 static void predict_superblock(const AV1_COMP *const cpi, ThreadData *td,
-#if CONFIG_EXT_INTER
-                               int mi_row_ori, int mi_col_ori,
-#endif  // CONFIG_EXT_INTER
-                               int mi_row_pred, int mi_col_pred, int plane,
+                               int mi_row_ori, int mi_col_ori, int mi_row_pred,
+                               int mi_col_pred, int plane,
                                BLOCK_SIZE bsize_pred, int b_sub8x8, int block);
 static int check_supertx_sb(BLOCK_SIZE bsize, TX_SIZE supertx_size,
                             PC_TREE *pc_tree);
@@ -273,6 +271,7 @@ static void set_offsets_without_segment_id(const AV1_COMP *const cpi,
   const int mi_height = mi_size_high[bsize];
 
   set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
+
   set_skip_context(xd, mi_row, mi_col);
 #if CONFIG_VAR_TX
   xd->above_txfm_context =
@@ -455,16 +454,17 @@ static void set_segment_id_supertx(const AV1_COMP *const cpi,
 #if CONFIG_DUAL_FILTER
 static void reset_intmv_filter_type(const AV1_COMMON *const cm, MACROBLOCKD *xd,
                                     MB_MODE_INFO *mbmi) {
-  int dir;
-  for (dir = 0; dir < 2; ++dir) {
-    if (!has_subpel_mv_component(xd->mi[0], xd, dir) &&
-        (mbmi->ref_frame[1] == NONE_FRAME ||
-         !has_subpel_mv_component(xd->mi[0], xd, dir + 2)))
-      mbmi->interp_filter[dir] = (cm->interp_filter == SWITCHABLE)
-                                     ? EIGHTTAP_REGULAR
-                                     : cm->interp_filter;
-    mbmi->interp_filter[dir + 2] = mbmi->interp_filter[dir];
+  InterpFilter filters[2];
+  InterpFilter default_filter = av1_unswitchable_filter(cm->interp_filter);
+
+  for (int dir = 0; dir < 2; ++dir) {
+    filters[dir] = ((!has_subpel_mv_component(xd->mi[0], xd, dir) &&
+                     (mbmi->ref_frame[1] == NONE_FRAME ||
+                      !has_subpel_mv_component(xd->mi[0], xd, dir + 2)))
+                        ? default_filter
+                        : av1_extract_interp_filter(mbmi->interp_filters, dir));
   }
+  mbmi->interp_filters = av1_make_interp_filters(filters[0], filters[1]);
 }
 
 static void update_filter_type_count(FRAME_COUNTS *counts,
@@ -476,7 +476,11 @@ static void update_filter_type_count(FRAME_COUNTS *counts,
         (mbmi->ref_frame[1] > INTRA_FRAME &&
          has_subpel_mv_component(xd->mi[0], xd, dir + 2))) {
       const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
-      ++counts->switchable_interp[ctx][mbmi->interp_filter[dir]];
+      InterpFilter filter =
+          av1_extract_interp_filter(mbmi->interp_filters, dir);
+      ++counts->switchable_interp[ctx][filter];
+      update_cdf(xd->tile_ctx->switchable_interp_cdf[ctx], filter,
+                 SWITCHABLE_FILTERS);
     }
   }
 }
@@ -485,11 +489,7 @@ static void update_filter_type_count(FRAME_COUNTS *counts,
 static void update_global_motion_used(PREDICTION_MODE mode, BLOCK_SIZE bsize,
                                       const MB_MODE_INFO *mbmi,
                                       RD_COUNTS *rdc) {
-  if (mode == ZEROMV
-#if CONFIG_EXT_INTER
-      || mode == ZERO_ZEROMV
-#endif
-      ) {
+  if (mode == ZEROMV || mode == ZERO_ZEROMV) {
     const int num_4x4s =
         num_4x4_blocks_wide_lookup[bsize] * num_4x4_blocks_high_lookup[bsize];
     int ref;
@@ -521,7 +521,6 @@ static void set_ref_and_pred_mvs(MACROBLOCK *const x, int_mv *const mi_pred_mv,
   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   CANDIDATE_MV *const curr_ref_mv_stack = mbmi_ext->ref_mv_stack[rf_type];
 
-#if CONFIG_EXT_INTER
   if (has_second_ref(mbmi)) {
     // Special case: NEAR_NEWMV and NEW_NEARMV modes use 1 + mbmi->ref_mv_idx
     // (like NEARMV) instead
@@ -557,7 +556,6 @@ static void set_ref_and_pred_mvs(MACROBLOCK *const x, int_mv *const mi_pred_mv,
     }
 #endif  // CONFIG_COMPOUND_SINGLEREF
   } else {
-#endif  // CONFIG_EXT_INTER
     if (mbmi->mode == NEWMV) {
       int i;
       for (i = 0; i < 1 + has_second_ref(mbmi); ++i) {
@@ -569,9 +567,7 @@ static void set_ref_and_pred_mvs(MACROBLOCK *const x, int_mv *const mi_pred_mv,
         mi_pred_mv[i] = this_mv;
       }
     }
-#if CONFIG_EXT_INTER
   }
-#endif  // CONFIG_EXT_INTER
 }
 
 static void update_state(const AV1_COMP *const cpi, ThreadData *td,
@@ -590,11 +586,6 @@ static void update_state(const AV1_COMP *const cpi, ThreadData *td,
   const struct segmentation *const seg = &cm->seg;
   const int bw = mi_size_wide[mi->mbmi.sb_type];
   const int bh = mi_size_high[mi->mbmi.sb_type];
-  const int x_mis = AOMMIN(bw, cm->mi_cols - mi_col);
-  const int y_mis = AOMMIN(bh, cm->mi_rows - mi_row);
-  MV_REF *const frame_mvs = cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
-  int w, h;
-
   const int mis = cm->mi_stride;
   const int mi_width = mi_size_wide[bsize];
   const int mi_height = mi_size_high[bsize];
@@ -649,9 +640,10 @@ static void update_state(const AV1_COMP *const cpi, ThreadData *td,
     p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
 #endif  // CONFIG_LV_MAP
   }
-#if CONFIG_PALETTE
   for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
-#endif  // CONFIG_PALETTE
+#if CONFIG_MRC_TX
+  xd->mrc_mask = ctx->mrc_mask;
+#endif  // CONFIG_MRC_TX
   // Restore the coding context of the MB to that that was in place
   // when the mode was picked for it
   for (y = 0; y < mi_height; y++)
@@ -661,7 +653,7 @@ static void update_state(const AV1_COMP *const cpi, ThreadData *td,
         xd->mi[x_idx + y * mis] = mi_addr;
       }
 
-#if CONFIG_DELTA_Q && !CONFIG_EXT_DELTA_Q
+#if !CONFIG_EXT_DELTA_Q
   if (cpi->oxcf.aq_mode > NO_AQ && cpi->oxcf.aq_mode < DELTA_AQ)
     av1_init_plane_quantizers(cpi, x, xd->mi[0]->mbmi.segment_id);
 #else
@@ -699,13 +691,11 @@ static void update_state(const AV1_COMP *const cpi, ThreadData *td,
         THR_D153_PRED /*D153_PRED*/,
         THR_D207_PRED /*D207_PRED*/,
         THR_D63_PRED /*D63_PRED*/,
-#if CONFIG_ALT_INTRA
         THR_SMOOTH, /*SMOOTH_PRED*/
 #if CONFIG_SMOOTH_HV
         THR_SMOOTH_V, /*SMOOTH_V_PRED*/
         THR_SMOOTH_H, /*SMOOTH_H_PRED*/
 #endif                // CONFIG_SMOOTH_HV
-#endif                // CONFIG_ALT_INTRA
         THR_TM /*TM_PRED*/,
       };
       ++mode_chosen_counts[kf_mode_index[mbmi->mode]];
@@ -747,7 +737,9 @@ static void update_state(const AV1_COMP *const cpi, ThreadData *td,
         update_filter_type_count(td->counts, xd, mbmi);
 #else
         const int switchable_ctx = av1_get_pred_context_switchable_interp(xd);
-        ++td->counts->switchable_interp[switchable_ctx][mbmi->interp_filter];
+        const InterpFilter filter =
+            av1_extract_interp_filter(mbmi->interp_filters, 0);
+        ++td->counts->switchable_interp[switchable_ctx][filter];
 #endif
       }
     }
@@ -757,16 +749,9 @@ static void update_state(const AV1_COMP *const cpi, ThreadData *td,
     rdc->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff;
   }
 
-  for (h = 0; h < y_mis; ++h) {
-    MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
-    for (w = 0; w < x_mis; ++w) {
-      MV_REF *const mv = frame_mv + w;
-      mv->ref_frame[0] = mi->mbmi.ref_frame[0];
-      mv->ref_frame[1] = mi->mbmi.ref_frame[1];
-      mv->mv[0].as_int = mi->mbmi.mv[0].as_int;
-      mv->mv[1].as_int = mi->mbmi.mv[1].as_int;
-    }
-  }
+  const int x_mis = AOMMIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = AOMMIN(bh, cm->mi_rows - mi_row);
+  av1_copy_frame_mvs(cm, mi, mi_row, mi_col, x_mis, y_mis);
 }
 
 #if CONFIG_SUPERTX
@@ -788,12 +773,7 @@ static void update_state_supertx(const AV1_COMP *const cpi, ThreadData *td,
   const int mis = cm->mi_stride;
   const int mi_width = mi_size_wide[bsize];
   const int mi_height = mi_size_high[bsize];
-  const int x_mis = AOMMIN(mi_width, cm->mi_cols - mi_col);
-  const int y_mis = AOMMIN(mi_height, cm->mi_rows - mi_row);
   const int unify_bsize = CONFIG_CB4X4;
-  MV_REF *const frame_mvs = cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
-  int w, h;
-
   int8_t rf_type;
 
   *mi_addr = *mi;
@@ -915,16 +895,9 @@ static void update_state_supertx(const AV1_COMP *const cpi, ThreadData *td,
     rdc->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff;
   }
 
-  for (h = 0; h < y_mis; ++h) {
-    MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
-    for (w = 0; w < x_mis; ++w) {
-      MV_REF *const mv = frame_mv + w;
-      mv->ref_frame[0] = mi->mbmi.ref_frame[0];
-      mv->ref_frame[1] = mi->mbmi.ref_frame[1];
-      mv->mv[0].as_int = mi->mbmi.mv[0].as_int;
-      mv->mv[1].as_int = mi->mbmi.mv[1].as_int;
-    }
-  }
+  const int x_mis = AOMMIN(mi_width, cm->mi_cols - mi_col);
+  const int y_mis = AOMMIN(mi_height, cm->mi_rows - mi_row);
+  av1_copy_frame_mvs(cm, mi, mi_row, mi_col, x_mis, y_mis);
 }
 
 static void update_state_sb_supertx(const AV1_COMP *const cpi, ThreadData *td,
@@ -1005,6 +978,9 @@ static void update_state_sb_supertx(const AV1_COMP *const cpi, ThreadData *td,
       pmc = &pc_tree->split_supertx;
       break;
 #if CONFIG_EXT_PARTITION_TYPES
+#if CONFIG_EXT_PARTITION_TYPES_AB
+#error HORZ/VERT_A/B partitions not yet updated in superres code
+#endif
     case PARTITION_HORZ_A:
       set_offsets_supertx(cpi, td, tile, mi_row, mi_col, bsize2);
       update_state_supertx(cpi, td, &pc_tree->horizontala[0], mi_row, mi_col,
@@ -1138,6 +1114,9 @@ static void update_supertx_param_sb(const AV1_COMP *const cpi, ThreadData *td,
       }
       break;
 #if CONFIG_EXT_PARTITION_TYPES
+#if CONFIG_EXT_PARTITION_TYPES_AB
+#error HORZ/VERT_A/B partitions not yet updated in superres code
+#endif
     case PARTITION_HORZ_A:
       for (i = 0; i < 3; i++)
         update_supertx_param(td, &pc_tree->horizontala[i], best_tx,
@@ -1162,7 +1141,7 @@ static void update_supertx_param_sb(const AV1_COMP *const cpi, ThreadData *td,
 }
 #endif  // CONFIG_SUPERTX
 
-#if CONFIG_MOTION_VAR && (CONFIG_NCOBMC || CONFIG_NCOBMC_ADAPT_WEIGHT)
+#if CONFIG_MOTION_VAR && NC_MODE_INFO
 static void set_mode_info_b(const AV1_COMP *const cpi,
                             const TileInfo *const tile, ThreadData *td,
                             int mi_row, int mi_col, BLOCK_SIZE bsize,
@@ -1229,6 +1208,9 @@ static void set_mode_info_sb(const AV1_COMP *const cpi, ThreadData *td,
       }
       break;
 #if CONFIG_EXT_PARTITION_TYPES
+#if CONFIG_EXT_PARTITION_TYPES_AB
+#error NC_MODE_INFO+MOTION_VAR not yet supported for new HORZ/VERT_AB partitions
+#endif
     case PARTITION_HORZ_A:
       set_mode_info_b(cpi, tile, td, mi_row, mi_col, bsize2,
                       &pc_tree->horizontala[0]);
@@ -1283,7 +1265,60 @@ static void set_mode_info_sb(const AV1_COMP *const cpi, ThreadData *td,
     default: assert(0 && "Invalid partition type."); break;
   }
 }
-#endif
+
+#if CONFIG_NCOBMC_ADAPT_WEIGHT
+static void av1_get_ncobmc_mode_rd(const AV1_COMP *const cpi,
+                                   MACROBLOCK *const x, MACROBLOCKD *const xd,
+                                   int bsize, const int mi_row,
+                                   const int mi_col, NCOBMC_MODE *mode) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int mi_width = mi_size_wide[bsize];
+  const int mi_height = mi_size_high[bsize];
+
+  assert(bsize >= BLOCK_8X8);
+
+  reset_xd_boundary(xd, mi_row, mi_height, mi_col, mi_width, cm->mi_rows,
+                    cm->mi_cols);
+
+  // set up source buffers before calling the mode searching function
+  av1_setup_src_planes(x, cpi->source, mi_row, mi_col);
+
+  *mode = get_ncobmc_mode(cpi, x, xd, mi_row, mi_col, bsize);
+}
+static void get_ncobmc_intrpl_pred(const AV1_COMP *const cpi, ThreadData *td,
+                                   int mi_row, int mi_col, BLOCK_SIZE bsize) {
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const int mi_width = mi_size_wide[bsize];
+  const int mi_height = mi_size_high[bsize];
+  const int hbs = AOMMAX(mi_size_wide[bsize] / 2, mi_size_high[bsize] / 2);
+  const BLOCK_SIZE sqr_blk = bsize_2_sqr_bsize[bsize];
+
+  if (mi_width > mi_height) {
+    // horizontal partition
+    av1_get_ncobmc_mode_rd(cpi, x, xd, sqr_blk, mi_row, mi_col,
+                           &mbmi->ncobmc_mode[0]);
+    xd->mi += hbs;
+    av1_get_ncobmc_mode_rd(cpi, x, xd, sqr_blk, mi_row, mi_col + hbs,
+                           &mbmi->ncobmc_mode[1]);
+  } else if (mi_height > mi_width) {
+    // vertical partition
+    av1_get_ncobmc_mode_rd(cpi, x, xd, sqr_blk, mi_row, mi_col,
+                           &mbmi->ncobmc_mode[0]);
+    xd->mi += hbs * xd->mi_stride;
+    av1_get_ncobmc_mode_rd(cpi, x, xd, sqr_blk, mi_row + hbs, mi_col,
+                           &mbmi->ncobmc_mode[1]);
+  } else {
+    av1_get_ncobmc_mode_rd(cpi, x, xd, sqr_blk, mi_row, mi_col,
+                           &mbmi->ncobmc_mode[0]);
+  }
+  // restore the info
+  av1_setup_src_planes(x, cpi->source, mi_row, mi_col);
+  set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
+}
+#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
+#endif  // CONFIG_MOTION_VAR && (CONFIG_NCOBMC || CONFIG_NCOBMC_ADAPT_WEIGHT)
 
 void av1_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
                           int mi_row, int mi_col) {
@@ -1384,10 +1419,6 @@ static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data,
   mbmi->mi_row = mi_row;
   mbmi->mi_col = mi_col;
 #endif
-#if CONFIG_CFL
-  // Don't store luma during RDO. Only store luma when best luma is known
-  x->cfl_store_y = 0;
-#endif
 #if CONFIG_SUPERTX
   // We set tx_size here as skip blocks would otherwise not set it.
   // tx_size needs to be set at this point as supertx_enable in
@@ -1413,9 +1444,10 @@ static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data,
 #endif
   }
 
-#if CONFIG_PALETTE
   for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
-#endif  // CONFIG_PALETTE
+#if CONFIG_MRC_TX
+  xd->mrc_mask = ctx->mrc_mask;
+#endif  // CONFIG_MRC_TX
 
   ctx->skippable = 0;
 
@@ -1491,6 +1523,9 @@ static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data,
   if ((rd_cost->rate != INT_MAX) && (aq_mode == COMPLEXITY_AQ) &&
       (bsize >= BLOCK_16X16) &&
       (cm->frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame ||
+#if CONFIG_EXT_REFS
+       cpi->refresh_alt2_ref_frame ||
+#endif  // CONFIG_EXT_REFS
        (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref))) {
     av1_caq_select_segment(cpi, x, bsize, mi_row, mi_col, rd_cost->rate);
   }
@@ -1542,23 +1577,19 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
                          int supertx_enabled
 #endif
                          ) {
-#if CONFIG_DELTA_Q
   MACROBLOCK *x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-#else
-  const MACROBLOCK *x = &td->mb;
-  const MACROBLOCKD *const xd = &x->e_mbd;
-#endif
   const MODE_INFO *const mi = xd->mi[0];
   const MB_MODE_INFO *const mbmi = &mi->mbmi;
   const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   const BLOCK_SIZE bsize = mbmi->sb_type;
+  FRAME_CONTEXT *fc = xd->tile_ctx;
 
-#if CONFIG_DELTA_Q
   // delta quant applies to both intra and inter
-  const int super_block_upper_left = ((mi_row & 7) == 0) && ((mi_col & 7) == 0);
+  int super_block_upper_left =
+      ((mi_row & MAX_MIB_MASK) == 0) && ((mi_col & MAX_MIB_MASK) == 0);
 
-  if (cm->delta_q_present_flag && (bsize != BLOCK_64X64 || !mbmi->skip) &&
+  if (cm->delta_q_present_flag && (bsize != cm->sb_size || !mbmi->skip) &&
       super_block_upper_left) {
     const int dq = (mbmi->current_q_index - xd->prev_qindex) / cm->delta_q_res;
     const int absdq = abs(dq);
@@ -1569,6 +1600,35 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
     if (absdq < DELTA_Q_SMALL) td->counts->delta_q[absdq][0]++;
     xd->prev_qindex = mbmi->current_q_index;
 #if CONFIG_EXT_DELTA_Q
+#if CONFIG_LOOPFILTER_LEVEL
+    if (cm->delta_lf_present_flag) {
+      if (cm->delta_lf_multi) {
+        for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id) {
+          const int delta_lf =
+              (mbmi->curr_delta_lf[lf_id] - xd->prev_delta_lf[lf_id]) /
+              cm->delta_lf_res;
+          const int abs_delta_lf = abs(delta_lf);
+          for (i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) {
+            td->counts->delta_lf_multi[lf_id][i][1]++;
+          }
+          if (abs_delta_lf < DELTA_LF_SMALL)
+            td->counts->delta_lf_multi[lf_id][abs_delta_lf][0]++;
+          xd->prev_delta_lf[lf_id] = mbmi->curr_delta_lf[lf_id];
+        }
+      } else {
+        const int delta_lf =
+            (mbmi->current_delta_lf_from_base - xd->prev_delta_lf_from_base) /
+            cm->delta_lf_res;
+        const int abs_delta_lf = abs(delta_lf);
+        for (i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) {
+          td->counts->delta_lf[i][1]++;
+        }
+        if (abs_delta_lf < DELTA_LF_SMALL)
+          td->counts->delta_lf[abs_delta_lf][0]++;
+        xd->prev_delta_lf_from_base = mbmi->current_delta_lf_from_base;
+      }
+    }
+#else
     if (cm->delta_lf_present_flag) {
       const int dlf =
           (mbmi->current_delta_lf_from_base - xd->prev_delta_lf_from_base) /
@@ -1580,12 +1640,9 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
       if (absdlf < DELTA_LF_SMALL) td->counts->delta_lf[absdlf][0]++;
       xd->prev_delta_lf_from_base = mbmi->current_delta_lf_from_base;
     }
+#endif  // CONFIG_LOOPFILTER_LEVEL
 #endif
   }
-#else
-  (void)mi_row;
-  (void)mi_col;
-#endif
   if (!frame_is_intra_only(cm)) {
     FRAME_COUNTS *const counts = td->counts;
     RD_COUNTS *rdc = &td->rd_counts;
@@ -1597,6 +1654,10 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
       if (!supertx_enabled)
 #endif
         counts->intra_inter[av1_get_intra_inter_context(xd)][inter_block]++;
+#if CONFIG_NEW_MULTISYMBOL
+      update_cdf(fc->intra_inter_cdf[av1_get_intra_inter_context(xd)],
+                 inter_block, 2);
+#endif
       // If the segment reference feature is enabled we have only a single
       // reference frame allowed for the segment so exclude it from
       // the reference frame counts used to work out probabilities.
@@ -1613,14 +1674,14 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
           else
             // This flag is also updated for 4x4 blocks
             rdc->single_ref_used_flag = 1;
-#if !SUB8X8_COMP_REF
-          if (mbmi->sb_type != BLOCK_4X4)
+          if (is_comp_ref_allowed(mbmi->sb_type)) {
             counts->comp_inter[av1_get_reference_mode_context(cm, xd)]
                               [has_second_ref(mbmi)]++;
-#else
-          counts->comp_inter[av1_get_reference_mode_context(cm, xd)]
-                            [has_second_ref(mbmi)]++;
-#endif
+#if CONFIG_NEW_MULTISYMBOL
+            update_cdf(av1_get_reference_mode_cdf(cm, xd), has_second_ref(mbmi),
+                       2);
+#endif  // CONFIG_NEW_MULTISYMBOL
+          }
         }
 
         if (has_second_ref(mbmi)) {
@@ -1664,6 +1725,9 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
 
             counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p(cm, xd)][0]
                                [ref1 == ALTREF_FRAME]++;
+            if (ref1 != ALTREF_FRAME)
+              counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p1(cm, xd)]
+                                 [1][ref1 == ALTREF2_FRAME]++;
 #else   // !CONFIG_EXT_REFS
           counts->comp_ref[av1_get_pred_context_comp_ref_p(cm, xd)][0]
                           [ref0 == GOLDEN_FRAME]++;
@@ -1673,12 +1737,16 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
 #endif  // CONFIG_EXT_COMP_REFS
         } else {
 #if CONFIG_EXT_REFS
-          const int bit = (ref0 == ALTREF_FRAME || ref0 == BWDREF_FRAME);
+          const int bit = (ref0 >= BWDREF_FRAME);
 
           counts->single_ref[av1_get_pred_context_single_ref_p1(xd)][0][bit]++;
           if (bit) {
+            assert(ref0 <= ALTREF_FRAME);
             counts->single_ref[av1_get_pred_context_single_ref_p2(xd)][1]
-                              [ref0 != BWDREF_FRAME]++;
+                              [ref0 == ALTREF_FRAME]++;
+            if (ref0 != ALTREF_FRAME)
+              counts->single_ref[av1_get_pred_context_single_ref_p6(xd)][5]
+                                [ref0 == ALTREF2_FRAME]++;
           } else {
             const int bit1 = !(ref0 == LAST2_FRAME || ref0 == LAST_FRAME);
             counts
@@ -1701,7 +1769,6 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
 #endif  // CONFIG_EXT_REFS
         }
 
-#if CONFIG_EXT_INTER
 #if CONFIG_COMPOUND_SINGLEREF
         if (!has_second_ref(mbmi))
           counts->comp_inter_mode[av1_get_inter_mode_context(xd)]
@@ -1717,31 +1784,32 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
           const int bsize_group = size_group_lookup[bsize];
           if (mbmi->ref_frame[1] == INTRA_FRAME) {
             counts->interintra[bsize_group][1]++;
+#if CONFIG_NEW_MULTISYMBOL
+            update_cdf(fc->interintra_cdf[bsize_group], 1, 2);
+#endif
             counts->interintra_mode[bsize_group][mbmi->interintra_mode]++;
-            if (is_interintra_wedge_used(bsize))
+            update_cdf(fc->interintra_mode_cdf[bsize_group],
+                       mbmi->interintra_mode, INTERINTRA_MODES);
+            if (is_interintra_wedge_used(bsize)) {
               counts->wedge_interintra[bsize][mbmi->use_wedge_interintra]++;
+#if CONFIG_NEW_MULTISYMBOL
+              update_cdf(fc->wedge_interintra_cdf[bsize],
+                         mbmi->use_wedge_interintra, 2);
+#endif
+            }
           } else {
             counts->interintra[bsize_group][0]++;
+#if CONFIG_NEW_MULTISYMBOL
+            update_cdf(fc->interintra_cdf[bsize_group], 0, 2);
+#endif
           }
         }
 #endif  // CONFIG_INTERINTRA
-#endif  // CONFIG_EXT_INTER
 
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 #if CONFIG_WARPED_MOTION
         set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
 #endif
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-        const MOTION_MODE motion_allowed =
-            motion_mode_allowed_wrapper(0,
-#if CONFIG_GLOBAL_MOTION
-                                        0, xd->global_motion,
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_WARPED_MOTION
-                                        xd,
-#endif
-                                        mi);
-#else
         const MOTION_MODE motion_allowed = motion_mode_allowed(
 #if CONFIG_GLOBAL_MOTION
             0, xd->global_motion,
@@ -1750,23 +1818,41 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
             xd,
 #endif
             mi);
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
 #if CONFIG_SUPERTX
         if (!supertx_enabled)
 #endif  // CONFIG_SUPERTX
-#if CONFIG_EXT_INTER
           if (mbmi->ref_frame[1] != INTRA_FRAME)
-#endif  // CONFIG_EXT_INTER
 #if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
           {
-            if (motion_allowed == WARPED_CAUSAL)
+            if (motion_allowed == WARPED_CAUSAL) {
               counts->motion_mode[mbmi->sb_type][mbmi->motion_mode]++;
-            else if (motion_allowed == OBMC_CAUSAL)
+              update_cdf(fc->motion_mode_cdf[mbmi->sb_type], mbmi->motion_mode,
+                         MOTION_MODES);
+#if CONFIG_NCOBMC_ADAPT_WEIGHT
+            } else if (motion_allowed == NCOBMC_ADAPT_WEIGHT) {
+              counts->ncobmc[mbmi->sb_type][mbmi->motion_mode]++;
+              update_cdf(fc->ncobmc_cdf[mbmi->sb_type], mbmi->motion_mode,
+                         OBMC_FAMILY_MODES);
+            } else if (motion_allowed == OBMC_CAUSAL) {
+              counts->obmc[mbmi->sb_type][mbmi->motion_mode == OBMC_CAUSAL]++;
+              update_cdf(fc->obmc_cdf[mbmi->sb_type], mbmi->motion_mode, 2);
+            }
+#else
+            } else if (motion_allowed == OBMC_CAUSAL) {
               counts->obmc[mbmi->sb_type][mbmi->motion_mode == OBMC_CAUSAL]++;
+#if CONFIG_NEW_MULTISYMBOL
+              update_cdf(fc->obmc_cdf[mbmi->sb_type],
+                         mbmi->motion_mode == OBMC_CAUSAL, 2);
+#endif
+            }
+#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
           }
 #else
-        if (motion_allowed > SIMPLE_TRANSLATION)
-          counts->motion_mode[mbmi->sb_type][mbmi->motion_mode]++;
+          if (motion_allowed > SIMPLE_TRANSLATION) {
+            counts->motion_mode[mbmi->sb_type][mbmi->motion_mode]++;
+            update_cdf(fc->motion_mode_cdf[mbmi->sb_type], mbmi->motion_mode,
+                       MOTION_MODES);
+          }
 #endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
 
 #if CONFIG_NCOBMC_ADAPT_WEIGHT
@@ -1774,15 +1860,18 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
           ADAPT_OVERLAP_BLOCK ao_block =
               adapt_overlap_block_lookup[mbmi->sb_type];
           ++counts->ncobmc_mode[ao_block][mbmi->ncobmc_mode[0]];
+          update_cdf(fc->ncobmc_mode_cdf[ao_block], mbmi->ncobmc_mode[0],
+                     MAX_NCOBMC_MODES);
           if (mi_size_wide[mbmi->sb_type] != mi_size_high[mbmi->sb_type]) {
             ++counts->ncobmc_mode[ao_block][mbmi->ncobmc_mode[1]];
+            update_cdf(fc->ncobmc_mode_cdf[ao_block], mbmi->ncobmc_mode[1],
+                       MAX_NCOBMC_MODES);
           }
         }
 #endif
 
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 
-#if CONFIG_EXT_INTER
         if (
 #if CONFIG_COMPOUND_SINGLEREF
             is_inter_anyref_comp_mode(mbmi->mode)
@@ -1794,9 +1883,19 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
             && mbmi->motion_mode == SIMPLE_TRANSLATION
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
             ) {
-          counts->compound_interinter[bsize][mbmi->interinter_compound_type]++;
+#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
+#if CONFIG_WEDGE && CONFIG_COMPOUND_SEGMENT
+          if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) {
+#endif
+            counts
+                ->compound_interinter[bsize][mbmi->interinter_compound_type]++;
+            update_cdf(fc->compound_type_cdf[bsize],
+                       mbmi->interinter_compound_type, COMPOUND_TYPES);
+#if CONFIG_WEDGE && CONFIG_COMPOUND_SEGMENT
+          }
+#endif
+#endif  // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
         }
-#endif  // CONFIG_EXT_INTER
       }
     }
 
@@ -1804,10 +1903,11 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
         !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
       int16_t mode_ctx;
       const PREDICTION_MODE mode = mbmi->mode;
-#if CONFIG_EXT_INTER
       if (has_second_ref(mbmi)) {
         mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]];
         ++counts->inter_compound_mode[mode_ctx][INTER_COMPOUND_OFFSET(mode)];
+        update_cdf(fc->inter_compound_mode_cdf[mode_ctx],
+                   INTER_COMPOUND_OFFSET(mode), INTER_COMPOUND_MODES);
 #if CONFIG_COMPOUND_SINGLEREF
       } else if (is_inter_singleref_comp_mode(mode)) {
         mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]];
@@ -1815,24 +1915,17 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
                                            [INTER_SINGLEREF_COMP_OFFSET(mode)];
 #endif  // CONFIG_COMPOUND_SINGLEREF
       } else {
-#endif  // CONFIG_EXT_INTER
         mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
                                              mbmi->ref_frame, bsize, -1);
         update_inter_mode_stats(counts, mode, mode_ctx);
-#if CONFIG_EXT_INTER
       }
-#endif  // CONFIG_EXT_INTER
 
-#if CONFIG_EXT_INTER
+      int mode_allowed = (mbmi->mode == NEWMV);
+      mode_allowed |= (mbmi->mode == NEW_NEWMV);
 #if CONFIG_COMPOUND_SINGLEREF
-      if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV ||
-          mbmi->mode == SR_NEW_NEWMV) {
-#else   // !CONFIG_COMPOUND_SINGLEREF
-      if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) {
+      mode_allowed |= (mbmi->mode == SR_NEW_NEWMV);
 #endif  // CONFIG_COMPOUND_SINGLEREF
-#else   // !CONFIG_EXT_INTER
-      if (mbmi->mode == NEWMV) {
-#endif  // CONFIG_EXT_INTER
+      if (mode_allowed) {
         uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
         int idx;
 
@@ -1847,11 +1940,7 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
         }
       }
 
-#if CONFIG_EXT_INTER
       if (have_nearmv_in_inter_mode(mbmi->mode)) {
-#else
-      if (mbmi->mode == NEARMV) {
-#endif
         uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
         int idx;
 
@@ -1868,7 +1957,7 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
     }
 #if CONFIG_INTRABC
   } else {
-    if (cm->allow_screen_content_tools && bsize >= BLOCK_8X8) {
+    if (av1_allow_intrabc(bsize, cm)) {
       FRAME_COUNTS *const counts = td->counts;
       ++counts->intrabc[mbmi->use_intrabc];
     } else {
@@ -1992,7 +2081,8 @@ static void encode_b(const AV1_COMP *const cpi, const TileInfo *const tile,
 #endif
                      PICK_MODE_CONTEXT *ctx, int *rate) {
   MACROBLOCK *const x = &td->mb;
-#if (CONFIG_MOTION_VAR && CONFIG_NCOBMC) | CONFIG_EXT_DELTA_Q
+#if (CONFIG_MOTION_VAR && CONFIG_NCOBMC) | CONFIG_EXT_DELTA_Q | \
+    CONFIG_NCOBMC_ADAPT_WEIGHT
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi;
 #if CONFIG_MOTION_VAR && CONFIG_NCOBMC
@@ -2005,11 +2095,14 @@ static void encode_b(const AV1_COMP *const cpi, const TileInfo *const tile,
   x->e_mbd.mi[0]->mbmi.partition = partition;
 #endif
   update_state(cpi, td, ctx, mi_row, mi_col, bsize, dry_run);
-#if CONFIG_MOTION_VAR && CONFIG_NCOBMC
+#if CONFIG_MOTION_VAR && (CONFIG_NCOBMC || CONFIG_NCOBMC_ADAPT_WEIGHT)
   mbmi = &xd->mi[0]->mbmi;
 #if CONFIG_WARPED_MOTION
   set_ref_ptrs(&cpi->common, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
 #endif
+#endif
+
+#if CONFIG_MOTION_VAR && (CONFIG_NCOBMC || CONFIG_NCOBMC_ADAPT_WEIGHT)
   const MOTION_MODE motion_allowed = motion_mode_allowed(
 #if CONFIG_GLOBAL_MOTION
       0, xd->global_motion,
@@ -2018,6 +2111,9 @@ static void encode_b(const AV1_COMP *const cpi, const TileInfo *const tile,
       xd,
 #endif
       xd->mi[0]);
+#endif  // CONFIG_MOTION_VAR && (CONFIG_NCOBMC || CONFIG_NCOBMC_ADAPT_WEIGHT)
+
+#if CONFIG_MOTION_VAR && CONFIG_NCOBMC
   check_ncobmc = is_inter_block(mbmi) && motion_allowed >= OBMC_CAUSAL;
   if (!dry_run && check_ncobmc) {
     av1_check_ncobmc_rd(cpi, x, mi_row, mi_col);
@@ -2025,13 +2121,38 @@ static void encode_b(const AV1_COMP *const cpi, const TileInfo *const tile,
                          get_frame_new_buffer(&cpi->common), mi_row, mi_col);
   }
 #endif
+
+#if CONFIG_LV_MAP
+  av1_set_coeff_buffer(cpi, x, mi_row, mi_col);
+#endif
+
+#if CONFIG_NCOBMC_ADAPT_WEIGHT
+  if (dry_run == OUTPUT_ENABLED && !frame_is_intra_only(&cpi->common)) {
+    if (motion_allowed >= NCOBMC_ADAPT_WEIGHT && is_inter_block(mbmi)) {
+      get_ncobmc_intrpl_pred(cpi, td, mi_row, mi_col, bsize);
+      av1_check_ncobmc_adapt_weight_rd(cpi, x, mi_row, mi_col);
+    }
+    av1_setup_dst_planes(x->e_mbd.plane, bsize,
+                         get_frame_new_buffer(&cpi->common), mi_row, mi_col);
+  }
+#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
+
   encode_superblock(cpi, td, tp, dry_run, mi_row, mi_col, bsize, rate);
 
+#if CONFIG_LV_MAP
+  if (dry_run == 0)
+    x->cb_offset += block_size_wide[bsize] * block_size_high[bsize];
+#endif
+
   if (!dry_run) {
 #if CONFIG_EXT_DELTA_Q
     mbmi = &xd->mi[0]->mbmi;
-    if (bsize == BLOCK_64X64 && mbmi->skip == 1 && is_inter_block(mbmi) &&
+    if (bsize == cpi->common.sb_size && mbmi->skip == 1 &&
         cpi->common.delta_lf_present_flag) {
+#if CONFIG_LOOPFILTER_LEVEL
+      for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id)
+        mbmi->curr_delta_lf[lf_id] = xd->prev_delta_lf[lf_id];
+#endif  // CONFIG_LOOPFILTER_LEVEL
       mbmi->current_delta_lf_from_base = xd->prev_delta_lf_from_base;
     }
 #endif
@@ -2051,6 +2172,9 @@ static void encode_sb(const AV1_COMP *const cpi, ThreadData *td,
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   const int hbs = mi_size_wide[bsize] / 2;
+#if CONFIG_EXT_PARTITION_TYPES && CONFIG_EXT_PARTITION_TYPES_AB
+  const int qbs = mi_size_wide[bsize] / 4;
+#endif
   const int is_partition_root = bsize >= BLOCK_8X8;
   const int ctx = is_partition_root
                       ? partition_plane_context(xd, mi_row, mi_col,
@@ -2063,9 +2187,11 @@ static void encode_sb(const AV1_COMP *const cpi, ThreadData *td,
   const PARTITION_TYPE partition = pc_tree->partitioning;
   const BLOCK_SIZE subsize = get_subsize(bsize, partition);
 #if CONFIG_EXT_PARTITION_TYPES
-  const BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
   int quarter_step = mi_size_wide[bsize] / 4;
   int i;
+#if !CONFIG_EXT_PARTITION_TYPES_AB
+  BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+#endif
 #endif
 
 #if CONFIG_CB4X4
@@ -2077,11 +2203,6 @@ static void encode_sb(const AV1_COMP *const cpi, ThreadData *td,
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
 
-#if CONFIG_SPEED_REFS
-  // First scanning pass of an SB is dry run only.
-  if (cpi->sb_scanning_pass_idx == 0) assert(dry_run == DRY_RUN_NORMAL);
-#endif  // CONFIG_SPEED_REFS
-
   if (!dry_run && ctx >= 0) td->counts->partition[ctx][partition]++;
 
 #if CONFIG_SUPERTX
@@ -2138,6 +2259,7 @@ static void encode_sb(const AV1_COMP *const cpi, ThreadData *td,
         td->counts->supertx[partition_supertx_context_lookup[partition]]
                            [supertx_size][1]++;
         td->counts->supertx_size[supertx_size]++;
+#if CONFIG_ENTROPY_STATS
 #if CONFIG_EXT_TX
         if (get_ext_tx_types(supertx_size, bsize, 1, cm->reduced_tx_set_used) >
                 1 &&
@@ -2154,6 +2276,7 @@ static void encode_sb(const AV1_COMP *const cpi, ThreadData *td,
           ++td->counts->inter_ext_tx[supertx_size][xd->mi[0]->mbmi.tx_type];
         }
 #endif  // CONFIG_EXT_TX
+#endif  // CONFIG_ENTROPY_STATS
       }
 #if CONFIG_EXT_PARTITION_TYPES
       update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize,
@@ -2230,7 +2353,53 @@ static void encode_sb(const AV1_COMP *const cpi, ThreadData *td,
                   subsize, pc_tree->split[3], rate);
       }
       break;
+
 #if CONFIG_EXT_PARTITION_TYPES
+#if CONFIG_EXT_PARTITION_TYPES_AB
+    case PARTITION_HORZ_A:
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run,
+               get_subsize(bsize, PARTITION_HORZ_4), partition,
+               &pc_tree->horizontala[0], rate);
+      encode_b(cpi, tile, td, tp, mi_row + qbs, mi_col, dry_run,
+               get_subsize(bsize, PARTITION_HORZ_4), partition,
+               &pc_tree->horizontala[1], rate);
+      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
+               partition, &pc_tree->horizontala[2], rate);
+      break;
+    case PARTITION_HORZ_B:
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize, partition,
+               &pc_tree->horizontalb[0], rate);
+      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run,
+               get_subsize(bsize, PARTITION_HORZ_4), partition,
+               &pc_tree->horizontalb[1], rate);
+      if (mi_row + 3 * qbs < cm->mi_rows)
+        encode_b(cpi, tile, td, tp, mi_row + 3 * qbs, mi_col, dry_run,
+                 get_subsize(bsize, PARTITION_HORZ_4), partition,
+                 &pc_tree->horizontalb[2], rate);
+      break;
+    case PARTITION_VERT_A:
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run,
+               get_subsize(bsize, PARTITION_VERT_4), partition,
+               &pc_tree->verticala[0], rate);
+      encode_b(cpi, tile, td, tp, mi_row, mi_col + qbs, dry_run,
+               get_subsize(bsize, PARTITION_VERT_4), partition,
+               &pc_tree->verticala[1], rate);
+      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
+               partition, &pc_tree->verticala[2], rate);
+
+      break;
+    case PARTITION_VERT_B:
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize, partition,
+               &pc_tree->verticalb[0], rate);
+      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run,
+               get_subsize(bsize, PARTITION_VERT_4), partition,
+               &pc_tree->verticalb[1], rate);
+      if (mi_col + 3 * qbs < cm->mi_cols)
+        encode_b(cpi, tile, td, tp, mi_row, mi_col + 3 * qbs, dry_run,
+                 get_subsize(bsize, PARTITION_VERT_4), partition,
+                 &pc_tree->verticalb[2], rate);
+      break;
+#else
     case PARTITION_HORZ_A:
       encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, bsize2, partition,
                &pc_tree->horizontala[0], rate);
@@ -2264,6 +2433,7 @@ static void encode_sb(const AV1_COMP *const cpi, ThreadData *td,
       encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col + hbs, dry_run, bsize2,
                partition, &pc_tree->verticalb[2], rate);
       break;
+#endif
     case PARTITION_HORZ_4:
       for (i = 0; i < 4; ++i) {
         int this_mi_row = mi_row + i * quarter_step;
@@ -2468,10 +2638,10 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
                        bsize, ctx_none, INT64_MAX);
 
       if (none_rdc.rate < INT_MAX) {
-        none_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
+        none_rdc.rate += x->partition_cost[pl][PARTITION_NONE];
         none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist);
 #if CONFIG_SUPERTX
-        none_rate_nocoef += cpi->partition_cost[pl][PARTITION_NONE];
+        none_rate_nocoef += x->partition_cost[pl][PARTITION_NONE];
 #endif
       }
 
@@ -2647,11 +2817,11 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
   }
 
   if (last_part_rdc.rate < INT_MAX) {
-    last_part_rdc.rate += cpi->partition_cost[pl][partition];
+    last_part_rdc.rate += x->partition_cost[pl][partition];
     last_part_rdc.rdcost =
         RDCOST(x->rdmult, last_part_rdc.rate, last_part_rdc.dist);
 #if CONFIG_SUPERTX
-    last_part_rate_nocoef += cpi->partition_cost[pl][partition];
+    last_part_rate_nocoef += x->partition_cost[pl][partition];
 #endif
   }
 
@@ -2726,16 +2896,16 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
         encode_sb(cpi, td, tile_info, tp, mi_row + y_idx, mi_col + x_idx,
                   OUTPUT_ENABLED, split_subsize, pc_tree->split[i], NULL);
 
-      chosen_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
+      chosen_rdc.rate += x->partition_cost[pl][PARTITION_NONE];
 #if CONFIG_SUPERTX
-      chosen_rate_nocoef += cpi->partition_cost[pl][PARTITION_SPLIT];
+      chosen_rate_nocoef += x->partition_cost[pl][PARTITION_SPLIT];
 #endif
     }
     if (chosen_rdc.rate < INT_MAX) {
-      chosen_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT];
+      chosen_rdc.rate += x->partition_cost[pl][PARTITION_SPLIT];
       chosen_rdc.rdcost = RDCOST(x->rdmult, chosen_rdc.rate, chosen_rdc.dist);
 #if CONFIG_SUPERTX
-      chosen_rate_nocoef += cpi->partition_cost[pl][PARTITION_NONE];
+      chosen_rate_nocoef += x->partition_cost[pl][PARTITION_NONE];
 #endif
     }
   }
@@ -2803,8 +2973,11 @@ static const BLOCK_SIZE min_partition_size[BLOCK_SIZES_ALL] = {
 #if CONFIG_EXT_PARTITION
   BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,  // 64x128, 128x64, 128x128
 #endif  // CONFIG_EXT_PARTITION
-  BLOCK_4X4,   BLOCK_4X4,   BLOCK_8X8,      //   4x16,   16x4,    8x32
-  BLOCK_8X8                                 //   32x8
+  BLOCK_4X4,   BLOCK_4X4,   BLOCK_8X8,    //   4x16,   16x4,    8x32
+  BLOCK_8X8,   BLOCK_16X16, BLOCK_16X16,  //   32x8,  16x64,   64x16
+#if CONFIG_EXT_PARTITION
+  BLOCK_16X16, BLOCK_16X16                // 32x128, 128x32
+#endif  // CONFIG_EXT_PARTITION
 };
 
 static const BLOCK_SIZE max_partition_size[BLOCK_SIZES_ALL] = {
@@ -2820,7 +2993,10 @@ static const BLOCK_SIZE max_partition_size[BLOCK_SIZES_ALL] = {
   BLOCK_LARGEST, BLOCK_LARGEST, BLOCK_LARGEST,  // 64x128, 128x64, 128x128
 #endif  // CONFIG_EXT_PARTITION
   BLOCK_16X16,   BLOCK_16X16,   BLOCK_32X32,    //   4x16,   16x4,    8x32
-  BLOCK_32X32                                   //   32x8
+  BLOCK_32X32,   BLOCK_LARGEST, BLOCK_LARGEST,  //   32x8,  16x64,   64x16
+#if CONFIG_EXT_PARTITION
+  BLOCK_LARGEST, BLOCK_LARGEST                  // 32x128, 128x32
+#endif  // CONFIG_EXT_PARTITION
 };
 
 // Next square block size less or equal than current block size.
@@ -2837,7 +3013,10 @@ static const BLOCK_SIZE next_square_size[BLOCK_SIZES_ALL] = {
   BLOCK_64X64, BLOCK_64X64, BLOCK_128X128,  // 64x128, 128x64, 128x128
 #endif  // CONFIG_EXT_PARTITION
   BLOCK_4X4,   BLOCK_4X4,   BLOCK_8X8,      //   4x16,   16x4,    8x32
-  BLOCK_8X8                                 //   32x8
+  BLOCK_8X8,   BLOCK_16X16, BLOCK_16X16,    //   32x8,  16x64,   64x16
+#if CONFIG_EXT_PARTITION
+  BLOCK_32X32, BLOCK_32X32                  // 32x128, 128x32
+#endif  // CONFIG_EXT_PARTITION
 };
 /* clang-format on */
 
@@ -2953,7 +3132,7 @@ static void set_partition_range(const AV1_COMMON *const cm,
 
   const int idx_str = cm->mi_stride * mi_row + mi_col;
   MODE_INFO **const prev_mi = &cm->prev_mi_grid_visible[idx_str];
-  BLOCK_SIZE min_size = BLOCK_64X64;  // default values
+  BLOCK_SIZE min_size = cm->sb_size;  // default values
   BLOCK_SIZE max_size = BLOCK_4X4;
 
   if (prev_mi) {
@@ -3004,66 +3183,24 @@ static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
 
 #if CONFIG_FP_MB_STATS
 const int qindex_skip_threshold_lookup[BLOCK_SIZES] = {
-  0,
-  10,
-  10,
-  30,
-  40,
-  40,
-  60,
-  80,
-  80,
-  90,
-  100,
-  100,
-  120,
+  0, 10, 10, 30, 40, 40, 60, 80, 80, 90, 100, 100, 120,
 #if CONFIG_EXT_PARTITION
   // TODO(debargha): What are the correct numbers here?
-  130,
-  130,
-  150
+  130, 130, 150
 #endif  // CONFIG_EXT_PARTITION
 };
 const int qindex_split_threshold_lookup[BLOCK_SIZES] = {
-  0,
-  3,
-  3,
-  7,
-  15,
-  15,
-  30,
-  40,
-  40,
-  60,
-  80,
-  80,
-  120,
+  0, 3, 3, 7, 15, 15, 30, 40, 40, 60, 80, 80, 120,
 #if CONFIG_EXT_PARTITION
   // TODO(debargha): What are the correct numbers here?
-  160,
-  160,
-  240
+  160, 160, 240
 #endif  // CONFIG_EXT_PARTITION
 };
 const int complexity_16x16_blocks_threshold[BLOCK_SIZES] = {
-  1,
-  1,
-  1,
-  1,
-  1,
-  1,
-  1,
-  1,
-  1,
-  1,
-  4,
-  4,
-  6,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 6,
 #if CONFIG_EXT_PARTITION
   // TODO(debargha): What are the correct numbers here?
-  8,
-  8,
-  10
+  8, 8, 10
 #endif  // CONFIG_EXT_PARTITION
 };
 
@@ -3101,6 +3238,78 @@ static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv,
 #endif
 
 #if CONFIG_EXT_PARTITION_TYPES
+// Try searching for an encoding for the given subblock. Returns zero if the
+// rdcost is already too high (to tell the caller not to bother searching for
+// encodings of further subblocks)
+static int rd_try_subblock(const AV1_COMP *const cpi, ThreadData *td,
+                           TileDataEnc *tile_data, TOKENEXTRA **tp,
+                           int is_first, int is_last, int mi_row, int mi_col,
+                           BLOCK_SIZE subsize, RD_STATS *best_rdc,
+                           RD_STATS *sum_rdc, RD_STATS *this_rdc,
+#if CONFIG_SUPERTX
+                           int64_t best_rd, int *sum_rate_nocoef,
+                           int *this_rate_nocoef, int *abort_flag,
+#endif
+                           PARTITION_TYPE partition,
+                           PICK_MODE_CONTEXT *prev_ctx,
+                           PICK_MODE_CONTEXT *this_ctx) {
+#if CONFIG_SUPERTX
+#define RTS_X_RATE_NOCOEF_ARG ((is_first) ? sum_rate_nocoef : this_rate_nocoef),
+#define RTS_MAX_RDCOST INT64_MAX
+#else
+#define RTS_X_RATE_NOCOEF_ARG
+#define RTS_MAX_RDCOST best_rdc->rdcost
+#endif
+
+  MACROBLOCK *const x = &td->mb;
+
+  if (cpi->sf.adaptive_motion_search) load_pred_mv(x, prev_ctx);
+
+  // On the first time around, write the rd stats straight to sum_rdc. Also, we
+  // should treat sum_rdc as containing zeros (even if it doesn't) to avoid
+  // having to zero it at the start.
+  if (is_first) this_rdc = sum_rdc;
+  const int64_t spent_rdcost = is_first ? 0 : sum_rdc->rdcost;
+  const int64_t rdcost_remaining = best_rdc->rdcost - spent_rdcost;
+
+  rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, this_rdc,
+                   RTS_X_RATE_NOCOEF_ARG partition, subsize, this_ctx,
+                   rdcost_remaining);
+
+#if CONFIG_SUPERTX
+  if (is_first) *abort_flag = sum_rdc->rdcost >= best_rd;
+#endif
+
+  if (!is_first) {
+    if (this_rdc->rate == INT_MAX) {
+      sum_rdc->rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+      *sum_rate_nocoef = INT_MAX;
+#endif
+    } else {
+      sum_rdc->rate += this_rdc->rate;
+      sum_rdc->dist += this_rdc->dist;
+      sum_rdc->rdcost += this_rdc->rdcost;
+#if CONFIG_SUPERTX
+      *sum_rate_nocoef += *this_rate_nocoef;
+#endif
+    }
+  }
+
+  if (sum_rdc->rdcost >= RTS_MAX_RDCOST) return 0;
+
+  if (!is_last) {
+    update_state(cpi, td, this_ctx, mi_row, mi_col, subsize, 1);
+    encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
+                      NULL);
+  }
+
+  return 1;
+
+#undef RTS_X_RATE_NOCOEF_ARG
+#undef RTS_MAX_RDCOST
+}
+
 static void rd_test_partition3(
     const AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
     TOKENEXTRA **tp, PC_TREE *pc_tree, RD_STATS *best_rdc,
@@ -3113,172 +3322,165 @@ static void rd_test_partition3(
     BLOCK_SIZE subsize1, int mi_row2, int mi_col2, BLOCK_SIZE subsize2) {
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  RD_STATS this_rdc, sum_rdc;
-#if CONFIG_SUPERTX
+  RD_STATS sum_rdc, this_rdc;
+#if CONFIG_UNPOISON_PARTITION_CTX
   const AV1_COMMON *const cm = &cpi->common;
+  const int hbs = mi_size_wide[bsize] / 2;
+  const int has_rows = mi_row + hbs < cm->mi_rows;
+  const int has_cols = mi_col + hbs < cm->mi_cols;
+#endif  // CONFIG_UNPOISON_PARTITION_CTX
+#if CONFIG_SUPERTX || CONFIG_EXT_PARTITION_TYPES_AB
+  const AV1_COMMON *const cm = &cpi->common;
+#endif
+#if CONFIG_SUPERTX
   TileInfo *const tile_info = &tile_data->tile_info;
-  int this_rate_nocoef, sum_rate_nocoef;
+  int sum_rate_nocoef, this_rate_nocoef;
   int abort_flag;
   const int supertx_allowed = !frame_is_intra_only(cm) &&
                               bsize <= MAX_SUPERTX_BLOCK_SIZE &&
                               !xd->lossless[0];
-#endif
-  if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx);
-
-  rd_pick_sb_modes(cpi, tile_data, x, mi_row0, mi_col0, &sum_rdc,
-#if CONFIG_SUPERTX
-                   &sum_rate_nocoef,
-#endif
-#if CONFIG_EXT_PARTITION_TYPES
-                   partition,
-#endif
-                   subsize0, &ctxs[0], best_rdc->rdcost);
-#if CONFIG_SUPERTX
-  abort_flag = sum_rdc.rdcost >= best_rd;
-#endif
 
-#if CONFIG_SUPERTX
-  if (sum_rdc.rdcost < INT64_MAX) {
+#define RTP_STX_TRY_ARGS \
+  best_rd, &sum_rate_nocoef, &this_rate_nocoef, &abort_flag,
 #else
-  if (sum_rdc.rdcost < best_rdc->rdcost) {
+#define RTP_STX_TRY_ARGS
 #endif
-    PICK_MODE_CONTEXT *ctx_0 = &ctxs[0];
-    update_state(cpi, td, ctx_0, mi_row0, mi_col0, subsize0, 1);
-    encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row0, mi_col0, subsize0,
-                      NULL);
 
-    if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_0);
+  if (!rd_try_subblock(cpi, td, tile_data, tp, 1, 0, mi_row0, mi_col0, subsize0,
+                       best_rdc, &sum_rdc, &this_rdc,
+                       RTP_STX_TRY_ARGS partition, ctx, &ctxs[0]))
+    return;
 
-#if CONFIG_SUPERTX
-    rd_pick_sb_modes(cpi, tile_data, x, mi_row1, mi_col1, &this_rdc,
-                     &this_rate_nocoef,
-#if CONFIG_EXT_PARTITION_TYPES
-                     partition,
-#endif
-                     subsize1, &ctxs[1], INT64_MAX - sum_rdc.rdcost);
+  if (!rd_try_subblock(cpi, td, tile_data, tp, 0, 0, mi_row1, mi_col1, subsize1,
+                       best_rdc, &sum_rdc, &this_rdc,
+                       RTP_STX_TRY_ARGS partition, &ctxs[0], &ctxs[1]))
+    return;
+
+// With the new layout of mixed partitions for PARTITION_HORZ_B and
+// PARTITION_VERT_B, the last subblock might start past halfway through the
+// main block, so we might signal it even though the subblock lies strictly
+// outside the image. In that case, we won't spend any bits coding it and the
+// difference (obviously) doesn't contribute to the error.
+#if CONFIG_EXT_PARTITION_TYPES_AB
+  const int try_block2 = mi_row2 < cm->mi_rows && mi_col2 < cm->mi_cols;
 #else
-    rd_pick_sb_modes(cpi, tile_data, x, mi_row1, mi_col1, &this_rdc,
-#if CONFIG_EXT_PARTITION_TYPES
-                     partition,
+  const int try_block2 = 1;
 #endif
-                     subsize1, &ctxs[1], best_rdc->rdcost - sum_rdc.rdcost);
-#endif  // CONFIG_SUPERTX
+  if (try_block2 &&
+      !rd_try_subblock(cpi, td, tile_data, tp, 0, 1, mi_row2, mi_col2, subsize2,
+                       best_rdc, &sum_rdc, &this_rdc,
+                       RTP_STX_TRY_ARGS partition, &ctxs[1], &ctxs[2]))
+    return;
 
-    if (this_rdc.rate == INT_MAX) {
-      sum_rdc.rdcost = INT64_MAX;
-#if CONFIG_SUPERTX
-      sum_rate_nocoef = INT_MAX;
-#endif
-    } else {
-      sum_rdc.rate += this_rdc.rate;
-      sum_rdc.dist += this_rdc.dist;
-      sum_rdc.rdcost += this_rdc.rdcost;
 #if CONFIG_SUPERTX
-      sum_rate_nocoef += this_rate_nocoef;
-#endif
-    }
+  if (supertx_allowed && !abort_flag && sum_rdc.rdcost < INT64_MAX) {
+    TX_SIZE supertx_size = max_txsize_lookup[bsize];
+    const PARTITION_TYPE best_partition = pc_tree->partitioning;
+    pc_tree->partitioning = partition;
+    sum_rdc.rate += av1_cost_bit(
+        cm->fc->supertx_prob[partition_supertx_context_lookup[partition]]
+                            [supertx_size],
+        0);
+    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
 
-#if CONFIG_SUPERTX
-    if (sum_rdc.rdcost < INT64_MAX) {
-#else
-    if (sum_rdc.rdcost < best_rdc->rdcost) {
-#endif
-      PICK_MODE_CONTEXT *ctx_1 = &ctxs[1];
-      update_state(cpi, td, ctx_1, mi_row1, mi_col1, subsize1, 1);
-      encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row1, mi_col1, subsize1,
-                        NULL);
+    if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) {
+      TX_TYPE best_tx = DCT_DCT;
+      RD_STATS tmp_rdc = { sum_rate_nocoef, 0, 0 };
 
-      if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_1);
+      restore_context(x, x_ctx, mi_row, mi_col, bsize);
 
-#if CONFIG_SUPERTX
-      rd_pick_sb_modes(cpi, tile_data, x, mi_row2, mi_col2, &this_rdc,
-                       &this_rate_nocoef,
-#if CONFIG_EXT_PARTITION_TYPES
-                       partition,
-#endif
-                       subsize2, &ctxs[2], INT64_MAX - sum_rdc.rdcost);
-#else
-      rd_pick_sb_modes(cpi, tile_data, x, mi_row2, mi_col2, &this_rdc,
-#if CONFIG_EXT_PARTITION_TYPES
-                       partition,
+      rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize, &tmp_rdc.rate,
+                    &tmp_rdc.dist, &best_tx, pc_tree);
+
+      tmp_rdc.rate += av1_cost_bit(
+          cm->fc->supertx_prob[partition_supertx_context_lookup[partition]]
+                              [supertx_size],
+          1);
+      tmp_rdc.rdcost = RDCOST(x->rdmult, tmp_rdc.rate, tmp_rdc.dist);
+      if (tmp_rdc.rdcost < sum_rdc.rdcost) {
+        sum_rdc = tmp_rdc;
+        update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx,
+                                supertx_size, pc_tree);
+      }
+    }
+
+    pc_tree->partitioning = best_partition;
+  }
 #endif
-                       subsize2, &ctxs[2], best_rdc->rdcost - sum_rdc.rdcost);
-#endif  // CONFIG_SUPERTX
 
-      if (this_rdc.rate == INT_MAX) {
-        sum_rdc.rdcost = INT64_MAX;
-#if CONFIG_SUPERTX
-        sum_rate_nocoef = INT_MAX;
+  if (sum_rdc.rdcost >= best_rdc->rdcost) return;
+
+  int pl = partition_plane_context(xd, mi_row, mi_col,
+#if CONFIG_UNPOISON_PARTITION_CTX
+                                   has_rows, has_cols,
 #endif
-      } else {
-        sum_rdc.rate += this_rdc.rate;
-        sum_rdc.dist += this_rdc.dist;
-        sum_rdc.rdcost += this_rdc.rdcost;
+                                   bsize);
+  sum_rdc.rate += x->partition_cost[pl][partition];
+  sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
 #if CONFIG_SUPERTX
-        sum_rate_nocoef += this_rate_nocoef;
+  sum_rate_nocoef += x->partition_cost[pl][partition];
 #endif
-      }
+
+  if (sum_rdc.rdcost >= best_rdc->rdcost) return;
 
 #if CONFIG_SUPERTX
-      if (supertx_allowed && !abort_flag && sum_rdc.rdcost < INT64_MAX) {
-        TX_SIZE supertx_size = max_txsize_lookup[bsize];
-        const PARTITION_TYPE best_partition = pc_tree->partitioning;
-        pc_tree->partitioning = partition;
-        sum_rdc.rate += av1_cost_bit(
-            cm->fc->supertx_prob[partition_supertx_context_lookup[partition]]
-                                [supertx_size],
-            0);
-        sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+  *best_rate_nocoef = sum_rate_nocoef;
+  assert(*best_rate_nocoef >= 0);
+#endif
+  *best_rdc = sum_rdc;
+  pc_tree->partitioning = partition;
 
-        if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) {
-          TX_TYPE best_tx = DCT_DCT;
-          RD_STATS tmp_rdc = { sum_rate_nocoef, 0, 0 };
+#undef RTP_STX_TRY_ARGS
+}
+#endif  // CONFIG_EXT_PARTITION_TYPES
 
-          restore_context(x, x_ctx, mi_row, mi_col, bsize);
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
+static int64_t dist_8x8_yuv(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                            uint8_t *y_src_8x8) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int64_t dist_8x8, dist_8x8_uv, total_dist;
+  const int src_stride = x->plane[0].src.stride;
+  uint8_t *decoded_8x8;
+  int plane;
 
-          rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize,
-                        &tmp_rdc.rate, &tmp_rdc.dist, &best_tx, pc_tree);
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    decoded_8x8 = CONVERT_TO_BYTEPTR(x->decoded_8x8);
+  else
+#endif
+    decoded_8x8 = (uint8_t *)x->decoded_8x8;
 
-          tmp_rdc.rate += av1_cost_bit(
-              cm->fc->supertx_prob[partition_supertx_context_lookup[partition]]
-                                  [supertx_size],
-              1);
-          tmp_rdc.rdcost = RDCOST(x->rdmult, tmp_rdc.rate, tmp_rdc.dist);
-          if (tmp_rdc.rdcost < sum_rdc.rdcost) {
-            sum_rdc = tmp_rdc;
-            update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx,
-                                    supertx_size, pc_tree);
-          }
-        }
+  dist_8x8 = av1_dist_8x8(cpi, x, y_src_8x8, src_stride, decoded_8x8, 8,
+                          BLOCK_8X8, 8, 8, 8, 8, x->qindex)
+             << 4;
 
-        pc_tree->partitioning = best_partition;
-      }
-#endif  // CONFIG_SUPERTX
+  // Compute chroma distortion for a luma 8x8 block
+  dist_8x8_uv = 0;
 
-      if (sum_rdc.rdcost < best_rdc->rdcost) {
-        int pl = partition_plane_context(xd, mi_row, mi_col,
-#if CONFIG_UNPOISON_PARTITION_CTX
-                                         has_rows, has_cols,
-#endif
-                                         bsize);
-        sum_rdc.rate += cpi->partition_cost[pl][partition];
-        sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
-#if CONFIG_SUPERTX
-        sum_rate_nocoef += cpi->partition_cost[pl][partition];
-#endif
-        if (sum_rdc.rdcost < best_rdc->rdcost) {
-#if CONFIG_SUPERTX
-          *best_rate_nocoef = sum_rate_nocoef;
-          assert(*best_rate_nocoef >= 0);
+  for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+    const int src_stride_uv = x->plane[plane].src.stride;
+    const int dst_stride_uv = xd->plane[plane].dst.stride;
+    // uv buff pointers now (i.e. the last sub8x8 block) is the same
+    // to those at the first sub8x8 block because
+    // uv buff pointer is set only once at first sub8x8 block in a 8x8.
+    uint8_t *src_uv = x->plane[plane].src.buf;
+    uint8_t *dst_uv = xd->plane[plane].dst.buf;
+    unsigned sse;
+#if CONFIG_CHROMA_SUB8X8
+    const BLOCK_SIZE plane_bsize =
+        AOMMAX(BLOCK_4X4, get_plane_block_size(BLOCK_8X8, &xd->plane[plane]));
+#else
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(BLOCK_8X8, &xd->plane[plane]);
 #endif
-          *best_rdc = sum_rdc;
-          pc_tree->partitioning = partition;
-        }
-      }
-    }
+    cpi->fn_ptr[plane_bsize].vf(src_uv, src_stride_uv, dst_uv, dst_stride_uv,
+                                &sse);
+    dist_8x8_uv += (int64_t)sse << 4;
   }
+
+  return total_dist = dist_8x8 + dist_8x8_uv;
 }
-#endif  // CONFIG_EXT_PARTITION_TYPES
+#endif  // CONFIG_DIST_8X8 && CONFIG_CB4X4
 
 // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
 // unlikely to be selected depending on previous rate-distortion optimization
@@ -3327,7 +3529,8 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 #endif
                                          bsize);
 #endif  // CONFIG_CB4X4
-  const int *partition_cost = cpi->partition_cost[pl];
+  const int *partition_cost =
+      pl >= 0 ? x->partition_cost[pl] : x->partition_cost[0];
 #if CONFIG_SUPERTX
   int this_rate_nocoef, sum_rate_nocoef = 0, best_rate_nocoef = INT_MAX;
   int abort_flag;
@@ -3337,7 +3540,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 #endif  // CONFIG_SUPERTX
 
   int do_rectangular_split = 1;
-#if CONFIG_EXT_PARTITION_TYPES
+#if CONFIG_EXT_PARTITION_TYPES && !CONFIG_EXT_PARTITION_TYPES_AB
   BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
 #endif
 
@@ -3458,9 +3661,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
     src_diff_var = get_sby_perpixel_diff_variance(cpi, &x->plane[0].src, mi_row,
                                                   mi_col, bsize);
   }
-#endif
 
-#if CONFIG_FP_MB_STATS
   // Decide whether we shall split directly and skip searching NONE by using
   // the first pass block statistics
   if (cpi->use_fp_mb_stats && bsize >= BLOCK_32X32 && do_square_split &&
@@ -3511,17 +3712,6 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
   }
 #endif
 
-#if CONFIG_SPEED_REFS
-  if (cpi->sb_scanning_pass_idx == 0) {
-    // NOTE: For the 1st pass of scanning, check all the subblocks of equal size
-    //       only.
-    partition_none_allowed = (bsize == MIN_SPEED_REFS_BLKSIZE);
-    partition_horz_allowed = 0;
-    partition_vert_allowed = 0;
-    do_square_split = (bsize > MIN_SPEED_REFS_BLKSIZE);
-  }
-#endif  // CONFIG_SPEED_REFS
-
   // PARTITION_NONE
   if (partition_none_allowed) {
     rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
@@ -3534,10 +3724,13 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
                      bsize, ctx_none, best_rdc.rdcost);
     if (this_rdc.rate != INT_MAX) {
       if (bsize_at_least_8x8) {
-        this_rdc.rate += partition_cost[PARTITION_NONE];
+        const int pt_cost = partition_cost[PARTITION_NONE] < INT_MAX
+                                ? partition_cost[PARTITION_NONE]
+                                : 0;
+        this_rdc.rate += pt_cost;
         this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
 #if CONFIG_SUPERTX
-        this_rate_nocoef += partition_cost[PARTITION_NONE];
+        this_rate_nocoef += pt_cost;
 #endif
       }
 
@@ -3622,11 +3815,22 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 #else
     restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
 #endif
+#if CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
+    if (!x->skip_chroma_rd) {
+      cfl_clear_sub8x8_val(xd->cfl);
+    }
+#endif  // CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
   }
 
   // store estimated motion vector
   if (cpi->sf.adaptive_motion_search) store_pred_mv(x, ctx_none);
 
+#if CONFIG_SUPERTX
+  int64_t temp_best_rdcost = INT64_MAX;
+#else
+  int64_t temp_best_rdcost = best_rdc.rdcost;
+#endif
+
   // PARTITION_SPLIT
   // TODO(jingning): use the motion vectors given by the above search as
   // the starting point of motion search in the following partition type check.
@@ -3634,29 +3838,18 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
     int reached_last_index = 0;
     subsize = get_subsize(bsize, PARTITION_SPLIT);
     if (bsize == BLOCK_8X8 && !unify_bsize) {
-#if CONFIG_DUAL_FILTER
       if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed)
         pc_tree->leaf_split[0]->pred_interp_filter =
-            ctx_none->mic.mbmi.interp_filter[0];
-#else
-      if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed)
-        pc_tree->leaf_split[0]->pred_interp_filter =
-            ctx_none->mic.mbmi.interp_filter;
-#endif
-#if CONFIG_SUPERTX
+            av1_extract_interp_filter(ctx_none->mic.mbmi.interp_filters, 0);
+
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc,
+#if CONFIG_SUPERTX
                        &sum_rate_nocoef,
-#if CONFIG_EXT_PARTITION_TYPES
-                       PARTITION_SPLIT,
 #endif
-                       subsize, pc_tree->leaf_split[0], INT64_MAX);
-#else
-      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc,
 #if CONFIG_EXT_PARTITION_TYPES
                        PARTITION_SPLIT,
 #endif
-                       subsize, pc_tree->leaf_split[0], best_rdc.rdcost);
-#endif  // CONFIG_SUPERTX
+                       subsize, pc_tree->leaf_split[0], temp_best_rdcost);
       if (sum_rdc.rate == INT_MAX) {
         sum_rdc.rdcost = INT64_MAX;
 #if CONFIG_SUPERTX
@@ -3705,11 +3898,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
       reached_last_index = 1;
     } else {
       int idx;
-#if CONFIG_SUPERTX
-      for (idx = 0; idx < 4 && sum_rdc.rdcost < INT64_MAX; ++idx) {
-#else
-      for (idx = 0; idx < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++idx) {
-#endif  // CONFIG_SUPERTX
+      for (idx = 0; idx < 4 && sum_rdc.rdcost < temp_best_rdcost; ++idx) {
         const int x_idx = (idx & 1) * mi_step;
         const int y_idx = (idx >> 1) * mi_step;
 
@@ -3719,21 +3908,14 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
         if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
 
         pc_tree->split[idx]->index = idx;
-#if CONFIG_SUPERTX
         rd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx,
-                          mi_col + x_idx, subsize, &this_rdc, &this_rate_nocoef,
-                          INT64_MAX - sum_rdc.rdcost, pc_tree->split[idx]);
-#else
-        rd_pick_partition(
-            cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, subsize,
-            &this_rdc, best_rdc.rdcost - sum_rdc.rdcost, pc_tree->split[idx]);
-#endif  // CONFIG_SUPERTX
-
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-        if (bsize == BLOCK_8X8 && this_rdc.rate != INT_MAX) {
-          assert(this_rdc.dist_y < INT64_MAX);
-        }
+                          mi_col + x_idx, subsize, &this_rdc,
+#if CONFIG_SUPERTX
+                          &this_rate_nocoef,
 #endif
+                          temp_best_rdcost - sum_rdc.rdcost,
+                          pc_tree->split[idx]);
+
         if (this_rdc.rate == INT_MAX) {
           sum_rdc.rdcost = INT64_MAX;
 #if CONFIG_SUPERTX
@@ -3747,37 +3929,18 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 #if CONFIG_SUPERTX
           sum_rate_nocoef += this_rate_nocoef;
 #endif  // CONFIG_SUPERTX
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-          if (bsize == BLOCK_8X8) {
-            assert(this_rdc.dist_y < INT64_MAX);
-            sum_rdc.dist_y += this_rdc.dist_y;
-          }
-#endif
         }
       }
       reached_last_index = (idx == 4);
 
 #if CONFIG_DIST_8X8 && CONFIG_CB4X4
-      if (reached_last_index && sum_rdc.rdcost != INT64_MAX &&
-          bsize == BLOCK_8X8) {
-        int64_t dist_8x8;
+      if (x->using_dist_8x8 && reached_last_index &&
+          sum_rdc.rdcost != INT64_MAX && bsize == BLOCK_8X8) {
         const int src_stride = x->plane[0].src.stride;
-        uint8_t *decoded_8x8;
-
-#if CONFIG_HIGHBITDEPTH
-        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-          decoded_8x8 = CONVERT_TO_BYTEPTR(x->decoded_8x8);
-        else
-#endif
-          decoded_8x8 = (uint8_t *)x->decoded_8x8;
-
+        int64_t dist_8x8;
         dist_8x8 =
-            av1_dist_8x8(cpi, xd, x->plane[0].src.buf - 4 * src_stride - 4,
-                         src_stride, decoded_8x8, 8, BLOCK_8X8, 8, 8, 8, 8,
-                         x->qindex)
-            << 4;
-        assert(sum_rdc.dist_y < INT64_MAX);
-        sum_rdc.dist = sum_rdc.dist - sum_rdc.dist_y + dist_8x8;
+            dist_8x8_yuv(cpi, x, x->plane[0].src.buf - 4 * src_stride - 4);
+        sum_rdc.dist = dist_8x8;
         sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
       }
 #endif  // CONFIG_DIST_8X8 && CONFIG_CB4X4
@@ -3823,6 +3986,11 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 #endif  // CONFIG_SUPERTX
     }
 
+#if CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
+    if (!reached_last_index && sum_rdc.rdcost >= best_rdc.rdcost)
+      cfl_clear_sub8x8_val(xd->cfl);
+#endif  // CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
+
     if (reached_last_index && sum_rdc.rdcost < best_rdc.rdcost) {
       sum_rdc.rate += partition_cost[PARTITION_SPLIT];
       sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
@@ -3835,6 +4003,8 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 #if CONFIG_SUPERTX
         best_rate_nocoef = sum_rate_nocoef;
         assert(best_rate_nocoef >= 0);
+#else
+        temp_best_rdcost = best_rdc.rdcost;
 #endif  // CONFIG_SUPERTX
         pc_tree->partitioning = PARTITION_SPLIT;
       }
@@ -3855,17 +4025,11 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
       (do_rectangular_split || av1_active_h_edge(cpi, mi_row, mi_step))) {
     subsize = get_subsize(bsize, PARTITION_HORZ);
     if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
-#if CONFIG_DUAL_FILTER
-    if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
-        partition_none_allowed)
-      pc_tree->horizontal[0].pred_interp_filter =
-          ctx_none->mic.mbmi.interp_filter[0];
-#else
     if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
         partition_none_allowed)
       pc_tree->horizontal[0].pred_interp_filter =
-          ctx_none->mic.mbmi.interp_filter;
-#endif
+          av1_extract_interp_filter(ctx_none->mic.mbmi.interp_filters, 0);
+
     rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc,
 #if CONFIG_SUPERTX
                      &sum_rate_nocoef,
@@ -3879,11 +4043,9 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
     abort_flag =
         (sum_rdc.rdcost >= best_rd && (bsize > BLOCK_8X8 || unify_bsize)) ||
         (sum_rdc.rate == INT_MAX && bsize == BLOCK_8X8);
-    if (sum_rdc.rdcost < INT64_MAX &&
-#else
-    if (sum_rdc.rdcost < best_rdc.rdcost &&
-#endif  // CONFIG_SUPERTX
-        !force_horz_split && (bsize > BLOCK_8X8 || unify_bsize)) {
+#endif
+    if (sum_rdc.rdcost < temp_best_rdcost && !force_horz_split &&
+        (bsize > BLOCK_8X8 || unify_bsize)) {
       PICK_MODE_CONTEXT *ctx_h = &pc_tree->horizontal[0];
       update_state(cpi, td, ctx_h, mi_row, mi_col, subsize, 1);
       encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
@@ -3891,17 +4053,11 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 
       if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_h);
 
-#if CONFIG_DUAL_FILTER
-      if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
-          partition_none_allowed)
-        pc_tree->horizontal[1].pred_interp_filter =
-            ctx_h->mic.mbmi.interp_filter[0];
-#else
       if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
           partition_none_allowed)
         pc_tree->horizontal[1].pred_interp_filter =
-            ctx_none->mic.mbmi.interp_filter;
-#endif
+            av1_extract_interp_filter(ctx_h->mic.mbmi.interp_filters, 0);
+
 #if CONFIG_SUPERTX
       rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col, &this_rdc,
                        &this_rate_nocoef,
@@ -3919,7 +4075,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 #endif  // CONFIG_SUPERTX
 
 #if CONFIG_DIST_8X8 && CONFIG_CB4X4
-      if (this_rdc.rate != INT_MAX && bsize == BLOCK_8X8) {
+      if (x->using_dist_8x8 && this_rdc.rate != INT_MAX && bsize == BLOCK_8X8) {
         update_state(cpi, td, &pc_tree->horizontal[1], mi_row + mi_step, mi_col,
                      subsize, DRY_RUN_NORMAL);
         encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row + mi_step, mi_col,
@@ -3939,28 +4095,14 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 #if CONFIG_SUPERTX
         sum_rate_nocoef += this_rate_nocoef;
 #endif  // CONFIG_SUPERTX
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-        sum_rdc.dist_y += this_rdc.dist_y;
-#endif
       }
 #if CONFIG_DIST_8X8 && CONFIG_CB4X4
-      if (sum_rdc.rdcost != INT64_MAX && bsize == BLOCK_8X8) {
-        int64_t dist_8x8;
+      if (x->using_dist_8x8 && sum_rdc.rdcost != INT64_MAX &&
+          bsize == BLOCK_8X8) {
         const int src_stride = x->plane[0].src.stride;
-        uint8_t *decoded_8x8;
-
-#if CONFIG_HIGHBITDEPTH
-        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-          decoded_8x8 = CONVERT_TO_BYTEPTR(x->decoded_8x8);
-        else
-#endif
-          decoded_8x8 = (uint8_t *)x->decoded_8x8;
-
-        dist_8x8 = av1_dist_8x8(cpi, xd, x->plane[0].src.buf - 4 * src_stride,
-                                src_stride, decoded_8x8, 8, BLOCK_8X8, 8, 8, 8,
-                                8, x->qindex)
-                   << 4;
-        sum_rdc.dist = sum_rdc.dist - sum_rdc.dist_y + dist_8x8;
+        int64_t dist_8x8;
+        dist_8x8 = dist_8x8_yuv(cpi, x, x->plane[0].src.buf - 4 * src_stride);
+        sum_rdc.dist = dist_8x8;
         sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
       }
 #endif  // CONFIG_DIST_8X8 && CONFIG_CB4X4
@@ -4007,6 +4149,9 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
     }
 #endif  // CONFIG_SUPERTX
 
+#if CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
+    cfl_clear_sub8x8_val(xd->cfl);
+#endif  // CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
     if (sum_rdc.rdcost < best_rdc.rdcost) {
       sum_rdc.rate += partition_cost[PARTITION_HORZ];
       sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
@@ -4036,17 +4181,11 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 
     if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
 
-#if CONFIG_DUAL_FILTER
-    if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
-        partition_none_allowed)
-      pc_tree->vertical[0].pred_interp_filter =
-          ctx_none->mic.mbmi.interp_filter[0];
-#else
     if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
         partition_none_allowed)
       pc_tree->vertical[0].pred_interp_filter =
-          ctx_none->mic.mbmi.interp_filter;
-#endif
+          av1_extract_interp_filter(ctx_none->mic.mbmi.interp_filters, 0);
+
     rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc,
 #if CONFIG_SUPERTX
                      &sum_rate_nocoef,
@@ -4059,28 +4198,23 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
     abort_flag =
         (sum_rdc.rdcost >= best_rd && (bsize > BLOCK_8X8 || unify_bsize)) ||
         (sum_rdc.rate == INT_MAX && bsize == BLOCK_8X8);
-    if (sum_rdc.rdcost < INT64_MAX &&
+    const int64_t vert_max_rdcost = INT64_MAX;
 #else
-    if (sum_rdc.rdcost < best_rdc.rdcost &&
+    const int64_t vert_max_rdcost = best_rdc.rdcost;
 #endif  // CONFIG_SUPERTX
-        !force_vert_split && (bsize > BLOCK_8X8 || unify_bsize)) {
+    if (sum_rdc.rdcost < vert_max_rdcost && !force_vert_split &&
+        (bsize > BLOCK_8X8 || unify_bsize)) {
       update_state(cpi, td, &pc_tree->vertical[0], mi_row, mi_col, subsize, 1);
       encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
                         NULL);
 
       if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
 
-#if CONFIG_DUAL_FILTER
       if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
           partition_none_allowed)
         pc_tree->vertical[1].pred_interp_filter =
-            ctx_none->mic.mbmi.interp_filter[0];
-#else
-      if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
-          partition_none_allowed)
-        pc_tree->vertical[1].pred_interp_filter =
-            ctx_none->mic.mbmi.interp_filter;
-#endif
+            av1_extract_interp_filter(ctx_none->mic.mbmi.interp_filters, 0);
+
 #if CONFIG_SUPERTX
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc,
                        &this_rate_nocoef,
@@ -4099,7 +4233,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 #endif  // CONFIG_SUPERTX
 
 #if CONFIG_DIST_8X8 && CONFIG_CB4X4
-      if (this_rdc.rate != INT_MAX && bsize == BLOCK_8X8) {
+      if (x->using_dist_8x8 && this_rdc.rate != INT_MAX && bsize == BLOCK_8X8) {
         update_state(cpi, td, &pc_tree->vertical[1], mi_row, mi_col + mi_step,
                      subsize, DRY_RUN_NORMAL);
         encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col + mi_step,
@@ -4119,28 +4253,13 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 #if CONFIG_SUPERTX
         sum_rate_nocoef += this_rate_nocoef;
 #endif  // CONFIG_SUPERTX
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-        sum_rdc.dist_y += this_rdc.dist_y;
-#endif
       }
 #if CONFIG_DIST_8X8 && CONFIG_CB4X4
-      if (sum_rdc.rdcost != INT64_MAX && bsize == BLOCK_8X8) {
+      if (x->using_dist_8x8 && sum_rdc.rdcost != INT64_MAX &&
+          bsize == BLOCK_8X8) {
         int64_t dist_8x8;
-        const int src_stride = x->plane[0].src.stride;
-        uint8_t *decoded_8x8;
-
-#if CONFIG_HIGHBITDEPTH
-        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-          decoded_8x8 = CONVERT_TO_BYTEPTR(x->decoded_8x8);
-        else
-#endif
-          decoded_8x8 = (uint8_t *)x->decoded_8x8;
-
-        dist_8x8 =
-            av1_dist_8x8(cpi, xd, x->plane[0].src.buf - 4, src_stride,
-                         decoded_8x8, 8, BLOCK_8X8, 8, 8, 8, 8, x->qindex)
-            << 4;
-        sum_rdc.dist = sum_rdc.dist - sum_rdc.dist_y + dist_8x8;
+        dist_8x8 = dist_8x8_yuv(cpi, x, x->plane[0].src.buf - 4);
+        sum_rdc.dist = dist_8x8;
         sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
       }
 #endif  // CONFIG_DIST_8X8 && CONFIG_CB4X4
@@ -4186,6 +4305,10 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
     }
 #endif  // CONFIG_SUPERTX
 
+#if CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
+    cfl_clear_sub8x8_val(xd->cfl);
+#endif  // CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
+
     if (sum_rdc.rdcost < best_rdc.rdcost) {
       sum_rdc.rate += partition_cost[PARTITION_VERT];
       sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
@@ -4209,9 +4332,31 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
   }
 
 #if CONFIG_EXT_PARTITION_TYPES
+  const int ext_partition_allowed =
+      do_rectangular_split && bsize > BLOCK_8X8 && partition_none_allowed;
+
+#if CONFIG_EXT_PARTITION && CONFIG_EXT_PARTITION_TYPES_AB
+  // Don't allow A/B partitions on 128x128 blocks for now (support for
+  // 128x32 and 32x128 blocks doesn't yet exist).
+  const int ab_partition_allowed =
+      ext_partition_allowed && bsize < BLOCK_128X128;
+#else
+  const int ab_partition_allowed = ext_partition_allowed;
+#endif
+
   // PARTITION_HORZ_A
-  if (partition_horz_allowed && do_rectangular_split && bsize > BLOCK_8X8 &&
-      partition_none_allowed) {
+  if (partition_horz_allowed && ab_partition_allowed) {
+#if CONFIG_EXT_PARTITION_TYPES_AB
+    rd_test_partition3(
+        cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->horizontala,
+        ctx_none, mi_row, mi_col, bsize, PARTITION_HORZ_A,
+#if CONFIG_SUPERTX
+        best_rd, &best_rate_nocoef, &x_ctx,
+#endif
+        mi_row, mi_col, get_subsize(bsize, PARTITION_HORZ_4),
+        mi_row + mi_step / 2, mi_col, get_subsize(bsize, PARTITION_HORZ_4),
+        mi_row + mi_step, mi_col, get_subsize(bsize, PARTITION_HORZ));
+#else
     subsize = get_subsize(bsize, PARTITION_HORZ_A);
     rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
                        pc_tree->horizontala, ctx_none, mi_row, mi_col, bsize,
@@ -4221,11 +4366,26 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 #endif
                        mi_row, mi_col, bsize2, mi_row, mi_col + mi_step, bsize2,
                        mi_row + mi_step, mi_col, subsize);
+#endif
+#if !CONFIG_PVQ
     restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+#else
+    restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
+#endif  // !CONFIG_PVQ
   }
   // PARTITION_HORZ_B
-  if (partition_horz_allowed && do_rectangular_split && bsize > BLOCK_8X8 &&
-      partition_none_allowed) {
+  if (partition_horz_allowed && ab_partition_allowed) {
+#if CONFIG_EXT_PARTITION_TYPES_AB
+    rd_test_partition3(
+        cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->horizontalb,
+        ctx_none, mi_row, mi_col, bsize, PARTITION_HORZ_B,
+#if CONFIG_SUPERTX
+        best_rd, &best_rate_nocoef, &x_ctx,
+#endif
+        mi_row, mi_col, get_subsize(bsize, PARTITION_HORZ), mi_row + mi_step,
+        mi_col, get_subsize(bsize, PARTITION_HORZ_4), mi_row + 3 * mi_step / 2,
+        mi_col, get_subsize(bsize, PARTITION_HORZ_4));
+#else
     subsize = get_subsize(bsize, PARTITION_HORZ_B);
     rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
                        pc_tree->horizontalb, ctx_none, mi_row, mi_col, bsize,
@@ -4235,11 +4395,26 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 #endif
                        mi_row, mi_col, subsize, mi_row + mi_step, mi_col,
                        bsize2, mi_row + mi_step, mi_col + mi_step, bsize2);
+#endif
+#if !CONFIG_PVQ
     restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+#else
+    restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
+#endif  // !CONFIG_PVQ
   }
   // PARTITION_VERT_A
-  if (partition_vert_allowed && do_rectangular_split && bsize > BLOCK_8X8 &&
-      partition_none_allowed) {
+  if (partition_vert_allowed && ab_partition_allowed) {
+#if CONFIG_EXT_PARTITION_TYPES_AB
+    rd_test_partition3(
+        cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->verticala,
+        ctx_none, mi_row, mi_col, bsize, PARTITION_VERT_A,
+#if CONFIG_SUPERTX
+        best_rd, &best_rate_nocoef, &x_ctx,
+#endif
+        mi_row, mi_col, get_subsize(bsize, PARTITION_VERT_4), mi_row,
+        mi_col + mi_step / 2, get_subsize(bsize, PARTITION_VERT_4), mi_row,
+        mi_col + mi_step, get_subsize(bsize, PARTITION_VERT));
+#else
     subsize = get_subsize(bsize, PARTITION_VERT_A);
     rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
                        pc_tree->verticala, ctx_none, mi_row, mi_col, bsize,
@@ -4249,11 +4424,26 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 #endif
                        mi_row, mi_col, bsize2, mi_row + mi_step, mi_col, bsize2,
                        mi_row, mi_col + mi_step, subsize);
+#endif
+#if !CONFIG_PVQ
     restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+#else
+    restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
+#endif  // !CONFIG_PVQ
   }
   // PARTITION_VERT_B
-  if (partition_vert_allowed && do_rectangular_split && bsize > BLOCK_8X8 &&
-      partition_none_allowed) {
+  if (partition_vert_allowed && ab_partition_allowed) {
+#if CONFIG_EXT_PARTITION_TYPES_AB
+    rd_test_partition3(
+        cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->verticalb,
+        ctx_none, mi_row, mi_col, bsize, PARTITION_VERT_B,
+#if CONFIG_SUPERTX
+        best_rd, &best_rate_nocoef, &x_ctx,
+#endif
+        mi_row, mi_col, get_subsize(bsize, PARTITION_VERT), mi_row,
+        mi_col + mi_step, get_subsize(bsize, PARTITION_VERT_4), mi_row,
+        mi_col + 3 * mi_step / 2, get_subsize(bsize, PARTITION_VERT_4));
+#else
     subsize = get_subsize(bsize, PARTITION_VERT_B);
     rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
                        pc_tree->verticalb, ctx_none, mi_row, mi_col, bsize,
@@ -4263,52 +4453,47 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 #endif
                        mi_row, mi_col, subsize, mi_row, mi_col + mi_step,
                        bsize2, mi_row + mi_step, mi_col + mi_step, bsize2);
+#endif
+#if !CONFIG_PVQ
     restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+#else
+    restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
+#endif  // !CONFIG_PVQ
   }
 
+#if CONFIG_EXT_PARTITION
+  const int can_partition_4 = (bsize == BLOCK_128X128 || bsize == BLOCK_64X64 ||
+                               bsize == BLOCK_32X32 || bsize == BLOCK_16X16);
+#else
+  const int can_partition_4 =
+      (bsize == BLOCK_64X64 || bsize == BLOCK_32X32 || bsize == BLOCK_16X16);
+#endif  // CONFIG_EXT_PARTITION
+
   // PARTITION_HORZ_4
   // TODO(david.barker): For this and PARTITION_VERT_4,
   // * Add support for BLOCK_16X16 once we support 2x8 and 8x2 blocks for the
   //   chroma plane
   // * Add support for supertx
-  if (bsize == BLOCK_32X32 && partition_horz_allowed && !force_horz_split &&
+  if (can_partition_4 && partition_horz_allowed && !force_horz_split &&
       (do_rectangular_split || av1_active_h_edge(cpi, mi_row, mi_step))) {
-    int i;
     const int quarter_step = mi_size_high[bsize] / 4;
     PICK_MODE_CONTEXT *ctx_prev = ctx_none;
 
     subsize = get_subsize(bsize, PARTITION_HORZ_4);
-    av1_zero(sum_rdc);
 
-    for (i = 0; i < 4; ++i) {
+    for (int i = 0; i < 4; ++i) {
       int this_mi_row = mi_row + i * quarter_step;
 
       if (i > 0 && this_mi_row >= cm->mi_rows) break;
 
-      if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_prev);
-
-      ctx_prev = &pc_tree->horizontal4[i];
+      PICK_MODE_CONTEXT *ctx_this = &pc_tree->horizontal4[i];
 
-      rd_pick_sb_modes(cpi, tile_data, x, this_mi_row, mi_col, &this_rdc,
-                       PARTITION_HORZ_4, subsize, ctx_prev,
-                       best_rdc.rdcost - sum_rdc.rdcost);
-
-      if (this_rdc.rate == INT_MAX) {
-        sum_rdc.rdcost = INT64_MAX;
+      if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 0), (i == 3),
+                           this_mi_row, mi_col, subsize, &best_rdc, &sum_rdc,
+                           &this_rdc, PARTITION_HORZ_4, ctx_prev, ctx_this))
         break;
-      } else {
-        sum_rdc.rate += this_rdc.rate;
-        sum_rdc.dist += this_rdc.dist;
-        sum_rdc.rdcost += this_rdc.rdcost;
-      }
 
-      if (sum_rdc.rdcost >= best_rdc.rdcost) break;
-
-      if (i < 3) {
-        update_state(cpi, td, ctx_prev, this_mi_row, mi_col, subsize, 1);
-        encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, this_mi_row, mi_col,
-                          subsize, NULL);
-      }
+      ctx_prev = ctx_this;
     }
 
     if (sum_rdc.rdcost < best_rdc.rdcost) {
@@ -4326,43 +4511,26 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 #endif
   }
   // PARTITION_VERT_4
-  if (bsize == BLOCK_32X32 && partition_vert_allowed && !force_vert_split &&
+  if (can_partition_4 && partition_vert_allowed && !force_vert_split &&
       (do_rectangular_split || av1_active_v_edge(cpi, mi_row, mi_step))) {
-    int i;
     const int quarter_step = mi_size_wide[bsize] / 4;
     PICK_MODE_CONTEXT *ctx_prev = ctx_none;
 
     subsize = get_subsize(bsize, PARTITION_VERT_4);
-    av1_zero(sum_rdc);
 
-    for (i = 0; i < 4; ++i) {
+    for (int i = 0; i < 4; ++i) {
       int this_mi_col = mi_col + i * quarter_step;
 
       if (i > 0 && this_mi_col >= cm->mi_cols) break;
 
-      if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_prev);
+      PICK_MODE_CONTEXT *ctx_this = &pc_tree->vertical4[i];
 
-      ctx_prev = &pc_tree->vertical4[i];
-
-      rd_pick_sb_modes(cpi, tile_data, x, mi_row, this_mi_col, &this_rdc,
-                       PARTITION_VERT_4, subsize, ctx_prev,
-                       best_rdc.rdcost - sum_rdc.rdcost);
-
-      if (this_rdc.rate == INT_MAX) {
-        sum_rdc.rdcost = INT64_MAX;
-      } else {
-        sum_rdc.rate += this_rdc.rate;
-        sum_rdc.dist += this_rdc.dist;
-        sum_rdc.rdcost += this_rdc.rdcost;
-      }
-
-      if (sum_rdc.rdcost >= best_rdc.rdcost) break;
+      if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 0), (i == 3), mi_row,
+                           this_mi_col, subsize, &best_rdc, &sum_rdc, &this_rdc,
+                           PARTITION_VERT_4, ctx_prev, ctx_this))
+        break;
 
-      if (i < 3) {
-        update_state(cpi, td, ctx_prev, mi_row, this_mi_col, subsize, 1);
-        encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, this_mi_col,
-                          subsize, NULL);
-      }
+      ctx_prev = ctx_this;
     }
 
     if (sum_rdc.rdcost < best_rdc.rdcost) {
@@ -4381,11 +4549,6 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
   }
 #endif  // CONFIG_EXT_PARTITION_TYPES
 
-#if CONFIG_SPEED_REFS
-  // First scanning is done.
-  if (cpi->sb_scanning_pass_idx == 0 && bsize == cm->sb_size) return;
-#endif  // CONFIG_SPEED_REFS
-
   // TODO(jbb): This code added so that we avoid static analysis
   // warning related to the fact that best_rd isn't used after this
   // point.  This code should be refactored so that the duplicate
@@ -4393,25 +4556,24 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
   (void)best_rd;
   *rd_cost = best_rdc;
 
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-  if (bsize <= BLOCK_8X8 && rd_cost->rate != INT_MAX) {
-    assert(rd_cost->dist_y < INT64_MAX);
-  }
-#endif  // CONFIG_DIST_8X8 && CONFIG_CB4X4
 #if CONFIG_SUPERTX
   *rate_nocoef = best_rate_nocoef;
 #endif  // CONFIG_SUPERTX
 
-#if CONFIG_CFL
-  // Store the luma for the best mode
-  x->cfl_store_y = 1;
-#endif
   if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
       pc_tree->index != 3) {
     if (bsize == cm->sb_size) {
-#if CONFIG_MOTION_VAR && CONFIG_NCOBMC
+#if CONFIG_MOTION_VAR && NC_MODE_INFO
       set_mode_info_sb(cpi, td, tile_info, tp, mi_row, mi_col, bsize, pc_tree);
 #endif
+
+#if CONFIG_LV_MAP
+      x->cb_offset = 0;
+#endif
+
+#if CONFIG_NCOBMC_ADAPT_WEIGHT
+      set_sb_mi_boundaries(cm, xd, mi_row, mi_col);
+#endif
       encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
                 pc_tree, NULL);
     } else {
@@ -4419,13 +4581,10 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
                 pc_tree, NULL);
     }
   }
-#if CONFIG_CFL
-  x->cfl_store_y = 0;
-#endif
 
 #if CONFIG_DIST_8X8 && CONFIG_CB4X4
-  if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
-      bsize == BLOCK_4X4 && pc_tree->index == 3) {
+  if (x->using_dist_8x8 && best_rdc.rate < INT_MAX &&
+      best_rdc.dist < INT64_MAX && bsize == BLOCK_4X4 && pc_tree->index == 3) {
     encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
               pc_tree, NULL);
   }
@@ -4442,22 +4601,6 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
   }
 }
 
-#if CONFIG_SPEED_REFS
-static void restore_mi(const AV1_COMP *const cpi, MACROBLOCK *const x,
-                       int mi_row, int mi_col) {
-  const AV1_COMMON *cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
-  int x_idx, y;
-  for (y = 0; y < mi_size_high[cm->sb_size]; y++)
-    for (x_idx = 0; x_idx < mi_size_wide[cm->sb_size]; x_idx++)
-      if (mi_col + x_idx < cm->mi_cols && mi_row + y < cm->mi_rows) {
-        memset(xd->mi + y * cm->mi_stride + x_idx, 0, sizeof(*xd->mi));
-        memset(x->mbmi_ext + y * cm->mi_cols + x_idx, 0, sizeof(*x->mbmi_ext));
-      }
-}
-#endif  // CONFIG_SPEED_REFS
-
 static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
                              TileDataEnc *tile_data, int mi_row,
                              TOKENEXTRA **tp) {
@@ -4476,14 +4619,18 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
   // Initialize the left context for the new SB row
   av1_zero_left_context(xd);
 
-#if CONFIG_DELTA_Q
   // Reset delta for every tile
   if (cm->delta_q_present_flag)
     if (mi_row == tile_info->mi_row_start) xd->prev_qindex = cm->base_qindex;
 #if CONFIG_EXT_DELTA_Q
-  if (cm->delta_lf_present_flag)
+  if (cm->delta_lf_present_flag) {
+#if CONFIG_LOOPFILTER_LEVEL
+    if (mi_row == tile_info->mi_row_start)
+      for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id)
+        xd->prev_delta_lf[lf_id] = 0;
+#endif  // CONFIG_LOOPFILTER_LEVEL
     if (mi_row == tile_info->mi_row_start) xd->prev_delta_lf_from_base = 0;
-#endif
+  }
 #endif
 
   // Code each SB in the row
@@ -4503,9 +4650,21 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
     MODE_INFO **mi = cm->mi_grid_visible + idx_str;
     PC_TREE *const pc_root = td->pc_root[cm->mib_size_log2 - MIN_MIB_SIZE_LOG2];
 
+#if CONFIG_LV_MAP && LV_MAP_PROB
+    av1_fill_coeff_costs(&td->mb, xd->tile_ctx);
+#else
+    av1_fill_token_costs_from_cdf(x->token_head_costs,
+                                  x->e_mbd.tile_ctx->coef_head_cdfs);
+    av1_fill_token_costs_from_cdf(x->token_tail_costs,
+                                  x->e_mbd.tile_ctx->coef_tail_cdfs);
+#endif
+    av1_fill_mode_rates(cm, x, xd->tile_ctx);
+
     if (sf->adaptive_pred_interp_filter) {
+#if !CONFIG_CB4X4
       for (i = 0; i < leaf_nodes; ++i)
         td->leaf_tree[i].pred_interp_filter = SWITCHABLE;
+#endif
 
       for (i = 0; i < leaf_nodes; ++i) {
         td->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE;
@@ -4515,6 +4674,7 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
       }
     }
 
+    x->tx_rd_record.num = x->tx_rd_record.index_start = 0;
     av1_zero(x->pred_mv);
     pc_root->index = 0;
 
@@ -4524,8 +4684,10 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
       int segment_id = get_segment_id(cm, map, cm->sb_size, mi_row, mi_col);
       seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
     }
+#if CONFIG_AMVR
+    xd->cur_frame_mv_precision_level = cm->cur_frame_mv_precision_level;
+#endif
 
-#if CONFIG_DELTA_Q
     if (cm->delta_q_present_flag) {
       // Test mode for delta quantization
       int sb_row = mi_row >> 3;
@@ -4545,7 +4707,7 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
       assert(current_qindex > 0);
 
       xd->delta_qindex = current_qindex - cm->base_qindex;
-      set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
+      set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->sb_size);
       xd->mi[0]->mbmi.current_q_index = current_qindex;
 #if !CONFIG_EXT_DELTA_Q
       xd->mi[0]->mbmi.segment_id = 0;
@@ -4564,13 +4726,19 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
         for (j = 0; j < AOMMIN(cm->mib_size, cm->mi_rows - mi_row); j++) {
           for (k = 0; k < AOMMIN(cm->mib_size, cm->mi_cols - mi_col); k++) {
             cm->mi[(mi_row + j) * cm->mi_stride + (mi_col + k)]
-                .mbmi.current_delta_lf_from_base = current_delta_lf_from_base;
+                .mbmi.current_delta_lf_from_base =
+                clamp(current_delta_lf_from_base, 0, MAX_LOOP_FILTER);
+#if CONFIG_LOOPFILTER_LEVEL
+            for (int lf_id = 0; lf_id < FRAME_LF_COUNT; ++lf_id) {
+              cm->mi[(mi_row + j) * cm->mi_stride + (mi_col + k)]
+                  .mbmi.curr_delta_lf[lf_id] = current_delta_lf_from_base;
+            }
+#endif  // CONFIG_LOOPFILTER_LEVEL
           }
         }
       }
 #endif  // CONFIG_EXT_DELTA_Q
     }
-#endif  // CONFIG_DELTA_Q
 
     x->source_variance = UINT_MAX;
     if (sf->partition_search_type == FIXED_PARTITION || seg_skip) {
@@ -4602,35 +4770,12 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
         rd_auto_partition_range(cpi, tile_info, xd, mi_row, mi_col,
                                 &x->min_partition_size, &x->max_partition_size);
       }
-#if CONFIG_SPEED_REFS
-      // NOTE: Two scanning passes for the current superblock - the first pass
-      //       is only targeted to collect stats.
-      int m_search_count_backup = *(x->m_search_count_ptr);
-      for (int sb_pass_idx = 0; sb_pass_idx < 2; ++sb_pass_idx) {
-        cpi->sb_scanning_pass_idx = sb_pass_idx;
-        if (frame_is_intra_only(cm) && sb_pass_idx == 0) continue;
-
-        rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, cm->sb_size,
-                          &dummy_rdc,
-#if CONFIG_SUPERTX
-                          &dummy_rate_nocoef,
-#endif  // CONFIG_SUPERTX
-                          INT64_MAX, pc_root);
-        if (sb_pass_idx == 0) {
-          av1_zero(x->pred_mv);
-          pc_root->index = 0;
-          restore_mi(cpi, x, mi_row, mi_col);
-          *(x->m_search_count_ptr) = m_search_count_backup;
-        }
-      }
-#else  // !CONFIG_SPEED_REFS
       rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, cm->sb_size,
                         &dummy_rdc,
 #if CONFIG_SUPERTX
                         &dummy_rate_nocoef,
 #endif  // CONFIG_SUPERTX
                         INT64_MAX, pc_root);
-#endif  // CONFIG_SPEED_REFS
     }
   }
 }
@@ -4656,7 +4801,7 @@ static int check_dual_ref_flags(AV1_COMP *cpi) {
     return (!!(ref_flags & AOM_GOLD_FLAG) + !!(ref_flags & AOM_LAST_FLAG) +
 #if CONFIG_EXT_REFS
             !!(ref_flags & AOM_LAST2_FLAG) + !!(ref_flags & AOM_LAST3_FLAG) +
-            !!(ref_flags & AOM_BWD_FLAG) +
+            !!(ref_flags & AOM_BWD_FLAG) + !!(ref_flags & AOM_ALT2_FLAG) +
 #endif  // CONFIG_EXT_REFS
             !!(ref_flags & AOM_ALT_FLAG)) >= 2;
   }
@@ -4686,9 +4831,13 @@ static MV_REFERENCE_FRAME get_frame_type(const AV1_COMP *cpi) {
            cpi->rc.is_src_frame_ext_arf)
 #else
   else if (cpi->rc.is_src_frame_alt_ref && cpi->refresh_golden_frame)
-#endif
+#endif  // CONFIG_EXT_REFS
     return ALTREF_FRAME;
-  else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)
+  else if (cpi->refresh_golden_frame ||
+#if CONFIG_EXT_REFS
+           cpi->refresh_alt2_ref_frame ||
+#endif  // CONFIG_EXT_REFS
+           cpi->refresh_alt_ref_frame)
     return GOLDEN_FRAME;
   else
     // TODO(zoeliu): To investigate whether a frame_type other than
@@ -4872,7 +5021,12 @@ void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row,
   td->mb.daala_enc.state.adapt = &this_tile->tctx.pvq_context;
 #endif  // CONFIG_PVQ
 
-  av1_setup_across_tile_boundary_info(cm, tile_info);
+#if CONFIG_LOOPFILTERING_ACROSS_TILES
+  if (!cm->loop_filter_across_tiles_enabled)
+    av1_setup_across_tile_boundary_info(cm, tile_info);
+#endif
+
+  av1_crc_calculator_init(&td->mb.tx_rd_record.crc_calculator, 24, 0x5D6DCB);
 
   for (mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
        mi_row += cm->mib_size) {
@@ -4925,8 +5079,8 @@ static int input_fpmb_stats(FIRSTPASS_MB_STATS *firstpass_mb_stats,
 
 #if CONFIG_GLOBAL_MOTION
 #define GLOBAL_TRANS_TYPES_ENC 3  // highest motion model to search
-static int gm_get_params_cost(WarpedMotionParams *gm,
-                              WarpedMotionParams *ref_gm, int allow_hp) {
+static int gm_get_params_cost(const WarpedMotionParams *gm,
+                              const WarpedMotionParams *ref_gm, int allow_hp) {
   assert(gm->wmtype < GLOBAL_TRANS_TYPES);
   int params_cost = 0;
   int trans_bits, trans_prec_diff;
@@ -5010,7 +5164,8 @@ static int do_gm_search_logic(SPEED_FEATURES *const sf, int num_refs_using_gm,
 }
 #endif  // CONFIG_GLOBAL_MOTION
 
-#if CONFIG_PALETTE
+// TODO(anybody) : remove this flag when PVQ supports pallete coding tool
+#if !CONFIG_PVQ
 // Estimate if the source frame is screen content, based on the portion of
 // blocks that have no more than 4 (experimentally selected) luma colors.
 static int is_screen_content(const uint8_t *src,
@@ -5038,7 +5193,7 @@ static int is_screen_content(const uint8_t *src,
   // The threshold is 10%.
   return counts * blk_h * blk_w * 10 > width * height;
 }
-#endif  // CONFIG_PALETTE
+#endif  // !CONFIG_PVQ
 
 static void encode_frame_internal(AV1_COMP *cpi) {
   ThreadData *const td = &cpi->td;
@@ -5057,18 +5212,21 @@ static void encode_frame_internal(AV1_COMP *cpi) {
 
   x->min_partition_size = AOMMIN(x->min_partition_size, cm->sb_size);
   x->max_partition_size = AOMMIN(x->max_partition_size, cm->sb_size);
+#if CONFIG_DIST_8X8
+  x->using_dist_8x8 = cpi->oxcf.using_dist_8x8;
+  x->tune_metric = cpi->oxcf.tuning;
+#endif
   cm->setup_mi(cm);
 
   xd->mi = cm->mi_grid_visible;
   xd->mi[0] = cm->mi;
 
   av1_zero(*td->counts);
-  av1_zero(rdc->coef_counts);
   av1_zero(rdc->comp_pred_diff);
 
-#if CONFIG_PALETTE || CONFIG_INTRABC
   if (frame_is_intra_only(cm)) {
-#if CONFIG_PALETTE
+// TODO(anybody) : remove this flag when PVQ supports pallete coding tool
+#if !CONFIG_PVQ
     cm->allow_screen_content_tools =
         cpi->oxcf.content == AOM_CONTENT_SCREEN ||
         is_screen_content(cpi->source->y_buffer,
@@ -5078,10 +5236,80 @@ static void encode_frame_internal(AV1_COMP *cpi) {
                           cpi->source->y_stride, cpi->source->y_width,
                           cpi->source->y_height);
 #else
-    cm->allow_screen_content_tools = cpi->oxcf.content == AOM_CONTENT_SCREEN;
-#endif  // CONFIG_PALETTE
+    cm->allow_screen_content_tools = 0;
+#endif  // !CONFIG_PVQ
+  }
+
+#if CONFIG_HASH_ME
+  if (cpi->oxcf.pass != 1 && cpi->common.allow_screen_content_tools) {
+    // add to hash table
+    const int pic_width = cpi->source->y_crop_width;
+    const int pic_height = cpi->source->y_crop_height;
+    uint32_t *block_hash_values[2][2];
+    int8_t *is_block_same[2][3];
+    int k, j;
+
+    for (k = 0; k < 2; k++) {
+      for (j = 0; j < 2; j++) {
+        CHECK_MEM_ERROR(cm, block_hash_values[k][j],
+                        aom_malloc(sizeof(uint32_t) * pic_width * pic_height));
+      }
+
+      for (j = 0; j < 3; j++) {
+        CHECK_MEM_ERROR(cm, is_block_same[k][j],
+                        aom_malloc(sizeof(int8_t) * pic_width * pic_height));
+      }
+    }
+
+    av1_hash_table_create(&cm->cur_frame->hash_table);
+    av1_generate_block_2x2_hash_value(cpi->source, block_hash_values[0],
+                                      is_block_same[0]);
+    av1_generate_block_hash_value(cpi->source, 4, block_hash_values[0],
+                                  block_hash_values[1], is_block_same[0],
+                                  is_block_same[1]);
+    av1_add_to_hash_map_by_row_with_precal_data(
+        &cm->cur_frame->hash_table, block_hash_values[1], is_block_same[1][2],
+        pic_width, pic_height, 4);
+    av1_generate_block_hash_value(cpi->source, 8, block_hash_values[1],
+                                  block_hash_values[0], is_block_same[1],
+                                  is_block_same[0]);
+    av1_add_to_hash_map_by_row_with_precal_data(
+        &cm->cur_frame->hash_table, block_hash_values[0], is_block_same[0][2],
+        pic_width, pic_height, 8);
+    av1_generate_block_hash_value(cpi->source, 16, block_hash_values[0],
+                                  block_hash_values[1], is_block_same[0],
+                                  is_block_same[1]);
+    av1_add_to_hash_map_by_row_with_precal_data(
+        &cm->cur_frame->hash_table, block_hash_values[1], is_block_same[1][2],
+        pic_width, pic_height, 16);
+    av1_generate_block_hash_value(cpi->source, 32, block_hash_values[1],
+                                  block_hash_values[0], is_block_same[1],
+                                  is_block_same[0]);
+    av1_add_to_hash_map_by_row_with_precal_data(
+        &cm->cur_frame->hash_table, block_hash_values[0], is_block_same[0][2],
+        pic_width, pic_height, 32);
+    av1_generate_block_hash_value(cpi->source, 64, block_hash_values[0],
+                                  block_hash_values[1], is_block_same[0],
+                                  is_block_same[1]);
+    av1_add_to_hash_map_by_row_with_precal_data(
+        &cm->cur_frame->hash_table, block_hash_values[1], is_block_same[1][2],
+        pic_width, pic_height, 64);
+
+    for (k = 0; k < 2; k++) {
+      for (j = 0; j < 2; j++) {
+        aom_free(block_hash_values[k][j]);
+      }
+
+      for (j = 0; j < 3; j++) {
+        aom_free(is_block_same[k][j]);
+      }
+    }
   }
-#endif  // CONFIG_PALETTE || CONFIG_INTRABC
+#endif
+
+#if CONFIG_NCOBMC_ADAPT_WEIGHT
+  alloc_ncobmc_pred_buffer(xd);
+#endif
 
 #if CONFIG_GLOBAL_MOTION
   av1_zero(rdc->global_motion_used);
@@ -5102,6 +5330,10 @@ static void encode_frame_internal(AV1_COMP *cpi) {
     for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
       ref_buf[frame] = get_ref_frame_buffer(cpi, frame);
       int pframe;
+      cm->global_motion[frame] = default_warp_params;
+      const WarpedMotionParams *ref_params =
+          cm->error_resilient_mode ? &default_warp_params
+                                   : &cm->prev_frame->global_motion[frame];
       // check for duplicate buffer
       for (pframe = LAST_FRAME; pframe < frame; ++pframe) {
         if (ref_buf[frame] == ref_buf[pframe]) break;
@@ -5168,7 +5400,7 @@ static void encode_frame_internal(AV1_COMP *cpi) {
           }
           if (cm->global_motion[frame].wmtype <= AFFINE)
             if (!get_shear_params(&cm->global_motion[frame]))
-              set_default_warp_params(&cm->global_motion[frame]);
+              cm->global_motion[frame] = default_warp_params;
 
           if (cm->global_motion[frame].wmtype == TRANSLATION) {
             cm->global_motion[frame].wmmat[0] =
@@ -5185,10 +5417,9 @@ static void encode_frame_internal(AV1_COMP *cpi) {
           // this motion type, revert to IDENTITY.
           if (!is_enough_erroradvantage(
                   (double)best_warp_error / ref_frame_error,
-                  gm_get_params_cost(&cm->global_motion[frame],
-                                     &cm->prev_frame->global_motion[frame],
+                  gm_get_params_cost(&cm->global_motion[frame], ref_params,
                                      cm->allow_high_precision_mv))) {
-            set_default_warp_params(&cm->global_motion[frame]);
+            cm->global_motion[frame] = default_warp_params;
           }
           if (cm->global_motion[frame].wmtype != IDENTITY) break;
         }
@@ -5196,8 +5427,7 @@ static void encode_frame_internal(AV1_COMP *cpi) {
       }
       if (cm->global_motion[frame].wmtype != IDENTITY) num_refs_using_gm++;
       cpi->gmparams_cost[frame] =
-          gm_get_params_cost(&cm->global_motion[frame],
-                             &cm->prev_frame->global_motion[frame],
+          gm_get_params_cost(&cm->global_motion[frame], ref_params,
                              cm->allow_high_precision_mv) +
           cpi->gmtype_cost[cm->global_motion[frame].wmtype] -
           cpi->gmtype_cost[IDENTITY];
@@ -5221,7 +5451,6 @@ static void encode_frame_internal(AV1_COMP *cpi) {
 
   cm->tx_mode = select_tx_mode(cpi);
 
-#if CONFIG_DELTA_Q
   // Fix delta q resolution for the moment
   cm->delta_q_res = DEFAULT_DELTA_Q_RES;
 // Set delta_q_present_flag before it is used for the first time
@@ -5234,7 +5463,6 @@ static void encode_frame_internal(AV1_COMP *cpi) {
   cm->delta_q_present_flag =
       cpi->oxcf.aq_mode == DELTA_AQ && cm->base_qindex > 0;
 #endif  // CONFIG_EXT_DELTA_Q
-#endif
 
   av1_frame_init_quantizer(cpi);
 
@@ -5262,19 +5490,7 @@ static void encode_frame_internal(AV1_COMP *cpi) {
 #endif  // CONFIG_EXT_REFS || CONFIG_TEMPMV_SIGNALING
 
 #if CONFIG_TEMPMV_SIGNALING
-  if (cm->prev_frame) {
-    cm->use_prev_frame_mvs &=
-        !cm->error_resilient_mode &&
-#if CONFIG_FRAME_SUPERRES
-        cm->width == cm->last_width && cm->height == cm->last_height &&
-#else
-        cm->width == cm->prev_frame->buf.y_crop_width &&
-        cm->height == cm->prev_frame->buf.y_crop_height &&
-#endif  // CONFIG_FRAME_SUPERRES
-        !cm->intra_only && !cm->prev_frame->intra_only && cm->last_show_frame;
-  } else {
-    cm->use_prev_frame_mvs = 0;
-  }
+  cm->use_prev_frame_mvs &= frame_can_use_prev_frame_mvs(cm);
 #else
   if (cm->prev_frame) {
     cm->use_prev_frame_mvs = !cm->error_resilient_mode &&
@@ -5301,6 +5517,10 @@ static void encode_frame_internal(AV1_COMP *cpi) {
   av1_zero(x->blk_skip_drl);
 #endif
 
+#if CONFIG_MFMV
+  av1_setup_motion_field(cm);
+#endif  // CONFIG_MFMV
+
   {
     struct aom_usec_timer emr_timer;
     aom_usec_timer_start(&emr_timer);
@@ -5326,6 +5546,9 @@ static void encode_frame_internal(AV1_COMP *cpi) {
     aom_usec_timer_mark(&emr_timer);
     cpi->time_encode_sb_row += aom_usec_timer_elapsed(&emr_timer);
   }
+#if CONFIG_NCOBMC_ADAPT_WEIGHT
+  free_ncobmc_pred_buffer(xd);
+#endif
 
 #if 0
   // Keep record of the total distortion this time around for future use
@@ -5333,7 +5556,6 @@ static void encode_frame_internal(AV1_COMP *cpi) {
 #endif
 }
 
-#if CONFIG_EXT_INTER
 static void make_consistent_compound_tools(AV1_COMMON *cm) {
   (void)cm;
 #if CONFIG_INTERINTRA
@@ -5349,7 +5571,6 @@ static void make_consistent_compound_tools(AV1_COMMON *cm) {
     cm->allow_masked_compound = 0;
 #endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
 }
-#endif  // CONFIG_EXT_INTER
 
 void av1_encode_frame(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
@@ -5358,6 +5579,32 @@ void av1_encode_frame(AV1_COMP *cpi) {
   // rather than the potential full set of 16 transforms
   cm->reduced_tx_set_used = 0;
 #endif  // CONFIG_EXT_TX
+#if CONFIG_ADAPT_SCAN
+  cm->use_adapt_scan = 1;
+  // TODO(angiebird): call av1_init_scan_order only when use_adapt_scan
+  // switches from 1 to 0
+  if (cm->use_adapt_scan == 0) av1_init_scan_order(cm);
+#endif
+
+#if CONFIG_FRAME_MARKER
+  if (cm->show_frame == 0) {
+    int arf_offset = AOMMIN(
+        (MAX_GF_INTERVAL - 1),
+        cpi->twopass.gf_group.arf_src_offset[cpi->twopass.gf_group.index]);
+#if CONFIG_EXT_REFS
+    int brf_offset =
+        cpi->twopass.gf_group.brf_src_offset[cpi->twopass.gf_group.index];
+    arf_offset = AOMMIN((MAX_GF_INTERVAL - 1), arf_offset + brf_offset);
+#endif  // CONFIG_EXT_REFS
+    cm->frame_offset = cm->current_video_frame + arf_offset;
+  } else {
+    cm->frame_offset = cm->current_video_frame;
+  }
+  av1_setup_frame_buf_refs(cm);
+#if CONFIG_FRAME_SIGN_BIAS
+  av1_setup_frame_sign_bias(cm);
+#endif  // CONFIG_FRAME_SIGN_BIAS
+#endif  // CONFIG_FRAME_MARKER
 
   // In the longer term the encoder should be generalized to match the
   // decoder such that we allow compound where one of the 3 buffers has a
@@ -5366,14 +5613,14 @@ void av1_encode_frame(AV1_COMP *cpi) {
   // side behavior is where the ALT ref buffer has opposite sign bias to
   // the other two.
   if (!frame_is_intra_only(cm)) {
-#if !(CONFIG_ONE_SIDED_COMPOUND || CONFIG_EXT_COMP_REFS)
+#if !CONFIG_ONE_SIDED_COMPOUND
     if ((cm->ref_frame_sign_bias[ALTREF_FRAME] ==
          cm->ref_frame_sign_bias[GOLDEN_FRAME]) ||
         (cm->ref_frame_sign_bias[ALTREF_FRAME] ==
          cm->ref_frame_sign_bias[LAST_FRAME])) {
       cpi->allow_comp_inter_inter = 0;
     } else {
-#endif  // !(CONFIG_ONE_SIDED_COMPOUND || CONFIG_EXT_COMP_REFS)
+#endif  // !CONFIG_ONE_SIDED_COMPOUND
       cpi->allow_comp_inter_inter = 1;
 #if CONFIG_EXT_REFS
       cm->comp_fwd_ref[0] = LAST_FRAME;
@@ -5381,16 +5628,16 @@ void av1_encode_frame(AV1_COMP *cpi) {
       cm->comp_fwd_ref[2] = LAST3_FRAME;
       cm->comp_fwd_ref[3] = GOLDEN_FRAME;
       cm->comp_bwd_ref[0] = BWDREF_FRAME;
-      cm->comp_bwd_ref[1] = ALTREF_FRAME;
-#else
+      cm->comp_bwd_ref[1] = ALTREF2_FRAME;
+      cm->comp_bwd_ref[2] = ALTREF_FRAME;
+#else                           // !CONFIG_EXT_REFS
     cm->comp_fixed_ref = ALTREF_FRAME;
     cm->comp_var_ref[0] = LAST_FRAME;
     cm->comp_var_ref[1] = GOLDEN_FRAME;
-#endif  // CONFIG_EXT_REFS
-#if !(CONFIG_ONE_SIDED_COMPOUND || \
-      CONFIG_EXT_COMP_REFS)  // Normative in encoder
+#endif                          // CONFIG_EXT_REFS
+#if !CONFIG_ONE_SIDED_COMPOUND  // Normative in encoder
     }
-#endif  // !(CONFIG_ONE_SIDED_COMPOUND || CONFIG_EXT_COMP_REFS)
+#endif  // !CONFIG_ONE_SIDED_COMPOUND
   } else {
     cpi->allow_comp_inter_inter = 0;
   }
@@ -5444,9 +5691,7 @@ void av1_encode_frame(AV1_COMP *cpi) {
     cm->interp_filter = SWITCHABLE;
 #endif
 
-#if CONFIG_EXT_INTER
     make_consistent_compound_tools(cm);
-#endif  // CONFIG_EXT_INTER
 
     rdc->single_ref_used_flag = 0;
     rdc->compound_ref_used_flag = 0;
@@ -5469,9 +5714,7 @@ void av1_encode_frame(AV1_COMP *cpi) {
 #endif  // !CONFIG_REF_ADAPT
       }
     }
-#if CONFIG_EXT_INTER
     make_consistent_compound_tools(cm);
-#endif  // CONFIG_EXT_INTER
 
 #if CONFIG_VAR_TX
 #if CONFIG_RECT_TX_EXT
@@ -5483,10 +5726,11 @@ void av1_encode_frame(AV1_COMP *cpi) {
       cm->tx_mode = ALLOW_32X32 + CONFIG_TX64X64;
 #else
 #if CONFIG_RECT_TX_EXT && CONFIG_EXT_TX
-    if (cm->tx_mode == TX_MODE_SELECT && counts->quarter_tx_size[1] == 0) {
+    if (cm->tx_mode == TX_MODE_SELECT && counts->quarter_tx_size[1] == 0)
 #else
-    if (cm->tx_mode == TX_MODE_SELECT) {
+    if (cm->tx_mode == TX_MODE_SELECT)
 #endif
+    {
 #if CONFIG_TX64X64
       int count4x4 = 0;
       int count8x8_8x8p = 0, count8x8_lp = 0;
@@ -5653,9 +5897,7 @@ void av1_encode_frame(AV1_COMP *cpi) {
     }
 #endif
   } else {
-#if CONFIG_EXT_INTER
     make_consistent_compound_tools(cm);
-#endif  // CONFIG_EXT_INTER
     encode_frame_internal(cpi);
   }
 }
@@ -5664,21 +5906,15 @@ static void sum_intra_stats(FRAME_COUNTS *counts, MACROBLOCKD *xd,
                             const MODE_INFO *mi, const MODE_INFO *above_mi,
                             const MODE_INFO *left_mi, const int intraonly,
                             const int mi_row, const int mi_col) {
+  FRAME_CONTEXT *fc = xd->tile_ctx;
   const MB_MODE_INFO *const mbmi = &mi->mbmi;
-#if CONFIG_ENTROPY_STATS
   const PREDICTION_MODE y_mode = mbmi->mode;
   const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
-#else   // CONFIG_ENTROPY_STATS
   (void)counts;
-  (void)above_mi;
-  (void)left_mi;
-  (void)intraonly;
-#endif  // CONFIG_ENTROPY_STATS
   const BLOCK_SIZE bsize = mbmi->sb_type;
   const int unify_bsize = CONFIG_CB4X4;
 
   if (bsize < BLOCK_8X8 && !unify_bsize) {
-#if CONFIG_ENTROPY_STATS
     int idx, idy;
     const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
     const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
@@ -5687,30 +5923,38 @@ static void sum_intra_stats(FRAME_COUNTS *counts, MACROBLOCKD *xd,
         const int bidx = idy * 2 + idx;
         const PREDICTION_MODE bmode = mi->bmi[bidx].as_mode;
         if (intraonly) {
+#if CONFIG_ENTROPY_STATS
           const PREDICTION_MODE a = av1_above_block_mode(mi, above_mi, bidx);
           const PREDICTION_MODE l = av1_left_block_mode(mi, left_mi, bidx);
           ++counts->kf_y_mode[a][l][bmode];
+#endif  // CONFIG_ENTROPY_STATS
+          update_cdf(get_y_mode_cdf(fc, mi, above_mi, left_mi, bidx), bmode,
+                     INTRA_MODES);
         } else {
+#if CONFIG_ENTROPY_STATS
           ++counts->y_mode[0][bmode];
+#endif  // CONFIG_ENTROPY_STATS
+          update_cdf(fc->y_mode_cdf[0], bmode, INTRA_MODES);
         }
       }
-#endif  // CONFIG_ENTROPY_STATS
   } else {
-#if CONFIG_ENTROPY_STATS
     if (intraonly) {
+#if CONFIG_ENTROPY_STATS
       const PREDICTION_MODE above = av1_above_block_mode(mi, above_mi, 0);
       const PREDICTION_MODE left = av1_left_block_mode(mi, left_mi, 0);
       ++counts->kf_y_mode[above][left][y_mode];
+#endif  // CONFIG_ENTROPY_STATS
+      update_cdf(get_y_mode_cdf(fc, mi, above_mi, left_mi, 0), y_mode,
+                 INTRA_MODES);
     } else {
+#if CONFIG_ENTROPY_STATS
       ++counts->y_mode[size_group_lookup[bsize]][y_mode];
-    }
 #endif  // CONFIG_ENTROPY_STATS
+      update_cdf(fc->y_mode_cdf[size_group_lookup[bsize]], y_mode, INTRA_MODES);
+    }
+
 #if CONFIG_FILTER_INTRA
-    if (mbmi->mode == DC_PRED
-#if CONFIG_PALETTE
-        && mbmi->palette_mode_info.palette_size[0] == 0
-#endif  // CONFIG_PALETTE
-        ) {
+    if (mbmi->mode == DC_PRED && mbmi->palette_mode_info.palette_size[0] == 0) {
       const int use_filter_intra_mode =
           mbmi->filter_intra_mode_info.use_filter_intra_mode[0];
       ++counts->filter_intra[0][use_filter_intra_mode];
@@ -5721,10 +5965,7 @@ static void sum_intra_stats(FRAME_COUNTS *counts, MACROBLOCKD *xd,
         is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
                             xd->plane[1].subsampling_y)
 #endif
-#if CONFIG_PALETTE
-        && mbmi->palette_mode_info.palette_size[1] == 0
-#endif  // CONFIG_PALETTE
-        ) {
+        && mbmi->palette_mode_info.palette_size[1] == 0) {
       const int use_filter_intra_mode =
           mbmi->filter_intra_mode_info.use_filter_intra_mode[1];
       ++counts->filter_intra[1][use_filter_intra_mode];
@@ -5753,6 +5994,7 @@ static void sum_intra_stats(FRAME_COUNTS *counts, MACROBLOCKD *xd,
 #if CONFIG_ENTROPY_STATS
   ++counts->uv_mode[y_mode][uv_mode];
 #endif  // CONFIG_ENTROPY_STATS
+  update_cdf(fc->uv_mode_cdf[y_mode], uv_mode, UV_INTRA_MODES);
 }
 
 #if CONFIG_VAR_TX
@@ -5770,13 +6012,26 @@ static void update_txfm_count(MACROBLOCK *x, MACROBLOCKD *xd,
   const TX_SIZE plane_tx_size = mbmi->inter_tx_size[tx_row][tx_col];
 
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+  assert(tx_size > TX_4X4);
+
+  if (depth == MAX_VARTX_DEPTH) {
+// Don't add to counts in this case
+#if CONFIG_RECT_TX_EXT
+    if (tx_size == plane_tx_size)
+#endif
+      mbmi->tx_size = tx_size;
+    txfm_partition_update(xd->above_txfm_context + blk_col,
+                          xd->left_txfm_context + blk_row, tx_size, tx_size);
+    return;
+  }
 
 #if CONFIG_RECT_TX_EXT
   if (tx_size == plane_tx_size ||
-      mbmi->tx_size == quarter_txsize_lookup[mbmi->sb_type]) {
+      mbmi->tx_size == quarter_txsize_lookup[mbmi->sb_type])
 #else
-  if (tx_size == plane_tx_size) {
+  if (tx_size == plane_tx_size)
 #endif
+  {
     ++counts->txfm_partition[ctx][0];
 #if CONFIG_RECT_TX_EXT
     if (tx_size == plane_tx_size)
@@ -5792,7 +6047,7 @@ static void update_txfm_count(MACROBLOCK *x, MACROBLOCKD *xd,
     ++counts->txfm_partition[ctx][1];
     ++x->txb_split_count;
 
-    if (tx_size == TX_8X8) {
+    if (sub_txs == TX_4X4) {
       mbmi->inter_tx_size[tx_row][tx_col] = TX_4X4;
       mbmi->tx_size = TX_4X4;
       txfm_partition_update(xd->above_txfm_context + blk_col,
@@ -5815,10 +6070,22 @@ static void tx_partition_count_update(const AV1_COMMON *const cm, MACROBLOCK *x,
   MACROBLOCKD *xd = &x->e_mbd;
   const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
   const int mi_height = block_size_high[plane_bsize] >> tx_size_wide_log2[0];
-  TX_SIZE max_tx_size = get_vartx_max_txsize(&xd->mi[0]->mbmi, plane_bsize);
+  TX_SIZE max_tx_size = get_vartx_max_txsize(&xd->mi[0]->mbmi, plane_bsize, 0);
   const int bh = tx_size_high_unit[max_tx_size];
   const int bw = tx_size_wide_unit[max_tx_size];
   int idx, idy;
+  int init_depth =
+      (mi_height != mi_width) ? RECT_VARTX_DEPTH_INIT : SQR_VARTX_DEPTH_INIT;
+
+#if CONFIG_INTRABC
+  // Intrabc doesn't support var-tx yet. So no need to update tx partition
+  // info., except for the split count (otherwise common->tx_mode may be
+  // modified, causing mismatch).
+  if (is_intrabc_block(&x->e_mbd.mi[0]->mbmi)) {
+    if (x->e_mbd.mi[0]->mbmi.tx_size != max_tx_size) ++x->txb_split_count;
+    return;
+  }
+#endif  // CONFIG_INTRABC
 
   xd->above_txfm_context =
       cm->above_txfm_context + (mi_col << TX_UNIT_WIDE_LOG2);
@@ -5827,8 +6094,7 @@ static void tx_partition_count_update(const AV1_COMMON *const cm, MACROBLOCK *x,
 
   for (idy = 0; idy < mi_height; idy += bh)
     for (idx = 0; idx < mi_width; idx += bw)
-      update_txfm_count(x, xd, td_counts, max_tx_size, mi_width != mi_height,
-                        idy, idx);
+      update_txfm_count(x, xd, td_counts, max_tx_size, init_depth, idy, idx);
 }
 
 static void set_txfm_context(MACROBLOCKD *xd, TX_SIZE tx_size, int blk_row,
@@ -5874,7 +6140,7 @@ static void tx_partition_set_contexts(const AV1_COMMON *const cm,
                                       int mi_row, int mi_col) {
   const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
   const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0];
-  TX_SIZE max_tx_size = get_vartx_max_txsize(&xd->mi[0]->mbmi, plane_bsize);
+  TX_SIZE max_tx_size = get_vartx_max_txsize(&xd->mi[0]->mbmi, plane_bsize, 0);
   const int bh = tx_size_high_unit[max_tx_size];
   const int bw = tx_size_wide_unit[max_tx_size];
   int idx, idy;
@@ -5898,6 +6164,10 @@ void av1_update_tx_type_count(const AV1_COMMON *cm, MACROBLOCKD *xd,
                               FRAME_COUNTS *counts) {
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   int is_inter = is_inter_block(mbmi);
+  FRAME_CONTEXT *fc = xd->tile_ctx;
+#if !CONFIG_ENTROPY_STATS
+  (void)counts;
+#endif  // !CONFIG_ENTROPY_STATS
 
 #if !CONFIG_TXK_SEL
   TX_TYPE tx_type = mbmi->tx_type;
@@ -5916,12 +6186,64 @@ void av1_update_tx_type_count(const AV1_COMMON *cm, MACROBLOCKD *xd,
     const int eset =
         get_ext_tx_set(tx_size, bsize, is_inter, cm->reduced_tx_set_used);
     if (eset > 0) {
+#if !CONFIG_LGT_FROM_PRED
+      const TxSetType tx_set_type = get_ext_tx_set_type(
+          tx_size, bsize, is_inter, cm->reduced_tx_set_used);
       if (is_inter) {
+        update_cdf(fc->inter_ext_tx_cdf[eset][txsize_sqr_map[tx_size]],
+                   av1_ext_tx_ind[tx_set_type][tx_type],
+                   av1_num_ext_tx_set[tx_set_type]);
+#if CONFIG_ENTROPY_STATS
         ++counts->inter_ext_tx[eset][txsize_sqr_map[tx_size]][tx_type];
+#endif  // CONFIG_ENTROPY_STATS
       } else {
+#if CONFIG_ENTROPY_STATS
         ++counts->intra_ext_tx[eset][txsize_sqr_map[tx_size]][mbmi->mode]
                               [tx_type];
+#endif  // CONFIG_ENTROPY_STATS
+        update_cdf(
+            fc->intra_ext_tx_cdf[eset][txsize_sqr_map[tx_size]][mbmi->mode],
+            av1_ext_tx_ind[tx_set_type][tx_type],
+            av1_num_ext_tx_set[tx_set_type]);
+      }
+#else
+      (void)tx_type;
+      (void)fc;
+      if (is_inter) {
+        if (LGT_FROM_PRED_INTER) {
+          if (is_lgt_allowed(mbmi->mode, tx_size) && !cm->reduced_tx_set_used)
+            ++counts->inter_lgt[txsize_sqr_map[tx_size]][mbmi->use_lgt];
+#if CONFIG_ENTROPY_STATS
+          if (!mbmi->use_lgt)
+            ++counts->inter_ext_tx[eset][txsize_sqr_map[tx_size]][tx_type];
+          else
+#endif  // CONFIG_ENTROPY_STATS
+            mbmi->tx_type = DCT_DCT;
+        } else {
+#if CONFIG_ENTROPY_STATS
+          ++counts->inter_ext_tx[eset][txsize_sqr_map[tx_size]][tx_type];
+#endif  // CONFIG_ENTROPY_STATS
+        }
+      } else {
+        if (LGT_FROM_PRED_INTRA) {
+          if (is_lgt_allowed(mbmi->mode, tx_size) && !cm->reduced_tx_set_used)
+            ++counts->intra_lgt[txsize_sqr_map[tx_size]][mbmi->mode]
+                               [mbmi->use_lgt];
+#if CONFIG_ENTROPY_STATS
+          if (!mbmi->use_lgt)
+            ++counts->intra_ext_tx[eset][txsize_sqr_map[tx_size]][mbmi->mode]
+                                  [tx_type];
+          else
+#endif  // CONFIG_ENTROPY_STATS
+            mbmi->tx_type = DCT_DCT;
+        } else {
+#if CONFIG_ENTROPY_STATS
+          ++counts->intra_ext_tx[eset][txsize_sqr_map[tx_size]][mbmi->mode]
+                                [tx_type];
+#endif  // CONFIG_ENTROPY_STATS
+        }
       }
+#endif  // CONFIG_LGT_FROM_PRED
     }
   }
 #else
@@ -5932,10 +6254,20 @@ void av1_update_tx_type_count(const AV1_COMMON *cm, MACROBLOCKD *xd,
       !mbmi->skip &&
       !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
     if (is_inter) {
+#if CONFIG_ENTROPY_STATS
       ++counts->inter_ext_tx[tx_size][tx_type];
+#endif  // CONFIG_ENTROPY_STATS
+      update_cdf(fc->inter_ext_tx_cdf[tx_size], av1_ext_tx_ind[tx_type],
+                 TX_TYPES);
     } else {
+#if CONFIG_ENTROPY_STATS
       ++counts->intra_ext_tx[tx_size][intra_mode_to_tx_type_context[mbmi->mode]]
                             [tx_type];
+#endif  // CONFIG_ENTROPY_STATS
+      update_cdf(
+          fc->intra_ext_tx_cdf[tx_size]
+                              [intra_mode_to_tx_type_context[mbmi->mode]],
+          av1_ext_tx_ind[tx_type], TX_TYPES);
     }
   }
 #endif  // CONFIG_EXT_TX
@@ -5966,29 +6298,48 @@ static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
   x->pvq_speed = 0;
   x->pvq_coded = (dry_run == OUTPUT_ENABLED) ? 1 : 0;
 #endif
-#if CONFIG_CFL
-  x->cfl_store_y = 1;
-#endif
 
   if (!is_inter) {
+#if CONFIG_CFL
+    xd->cfl->store_y = 1;
+#endif  // CONFIG_CFL
     int plane;
     mbmi->skip = 1;
     for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
       av1_encode_intra_block_plane((AV1_COMMON *)cm, x, block_size, plane, 1,
                                    mi_row, mi_col);
     }
+#if CONFIG_CFL
+    xd->cfl->store_y = 0;
+#if CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
+    if (is_chroma_reference(mi_row, mi_col, bsize, xd->cfl->subsampling_x,
+                            xd->cfl->subsampling_y) &&
+        !xd->cfl->are_parameters_computed) {
+      cfl_clear_sub8x8_val(xd->cfl);
+    }
+#endif  // CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
+#endif  // CONFIG_CFL
     if (!dry_run) {
       sum_intra_stats(td->counts, xd, mi, xd->above_mi, xd->left_mi,
                       frame_is_intra_only(cm), mi_row, mi_col);
     }
-#if CONFIG_PALETTE
-    if (bsize >= BLOCK_8X8 && !dry_run) {
+
+// TODO(anybody) : remove this flag when PVQ supports pallete coding tool
+#if !CONFIG_PVQ
+    if (bsize >= BLOCK_8X8) {
       for (plane = 0; plane <= 1; ++plane) {
-        if (mbmi->palette_mode_info.palette_size[plane] > 0)
-          av1_tokenize_palette_sb(cpi, td, plane, t, dry_run, bsize, rate);
+        if (mbmi->palette_mode_info.palette_size[plane] > 0) {
+          if (!dry_run)
+            av1_tokenize_color_map(x, plane, 0, t, bsize, mbmi->tx_size,
+                                   PALETTE_MAP);
+          else if (dry_run == DRY_RUN_COSTCOEFFS)
+            rate += av1_cost_color_map(x, plane, 0, bsize, mbmi->tx_size,
+                                       PALETTE_MAP);
+        }
       }
     }
-#endif  // CONFIG_PALETTE
+#endif  // !CONFIG_PVQ
+
 #if CONFIG_VAR_TX
     mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
 #endif
@@ -6012,7 +6363,7 @@ static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
       av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
                            &xd->block_refs[ref]->sf);
     }
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
     // Single ref compound mode
     if (!is_compound && is_inter_singleref_comp_mode(mbmi->mode)) {
       xd->block_refs[1] = xd->block_refs[0];
@@ -6024,9 +6375,11 @@ static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
 #endif  // !CONFIG_INTRABC
       av1_setup_pre_planes(xd, 1, cfg, mi_row, mi_col, &xd->block_refs[1]->sf);
     }
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
 
     av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, block_size);
+
+#if !CONFIG_NCOBMC_ADAPT_WEIGHT
 #if CONFIG_MOTION_VAR
     if (mbmi->motion_mode == OBMC_CAUSAL) {
 #if CONFIG_NCOBMC
@@ -6037,6 +6390,17 @@ static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
         av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
     }
 #endif  // CONFIG_MOTION_VAR
+#else
+    if (mbmi->motion_mode == OBMC_CAUSAL) {
+      av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+    } else if (mbmi->motion_mode == NCOBMC_ADAPT_WEIGHT &&
+               dry_run == OUTPUT_ENABLED) {
+      int p;
+      for (p = 0; p < MAX_MB_PLANE; ++p) {
+        get_pred_from_intrpl_buf(xd, mi_row, mi_col, block_size, p);
+      }
+    }
+#endif
 
     av1_encode_sb((AV1_COMMON *)cm, x, block_size, mi_row, mi_col);
 #if CONFIG_VAR_TX
@@ -6053,7 +6417,7 @@ static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
   }
 
 #if CONFIG_DIST_8X8 && CONFIG_CB4X4
-  if (bsize < BLOCK_8X8) {
+  if (x->using_dist_8x8 && bsize < BLOCK_8X8) {
     dist_8x8_set_sub8x8_dst(x, (uint8_t *)x->decoded_8x8, bsize,
                             block_size_wide[bsize], block_size_high[bsize],
                             mi_row, mi_col);
@@ -6079,8 +6443,8 @@ static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
         tx_partition_count_update(cm, x, bsize, mi_row, mi_col, td->counts);
       } else {
         const int tx_size_ctx = get_tx_size_context(xd);
-        const int tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize]
-                                         : intra_tx_size_cat_lookup[bsize];
+        const int32_t tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize]
+                                             : intra_tx_size_cat_lookup[bsize];
         const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size];
         const int depth = tx_size_to_depth(coded_tx_size);
         ++td->counts->tx_size[tx_size_cat][tx_size_ctx][depth];
@@ -6088,8 +6452,8 @@ static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
       }
 #else
       const int tx_size_ctx = get_tx_size_context(xd);
-      const int tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize]
-                                       : intra_tx_size_cat_lookup[bsize];
+      const int32_t tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize]
+                                           : intra_tx_size_cat_lookup[bsize];
       const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size];
       const int depth = tx_size_to_depth(coded_tx_size);
 
@@ -6141,9 +6505,6 @@ static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
 #endif
     }
 
-    ++td->counts->tx_size_totals[txsize_sqr_map[tx_size]];
-    ++td->counts->tx_size_totals[txsize_sqr_map[av1_get_uv_tx_size(
-        mbmi, &xd->plane[1])]];
 #if !CONFIG_TXK_SEL
     av1_update_tx_type_count(cm, xd, bsize, tx_size, td->counts);
 #endif
@@ -6156,27 +6517,46 @@ static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
 #else
       mbmi->sb_type >= BLOCK_8X8 &&
 #endif
-      is_inter && !(mbmi->skip || seg_skip)) {
+      is_inter && !(mbmi->skip || seg_skip) &&
+      !xd->lossless[mbmi->segment_id]) {
     if (dry_run) tx_partition_set_contexts(cm, xd, bsize, mi_row, mi_col);
   } else {
     TX_SIZE tx_size = mbmi->tx_size;
     // The new intra coding scheme requires no change of transform size
-    if (is_inter)
-      tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode, is_inter);
-    else
+    if (is_inter) {
+      if (xd->lossless[mbmi->segment_id]) {
+        tx_size = TX_4X4;
+      } else {
+        tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode, is_inter);
+      }
+    } else {
       tx_size = (bsize > BLOCK_4X4) ? tx_size : TX_4X4;
+    }
     mbmi->tx_size = tx_size;
     set_txfm_ctxs(tx_size, xd->n8_w, xd->n8_h, (mbmi->skip || seg_skip), xd);
   }
 #endif  // CONFIG_VAR_TX
+#if CONFIG_CFL && CONFIG_CHROMA_SUB8X8
+  CFL_CTX *const cfl = xd->cfl;
+#if CONFIG_DEBUG
+  if (is_chroma_reference(mi_row, mi_col, bsize, cfl->subsampling_x,
+                          cfl->subsampling_y) &&
+      !cfl->are_parameters_computed) {
+    cfl_clear_sub8x8_val(cfl);
+  }
+#endif  // CONFIG_DEBUG
+  if (is_inter_block(mbmi) &&
+      !is_chroma_reference(mi_row, mi_col, bsize, cfl->subsampling_x,
+                           cfl->subsampling_y)) {
+    cfl_store_block(xd, mbmi->sb_type, mbmi->tx_size);
+  }
+#endif  // CONFIG_CFL && CONFIG_CHROMA_SUB8X8
 }
 
 #if CONFIG_SUPERTX
 static int check_intra_b(PICK_MODE_CONTEXT *ctx) {
   if (!is_inter_mode((&ctx->mic)->mbmi.mode)) return 1;
-#if CONFIG_EXT_INTER
   if (ctx->mic.mbmi.ref_frame[1] == INTRA_FRAME) return 1;
-#endif  // CONFIG_EXT_INTER
   return 0;
 }
 
@@ -6235,6 +6615,9 @@ static int check_intra_sb(const AV1_COMP *const cpi, const TileInfo *const tile,
       }
       break;
 #if CONFIG_EXT_PARTITION_TYPES
+#if CONFIG_EXT_PARTITION_TYPES_AB
+#error HORZ/VERT_A/B partitions not yet updated in superres code
+#endif
     case PARTITION_HORZ_A:
       for (i = 0; i < 3; i++) {
         if (check_intra_b(&pc_tree->horizontala[i])) return 1;
@@ -6289,6 +6672,9 @@ static int check_supertx_sb(BLOCK_SIZE bsize, TX_SIZE supertx_size,
       else
         return check_supertx_sb(subsize, supertx_size, pc_tree->split[0]);
 #if CONFIG_EXT_PARTITION_TYPES
+#if CONFIG_EXT_PARTITION_TYPES_AB
+#error HORZ/VERT_A/B partitions not yet updated in superres code
+#endif
     case PARTITION_HORZ_A:
       return check_supertx_b(supertx_size, &pc_tree->horizontala[0]);
     case PARTITION_HORZ_B:
@@ -6303,10 +6689,8 @@ static int check_supertx_sb(BLOCK_SIZE bsize, TX_SIZE supertx_size,
 }
 
 static void predict_superblock(const AV1_COMP *const cpi, ThreadData *td,
-#if CONFIG_EXT_INTER
-                               int mi_row_ori, int mi_col_ori,
-#endif  // CONFIG_EXT_INTER
-                               int mi_row_pred, int mi_col_pred, int plane,
+                               int mi_row_ori, int mi_col_ori, int mi_row_pred,
+                               int mi_col_pred, int plane,
                                BLOCK_SIZE bsize_pred, int b_sub8x8, int block) {
   // Used in supertx
   // (mi_row_ori, mi_col_ori): location for mv
@@ -6328,7 +6712,7 @@ static void predict_superblock(const AV1_COMP *const cpi, ThreadData *td,
                          &xd->block_refs[ref]->sf);
   }
 
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
   // Single ref compound mode
   if (!is_compound && is_inter_singleref_comp_mode(mbmi->mode)) {
     xd->block_refs[1] = xd->block_refs[0];
@@ -6336,20 +6720,14 @@ static void predict_superblock(const AV1_COMP *const cpi, ThreadData *td,
     av1_setup_pre_planes(xd, 1, cfg, mi_row_pred, mi_col_pred,
                          &xd->block_refs[1]->sf);
   }
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
 
   if (!b_sub8x8)
-    av1_build_inter_predictor_sb_extend(cm, xd,
-#if CONFIG_EXT_INTER
-                                        mi_row_ori, mi_col_ori,
-#endif  // CONFIG_EXT_INTER
+    av1_build_inter_predictor_sb_extend(cm, xd, mi_row_ori, mi_col_ori,
                                         mi_row_pred, mi_col_pred, plane,
                                         bsize_pred);
   else
-    av1_build_inter_predictor_sb_sub8x8_extend(cm, xd,
-#if CONFIG_EXT_INTER
-                                               mi_row_ori, mi_col_ori,
-#endif  // CONFIG_EXT_INTER
+    av1_build_inter_predictor_sb_sub8x8_extend(cm, xd, mi_row_ori, mi_col_ori,
                                                mi_row_pred, mi_col_pred, plane,
                                                bsize_pred, block);
 }
@@ -6390,12 +6768,8 @@ static void predict_b_extend(const AV1_COMP *const cpi, ThreadData *td,
       dst_buf + (r >> xd->plane[plane].subsampling_y) * dst_stride +
       (c >> xd->plane[plane].subsampling_x);
 
-  predict_superblock(cpi, td,
-#if CONFIG_EXT_INTER
-                     mi_row_ori, mi_col_ori,
-#endif  // CONFIG_EXT_INTER
-                     mi_row_pred, mi_col_pred, plane, bsize_pred, b_sub8x8,
-                     block);
+  predict_superblock(cpi, td, mi_row_ori, mi_col_ori, mi_row_pred, mi_col_pred,
+                     plane, bsize_pred, b_sub8x8, block);
 
   if (!dry_run && (plane == 0) && (block == 0 || !b_sub8x8))
     update_stats(&cpi->common, td, mi_row_pred, mi_col_pred, 1);
@@ -6940,6 +7314,9 @@ static void predict_sb_complex(const AV1_COMP *const cpi, ThreadData *td,
       }
       break;
 #if CONFIG_EXT_PARTITION_TYPES
+#if CONFIG_EXT_PARTITION_TYPES_AB
+#error HORZ/VERT_A/B partitions not yet updated in superres code
+#endif
     case PARTITION_HORZ_A:
       predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
                        mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
@@ -7130,9 +7507,6 @@ static void rd_supertx_sb(const AV1_COMP *const cpi, ThreadData *td,
   TX_SIZE tx_size;
   MB_MODE_INFO *mbmi;
   TX_TYPE tx_type, best_tx_nostx;
-#if CONFIG_EXT_TX
-  int ext_tx_set;
-#endif  // CONFIG_EXT_TX
   int tmp_rate_tx = 0, skip_tx = 0;
   int64_t tmp_dist_tx = 0, rd_tx, bestrd_tx = INT64_MAX;
 
@@ -7202,7 +7576,9 @@ static void rd_supertx_sb(const AV1_COMP *const cpi, ThreadData *td,
   tx_size = max_txsize_lookup[bsize];
   av1_subtract_plane(x, bsize, 0);
 #if CONFIG_EXT_TX
-  ext_tx_set = get_ext_tx_set(tx_size, bsize, 1, cm->reduced_tx_set_used);
+  int ext_tx_set = get_ext_tx_set(tx_size, bsize, 1, cm->reduced_tx_set_used);
+  const TxSetType tx_set_type =
+      get_ext_tx_set_type(tx_size, bsize, 1, cm->reduced_tx_set_used);
 #endif  // CONFIG_EXT_TX
   for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
 #if CONFIG_VAR_TX
@@ -7213,7 +7589,7 @@ static void rd_supertx_sb(const AV1_COMP *const cpi, ThreadData *td,
 #endif  // CONFIG_VAR_TX
 
 #if CONFIG_EXT_TX
-    if (!ext_tx_used_inter[ext_tx_set][tx_type]) continue;
+    if (!av1_ext_tx_used[tx_set_type][tx_type]) continue;
 #else
     if (tx_size >= TX_32X32 && tx_type != DCT_DCT) continue;
 #endif  // CONFIG_EXT_TX
@@ -7239,12 +7615,12 @@ static void rd_supertx_sb(const AV1_COMP *const cpi, ThreadData *td,
         !xd->lossless[xd->mi[0]->mbmi.segment_id] && this_rate != INT_MAX) {
       if (ext_tx_set > 0)
         this_rate +=
-            cpi->inter_tx_type_costs[ext_tx_set][mbmi->tx_size][mbmi->tx_type];
+            x->inter_tx_type_costs[ext_tx_set][mbmi->tx_size][mbmi->tx_type];
     }
 #else
     if (tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
         this_rate != INT_MAX) {
-      this_rate += cpi->inter_tx_type_costs[tx_size][mbmi->tx_type];
+      this_rate += x->inter_tx_type_costs[tx_size][mbmi->tx_type];
     }
 #endif  // CONFIG_EXT_TX
     *tmp_rate = rate_uv + this_rate;
diff --git a/third_party/aom/av1/encoder/encodeframe.h b/third_party/aom/av1/encoder/encodeframe.h
index 569ec9f72..b54e54d25 100644
--- a/third_party/aom/av1/encoder/encodeframe.h
+++ b/third_party/aom/av1/encoder/encodeframe.h
@@ -41,7 +41,6 @@ void av1_update_tx_type_count(const struct AV1Common *cm, MACROBLOCKD *xd,
 #endif
                               BLOCK_SIZE bsize, TX_SIZE tx_size,
                               FRAME_COUNTS *counts);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/encoder/encodemb.c b/third_party/aom/av1/encoder/encodemb.c
index e7f4d313d..f35ce8a4f 100644
--- a/third_party/aom/av1/encoder/encodemb.c
+++ b/third_party/aom/av1/encoder/encodemb.c
@@ -110,42 +110,46 @@ void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
                  pd->dst.buf, pd->dst.stride);
 }
 
+// Shifting negative values is undefined behaviour in C99,
+// and could mislead the optimizer, who might assume the shifted is positive.
+// This also avoids ubsan warnings.
+// In practise, this gets inlined by the optimizer to a single instruction.
+static INLINE int signed_shift_right(int x, int shift) {
+  if (x >= 0)
+    return x >> shift;
+  else
+    return -((-x) >> shift);
+}
+
+#if !CONFIG_LV_MAP
 // These numbers are empirically obtained.
 static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
   { 10, 7 }, { 8, 5 },
 };
 
-static INLINE unsigned int get_token_bit_costs(
-    unsigned int token_costs[2][COEFF_CONTEXTS][ENTROPY_TOKENS], int skip_eob,
-    int ctx, int token) {
-  (void)skip_eob;
-  return token_costs[token == ZERO_TOKEN || token == EOB_TOKEN][ctx][token];
-}
-
-#if !CONFIG_LV_MAP
-
 static int optimize_b_greedy(const AV1_COMMON *cm, MACROBLOCK *mb, int plane,
                              int blk_row, int blk_col, int block,
                              TX_SIZE tx_size, int ctx) {
   MACROBLOCKD *const xd = &mb->e_mbd;
   struct macroblock_plane *const p = &mb->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
+  const PLANE_TYPE plane_type = pd->plane_type;
+  const int eob = p->eobs[block];
+  assert(mb->qindex > 0);
+  assert((!plane_type && !plane) || (plane_type && plane));
+  assert(eob <= tx_size_2d[tx_size]);
   const int ref = is_inter_block(&xd->mi[0]->mbmi);
-  uint8_t token_cache[MAX_TX_SQUARE];
   const tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
   tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  const int eob = p->eobs[block];
-  const PLANE_TYPE plane_type = pd->plane_type;
   const int16_t *const dequant_ptr = pd->dequant;
   const uint8_t *const band_translate = get_band_translate(tx_size);
-  TX_TYPE tx_type =
+  const TX_TYPE tx_type =
       av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
   const SCAN_ORDER *const scan_order =
       get_scan(cm, tx_size, tx_type, &xd->mi[0]->mbmi);
   const int16_t *const scan = scan_order->scan;
   const int16_t *const nb = scan_order->neighbors;
-  int dqv;
   const int shift = av1_get_tx_scale(tx_size);
 #if CONFIG_AOM_QM
   int seg_id = xd->mi[0]->mbmi.segment_id;
@@ -159,66 +163,52 @@ static int optimize_b_greedy(const AV1_COMMON *cm, MACROBLOCK *mb, int plane,
   int dq = get_dq_profile_from_ctx(mb->qindex, ctx, ref, plane_type);
   const dequant_val_type_nuq *dequant_val = pd->dequant_val_nuq[dq];
 #endif  // CONFIG_NEW_QUANT
-  int sz = 0;
   int64_t rd_cost0, rd_cost1;
   int16_t t0, t1;
-  int i, final_eob;
+  int i, final_eob = 0;
   const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, xd->bd);
-  unsigned int(*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
-      mb->token_costs[txsize_sqr_map[tx_size]][plane_type][ref];
-  const int default_eob = tx_size_2d[tx_size];
-
-  assert(mb->qindex > 0);
-
-  assert((!plane_type && !plane) || (plane_type && plane));
-  assert(eob <= default_eob);
-
-  int64_t rdmult = (mb->rdmult * plane_rd_mult[ref][plane_type]) >> 1;
-
+  int(*head_token_costs)[COEFF_CONTEXTS][TAIL_TOKENS] =
+      mb->token_head_costs[txsize_sqr_map[tx_size]][plane_type][ref];
+  int(*tail_token_costs)[COEFF_CONTEXTS][TAIL_TOKENS] =
+      mb->token_tail_costs[txsize_sqr_map[tx_size]][plane_type][ref];
+  const int64_t rdmult = (mb->rdmult * plane_rd_mult[ref][plane_type]) >> 1;
   int64_t rate0, rate1;
+  int64_t eob_cost0, eob_cost1;
+  tran_low_t before_best_eob_qc = 0;
+  tran_low_t before_best_eob_dqc = 0;
+
+  uint8_t token_cache[MAX_TX_SQUARE];
   for (i = 0; i < eob; i++) {
     const int rc = scan[i];
     token_cache[rc] = av1_pt_energy_class[av1_get_token(qcoeff[rc])];
   }
 
-  unsigned int(*token_costs_ptr)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
-      token_costs;
-
-  final_eob = 0;
-
-  int64_t eob_cost0, eob_cost1;
-  tran_low_t before_best_eob_qc = 0;
-  tran_low_t before_best_eob_dqc = 0;
-
-  const int ctx0 = ctx;
   /* Record the r-d cost */
   int64_t accu_rate = 0;
   // Initialized to the worst possible error for the largest transform size.
   // This ensures that it never goes negative.
   int64_t accu_error = ((int64_t)1) << 50;
-
-  rate0 = get_token_bit_costs(*(token_costs_ptr + band_translate[0]), 0, ctx0,
-                              EOB_TOKEN);
+  rate0 = head_token_costs[0][ctx][0];
   int64_t best_block_rd_cost = RDCOST(rdmult, rate0, accu_error);
 
   // int64_t best_block_rd_cost_all0 = best_block_rd_cost;
-  int x_prev = 1;
+  const int seg_eob =
+      av1_get_tx_eob(&cm->seg, xd->mi[0]->mbmi.segment_id, tx_size);
   for (i = 0; i < eob; i++) {
     const int rc = scan[i];
-    int x = qcoeff[rc];
-    sz = -(x < 0);
-
-    int band_cur = band_translate[i];
-    int ctx_cur = (i == 0) ? ctx : get_coef_context(nb, token_cache, i);
-    int token_tree_sel_cur = (x_prev == 0);
+    const int x = qcoeff[rc];
+    const int sz = -(x < 0);
+    const int band_cur = band_translate[i];
+    const int ctx_cur = (i == 0) ? ctx : get_coef_context(nb, token_cache, i);
+    const int eob_val =
+        (i + 1 == eob) ? (i + 1 == seg_eob ? LAST_EOB : EARLY_EOB) : NO_EOB;
+    const int is_first = (i == 0);
 
     if (x == 0) {
       // no need to search when x == 0
-      int token = av1_get_token(x);
-      rate0 = get_token_bit_costs(*(token_costs_ptr + band_cur),
-                                  token_tree_sel_cur, ctx_cur, token);
-      accu_rate += rate0;
-      x_prev = 0;
+      accu_rate += av1_get_coeff_token_cost(
+          ZERO_TOKEN, eob_val, is_first, head_token_costs[band_cur][ctx_cur],
+          tail_token_costs[band_cur][ctx_cur]);
       // accu_error does not change when x==0
     } else {
       /*  Computing distortion
@@ -226,136 +216,109 @@ static int optimize_b_greedy(const AV1_COMMON *cm, MACROBLOCK *mb, int plane,
       // compute the distortion for the first candidate
       // and the distortion for quantizing to 0.
       int dx0 = abs(coeff[rc]) * (1 << shift);
-#if CONFIG_HIGHBITDEPTH
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-        dx0 >>= xd->bd - 8;
-      }
-#endif
-      int64_t d0 = (int64_t)dx0 * dx0;
-
-      int x_a = x - 2 * sz - 1;
-      int64_t d2, d2_a;
-
-      int dx;
+      dx0 >>= xd->bd - 8;
 
+      const int64_t d0 = (int64_t)dx0 * dx0;
+      const int x_a = x - 2 * sz - 1;
+      int dqv;
 #if CONFIG_AOM_QM
-      int iwt = iqmatrix[rc];
+      int iwt;
       dqv = dequant_ptr[rc != 0];
-      dqv = ((iwt * (int)dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+      if (iqmatrix != NULL) {
+        iwt = iqmatrix[rc];
+        dqv = ((iwt * (int)dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+      }
 #else
       dqv = dequant_ptr[rc != 0];
 #endif
 
-      dx = (dqcoeff[rc] - coeff[rc]) * (1 << shift);
-#if CONFIG_HIGHBITDEPTH
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-        int dx_sign = dx < 0 ? 1 : 0;
-        dx = abs(dx) >> (xd->bd - 8);
-        if (dx_sign) dx = -dx;
-      }
-#endif  // CONFIG_HIGHBITDEPTH
-      d2 = (int64_t)dx * dx;
+      int dx = (dqcoeff[rc] - coeff[rc]) * (1 << shift);
+      dx = signed_shift_right(dx, xd->bd - 8);
+      const int64_t d2 = (int64_t)dx * dx;
 
       /* compute the distortion for the second candidate
        * x_a = x - 2 * sz + 1;
        */
+      int64_t d2_a;
       if (x_a != 0) {
 #if CONFIG_NEW_QUANT
         dx = av1_dequant_coeff_nuq(x, dqv, dequant_val[band_translate[i]]) -
-             (coeff[rc] << shift);
-#if CONFIG_HIGHBITDEPTH
-        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-          dx >>= xd->bd - 8;
-        }
-#endif  // CONFIG_HIGHBITDEPTH
+             (coeff[rc] * (1 << shift));
+        dx >>= xd->bd - 8;
 #else   // CONFIG_NEW_QUANT
-#if CONFIG_HIGHBITDEPTH
-        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-          dx -= ((dqv >> (xd->bd - 8)) + sz) ^ sz;
-        } else {
-          dx -= (dqv + sz) ^ sz;
-        }
-#else
-        dx -= (dqv + sz) ^ sz;
-#endif  // CONFIG_HIGHBITDEPTH
+        dx -= ((dqv >> (xd->bd - 8)) + sz) ^ sz;
 #endif  // CONFIG_NEW_QUANT
         d2_a = (int64_t)dx * dx;
       } else {
         d2_a = d0;
       }
-      /*  Computing rates and r-d cost
-       */
-
-      int best_x, best_eob_x;
-      int64_t base_bits, next_bits0, next_bits1;
-      int64_t next_eob_bits0, next_eob_bits1;
-
+      // Computing RD cost
+      int64_t base_bits;
       // rate cost of x
       base_bits = av1_get_token_cost(x, &t0, cat6_bits);
-      rate0 = base_bits + get_token_bit_costs(*(token_costs_ptr + band_cur),
-                                              token_tree_sel_cur, ctx_cur, t0);
-
+      rate0 = base_bits +
+              av1_get_coeff_token_cost(t0, eob_val, is_first,
+                                       head_token_costs[band_cur][ctx_cur],
+                                       tail_token_costs[band_cur][ctx_cur]);
+      // rate cost of x_a
       base_bits = av1_get_token_cost(x_a, &t1, cat6_bits);
-      rate1 = base_bits + get_token_bit_costs(*(token_costs_ptr + band_cur),
-                                              token_tree_sel_cur, ctx_cur, t1);
-
-      next_bits0 = 0;
-      next_bits1 = 0;
-      next_eob_bits0 = 0;
-      next_eob_bits1 = 0;
+      if (t1 == ZERO_TOKEN && eob_val) {
+        rate1 = base_bits;
+      } else {
+        rate1 = base_bits +
+                av1_get_coeff_token_cost(t1, eob_val, is_first,
+                                         head_token_costs[band_cur][ctx_cur],
+                                         tail_token_costs[band_cur][ctx_cur]);
+      }
 
-      if (i < default_eob - 1) {
-        int ctx_next, token_tree_sel_next;
-        int band_next = band_translate[i + 1];
-        int token_next =
-            i + 1 != eob ? av1_get_token(qcoeff[scan[i + 1]]) : EOB_TOKEN;
+      int64_t next_bits0 = 0, next_bits1 = 0;
+      if (i < eob - 1) {
+        int ctx_next;
+        const int band_next = band_translate[i + 1];
+        const int token_next = av1_get_token(qcoeff[scan[i + 1]]);
+        const int eob_val_next =
+            (i + 2 == eob) ? (i + 2 == seg_eob ? LAST_EOB : EARLY_EOB) : NO_EOB;
 
         token_cache[rc] = av1_pt_energy_class[t0];
         ctx_next = get_coef_context(nb, token_cache, i + 1);
-        token_tree_sel_next = (x == 0);
-
-        next_bits0 =
-            get_token_bit_costs(*(token_costs_ptr + band_next),
-                                token_tree_sel_next, ctx_next, token_next);
-        next_eob_bits0 =
-            get_token_bit_costs(*(token_costs_ptr + band_next),
-                                token_tree_sel_next, ctx_next, EOB_TOKEN);
+        next_bits0 = av1_get_coeff_token_cost(
+            token_next, eob_val_next, 0, head_token_costs[band_next][ctx_next],
+            tail_token_costs[band_next][ctx_next]);
 
         token_cache[rc] = av1_pt_energy_class[t1];
         ctx_next = get_coef_context(nb, token_cache, i + 1);
-        token_tree_sel_next = (x_a == 0);
-
-        next_bits1 =
-            get_token_bit_costs(*(token_costs_ptr + band_next),
-                                token_tree_sel_next, ctx_next, token_next);
-
-        if (x_a != 0) {
-          next_eob_bits1 =
-              get_token_bit_costs(*(token_costs_ptr + band_next),
-                                  token_tree_sel_next, ctx_next, EOB_TOKEN);
-        }
+        next_bits1 = av1_get_coeff_token_cost(
+            token_next, eob_val_next, 0, head_token_costs[band_next][ctx_next],
+            tail_token_costs[band_next][ctx_next]);
       }
 
       rd_cost0 = RDCOST(rdmult, (rate0 + next_bits0), d2);
       rd_cost1 = RDCOST(rdmult, (rate1 + next_bits1), d2_a);
+      const int best_x = (rd_cost1 < rd_cost0);
 
-      best_x = (rd_cost1 < rd_cost0);
-
-      eob_cost0 = RDCOST(rdmult, (accu_rate + rate0 + next_eob_bits0),
-                         (accu_error + d2 - d0));
+      const int eob_v = (i + 1 == seg_eob) ? LAST_EOB : EARLY_EOB;
+      int64_t next_eob_bits0, next_eob_bits1;
+      int best_eob_x;
+      next_eob_bits0 = av1_get_coeff_token_cost(
+          t0, eob_v, is_first, head_token_costs[band_cur][ctx_cur],
+          tail_token_costs[band_cur][ctx_cur]);
+      eob_cost0 =
+          RDCOST(rdmult, (accu_rate + next_eob_bits0), (accu_error + d2 - d0));
       eob_cost1 = eob_cost0;
       if (x_a != 0) {
-        eob_cost1 = RDCOST(rdmult, (accu_rate + rate1 + next_eob_bits1),
+        next_eob_bits1 = av1_get_coeff_token_cost(
+            t1, eob_v, is_first, head_token_costs[band_cur][ctx_cur],
+            tail_token_costs[band_cur][ctx_cur]);
+        eob_cost1 = RDCOST(rdmult, (accu_rate + next_eob_bits1),
                            (accu_error + d2_a - d0));
         best_eob_x = (eob_cost1 < eob_cost0);
       } else {
         best_eob_x = 0;
       }
 
-      int dqc, dqc_a = 0;
-
-      dqc = dqcoeff[rc];
-      if (best_x + best_eob_x) {
+      const int dqc = dqcoeff[rc];
+      int dqc_a = 0;
+      if (best_x || best_eob_x) {
         if (x_a != 0) {
 #if CONFIG_NEW_QUANT
           dqc_a = av1_dequant_abscoeff_nuq(abs(x_a), dqv,
@@ -375,29 +338,23 @@ static int optimize_b_greedy(const AV1_COMMON *cm, MACROBLOCK *mb, int plane,
 
       // record the better quantized value
       if (best_x) {
+        assert(d2_a <= d0);
         qcoeff[rc] = x_a;
         dqcoeff[rc] = dqc_a;
-
         accu_rate += rate1;
         accu_error += d2_a - d0;
-        assert(d2_a <= d0);
-
         token_cache[rc] = av1_pt_energy_class[t1];
       } else {
+        assert(d2 <= d0);
         accu_rate += rate0;
         accu_error += d2 - d0;
-        assert(d2 <= d0);
-
         token_cache[rc] = av1_pt_energy_class[t0];
       }
       assert(accu_error >= 0);
 
-      x_prev = qcoeff[rc];
-
       // determine whether to move the eob position to i+1
-      int use_a = (x_a != 0) && (best_eob_x);
-      int64_t best_eob_cost_i = use_a ? eob_cost1 : eob_cost0;
-
+      const int use_a = (x_a != 0) && (best_eob_x);
+      const int64_t best_eob_cost_i = use_a ? eob_cost1 : eob_cost0;
       if (best_eob_cost_i < best_block_rd_cost) {
         best_block_rd_cost = best_eob_cost_i;
         final_eob = i + 1;
@@ -427,7 +384,7 @@ static int optimize_b_greedy(const AV1_COMMON *cm, MACROBLOCK *mb, int plane,
     dqcoeff[rc] = 0;
   }
 
-  mb->plane[plane].eobs[block] = final_eob;
+  p->eobs[block] = final_eob;
   return final_eob;
 }
 #endif  // !CONFIG_LV_MAP
@@ -435,7 +392,7 @@ static int optimize_b_greedy(const AV1_COMMON *cm, MACROBLOCK *mb, int plane,
 int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int blk_row,
                    int blk_col, int block, BLOCK_SIZE plane_bsize,
                    TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
-                   const ENTROPY_CONTEXT *l) {
+                   const ENTROPY_CONTEXT *l, int fast_mode) {
   MACROBLOCKD *const xd = &mb->e_mbd;
   struct macroblock_plane *const p = &mb->plane[plane];
   const int eob = p->eobs[block];
@@ -455,6 +412,7 @@ int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int blk_row,
   (void)plane_bsize;
   (void)blk_row;
   (void)blk_col;
+  (void)fast_mode;
 #if CONFIG_VAR_TX
   int ctx = get_entropy_context(tx_size, a, l);
 #else
@@ -466,7 +424,7 @@ int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int blk_row,
   TXB_CTX txb_ctx;
   get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
   return av1_optimize_txb(cm, mb, plane, blk_row, blk_col, block, tx_size,
-                          &txb_ctx);
+                          &txb_ctx, fast_mode);
 #endif  // !CONFIG_LV_MAP
 }
 
@@ -492,10 +450,12 @@ static AV1_QUANT_FACADE
     };
 #endif  // !CONFIG_PVQ
 
+#if !CONFIG_TXMG && !CONFIG_PVQ
 typedef void (*fwdTxfmFunc)(const int16_t *diff, tran_low_t *coeff, int stride,
                             TxfmParam *txfm_param);
 static const fwdTxfmFunc fwd_txfm_func[2] = { av1_fwd_txfm,
                                               av1_highbd_fwd_txfm };
+#endif
 
 void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
                      int blk_row, int blk_col, BLOCK_SIZE plane_bsize,
@@ -514,7 +474,7 @@ void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
   TX_TYPE tx_type =
       av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
 
-#if CONFIG_AOM_QM || CONFIG_NEW_QUANT
+#if (CONFIG_AOM_QM || CONFIG_NEW_QUANT) && !CONFIG_PVQ
   const int is_inter = is_inter_block(mbmi);
 #endif
 
@@ -524,7 +484,7 @@ void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   uint16_t *const eob = &p->eobs[block];
   const int diff_stride = block_size_wide[plane_bsize];
-#if CONFIG_AOM_QM
+#if CONFIG_AOM_QM && !CONFIG_PVQ
   int seg_id = mbmi->segment_id;
   // Use a flat matrix (i.e. no weighting) for 1D and Identity transforms
   const qm_val_t *qmatrix =
@@ -538,7 +498,7 @@ void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
 
   TxfmParam txfm_param;
 
-#if CONFIG_PVQ || CONFIG_DIST_8X8 || CONFIG_LGT || CONFIG_MRC_TX
+#if CONFIG_PVQ || CONFIG_DIST_8X8 || CONFIG_LGT_FROM_PRED || CONFIG_MRC_TX
   uint8_t *dst;
   const int dst_stride = pd->dst.stride;
 #if CONFIG_PVQ || CONFIG_DIST_8X8
@@ -601,29 +561,37 @@ void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
 #endif  // CONFIG_HIGHBITDEPTH
 #endif
 
-#if CONFIG_PVQ || CONFIG_DIST_8X8 || CONFIG_LGT || CONFIG_MRC_TX
+#if CONFIG_PVQ || CONFIG_DIST_8X8 || CONFIG_LGT_FROM_PRED || CONFIG_MRC_TX
   dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+#endif  // CONFIG_PVQ || CONFIG_DIST_8X8 || CONFIG_LGT_FROM_PRED ||
+        // CONFIG_MRC_TX
+
 #if CONFIG_PVQ || CONFIG_DIST_8X8
-  pred = &pd->pred[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
+  if (CONFIG_PVQ
+#if CONFIG_DIST_8X8
+      || x->using_dist_8x8
+#endif  // CONFIG_DIST_8X8
+      ) {
+    pred = &pd->pred[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
 
 // copy uint8 orig and predicted block to int16 buffer
 // in order to use existing VP10 transform functions
 #if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    for (j = 0; j < txh; j++)
-      for (i = 0; i < txw; i++)
-        pred[diff_stride * j + i] =
-            CONVERT_TO_SHORTPTR(dst)[dst_stride * j + i];
-  } else {
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      for (j = 0; j < txh; j++)
+        for (i = 0; i < txw; i++)
+          pred[diff_stride * j + i] =
+              CONVERT_TO_SHORTPTR(dst)[dst_stride * j + i];
+    } else {
 #endif  // CONFIG_HIGHBITDEPTH
-    for (j = 0; j < txh; j++)
-      for (i = 0; i < txw; i++)
-        pred[diff_stride * j + i] = dst[dst_stride * j + i];
+      for (j = 0; j < txh; j++)
+        for (i = 0; i < txw; i++)
+          pred[diff_stride * j + i] = dst[dst_stride * j + i];
 #if CONFIG_HIGHBITDEPTH
-  }
+    }
 #endif  // CONFIG_HIGHBITDEPTH
+  }
 #endif  // CONFIG_PVQ || CONFIG_DIST_8X8
-#endif  // CONFIG_PVQ || CONFIG_DIST_8X8 || CONFIG_LGT || CONFIG_MRC_TX
 
   (void)ctx;
 
@@ -631,18 +599,32 @@ void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
   txfm_param.tx_size = tx_size;
   txfm_param.lossless = xd->lossless[mbmi->segment_id];
 #if CONFIG_MRC_TX || CONFIG_LGT
-  txfm_param.dst = dst;
-  txfm_param.stride = dst_stride;
-#endif  // CONFIG_MRC_TX || CONFIG_LGT
-#if CONFIG_LGT
   txfm_param.is_inter = is_inter_block(mbmi);
-  txfm_param.mode = get_prediction_mode(xd->mi[0], plane, tx_size, block);
 #endif
+#if CONFIG_MRC_TX || CONFIG_LGT_FROM_PRED
+  txfm_param.dst = dst;
+  txfm_param.stride = dst_stride;
+#if CONFIG_MRC_TX
+  txfm_param.valid_mask = &mbmi->valid_mrc_mask;
+#if SIGNAL_ANY_MRC_MASK
+  txfm_param.mask = BLOCK_OFFSET(xd->mrc_mask, block);
+#endif  // SIGNAL_ANY_MRC_MASK
+#endif  // CONFIG_MRC_TX
+#if CONFIG_LGT_FROM_PRED
+  txfm_param.mode = mbmi->mode;
+  txfm_param.use_lgt = mbmi->use_lgt;
+#endif  // CONFIG_LGT_FROM_PRED
+#endif  // CONFIG_MRC_TX || CONFIG_LGT_FROM_PRED
 
 #if !CONFIG_PVQ
   txfm_param.bd = xd->bd;
   const int is_hbd = get_bitdepth_data_path_index(xd);
+
+#if CONFIG_TXMG
+  av1_highbd_fwd_txfm(src_diff, coeff, diff_stride, &txfm_param);
+#else   // CONFIG_TXMG
   fwd_txfm_func[is_hbd](src_diff, coeff, diff_stride, &txfm_param);
+#endif  // CONFIG_TXMG
 
   if (xform_quant_idx != AV1_XFORM_QUANT_SKIP_QUANT) {
     if (LIKELY(!x->skip_block)) {
@@ -705,6 +687,9 @@ static void encode_block(int plane, int block, int blk_row, int blk_col,
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+  uint8_t *mrc_mask = BLOCK_OFFSET(xd->mrc_mask, block);
+#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
   uint8_t *dst;
 #if !CONFIG_PVQ
   ENTROPY_CONTEXT *a, *l;
@@ -731,10 +716,9 @@ static void encode_block(int plane, int block, int blk_row, int blk_col,
   // Assert not magic number (uninitialized).
   assert(x->blk_skip[plane][blk_row * bw + blk_col] != 234);
 
-  if (x->blk_skip[plane][blk_row * bw + blk_col] == 0) {
-#else
-  {
+  if (x->blk_skip[plane][blk_row * bw + blk_col] == 0)
 #endif
+  {
     av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
                     ctx, AV1_XFORM_QUANT_FP);
   }
@@ -746,29 +730,35 @@ static void encode_block(int plane, int block, int blk_row, int blk_col,
 
 #if !CONFIG_PVQ
   av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size, a,
-                 l);
+                 l, 0);
 
   av1_set_txb_context(x, plane, block, tx_size, a, l);
 
   if (p->eobs[block]) *(args->skip) = 0;
 
-  if (p->eobs[block] == 0) return;
+  if (p->eobs[block] != 0)
 #else
   (void)ctx;
   if (!x->pvq_skip[plane]) *(args->skip) = 0;
 
-  if (x->pvq_skip[plane]) return;
+  if (!x->pvq_skip[plane])
 #endif
-  TX_TYPE tx_type =
-      av1_get_tx_type(pd->plane_type, xd, blk_row, blk_col, block, tx_size);
-#if CONFIG_LGT
-  PREDICTION_MODE mode = get_prediction_mode(xd->mi[0], plane, tx_size, block);
-  av1_inverse_transform_block(xd, dqcoeff, mode, tx_type, tx_size, dst,
-                              pd->dst.stride, p->eobs[block]);
-#else
-  av1_inverse_transform_block(xd, dqcoeff, tx_type, tx_size, dst,
-                              pd->dst.stride, p->eobs[block]);
+  {
+#if CONFIG_LGT_FROM_PRED
+    PREDICTION_MODE mode = xd->mi[0]->mbmi.mode;
+#endif  // CONFIG_LGT_FROM_PRED
+    TX_TYPE tx_type =
+        av1_get_tx_type(pd->plane_type, xd, blk_row, blk_col, block, tx_size);
+    av1_inverse_transform_block(xd, dqcoeff,
+#if CONFIG_LGT_FROM_PRED
+                                mode,
 #endif
+#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+                                mrc_mask,
+#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+                                tx_type, tx_size, dst, pd->dst.stride,
+                                p->eobs[block]);
+  }
 }
 
 #if CONFIG_VAR_TX
@@ -803,7 +793,8 @@ static void encode_block_inter(int plane, int block, int blk_row, int blk_col,
     if (is_qttx) assert(blk_row == 0 && blk_col == 0 && block == 0);
 #else
     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
-    assert(sub_txs < tx_size);
+    assert(IMPLIES(tx_size <= TX_4X4, sub_txs == tx_size));
+    assert(IMPLIES(tx_size > TX_4X4, sub_txs < tx_size));
 #endif
     // This is the square transform block partition entry point.
     int bsl = tx_size_wide_unit[sub_txs];
@@ -858,34 +849,36 @@ static void encode_block_pass1(int plane, int block, int blk_row, int blk_col,
 
   av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
                   ctx, AV1_XFORM_QUANT_B);
-#if !CONFIG_PVQ
-  if (p->eobs[block] > 0) {
-#else
+#if CONFIG_PVQ
   if (!x->pvq_skip[plane]) {
-    {
-      int tx_blk_size;
-      int i, j;
-      // transform block size in pixels
-      tx_blk_size = tx_size_wide[tx_size];
+    int tx_blk_size;
+    int i, j;
+    // transform block size in pixels
+    tx_blk_size = tx_size_wide[tx_size];
 
 // Since av1 does not have separate function which does inverse transform
 // but av1_inv_txfm_add_*x*() also does addition of predicted image to
 // inverse transformed image,
 // pass blank dummy image to av1_inv_txfm_add_*x*(), i.e. set dst as zeros
 #if CONFIG_HIGHBITDEPTH
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-        for (j = 0; j < tx_blk_size; j++)
-          for (i = 0; i < tx_blk_size; i++)
-            CONVERT_TO_SHORTPTR(dst)[j * pd->dst.stride + i] = 0;
-      } else {
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      for (j = 0; j < tx_blk_size; j++)
+        for (i = 0; i < tx_blk_size; i++)
+          CONVERT_TO_SHORTPTR(dst)[j * pd->dst.stride + i] = 0;
+    } else {
 #endif  // CONFIG_HIGHBITDEPTH
-        for (j = 0; j < tx_blk_size; j++)
-          for (i = 0; i < tx_blk_size; i++) dst[j * pd->dst.stride + i] = 0;
+      for (j = 0; j < tx_blk_size; j++)
+        for (i = 0; i < tx_blk_size; i++) dst[j * pd->dst.stride + i] = 0;
 #if CONFIG_HIGHBITDEPTH
-      }
-#endif  // CONFIG_HIGHBITDEPTH
     }
-#endif  // !CONFIG_PVQ
+#endif  // CONFIG_HIGHBITDEPTH
+  }
+#endif  // CONFIG_PVQ
+
+#if !CONFIG_PVQ
+  if (p->eobs[block] > 0)
+#endif
+  {
     txfm_param.bd = xd->bd;
     txfm_param.tx_type = DCT_DCT;
     txfm_param.eob = p->eobs[block];
@@ -944,7 +937,8 @@ void av1_encode_sb(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
     const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
     const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
     const int mi_height = block_size_high[plane_bsize] >> tx_size_wide_log2[0];
-    const TX_SIZE max_tx_size = get_vartx_max_txsize(mbmi, plane_bsize);
+    const TX_SIZE max_tx_size = get_vartx_max_txsize(
+        mbmi, plane_bsize, pd->subsampling_x || pd->subsampling_y);
     const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
     const int bw = block_size_wide[txb_size] >> tx_size_wide_log2[0];
     const int bh = block_size_high[txb_size] >> tx_size_wide_log2[0];
@@ -1059,320 +1053,6 @@ static void encode_block_intra_and_set_context(int plane, int block,
 #endif
 }
 
-#if CONFIG_DPCM_INTRA
-static int get_eob(const tran_low_t *qcoeff, intptr_t n_coeffs,
-                   const int16_t *scan) {
-  int eob = -1;
-  for (int i = (int)n_coeffs - 1; i >= 0; i--) {
-    const int rc = scan[i];
-    if (qcoeff[rc]) {
-      eob = i;
-      break;
-    }
-  }
-  return eob + 1;
-}
-
-static void quantize_scaler(int coeff, int16_t zbin, int16_t round_value,
-                            int16_t quant, int16_t quant_shift, int16_t dequant,
-                            int log_scale, tran_low_t *const qcoeff,
-                            tran_low_t *const dqcoeff) {
-  zbin = ROUND_POWER_OF_TWO(zbin, log_scale);
-  round_value = ROUND_POWER_OF_TWO(round_value, log_scale);
-  const int coeff_sign = (coeff >> 31);
-  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-  if (abs_coeff >= zbin) {
-    int tmp = clamp(abs_coeff + round_value, INT16_MIN, INT16_MAX);
-    tmp = ((((tmp * quant) >> 16) + tmp) * quant_shift) >> (16 - log_scale);
-    *qcoeff = (tmp ^ coeff_sign) - coeff_sign;
-    *dqcoeff = (*qcoeff * dequant) / (1 << log_scale);
-  }
-}
-
-#if CONFIG_HIGHBITDEPTH
-typedef void (*hbd_dpcm_fwd_tx_func)(const int16_t *input, int stride,
-                                     TX_TYPE_1D tx_type, tran_low_t *output,
-                                     int dir);
-
-static hbd_dpcm_fwd_tx_func get_hbd_dpcm_fwd_tx_func(int tx_length) {
-  switch (tx_length) {
-    case 4: return av1_hbd_dpcm_ft4_c;
-    case 8: return av1_hbd_dpcm_ft8_c;
-    case 16: return av1_hbd_dpcm_ft16_c;
-    case 32:
-      return av1_hbd_dpcm_ft32_c;
-    // TODO(huisu): add support for TX_64X64.
-    default: assert(0); return NULL;
-  }
-}
-#endif  // CONFIG_HIGHBITDEPTH
-
-typedef void (*dpcm_fwd_tx_func)(const int16_t *input, int stride,
-                                 TX_TYPE_1D tx_type, tran_low_t *output);
-
-static dpcm_fwd_tx_func get_dpcm_fwd_tx_func(int tx_length) {
-  switch (tx_length) {
-    case 4: return av1_dpcm_ft4_c;
-    case 8: return av1_dpcm_ft8_c;
-    case 16: return av1_dpcm_ft16_c;
-    case 32:
-      return av1_dpcm_ft32_c;
-    // TODO(huisu): add support for TX_64X64.
-    default: assert(0); return NULL;
-  }
-}
-
-static void process_block_dpcm_vert(TX_SIZE tx_size, TX_TYPE_1D tx_type_1d,
-                                    struct macroblockd_plane *const pd,
-                                    struct macroblock_plane *const p,
-                                    uint8_t *src, int src_stride, uint8_t *dst,
-                                    int dst_stride, int16_t *src_diff,
-                                    int diff_stride, tran_low_t *coeff,
-                                    tran_low_t *qcoeff, tran_low_t *dqcoeff) {
-  const int tx1d_width = tx_size_wide[tx_size];
-  dpcm_fwd_tx_func forward_tx = get_dpcm_fwd_tx_func(tx1d_width);
-  dpcm_inv_txfm_add_func inverse_tx =
-      av1_get_dpcm_inv_txfm_add_func(tx1d_width);
-  const int tx1d_height = tx_size_high[tx_size];
-  const int log_scale = av1_get_tx_scale(tx_size);
-  int q_idx = 0;
-  for (int r = 0; r < tx1d_height; ++r) {
-    // Update prediction.
-    if (r > 0) memcpy(dst, dst - dst_stride, tx1d_width * sizeof(dst[0]));
-    // Subtraction.
-    for (int c = 0; c < tx1d_width; ++c) src_diff[c] = src[c] - dst[c];
-    // Forward transform.
-    forward_tx(src_diff, 1, tx_type_1d, coeff);
-    // Quantization.
-    for (int c = 0; c < tx1d_width; ++c) {
-      quantize_scaler(coeff[c], p->zbin[q_idx], p->round[q_idx],
-                      p->quant[q_idx], p->quant_shift[q_idx],
-                      pd->dequant[q_idx], log_scale, &qcoeff[c], &dqcoeff[c]);
-      q_idx = 1;
-    }
-    // Inverse transform.
-    inverse_tx(dqcoeff, 1, tx_type_1d, dst);
-    // Move to the next row.
-    coeff += tx1d_width;
-    qcoeff += tx1d_width;
-    dqcoeff += tx1d_width;
-    src_diff += diff_stride;
-    dst += dst_stride;
-    src += src_stride;
-  }
-}
-
-static void process_block_dpcm_horz(TX_SIZE tx_size, TX_TYPE_1D tx_type_1d,
-                                    struct macroblockd_plane *const pd,
-                                    struct macroblock_plane *const p,
-                                    uint8_t *src, int src_stride, uint8_t *dst,
-                                    int dst_stride, int16_t *src_diff,
-                                    int diff_stride, tran_low_t *coeff,
-                                    tran_low_t *qcoeff, tran_low_t *dqcoeff) {
-  const int tx1d_height = tx_size_high[tx_size];
-  dpcm_fwd_tx_func forward_tx = get_dpcm_fwd_tx_func(tx1d_height);
-  dpcm_inv_txfm_add_func inverse_tx =
-      av1_get_dpcm_inv_txfm_add_func(tx1d_height);
-  const int tx1d_width = tx_size_wide[tx_size];
-  const int log_scale = av1_get_tx_scale(tx_size);
-  int q_idx = 0;
-  for (int c = 0; c < tx1d_width; ++c) {
-    for (int r = 0; r < tx1d_height; ++r) {
-      // Update prediction.
-      if (c > 0) dst[r * dst_stride] = dst[r * dst_stride - 1];
-      // Subtraction.
-      src_diff[r * diff_stride] = src[r * src_stride] - dst[r * dst_stride];
-    }
-    // Forward transform.
-    tran_low_t tx_buff[64];
-    forward_tx(src_diff, diff_stride, tx_type_1d, tx_buff);
-    for (int r = 0; r < tx1d_height; ++r) coeff[r * tx1d_width] = tx_buff[r];
-    // Quantization.
-    for (int r = 0; r < tx1d_height; ++r) {
-      quantize_scaler(coeff[r * tx1d_width], p->zbin[q_idx], p->round[q_idx],
-                      p->quant[q_idx], p->quant_shift[q_idx],
-                      pd->dequant[q_idx], log_scale, &qcoeff[r * tx1d_width],
-                      &dqcoeff[r * tx1d_width]);
-      q_idx = 1;
-    }
-    // Inverse transform.
-    for (int r = 0; r < tx1d_height; ++r) tx_buff[r] = dqcoeff[r * tx1d_width];
-    inverse_tx(tx_buff, dst_stride, tx_type_1d, dst);
-    // Move to the next column.
-    ++coeff, ++qcoeff, ++dqcoeff, ++src_diff, ++dst, ++src;
-  }
-}
-
-#if CONFIG_HIGHBITDEPTH
-static void hbd_process_block_dpcm_vert(
-    TX_SIZE tx_size, TX_TYPE_1D tx_type_1d, int bd,
-    struct macroblockd_plane *const pd, struct macroblock_plane *const p,
-    uint8_t *src8, int src_stride, uint8_t *dst8, int dst_stride,
-    int16_t *src_diff, int diff_stride, tran_low_t *coeff, tran_low_t *qcoeff,
-    tran_low_t *dqcoeff) {
-  const int tx1d_width = tx_size_wide[tx_size];
-  hbd_dpcm_fwd_tx_func forward_tx = get_hbd_dpcm_fwd_tx_func(tx1d_width);
-  hbd_dpcm_inv_txfm_add_func inverse_tx =
-      av1_get_hbd_dpcm_inv_txfm_add_func(tx1d_width);
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  const int tx1d_height = tx_size_high[tx_size];
-  const int log_scale = av1_get_tx_scale(tx_size);
-  int q_idx = 0;
-  for (int r = 0; r < tx1d_height; ++r) {
-    // Update prediction.
-    if (r > 0) memcpy(dst, dst - dst_stride, tx1d_width * sizeof(dst[0]));
-    // Subtraction.
-    for (int c = 0; c < tx1d_width; ++c) src_diff[c] = src[c] - dst[c];
-    // Forward transform.
-    forward_tx(src_diff, 1, tx_type_1d, coeff, 1);
-    // Quantization.
-    for (int c = 0; c < tx1d_width; ++c) {
-      quantize_scaler(coeff[c], p->zbin[q_idx], p->round[q_idx],
-                      p->quant[q_idx], p->quant_shift[q_idx],
-                      pd->dequant[q_idx], log_scale, &qcoeff[c], &dqcoeff[c]);
-      q_idx = 1;
-    }
-    // Inverse transform.
-    inverse_tx(dqcoeff, 1, tx_type_1d, bd, dst, 1);
-    // Move to the next row.
-    coeff += tx1d_width;
-    qcoeff += tx1d_width;
-    dqcoeff += tx1d_width;
-    src_diff += diff_stride;
-    dst += dst_stride;
-    src += src_stride;
-  }
-}
-
-static void hbd_process_block_dpcm_horz(
-    TX_SIZE tx_size, TX_TYPE_1D tx_type_1d, int bd,
-    struct macroblockd_plane *const pd, struct macroblock_plane *const p,
-    uint8_t *src8, int src_stride, uint8_t *dst8, int dst_stride,
-    int16_t *src_diff, int diff_stride, tran_low_t *coeff, tran_low_t *qcoeff,
-    tran_low_t *dqcoeff) {
-  const int tx1d_height = tx_size_high[tx_size];
-  hbd_dpcm_fwd_tx_func forward_tx = get_hbd_dpcm_fwd_tx_func(tx1d_height);
-  hbd_dpcm_inv_txfm_add_func inverse_tx =
-      av1_get_hbd_dpcm_inv_txfm_add_func(tx1d_height);
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  const int tx1d_width = tx_size_wide[tx_size];
-  const int log_scale = av1_get_tx_scale(tx_size);
-  int q_idx = 0;
-  for (int c = 0; c < tx1d_width; ++c) {
-    for (int r = 0; r < tx1d_height; ++r) {
-      // Update prediction.
-      if (c > 0) dst[r * dst_stride] = dst[r * dst_stride - 1];
-      // Subtraction.
-      src_diff[r * diff_stride] = src[r * src_stride] - dst[r * dst_stride];
-    }
-    // Forward transform.
-    tran_low_t tx_buff[64];
-    forward_tx(src_diff, diff_stride, tx_type_1d, tx_buff, 0);
-    for (int r = 0; r < tx1d_height; ++r) coeff[r * tx1d_width] = tx_buff[r];
-    // Quantization.
-    for (int r = 0; r < tx1d_height; ++r) {
-      quantize_scaler(coeff[r * tx1d_width], p->zbin[q_idx], p->round[q_idx],
-                      p->quant[q_idx], p->quant_shift[q_idx],
-                      pd->dequant[q_idx], log_scale, &qcoeff[r * tx1d_width],
-                      &dqcoeff[r * tx1d_width]);
-      q_idx = 1;
-    }
-    // Inverse transform.
-    for (int r = 0; r < tx1d_height; ++r) tx_buff[r] = dqcoeff[r * tx1d_width];
-    inverse_tx(tx_buff, dst_stride, tx_type_1d, bd, dst, 0);
-    // Move to the next column.
-    ++coeff, ++qcoeff, ++dqcoeff, ++src_diff, ++dst, ++src;
-  }
-}
-#endif  // CONFIG_HIGHBITDEPTH
-
-void av1_encode_block_intra_dpcm(const AV1_COMMON *cm, MACROBLOCK *x,
-                                 PREDICTION_MODE mode, int plane, int block,
-                                 int blk_row, int blk_col,
-                                 BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                                 TX_TYPE tx_type, ENTROPY_CONTEXT *ta,
-                                 ENTROPY_CONTEXT *tl, int8_t *skip) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  struct macroblock_plane *const p = &x->plane[plane];
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  const int diff_stride = block_size_wide[plane_bsize];
-  const int src_stride = p->src.stride;
-  const int dst_stride = pd->dst.stride;
-  const int tx1d_width = tx_size_wide[tx_size];
-  const int tx1d_height = tx_size_high[tx_size];
-  const SCAN_ORDER *const scan_order =
-      get_scan(cm, tx_size, tx_type, &xd->mi[0]->mbmi);
-  tran_low_t *coeff = BLOCK_OFFSET(p->coeff, block);
-  tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  uint8_t *dst =
-      &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
-  uint8_t *src =
-      &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
-  int16_t *src_diff =
-      &p->src_diff[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
-  uint16_t *eob = &p->eobs[block];
-  *eob = 0;
-  memset(qcoeff, 0, tx1d_height * tx1d_width * sizeof(*qcoeff));
-  memset(dqcoeff, 0, tx1d_height * tx1d_width * sizeof(*dqcoeff));
-
-  if (LIKELY(!x->skip_block)) {
-    TX_TYPE_1D tx_type_1d = DCT_1D;
-    switch (tx_type) {
-      case IDTX: tx_type_1d = IDTX_1D; break;
-      case V_DCT:
-        assert(mode == H_PRED);
-        tx_type_1d = DCT_1D;
-        break;
-      case H_DCT:
-        assert(mode == V_PRED);
-        tx_type_1d = DCT_1D;
-        break;
-      default: assert(0);
-    }
-    switch (mode) {
-      case V_PRED:
-#if CONFIG_HIGHBITDEPTH
-        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-          hbd_process_block_dpcm_vert(tx_size, tx_type_1d, xd->bd, pd, p, src,
-                                      src_stride, dst, dst_stride, src_diff,
-                                      diff_stride, coeff, qcoeff, dqcoeff);
-        } else {
-#endif  // CONFIG_HIGHBITDEPTH
-          process_block_dpcm_vert(tx_size, tx_type_1d, pd, p, src, src_stride,
-                                  dst, dst_stride, src_diff, diff_stride, coeff,
-                                  qcoeff, dqcoeff);
-#if CONFIG_HIGHBITDEPTH
-        }
-#endif  // CONFIG_HIGHBITDEPTH
-        break;
-      case H_PRED:
-#if CONFIG_HIGHBITDEPTH
-        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-          hbd_process_block_dpcm_horz(tx_size, tx_type_1d, xd->bd, pd, p, src,
-                                      src_stride, dst, dst_stride, src_diff,
-                                      diff_stride, coeff, qcoeff, dqcoeff);
-        } else {
-#endif  // CONFIG_HIGHBITDEPTH
-          process_block_dpcm_horz(tx_size, tx_type_1d, pd, p, src, src_stride,
-                                  dst, dst_stride, src_diff, diff_stride, coeff,
-                                  qcoeff, dqcoeff);
-#if CONFIG_HIGHBITDEPTH
-        }
-#endif  // CONFIG_HIGHBITDEPTH
-        break;
-      default: assert(0);
-    }
-    *eob = get_eob(qcoeff, tx1d_height * tx1d_width, scan_order->scan);
-  }
-
-  ta[blk_col] = tl[blk_row] = *eob > 0;
-  if (*eob) *skip = 0;
-}
-#endif  // CONFIG_DPCM_INTRA
-
 void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
                             BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
                             void *arg) {
@@ -1383,6 +1063,9 @@ void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+  uint8_t *mrc_mask = BLOCK_OFFSET(xd->mrc_mask, block);
+#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
   PLANE_TYPE plane_type = get_plane_type(plane);
   const TX_TYPE tx_type =
       av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
@@ -1391,21 +1074,8 @@ void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
   uint8_t *dst =
       &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
 
-  av1_predict_intra_block_facade(xd, plane, block, blk_col, blk_row, tx_size);
-
-#if CONFIG_DPCM_INTRA || CONFIG_LGT
-  const PREDICTION_MODE mode =
-      get_prediction_mode(xd->mi[0], plane, tx_size, block);
-#if CONFIG_DPCM_INTRA
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  if (av1_use_dpcm_intra(plane, mode, tx_type, mbmi)) {
-    av1_encode_block_intra_dpcm(cm, x, mode, plane, block, blk_row, blk_col,
-                                plane_bsize, tx_size, tx_type, args->ta,
-                                args->tl, args->skip);
-    return;
-  }
-#endif  // CONFIG_DPCM_INTRA
-#endif  // CONFIG_DPCM_INTRA || CONFIG_LGT
+  av1_predict_intra_block_facade(cm, xd, plane, block, blk_col, blk_row,
+                                 tx_size);
 
   av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
 
@@ -1416,7 +1086,7 @@ void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
     av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
                     ctx, AV1_XFORM_QUANT_FP);
     av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size,
-                   a, l);
+                   a, l, 0);
   } else {
     av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
                     ctx, AV1_XFORM_QUANT_B);
@@ -1429,9 +1099,12 @@ void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
   if (x->pvq_skip[plane]) return;
 #endif  // CONFIG_PVQ
   av1_inverse_transform_block(xd, dqcoeff,
-#if CONFIG_LGT
-                              mode,
+#if CONFIG_LGT_FROM_PRED
+                              xd->mi[0]->mbmi.mode,
 #endif
+#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+                              mrc_mask,
+#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
                               tx_type, tx_size, dst, dst_stride, *eob);
 #if !CONFIG_PVQ
   if (*eob) *(args->skip) = 0;
@@ -1439,12 +1112,10 @@ void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
 // Note : *(args->skip) == mbmi->skip
 #endif
 #if CONFIG_CFL
-  if (plane == AOM_PLANE_Y && x->cfl_store_y) {
-    // TODO (ltrudeau) Store sub-8x8 inter blocks when bottom right block is
-    // intra predicted.
-    cfl_store(xd->cfl, dst, dst_stride, blk_row, blk_col, tx_size, plane_bsize);
+  if (plane == AOM_PLANE_Y && xd->cfl->store_y) {
+    cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize);
   }
-#endif
+#endif  // CONFIG_CFL
 }
 
 void av1_encode_intra_block_plane(AV1_COMMON *cm, MACROBLOCK *x,
@@ -1483,7 +1154,7 @@ PVQ_SKIP_TYPE av1_pvq_encode_helper(MACROBLOCK *x, tran_low_t *const coeff,
                                     tran_low_t *ref_coeff,
                                     tran_low_t *const dqcoeff, uint16_t *eob,
                                     const int16_t *quant, int plane,
-                                    int tx_size, TX_TYPE tx_type, int *rate,
+                                    TX_SIZE tx_size, TX_TYPE tx_type, int *rate,
                                     int speed, PVQ_INFO *pvq_info) {
   const int tx_blk_size = tx_size_wide[tx_size];
   daala_enc_ctx *daala_enc = &x->daala_enc;
@@ -1512,10 +1183,11 @@ PVQ_SKIP_TYPE av1_pvq_encode_helper(MACROBLOCK *x, tran_low_t *const coeff,
   // DC quantizer for PVQ
   if (use_activity_masking)
     pvq_dc_quant =
-        OD_MAXI(1, (quant[0] << (OD_COEFF_SHIFT - 3) >> hbd_downshift) *
-                           daala_enc->state
-                               .pvq_qm_q4[plane][od_qm_get_index(tx_size, 0)] >>
-                       4);
+        OD_MAXI(1,
+                (quant[0] << (OD_COEFF_SHIFT - 3) >> hbd_downshift) *
+                        daala_enc->state
+                            .pvq_qm_q4[plane][od_qm_get_index(tx_size, 0)] >>
+                    4);
   else
     pvq_dc_quant =
         OD_MAXI(1, quant[0] << (OD_COEFF_SHIFT - 3) >> hbd_downshift);
@@ -1549,18 +1221,19 @@ PVQ_SKIP_TYPE av1_pvq_encode_helper(MACROBLOCK *x, tran_low_t *const coeff,
     out_int32[0] = OD_DIV_R0(in_int32[0] - ref_int32[0], pvq_dc_quant);
   }
 
-  ac_dc_coded =
-      od_pvq_encode(daala_enc, ref_int32, in_int32, out_int32,
-                    OD_MAXI(1, quant[0] << (OD_COEFF_SHIFT - 3) >>
-                                   hbd_downshift),  // scale/quantizer
-                    OD_MAXI(1, quant[1] << (OD_COEFF_SHIFT - 3) >>
-                                   hbd_downshift),  // scale/quantizer
-                    plane,
-                    tx_size, OD_PVQ_BETA[use_activity_masking][plane][tx_size],
-                    0,  // is_keyframe,
-                    daala_enc->state.qm + off, daala_enc->state.qm_inv + off,
-                    speed,  // speed
-                    pvq_info);
+  ac_dc_coded = od_pvq_encode(
+      daala_enc, ref_int32, in_int32, out_int32,
+      OD_MAXI(1,
+              quant[0] << (OD_COEFF_SHIFT - 3) >>
+                  hbd_downshift),  // scale/quantizer
+      OD_MAXI(1,
+              quant[1] << (OD_COEFF_SHIFT - 3) >>
+                  hbd_downshift),  // scale/quantizer
+      plane, tx_size, OD_PVQ_BETA[use_activity_masking][plane][tx_size],
+      0,  // is_keyframe,
+      daala_enc->state.qm + off, daala_enc->state.qm_inv + off,
+      speed,  // speed
+      pvq_info);
 
   // Encode residue of DC coeff, if required.
   if (!has_dc_skip || out_int32[0]) {
diff --git a/third_party/aom/av1/encoder/encodemb.h b/third_party/aom/av1/encoder/encodemb.h
index 65476bcae..c817a94f0 100644
--- a/third_party/aom/av1/encoder/encodemb.h
+++ b/third_party/aom/av1/encoder/encodemb.h
@@ -56,15 +56,17 @@ void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
 int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int blk_row,
                    int blk_col, int block, BLOCK_SIZE plane_bsize,
                    TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
-                   const ENTROPY_CONTEXT *l);
+                   const ENTROPY_CONTEXT *l, int fast_mode);
 
 void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
                       int blk_col, int blk_row, TX_SIZE tx_size);
 
 void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
 
+#if !CONFIG_PVQ
 void av1_set_txb_context(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size,
                          ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l);
+#endif
 
 void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
                             BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg);
@@ -79,7 +81,7 @@ PVQ_SKIP_TYPE av1_pvq_encode_helper(MACROBLOCK *x, tran_low_t *const coeff,
                                     tran_low_t *ref_coeff,
                                     tran_low_t *const dqcoeff, uint16_t *eob,
                                     const int16_t *quant, int plane,
-                                    int tx_size, TX_TYPE tx_type, int *rate,
+                                    TX_SIZE tx_size, TX_TYPE tx_type, int *rate,
                                     int speed, PVQ_INFO *pvq_info);
 
 void av1_store_pvq_enc_info(PVQ_INFO *pvq_info, int *qg, int *theta, int *k,
@@ -87,15 +89,6 @@ void av1_store_pvq_enc_info(PVQ_INFO *pvq_info, int *qg, int *theta, int *k,
                             int *size, int skip_rest, int skip_dir, int bs);
 #endif
 
-#if CONFIG_DPCM_INTRA
-void av1_encode_block_intra_dpcm(const AV1_COMMON *cm, MACROBLOCK *x,
-                                 PREDICTION_MODE mode, int plane, int block,
-                                 int blk_row, int blk_col,
-                                 BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                                 TX_TYPE tx_type, ENTROPY_CONTEXT *ta,
-                                 ENTROPY_CONTEXT *tl, int8_t *skip);
-#endif  // CONFIG_DPCM_INTRA
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/encoder/encodemv.c b/third_party/aom/av1/encoder/encodemv.c
index fd61fe6b2..f8a546999 100644
--- a/third_party/aom/av1/encoder/encodemv.c
+++ b/third_party/aom/av1/encoder/encodemv.c
@@ -62,17 +62,22 @@ static void encode_mv_component(aom_writer *w, int comp, nmv_component *mvcomp,
   } else {
     int i;
     const int n = mv_class + CLASS0_BITS - 1;  // number of bits
+#if CONFIG_NEW_MULTISYMBOL
+    for (i = 0; i < n; ++i)
+      aom_write_symbol(w, (d >> i) & 1, mvcomp->bits_cdf[(i + 1) / 2], 2);
+#else
     for (i = 0; i < n; ++i) aom_write(w, (d >> i) & 1, mvcomp->bits[i]);
+#endif
   }
-
 // Fractional bits
-#if CONFIG_INTRABC
+#if CONFIG_INTRABC || CONFIG_AMVR
   if (precision > MV_SUBPEL_NONE)
-#endif  // CONFIG_INTRABC
+#endif  // CONFIG_INTRABC || CONFIG_AMVR
   {
-    aom_write_symbol(w, fr, mv_class == MV_CLASS_0 ? mvcomp->class0_fp_cdf[d]
-                                                   : mvcomp->fp_cdf,
-                     MV_FP_SIZE);
+    aom_write_symbol(
+        w, fr,
+        mv_class == MV_CLASS_0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf,
+        MV_FP_SIZE);
   }
 
   // High precision bit
@@ -129,9 +134,9 @@ static void build_nmv_component_cost_table(int *mvcost,
       const int b = c + CLASS0_BITS - 1; /* number of bits */
       for (i = 0; i < b; ++i) cost += bits_cost[i][((d >> i) & 1)];
     }
-#if CONFIG_INTRABC
+#if CONFIG_INTRABC || CONFIG_AMVR
     if (precision > MV_SUBPEL_NONE)
-#endif  // CONFIG_INTRABC
+#endif  // CONFIG_INTRABC || CONFIG_AMVR
     {
       if (c == MV_CLASS_0) {
         cost += class0_fp_cost[d][f];
@@ -165,6 +170,11 @@ void av1_write_nmv_probs(AV1_COMMON *cm, int usehp, aom_writer *w,
                          nmv_context_counts *const nmv_counts) {
   int i;
   int nmv_ctx = 0;
+#if CONFIG_AMVR
+  if (cm->cur_frame_mv_precision_level) {
+    return;
+  }
+#endif
   for (nmv_ctx = 0; nmv_ctx < NMV_CONTEXTS; ++nmv_ctx) {
     nmv_context *const mvc = &cm->fc->nmvc[nmv_ctx];
     nmv_context_counts *const counts = &nmv_counts[nmv_ctx];
@@ -184,6 +194,11 @@ void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref,
                    nmv_context *mvctx, int usehp) {
   const MV diff = { mv->row - ref->row, mv->col - ref->col };
   const MV_JOINT_TYPE j = av1_get_mv_joint(&diff);
+#if CONFIG_AMVR
+  if (cpi->common.cur_frame_mv_precision_level) {
+    usehp = MV_SUBPEL_NONE;
+  }
+#endif
   aom_write_symbol(w, j, mvctx->joint_cdf, MV_JOINTS);
   if (mv_joint_vertical(j))
     encode_mv_component(w, diff.row, &mvctx->comps[0], usehp);
@@ -222,10 +237,14 @@ void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
   build_nmv_component_cost_table(mvcost[1], &ctx->comps[1], precision);
 }
 
-#if CONFIG_EXT_INTER
 static void inc_mvs(const MB_MODE_INFO *mbmi, const MB_MODE_INFO_EXT *mbmi_ext,
                     const int_mv mvs[2], const int_mv pred_mvs[2],
-                    nmv_context_counts *nmv_counts) {
+                    nmv_context_counts *nmv_counts
+#if CONFIG_AMVR
+                    ,
+                    MvSubpelPrecision precision
+#endif
+                    ) {
   int i;
   PREDICTION_MODE mode = mbmi->mode;
 
@@ -240,7 +259,11 @@ static void inc_mvs(const MB_MODE_INFO *mbmi, const MB_MODE_INFO_EXT *mbmi_ext,
                       mbmi_ext->ref_mv_stack[rf_type], i, mbmi->ref_mv_idx);
       nmv_context_counts *counts = &nmv_counts[nmv_ctx];
       (void)pred_mvs;
+#if CONFIG_AMVR
+      av1_inc_mv(&diff, counts, precision);
+#else
       av1_inc_mv(&diff, counts, 1);
+#endif
     }
   } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
     const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0].as_mv;
@@ -251,7 +274,11 @@ static void inc_mvs(const MB_MODE_INFO *mbmi, const MB_MODE_INFO_EXT *mbmi_ext,
         av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
                     mbmi_ext->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx);
     nmv_context_counts *counts = &nmv_counts[nmv_ctx];
+#if CONFIG_AMVR
+    av1_inc_mv(&diff, counts, precision);
+#else
     av1_inc_mv(&diff, counts, 1);
+#endif
   } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
     const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0].as_mv;
     const MV diff = { mvs[0].as_mv.row - ref->row,
@@ -261,7 +288,11 @@ static void inc_mvs(const MB_MODE_INFO *mbmi, const MB_MODE_INFO_EXT *mbmi_ext,
         av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
                     mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
     nmv_context_counts *counts = &nmv_counts[nmv_ctx];
+#if CONFIG_AMVR
+    av1_inc_mv(&diff, counts, precision);
+#else
     av1_inc_mv(&diff, counts, 1);
+#endif
 #if CONFIG_COMPOUND_SINGLEREF
   } else {
     assert(  // mode == SR_NEAREST_NEWMV ||
@@ -288,7 +319,12 @@ static void inc_mvs(const MB_MODE_INFO *mbmi, const MB_MODE_INFO_EXT *mbmi_ext,
 
 static void inc_mvs_sub8x8(const MODE_INFO *mi, int block, const int_mv mvs[2],
                            const MB_MODE_INFO_EXT *mbmi_ext,
-                           nmv_context_counts *nmv_counts) {
+                           nmv_context_counts *nmv_counts
+#if CONFIG_AMVR
+                           ,
+                           MvSubpelPrecision precision
+#endif
+                           ) {
   int i;
   PREDICTION_MODE mode = mi->bmi[block].as_mode;
   const MB_MODE_INFO *mbmi = &mi->mbmi;
@@ -303,7 +339,11 @@ static void inc_mvs_sub8x8(const MODE_INFO *mi, int block, const int_mv mvs[2],
           av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
                       mbmi_ext->ref_mv_stack[rf_type], i, mbmi->ref_mv_idx);
       nmv_context_counts *counts = &nmv_counts[nmv_ctx];
+#if CONFIG_AMVR
+      av1_inc_mv(&diff, counts, precision);
+#else
       av1_inc_mv(&diff, counts, 1);
+#endif
     }
   } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
     const MV *ref = &mi->bmi[block].ref_mv[1].as_mv;
@@ -314,7 +354,11 @@ static void inc_mvs_sub8x8(const MODE_INFO *mi, int block, const int_mv mvs[2],
         av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
                     mbmi_ext->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx);
     nmv_context_counts *counts = &nmv_counts[nmv_ctx];
+#if CONFIG_AMVR
+    av1_inc_mv(&diff, counts, precision);
+#else
     av1_inc_mv(&diff, counts, 1);
+#endif
   } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
     const MV *ref = &mi->bmi[block].ref_mv[0].as_mv;
     const MV diff = { mvs[0].as_mv.row - ref->row,
@@ -324,28 +368,13 @@ static void inc_mvs_sub8x8(const MODE_INFO *mi, int block, const int_mv mvs[2],
         av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
                     mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
     nmv_context_counts *counts = &nmv_counts[nmv_ctx];
+#if CONFIG_AMVR
+    av1_inc_mv(&diff, counts, precision);
+#else
     av1_inc_mv(&diff, counts, 1);
+#endif
   }
 }
-#else   // !CONFIG_EXT_INTER
-static void inc_mvs(const MB_MODE_INFO *mbmi, const MB_MODE_INFO_EXT *mbmi_ext,
-                    const int_mv mvs[2], const int_mv pred_mvs[2],
-                    nmv_context_counts *nmv_counts) {
-  int i;
-
-  for (i = 0; i < 1 + has_second_ref(mbmi); ++i) {
-    int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
-    int nmv_ctx =
-        av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
-                    mbmi_ext->ref_mv_stack[rf_type], i, mbmi->ref_mv_idx);
-    nmv_context_counts *counts = &nmv_counts[nmv_ctx];
-    const MV *ref = &pred_mvs[i].as_mv;
-    const MV diff = { mvs[i].as_mv.row - ref->row,
-                      mvs[i].as_mv.col - ref->col };
-    av1_inc_mv(&diff, counts, 1);
-  }
-}
-#endif  // CONFIG_EXT_INTER
 
 void av1_update_mv_count(ThreadData *td) {
   const MACROBLOCKD *xd = &td->mb.e_mbd;
@@ -357,6 +386,12 @@ void av1_update_mv_count(ThreadData *td) {
 #else
   const int unify_bsize = 0;
 #endif
+#if CONFIG_AMVR
+  MvSubpelPrecision precision = 1;
+  if (xd->cur_frame_mv_precision_level) {
+    precision = MV_SUBPEL_NONE;
+  }
+#endif
 
   if (mbmi->sb_type < BLOCK_8X8 && !unify_bsize) {
     const int num_4x4_w = num_4x4_blocks_wide_lookup[mbmi->sb_type];
@@ -367,22 +402,24 @@ void av1_update_mv_count(ThreadData *td) {
       for (idx = 0; idx < 2; idx += num_4x4_w) {
         const int i = idy * 2 + idx;
 
-#if CONFIG_EXT_INTER
         if (have_newmv_in_inter_mode(mi->bmi[i].as_mode))
-          inc_mvs_sub8x8(mi, i, mi->bmi[i].as_mv, mbmi_ext, td->counts->mv);
+
+#if CONFIG_AMVR
+          inc_mvs_sub8x8(mi, i, mi->bmi[i].as_mv, mbmi_ext, td->counts->mv,
+                         precision);
 #else
-        if (mi->bmi[i].as_mode == NEWMV)
-          inc_mvs(mbmi, mbmi_ext, mi->bmi[i].as_mv, mi->bmi[i].pred_mv,
-                  td->counts->mv);
-#endif  // CONFIG_EXT_INTER
+          inc_mvs_sub8x8(mi, i, mi->bmi[i].as_mv, mbmi_ext, td->counts->mv);
+#endif
       }
     }
   } else {
-#if CONFIG_EXT_INTER
     if (have_newmv_in_inter_mode(mbmi->mode))
+
+#if CONFIG_AMVR
+      inc_mvs(mbmi, mbmi_ext, mbmi->mv, mbmi->pred_mv, td->counts->mv,
+              precision);
 #else
-    if (mbmi->mode == NEWMV)
-#endif  // CONFIG_EXT_INTER
       inc_mvs(mbmi, mbmi_ext, mbmi->mv, mbmi->pred_mv, td->counts->mv);
+#endif
   }
 }
diff --git a/third_party/aom/av1/encoder/encoder.c b/third_party/aom/av1/encoder/encoder.c
index 943e2c6a0..e9ab3c87f 100644
--- a/third_party/aom/av1/encoder/encoder.c
+++ b/third_party/aom/av1/encoder/encoder.c
@@ -45,11 +45,18 @@
 #endif
 #include "av1/encoder/ethread.h"
 #include "av1/encoder/firstpass.h"
+#if CONFIG_HASH_ME
+#include "av1/encoder/hash_motion.h"
+#endif
 #include "av1/encoder/mbgraph.h"
+#if CONFIG_NCOBMC_ADAPT_WEIGHT
+#include "av1/common/ncobmc_kernels.h"
+#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
 #include "av1/encoder/picklpf.h"
 #if CONFIG_LOOP_RESTORATION
 #include "av1/encoder/pickrst.h"
 #endif  // CONFIG_LOOP_RESTORATION
+#include "av1/encoder/random.h"
 #include "av1/encoder/ratectrl.h"
 #include "av1/encoder/rd.h"
 #include "av1/encoder/segmentation.h"
@@ -90,6 +97,7 @@ FRAME_COUNTS aggregate_fc_per_type[FRAME_CONTEXTS];
                                        // mv. Choose a very high value for
                                        // now so that HIGH_PRECISION is always
                                        // chosen.
+
 // #define OUTPUT_YUV_REC
 #ifdef OUTPUT_YUV_DENOISED
 FILE *yuv_denoised_file = NULL;
@@ -172,14 +180,37 @@ static void apply_active_map(AV1_COMP *cpi) {
         if (seg_map[i] == AM_SEGMENT_ID_ACTIVE) seg_map[i] = active_map[i];
       av1_enable_segmentation(seg);
       av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
+#if CONFIG_LOOPFILTER_LEVEL
+      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H);
+      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V);
+      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U);
+      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V);
+
+      av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H,
+                      -MAX_LOOP_FILTER);
+      av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V,
+                      -MAX_LOOP_FILTER);
+      av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U,
+                      -MAX_LOOP_FILTER);
+      av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V,
+                      -MAX_LOOP_FILTER);
+#else
       av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF);
       // Setting the data to -MAX_LOOP_FILTER will result in the computed loop
       // filter level being zero regardless of the value of seg->abs_delta.
       av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF,
                       -MAX_LOOP_FILTER);
+#endif  // CONFIG_LOOPFILTER_LEVEL
     } else {
       av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
+#if CONFIG_LOOPFILTER_LEVEL
+      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H);
+      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V);
+      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U);
+      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V);
+#else
       av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF);
+#endif  // CONFIG_LOOPFILTER_LEVEL
       if (seg->enabled) {
         seg->update_data = 1;
         seg->update_map = 1;
@@ -246,11 +277,21 @@ int av1_get_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows,
   }
 }
 
-void av1_set_high_precision_mv(AV1_COMP *cpi, int allow_high_precision_mv) {
+static void set_high_precision_mv(AV1_COMP *cpi, int allow_high_precision_mv
+#if CONFIG_AMVR
+                                  ,
+                                  int cur_frame_mv_precision_level
+#endif
+                                  ) {
   MACROBLOCK *const mb = &cpi->td.mb;
   cpi->common.allow_high_precision_mv = allow_high_precision_mv;
 
+#if CONFIG_AMVR
+  if (cpi->common.allow_high_precision_mv &&
+      cur_frame_mv_precision_level == 0) {
+#else
   if (cpi->common.allow_high_precision_mv) {
+#endif
     int i;
     for (i = 0; i < NMV_CONTEXTS; ++i) {
       mb->mv_cost_stack[i] = mb->nmvcost_hp[i];
@@ -296,13 +337,17 @@ static void setup_frame(AV1_COMP *cpi) {
   if (frame_is_intra_only(cm) || cm->error_resilient_mode) {
     av1_setup_past_independence(cm);
   } else {
+#if CONFIG_NO_FRAME_CONTEXT_SIGNALING
+// Just use frame context from first signaled reference frame.
+// This will always be LAST_FRAME for now.
+#else
 #if CONFIG_EXT_REFS
     const GF_GROUP *gf_group = &cpi->twopass.gf_group;
-    if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW)
+    if (gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE)
       cm->frame_context_idx = EXT_ARF_FRAME;
     else if (cpi->refresh_alt_ref_frame)
       cm->frame_context_idx = ARF_FRAME;
-#else
+#else   // !CONFIG_EXT_REFS
     if (cpi->refresh_alt_ref_frame) cm->frame_context_idx = ARF_FRAME;
 #endif  // CONFIG_EXT_REFS
     else if (cpi->rc.is_src_frame_alt_ref)
@@ -315,32 +360,56 @@ static void setup_frame(AV1_COMP *cpi) {
 #endif  // CONFIG_EXT_REFS
     else
       cm->frame_context_idx = REGULAR_FRAME;
+#endif  // CONFIG_NO_FRAME_CONTEXT_SIGNALING
   }
 
   if (cm->frame_type == KEY_FRAME) {
     cpi->refresh_golden_frame = 1;
     cpi->refresh_alt_ref_frame = 1;
     av1_zero(cpi->interp_filter_selected);
+    set_sb_size(cm, select_sb_size(cpi));
+#if CONFIG_REFERENCE_BUFFER
+    set_use_reference_buffer(cm, 0);
+#endif  // CONFIG_REFERENCE_BUFFER
   } else {
+#if CONFIG_NO_FRAME_CONTEXT_SIGNALING
+    if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
+        cm->frame_refs[0].idx < 0) {
+      *cm->fc = cm->frame_contexts[FRAME_CONTEXT_DEFAULTS];
+    } else {
+      *cm->fc = cm->frame_contexts[cm->frame_refs[0].idx];
+    }
+#else
     *cm->fc = cm->frame_contexts[cm->frame_context_idx];
+#endif  // CONFIG_NO_FRAME_CONTEXT_SIGNALING
     av1_zero(cpi->interp_filter_selected[0]);
   }
 #if CONFIG_EXT_REFS
-#if CONFIG_ONE_SIDED_COMPOUND  // No change to bitstream
+#if CONFIG_ONE_SIDED_COMPOUND && \
+    !CONFIG_EXT_COMP_REFS  // No change to bitstream
   if (cpi->sf.recode_loop == DISALLOW_RECODE) {
     cpi->refresh_bwd_ref_frame = cpi->refresh_last_frame;
     cpi->rc.is_bipred_frame = 1;
   }
-#endif
-#endif
+#endif  // CONFIG_ONE_SIDED_COMPOUND && !CONFIG_EXT_COMP_REFS
+#endif  // CONFIG_EXT_REFS
+#if CONFIG_NO_FRAME_CONTEXT_SIGNALING
+  if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
+      cm->frame_refs[0].idx < 0) {
+    // use default frame context values
+    cm->pre_fc = &cm->frame_contexts[FRAME_CONTEXT_DEFAULTS];
+  } else {
+    *cm->fc = cm->frame_contexts[cm->frame_refs[0].idx];
+    cm->pre_fc = &cm->frame_contexts[cm->frame_refs[0].idx];
+  }
+#else
   cm->pre_fc = &cm->frame_contexts[cm->frame_context_idx];
+#endif  // CONFIG_NO_FRAME_CONTEXT_SIGNALING
 
   cpi->vaq_refresh = 0;
-
-  set_sb_size(cm, select_sb_size(cpi));
 }
 
-static void av1_enc_setup_mi(AV1_COMMON *cm) {
+static void enc_setup_mi(AV1_COMMON *cm) {
   int i;
   cm->mi = cm->mip + cm->mi_stride + 1;
   memset(cm->mip, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip));
@@ -350,7 +419,6 @@ static void av1_enc_setup_mi(AV1_COMMON *cm) {
   // Clear left border column
   for (i = 1; i < cm->mi_rows + 1; ++i)
     memset(&cm->prev_mip[i * cm->mi_stride], 0, sizeof(*cm->prev_mip));
-
   cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1;
   cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1;
 
@@ -358,7 +426,7 @@ static void av1_enc_setup_mi(AV1_COMMON *cm) {
          cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mi_grid_base));
 }
 
-static int av1_enc_alloc_mi(AV1_COMMON *cm, int mi_size) {
+static int enc_alloc_mi(AV1_COMMON *cm, int mi_size) {
   cm->mip = aom_calloc(mi_size, sizeof(*cm->mip));
   if (!cm->mip) return 1;
   cm->prev_mip = aom_calloc(mi_size, sizeof(*cm->prev_mip));
@@ -374,7 +442,7 @@ static int av1_enc_alloc_mi(AV1_COMMON *cm, int mi_size) {
   return 0;
 }
 
-static void av1_enc_free_mi(AV1_COMMON *cm) {
+static void enc_free_mi(AV1_COMMON *cm) {
   aom_free(cm->mip);
   cm->mip = NULL;
   aom_free(cm->prev_mip);
@@ -383,9 +451,10 @@ static void av1_enc_free_mi(AV1_COMMON *cm) {
   cm->mi_grid_base = NULL;
   aom_free(cm->prev_mi_grid_base);
   cm->prev_mi_grid_base = NULL;
+  cm->mi_alloc_size = 0;
 }
 
-static void av1_swap_mi_and_prev_mi(AV1_COMMON *cm) {
+static void swap_mi_and_prev_mi(AV1_COMMON *cm) {
   // Current mip will be the prev_mip for the next frame.
   MODE_INFO **temp_base = cm->prev_mi_grid_base;
   MODE_INFO *temp = cm->prev_mip;
@@ -416,18 +485,31 @@ void av1_initialize_enc(void) {
 #endif
     av1_entropy_mv_init();
     av1_encode_token_init();
-#if CONFIG_EXT_INTER
     av1_init_wedge_masks();
-#endif
     init_done = 1;
   }
 }
 
+static void dealloc_context_buffers_ext(AV1_COMP *cpi) {
+  if (cpi->mbmi_ext_base) {
+    aom_free(cpi->mbmi_ext_base);
+    cpi->mbmi_ext_base = NULL;
+  }
+}
+
+static void alloc_context_buffers_ext(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  int mi_size = cm->mi_cols * cm->mi_rows;
+
+  dealloc_context_buffers_ext(cpi);
+  CHECK_MEM_ERROR(cm, cpi->mbmi_ext_base,
+                  aom_calloc(mi_size, sizeof(*cpi->mbmi_ext_base)));
+}
+
 static void dealloc_compressor_data(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
 
-  aom_free(cpi->mbmi_ext_base);
-  cpi->mbmi_ext_base = NULL;
+  dealloc_context_buffers_ext(cpi);
 
 #if CONFIG_PVQ
   if (cpi->oxcf.pass != 1) {
@@ -498,9 +580,7 @@ static void dealloc_compressor_data(AV1_COMP *cpi) {
 
   av1_free_pc_tree(&cpi->td);
 
-#if CONFIG_PALETTE
   aom_free(cpi->td.mb.palette_buffer);
-#endif  // CONFIG_PALETTE
 
 #if CONFIG_ANS
   aom_buf_ans_free(&cpi->buf_ans);
@@ -593,10 +673,22 @@ static void configure_static_seg_features(AV1_COMP *cpi) {
       qi_delta =
           av1_compute_qdelta(rc, rc->avg_q, rc->avg_q * 0.875, cm->bit_depth);
       av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta - 2);
+#if CONFIG_LOOPFILTER_LEVEL
+      av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_H, -2);
+      av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_V, -2);
+      av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_U, -2);
+      av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_V, -2);
+
+      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_H);
+      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_V);
+      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_U);
+      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_V);
+#else
       av1_set_segdata(seg, 1, SEG_LVL_ALT_LF, -2);
+      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF);
+#endif  // CONFIG_LOOPFILTER_LEVEL
 
       av1_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
-      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF);
 
       // Where relevant assume segment data is delta data
       seg->abs_delta = SEGMENT_DELTADATA;
@@ -617,8 +709,20 @@ static void configure_static_seg_features(AV1_COMP *cpi) {
         av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta + 2);
         av1_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
 
+#if CONFIG_LOOPFILTER_LEVEL
+        av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_H, -2);
+        av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_V, -2);
+        av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_U, -2);
+        av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_V, -2);
+
+        av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_H);
+        av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_V);
+        av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_U);
+        av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_V);
+#else
         av1_set_segdata(seg, 1, SEG_LVL_ALT_LF, -2);
         av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF);
+#endif  // CONFIG_LOOPFILTER_LEVEL
 
         // Segment coding disabled for compred testing
         if (high_q || (cpi->static_mb_pct == 100)) {
@@ -781,15 +885,7 @@ static void alloc_util_frame_buffers(AV1_COMP *cpi) {
                        "Failed to allocate scaled last source buffer");
 }
 
-static void alloc_context_buffers_ext(AV1_COMP *cpi) {
-  AV1_COMMON *cm = &cpi->common;
-  int mi_size = cm->mi_cols * cm->mi_rows;
-
-  CHECK_MEM_ERROR(cm, cpi->mbmi_ext_base,
-                  aom_calloc(mi_size, sizeof(*cpi->mbmi_ext_base)));
-}
-
-void av1_alloc_compressor_data(AV1_COMP *cpi) {
+static void alloc_compressor_data(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
 
   av1_alloc_context_buffers(cm, cm->width, cm->height);
@@ -806,9 +902,6 @@ void av1_alloc_compressor_data(AV1_COMP *cpi) {
     unsigned int tokens = get_token_alloc(cm->mb_rows, cm->mb_cols);
     CHECK_MEM_ERROR(cm, cpi->tile_tok[0][0],
                     aom_calloc(tokens, sizeof(*cpi->tile_tok[0][0])));
-#if CONFIG_ANS && !ANS_MAX_SYMBOLS
-    aom_buf_ans_alloc(&cpi->buf_ans, &cm->error, (int)tokens);
-#endif  // CONFIG_ANS
   }
 
   av1_setup_pc_tree(&cpi->common, &cpi->td);
@@ -821,10 +914,61 @@ void av1_new_framerate(AV1_COMP *cpi, double framerate) {
   cpi->od_rc.framerate = cpi->framerate;
   od_enc_rc_resize(&cpi->od_rc);
 #else
-  av1_rc_update_framerate(cpi);
+  av1_rc_update_framerate(cpi, cpi->common.width, cpi->common.height);
 #endif
 }
 
+#if CONFIG_MAX_TILE
+
+static void set_tile_info_max_tile(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  int i, start_sb;
+
+  av1_get_tile_limits(cm);
+
+  // configure tile columns
+  if (cpi->oxcf.tile_width_count == 0 || cpi->oxcf.tile_height_count == 0) {
+    cm->uniform_tile_spacing_flag = 1;
+    cm->log2_tile_cols = AOMMAX(cpi->oxcf.tile_columns, cm->min_log2_tile_cols);
+    cm->log2_tile_cols = AOMMIN(cm->log2_tile_cols, cm->max_log2_tile_cols);
+  } else {
+    int mi_cols = ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
+    int sb_cols = mi_cols >> MAX_MIB_SIZE_LOG2;
+    int size_sb, j = 0;
+    cm->uniform_tile_spacing_flag = 0;
+    for (i = 0, start_sb = 0; start_sb < sb_cols && i < MAX_TILE_COLS; i++) {
+      cm->tile_col_start_sb[i] = start_sb;
+      size_sb = cpi->oxcf.tile_widths[j++];
+      if (j >= cpi->oxcf.tile_width_count) j = 0;
+      start_sb += AOMMIN(size_sb, MAX_TILE_WIDTH_SB);
+    }
+    cm->tile_cols = i;
+    cm->tile_col_start_sb[i] = sb_cols;
+  }
+  av1_calculate_tile_cols(cm);
+
+  // configure tile rows
+  if (cm->uniform_tile_spacing_flag) {
+    cm->log2_tile_rows = AOMMAX(cpi->oxcf.tile_rows, cm->min_log2_tile_rows);
+    cm->log2_tile_rows = AOMMIN(cm->log2_tile_rows, cm->max_log2_tile_rows);
+  } else {
+    int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2);
+    int sb_rows = mi_rows >> MAX_MIB_SIZE_LOG2;
+    int size_sb, j = 0;
+    for (i = 0, start_sb = 0; start_sb < sb_rows && i < MAX_TILE_ROWS; i++) {
+      cm->tile_row_start_sb[i] = start_sb;
+      size_sb = cpi->oxcf.tile_heights[j++];
+      if (j >= cpi->oxcf.tile_height_count) j = 0;
+      start_sb += AOMMIN(size_sb, cm->max_tile_height_sb);
+    }
+    cm->tile_rows = i;
+    cm->tile_row_start_sb[i] = sb_rows;
+  }
+  av1_calculate_tile_rows(cm);
+}
+
+#endif
+
 static void set_tile_info(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
 #if CONFIG_DEPENDENT_HORZTILES
@@ -866,24 +1010,22 @@ static void set_tile_info(AV1_COMP *cpi) {
     while (cm->tile_rows * cm->tile_height < cm->mi_rows) ++cm->tile_rows;
   } else {
 #endif  // CONFIG_EXT_TILE
-    int min_log2_tile_cols, max_log2_tile_cols;
-    av1_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
-
-    cm->log2_tile_cols =
-        clamp(cpi->oxcf.tile_columns, min_log2_tile_cols, max_log2_tile_cols);
-    cm->log2_tile_rows = cpi->oxcf.tile_rows;
-
-    cm->tile_cols = 1 << cm->log2_tile_cols;
-    cm->tile_rows = 1 << cm->log2_tile_rows;
 
-    cm->tile_width = ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
-    cm->tile_width >>= cm->log2_tile_cols;
-    cm->tile_height = ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2);
-    cm->tile_height >>= cm->log2_tile_rows;
-
-    // round to integer multiples of max superblock size
-    cm->tile_width = ALIGN_POWER_OF_TWO(cm->tile_width, MAX_MIB_SIZE_LOG2);
-    cm->tile_height = ALIGN_POWER_OF_TWO(cm->tile_height, MAX_MIB_SIZE_LOG2);
+#if CONFIG_MAX_TILE
+    set_tile_info_max_tile(cpi);
+#else
+  int min_log2_tile_cols, max_log2_tile_cols;
+  av1_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
+
+  cm->log2_tile_cols =
+      clamp(cpi->oxcf.tile_columns, min_log2_tile_cols, max_log2_tile_cols);
+  cm->log2_tile_rows = cpi->oxcf.tile_rows;
+
+  cm->tile_width =
+      get_tile_size(cm->mi_cols, cm->log2_tile_cols, &cm->tile_cols);
+  cm->tile_height =
+      get_tile_size(cm->mi_rows, cm->log2_tile_rows, &cm->tile_rows);
+#endif  // CONFIG_MAX_TILE
 #if CONFIG_EXT_TILE
   }
 #endif  // CONFIG_EXT_TILE
@@ -952,7 +1094,6 @@ static void update_frame_size(AV1_COMP *cpi) {
                        NULL);
   memset(cpi->mbmi_ext_base, 0,
          cm->mi_rows * cm->mi_cols * sizeof(*cpi->mbmi_ext_base));
-
   set_tile_info(cpi);
 }
 
@@ -963,14 +1104,21 @@ static void init_buffer_indices(AV1_COMP *cpi) {
     cpi->lst_fb_idxes[fb_idx] = fb_idx;
   cpi->gld_fb_idx = LAST_REF_FRAMES;
   cpi->bwd_fb_idx = LAST_REF_FRAMES + 1;
-  cpi->alt_fb_idx = LAST_REF_FRAMES + 2;
+  cpi->alt2_fb_idx = LAST_REF_FRAMES + 2;
+  cpi->alt_fb_idx = LAST_REF_FRAMES + 3;
+  cpi->ext_fb_idx = LAST_REF_FRAMES + 4;
   for (fb_idx = 0; fb_idx < MAX_EXT_ARFS + 1; ++fb_idx)
     cpi->arf_map[fb_idx] = LAST_REF_FRAMES + 2 + fb_idx;
-#else
+#else   // !CONFIG_EXT_REFS
   cpi->lst_fb_idx = 0;
   cpi->gld_fb_idx = 1;
   cpi->alt_fb_idx = 2;
 #endif  // CONFIG_EXT_REFS
+#if CONFIG_AMVR
+  cpi->rate_index = 0;
+  cpi->rate_size = 0;
+  cpi->cur_poc = -1;
+#endif
 }
 
 static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) {
@@ -993,7 +1141,7 @@ static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) {
 
   cm->width = oxcf->width;
   cm->height = oxcf->height;
-  av1_alloc_compressor_data(cpi);
+  alloc_compressor_data(cpi);
 
   // Single thread case: use counts in common.
   cpi->td.counts = &cm->counts;
@@ -1004,6 +1152,10 @@ static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) {
   cpi->static_mb_pct = 0;
   cpi->ref_frame_flags = 0;
 
+  // Reset resize pending flags
+  cpi->resize_pending_width = 0;
+  cpi->resize_pending_height = 0;
+
   init_buffer_indices(cpi);
 }
 
@@ -1212,9 +1364,22 @@ MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x32x4d)
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x8)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x8_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x8x4d)
-#endif
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x64)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x64x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x16x4d)
+#if CONFIG_EXT_PARTITION
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x128)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x128_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x128x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x32)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x32x4d)
+#endif  // CONFIG_EXT_PARTITION
+#endif  // CONFIG_EXT_PARTITION_TYPES
 
-#if CONFIG_EXT_INTER
 #define HIGHBD_MBFP(BT, MCSDF, MCSVF) \
   cpi->fn_ptr[BT].msdf = MCSDF;       \
   cpi->fn_ptr[BT].msvf = MCSVF;
@@ -1268,8 +1433,13 @@ MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x16)
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x4)
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x32)
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x8)
-#endif
-#endif  // CONFIG_EXT_INTER
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x64)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x16)
+#if CONFIG_EXT_PARTITION
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x128)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x32)
+#endif  // CONFIG_EXT_PARTITION
+#endif  // CONFIG_EXT_PARTITION_TYPES
 
 #if CONFIG_MOTION_VAR
 #define HIGHBD_OBFP(BT, OSDF, OVF, OSVF) \
@@ -1318,7 +1488,13 @@ MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x16)
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x4)
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x32)
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x8)
-#endif
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x64)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x16)
+#if CONFIG_EXT_PARTITION
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x128)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x32)
+#endif  // CONFIG_EXT_PARTITION
+#endif  // CONFIG_EXT_PARTITION_TYPES
 #endif  // CONFIG_MOTION_VAR
 
 static void highbd_set_var_fns(AV1_COMP *const cpi) {
@@ -1327,6 +1503,32 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
     switch (cm->bit_depth) {
       case AOM_BITS_8:
 #if CONFIG_EXT_PARTITION_TYPES
+#if CONFIG_EXT_PARTITION
+        HIGHBD_BFP(BLOCK_128X32, aom_highbd_sad128x32_bits8,
+                   aom_highbd_sad128x32_avg_bits8, aom_highbd_8_variance128x32,
+                   aom_highbd_8_sub_pixel_variance128x32,
+                   aom_highbd_8_sub_pixel_avg_variance128x32, NULL, NULL,
+                   aom_highbd_sad128x32x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_32X128, aom_highbd_sad32x128_bits8,
+                   aom_highbd_sad32x128_avg_bits8, aom_highbd_8_variance32x128,
+                   aom_highbd_8_sub_pixel_variance32x128,
+                   aom_highbd_8_sub_pixel_avg_variance32x128, NULL, NULL,
+                   aom_highbd_sad32x128x4d_bits8)
+#endif  // CONFIG_EXT_PARTITION
+
+        HIGHBD_BFP(BLOCK_64X16, aom_highbd_sad64x16_bits8,
+                   aom_highbd_sad64x16_avg_bits8, aom_highbd_8_variance64x16,
+                   aom_highbd_8_sub_pixel_variance64x16,
+                   aom_highbd_8_sub_pixel_avg_variance64x16, NULL, NULL,
+                   aom_highbd_sad64x16x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_16X64, aom_highbd_sad16x64_bits8,
+                   aom_highbd_sad16x64_avg_bits8, aom_highbd_8_variance16x64,
+                   aom_highbd_8_sub_pixel_variance16x64,
+                   aom_highbd_8_sub_pixel_avg_variance16x64, NULL, NULL,
+                   aom_highbd_sad16x64x4d_bits8)
+
         HIGHBD_BFP(BLOCK_32X8, aom_highbd_sad32x8_bits8,
                    aom_highbd_sad32x8_avg_bits8, aom_highbd_8_variance32x8,
                    aom_highbd_8_sub_pixel_variance32x8,
@@ -1464,7 +1666,6 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
                    aom_highbd_sad64x128x4d_bits8)
 #endif  // CONFIG_EXT_PARTITION
 
-#if CONFIG_EXT_INTER
 #if CONFIG_EXT_PARTITION
         HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits8,
                     aom_highbd_8_masked_sub_pixel_variance128x128)
@@ -1500,6 +1701,20 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
         HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits8,
                     aom_highbd_8_masked_sub_pixel_variance4x4)
 #if CONFIG_EXT_PARTITION_TYPES
+#if CONFIG_EXT_PARTITION
+        HIGHBD_MBFP(BLOCK_128X32, aom_highbd_masked_sad128x32_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance128x32)
+
+        HIGHBD_MBFP(BLOCK_32X128, aom_highbd_masked_sad32x128_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance32x128)
+#endif  // CONFIG_EXT_PARTITION
+
+        HIGHBD_MBFP(BLOCK_64X16, aom_highbd_masked_sad64x16_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance64x16)
+
+        HIGHBD_MBFP(BLOCK_16X64, aom_highbd_masked_sad16x64_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance16x64)
+
         HIGHBD_MBFP(BLOCK_32X8, aom_highbd_masked_sad32x8_bits8,
                     aom_highbd_8_masked_sub_pixel_variance32x8)
 
@@ -1512,7 +1727,6 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
         HIGHBD_MBFP(BLOCK_4X16, aom_highbd_masked_sad4x16_bits8,
                     aom_highbd_8_masked_sub_pixel_variance4x16)
 #endif
-#endif  // CONFIG_EXT_INTER
 #if CONFIG_MOTION_VAR
 #if CONFIG_EXT_PARTITION
         HIGHBD_OBFP(BLOCK_128X128, aom_highbd_obmc_sad128x128_bits8,
@@ -1565,6 +1779,24 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
                     aom_highbd_obmc_variance4x4,
                     aom_highbd_obmc_sub_pixel_variance4x4)
 #if CONFIG_EXT_PARTITION_TYPES
+#if CONFIG_EXT_PARTITION
+        HIGHBD_OBFP(BLOCK_128X32, aom_highbd_obmc_sad128x32_bits8,
+                    aom_highbd_obmc_variance128x32,
+                    aom_highbd_obmc_sub_pixel_variance128x32)
+
+        HIGHBD_OBFP(BLOCK_32X128, aom_highbd_obmc_sad32x128_bits8,
+                    aom_highbd_obmc_variance32x128,
+                    aom_highbd_obmc_sub_pixel_variance32x128)
+#endif  // CONFIG_EXT_PARTITION
+
+        HIGHBD_OBFP(BLOCK_64X16, aom_highbd_obmc_sad64x16_bits8,
+                    aom_highbd_obmc_variance64x16,
+                    aom_highbd_obmc_sub_pixel_variance64x16)
+
+        HIGHBD_OBFP(BLOCK_16X64, aom_highbd_obmc_sad16x64_bits8,
+                    aom_highbd_obmc_variance16x64,
+                    aom_highbd_obmc_sub_pixel_variance16x64)
+
         HIGHBD_OBFP(BLOCK_32X8, aom_highbd_obmc_sad32x8_bits8,
                     aom_highbd_obmc_variance32x8,
                     aom_highbd_obmc_sub_pixel_variance32x8)
@@ -1586,6 +1818,34 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
 
       case AOM_BITS_10:
 #if CONFIG_EXT_PARTITION_TYPES
+#if CONFIG_EXT_PARTITION
+        HIGHBD_BFP(BLOCK_128X32, aom_highbd_sad128x32_bits10,
+                   aom_highbd_sad128x32_avg_bits10,
+                   aom_highbd_10_variance128x32,
+                   aom_highbd_10_sub_pixel_variance128x32,
+                   aom_highbd_10_sub_pixel_avg_variance128x32, NULL, NULL,
+                   aom_highbd_sad128x32x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_32X128, aom_highbd_sad32x128_bits10,
+                   aom_highbd_sad32x128_avg_bits10,
+                   aom_highbd_10_variance32x128,
+                   aom_highbd_10_sub_pixel_variance32x128,
+                   aom_highbd_10_sub_pixel_avg_variance32x128, NULL, NULL,
+                   aom_highbd_sad32x128x4d_bits10)
+#endif  // CONFIG_EXT_PARTITION
+
+        HIGHBD_BFP(BLOCK_64X16, aom_highbd_sad64x16_bits10,
+                   aom_highbd_sad64x16_avg_bits10, aom_highbd_10_variance64x16,
+                   aom_highbd_10_sub_pixel_variance64x16,
+                   aom_highbd_10_sub_pixel_avg_variance64x16, NULL, NULL,
+                   aom_highbd_sad64x16x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_16X64, aom_highbd_sad16x64_bits10,
+                   aom_highbd_sad16x64_avg_bits10, aom_highbd_10_variance16x64,
+                   aom_highbd_10_sub_pixel_variance16x64,
+                   aom_highbd_10_sub_pixel_avg_variance16x64, NULL, NULL,
+                   aom_highbd_sad16x64x4d_bits10)
+
         HIGHBD_BFP(BLOCK_32X8, aom_highbd_sad32x8_bits10,
                    aom_highbd_sad32x8_avg_bits10, aom_highbd_10_variance32x8,
                    aom_highbd_10_sub_pixel_variance32x8,
@@ -1727,7 +1987,6 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
                    aom_highbd_sad64x128x4d_bits10)
 #endif  // CONFIG_EXT_PARTITION
 
-#if CONFIG_EXT_INTER
 #if CONFIG_EXT_PARTITION
         HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits10,
                     aom_highbd_10_masked_sub_pixel_variance128x128)
@@ -1763,6 +2022,20 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
         HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits10,
                     aom_highbd_10_masked_sub_pixel_variance4x4)
 #if CONFIG_EXT_PARTITION_TYPES
+#if CONFIG_EXT_PARTITION
+        HIGHBD_MBFP(BLOCK_128X32, aom_highbd_masked_sad128x32_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance128x32)
+
+        HIGHBD_MBFP(BLOCK_32X128, aom_highbd_masked_sad32x128_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance32x128)
+#endif  // CONFIG_EXT_PARTITION
+
+        HIGHBD_MBFP(BLOCK_64X16, aom_highbd_masked_sad64x16_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance64x16)
+
+        HIGHBD_MBFP(BLOCK_16X64, aom_highbd_masked_sad16x64_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance16x64)
+
         HIGHBD_MBFP(BLOCK_32X8, aom_highbd_masked_sad32x8_bits10,
                     aom_highbd_10_masked_sub_pixel_variance32x8)
 
@@ -1775,7 +2048,6 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
         HIGHBD_MBFP(BLOCK_4X16, aom_highbd_masked_sad4x16_bits10,
                     aom_highbd_10_masked_sub_pixel_variance4x16)
 #endif
-#endif  // CONFIG_EXT_INTER
 #if CONFIG_MOTION_VAR
 #if CONFIG_EXT_PARTITION
         HIGHBD_OBFP(BLOCK_128X128, aom_highbd_obmc_sad128x128_bits10,
@@ -1828,6 +2100,24 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
                     aom_highbd_10_obmc_variance4x4,
                     aom_highbd_10_obmc_sub_pixel_variance4x4)
 #if CONFIG_EXT_PARTITION_TYPES
+#if CONFIG_EXT_PARTITION
+        HIGHBD_OBFP(BLOCK_128X32, aom_highbd_obmc_sad128x32_bits10,
+                    aom_highbd_10_obmc_variance128x32,
+                    aom_highbd_10_obmc_sub_pixel_variance128x32)
+
+        HIGHBD_OBFP(BLOCK_32X128, aom_highbd_obmc_sad32x128_bits10,
+                    aom_highbd_10_obmc_variance32x128,
+                    aom_highbd_10_obmc_sub_pixel_variance32x128)
+#endif  // CONFIG_EXT_PARTITION
+
+        HIGHBD_OBFP(BLOCK_64X16, aom_highbd_obmc_sad64x16_bits10,
+                    aom_highbd_10_obmc_variance64x16,
+                    aom_highbd_10_obmc_sub_pixel_variance64x16)
+
+        HIGHBD_OBFP(BLOCK_16X64, aom_highbd_obmc_sad16x64_bits10,
+                    aom_highbd_10_obmc_variance16x64,
+                    aom_highbd_10_obmc_sub_pixel_variance16x64)
+
         HIGHBD_OBFP(BLOCK_32X8, aom_highbd_obmc_sad32x8_bits10,
                     aom_highbd_10_obmc_variance32x8,
                     aom_highbd_10_obmc_sub_pixel_variance32x8)
@@ -1849,6 +2139,34 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
 
       case AOM_BITS_12:
 #if CONFIG_EXT_PARTITION_TYPES
+#if CONFIG_EXT_PARTITION
+        HIGHBD_BFP(BLOCK_128X32, aom_highbd_sad128x32_bits12,
+                   aom_highbd_sad128x32_avg_bits12,
+                   aom_highbd_12_variance128x32,
+                   aom_highbd_12_sub_pixel_variance128x32,
+                   aom_highbd_12_sub_pixel_avg_variance128x32, NULL, NULL,
+                   aom_highbd_sad128x32x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_32X128, aom_highbd_sad32x128_bits12,
+                   aom_highbd_sad32x128_avg_bits12,
+                   aom_highbd_12_variance32x128,
+                   aom_highbd_12_sub_pixel_variance32x128,
+                   aom_highbd_12_sub_pixel_avg_variance32x128, NULL, NULL,
+                   aom_highbd_sad32x128x4d_bits12)
+#endif  // CONFIG_EXT_PARTITION
+
+        HIGHBD_BFP(BLOCK_64X16, aom_highbd_sad64x16_bits12,
+                   aom_highbd_sad64x16_avg_bits12, aom_highbd_12_variance64x16,
+                   aom_highbd_12_sub_pixel_variance64x16,
+                   aom_highbd_12_sub_pixel_avg_variance64x16, NULL, NULL,
+                   aom_highbd_sad64x16x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_16X64, aom_highbd_sad16x64_bits12,
+                   aom_highbd_sad16x64_avg_bits12, aom_highbd_12_variance16x64,
+                   aom_highbd_12_sub_pixel_variance16x64,
+                   aom_highbd_12_sub_pixel_avg_variance16x64, NULL, NULL,
+                   aom_highbd_sad16x64x4d_bits12)
+
         HIGHBD_BFP(BLOCK_32X8, aom_highbd_sad32x8_bits12,
                    aom_highbd_sad32x8_avg_bits12, aom_highbd_12_variance32x8,
                    aom_highbd_12_sub_pixel_variance32x8,
@@ -1990,7 +2308,6 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
                    aom_highbd_sad64x128x4d_bits12)
 #endif  // CONFIG_EXT_PARTITION
 
-#if CONFIG_EXT_INTER
 #if CONFIG_EXT_PARTITION
         HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits12,
                     aom_highbd_12_masked_sub_pixel_variance128x128)
@@ -2026,6 +2343,20 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
         HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits12,
                     aom_highbd_12_masked_sub_pixel_variance4x4)
 #if CONFIG_EXT_PARTITION_TYPES
+#if CONFIG_EXT_PARTITION
+        HIGHBD_MBFP(BLOCK_128X32, aom_highbd_masked_sad128x32_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance128x32)
+
+        HIGHBD_MBFP(BLOCK_32X128, aom_highbd_masked_sad32x128_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance32x128)
+#endif  // CONFIG_EXT_PARTITION
+
+        HIGHBD_MBFP(BLOCK_64X16, aom_highbd_masked_sad64x16_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance64x16)
+
+        HIGHBD_MBFP(BLOCK_16X64, aom_highbd_masked_sad16x64_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance16x64)
+
         HIGHBD_MBFP(BLOCK_32X8, aom_highbd_masked_sad32x8_bits12,
                     aom_highbd_12_masked_sub_pixel_variance32x8)
 
@@ -2038,7 +2369,6 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
         HIGHBD_MBFP(BLOCK_4X16, aom_highbd_masked_sad4x16_bits12,
                     aom_highbd_12_masked_sub_pixel_variance4x16)
 #endif
-#endif  // CONFIG_EXT_INTER
 
 #if CONFIG_MOTION_VAR
 #if CONFIG_EXT_PARTITION
@@ -2092,6 +2422,24 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
                     aom_highbd_12_obmc_variance4x4,
                     aom_highbd_12_obmc_sub_pixel_variance4x4)
 #if CONFIG_EXT_PARTITION_TYPES
+#if CONFIG_EXT_PARTITION
+        HIGHBD_OBFP(BLOCK_128X32, aom_highbd_obmc_sad128x32_bits12,
+                    aom_highbd_12_obmc_variance128x32,
+                    aom_highbd_12_obmc_sub_pixel_variance128x32)
+
+        HIGHBD_OBFP(BLOCK_32X128, aom_highbd_obmc_sad32x128_bits12,
+                    aom_highbd_12_obmc_variance32x128,
+                    aom_highbd_12_obmc_sub_pixel_variance32x128)
+#endif  // CONFIG_EXT_PARTITION
+
+        HIGHBD_OBFP(BLOCK_64X16, aom_highbd_obmc_sad64x16_bits12,
+                    aom_highbd_12_obmc_variance64x16,
+                    aom_highbd_12_obmc_sub_pixel_variance64x16)
+
+        HIGHBD_OBFP(BLOCK_16X64, aom_highbd_obmc_sad16x64_bits12,
+                    aom_highbd_12_obmc_variance16x64,
+                    aom_highbd_12_obmc_sub_pixel_variance16x64)
+
         HIGHBD_OBFP(BLOCK_32X8, aom_highbd_obmc_sad32x8_bits12,
                     aom_highbd_12_obmc_variance32x8,
                     aom_highbd_12_obmc_sub_pixel_variance32x8)
@@ -2139,7 +2487,6 @@ static void realloc_segmentation_maps(AV1_COMP *cpi) {
                   aom_calloc(cm->mi_rows * cm->mi_cols, 1));
 }
 
-#if CONFIG_EXT_INTER
 void set_compound_tools(AV1_COMMON *cm) {
   (void)cm;
 #if CONFIG_INTERINTRA
@@ -2149,7 +2496,6 @@ void set_compound_tools(AV1_COMMON *cm) {
   cm->allow_masked_compound = 1;
 #endif  // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
 }
-#endif  // CONFIG_EXT_INTER
 
 void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
   AV1_COMMON *const cm = &cpi->common;
@@ -2186,25 +2532,28 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
   cpi->refresh_golden_frame = 0;
 #if CONFIG_EXT_REFS
   cpi->refresh_bwd_ref_frame = 0;
+  cpi->refresh_alt2_ref_frame = 0;
 #endif  // CONFIG_EXT_REFS
 
   cm->refresh_frame_context =
       (oxcf->error_resilient_mode || oxcf->frame_parallel_decoding_mode)
           ? REFRESH_FRAME_CONTEXT_FORWARD
           : REFRESH_FRAME_CONTEXT_BACKWARD;
+#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING
   cm->reset_frame_context = RESET_FRAME_CONTEXT_NONE;
+#endif
 
-#if CONFIG_PALETTE
   if (x->palette_buffer == NULL) {
     CHECK_MEM_ERROR(cm, x->palette_buffer,
                     aom_memalign(16, sizeof(*x->palette_buffer)));
   }
-#endif  // CONFIG_PALETTE
-#if CONFIG_EXT_INTER
   set_compound_tools(cm);
-#endif  // CONFIG_EXT_INTER
   av1_reset_segment_features(cm);
-  av1_set_high_precision_mv(cpi, 0);
+#if CONFIG_AMVR
+  set_high_precision_mv(cpi, 0, 0);
+#else
+  set_high_precision_mv(cpi, 0);
+#endif
 
   set_rc_buffer_sizes(rc, &cpi->oxcf);
 
@@ -2235,7 +2584,8 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
   if (cpi->initial_width) {
     if (cm->width > cpi->initial_width || cm->height > cpi->initial_height) {
       av1_free_context_buffers(cm);
-      av1_alloc_compressor_data(cpi);
+      av1_free_pc_tree(&cpi->td);
+      alloc_compressor_data(cpi);
       realloc_segmentation_maps(cpi);
       cpi->initial_width = cpi->initial_height = 0;
     }
@@ -2265,15 +2615,12 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
 #if CONFIG_HIGHBITDEPTH
   highbd_set_var_fns(cpi);
 #endif
-
 #if CONFIG_ANS && ANS_MAX_SYMBOLS
   cpi->common.ans_window_size_log2 = cpi->oxcf.ans_window_size_log2;
-  if (cpi->buf_ans.size != (1 << cpi->common.ans_window_size_log2)) {
-    aom_buf_ans_free(&cpi->buf_ans);
-    aom_buf_ans_alloc(&cpi->buf_ans, &cpi->common.error,
-                      1 << cpi->common.ans_window_size_log2);
-  }
 #endif  // CONFIG_ANS && ANS_MAX_SYMBOLS
+#if CONFIG_AMVR
+  cm->seq_mv_precision_level = 2;
+#endif
 }
 
 AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
@@ -2293,9 +2640,13 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
   }
 
   cm->error.setjmp = 1;
-  cm->alloc_mi = av1_enc_alloc_mi;
-  cm->free_mi = av1_enc_free_mi;
-  cm->setup_mi = av1_enc_setup_mi;
+  cm->alloc_mi = enc_alloc_mi;
+  cm->free_mi = enc_free_mi;
+  cm->setup_mi = enc_setup_mi;
+
+#if CONFIG_NCOBMC_ADAPT_WEIGHT
+  get_default_ncobmc_kernels(cm);
+#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
 
   CHECK_MEM_ERROR(cm, cm->fc,
                   (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc)));
@@ -2467,12 +2818,14 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
 #endif
   CHECK_MEM_ERROR(
       cm, cpi->td.mb.above_pred_buf,
-      (uint8_t *)aom_memalign(16, buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE *
-                                      sizeof(*cpi->td.mb.above_pred_buf)));
+      (uint8_t *)aom_memalign(16,
+                              buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE *
+                                  sizeof(*cpi->td.mb.above_pred_buf)));
   CHECK_MEM_ERROR(
       cm, cpi->td.mb.left_pred_buf,
-      (uint8_t *)aom_memalign(16, buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE *
-                                      sizeof(*cpi->td.mb.left_pred_buf)));
+      (uint8_t *)aom_memalign(16,
+                              buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE *
+                                  sizeof(*cpi->td.mb.left_pred_buf)));
 
   CHECK_MEM_ERROR(cm, cpi->td.mb.wsrc_buf,
                   (int32_t *)aom_memalign(
@@ -2513,7 +2866,25 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
   BFP(BLOCK_32X8, aom_sad32x8, aom_sad32x8_avg, aom_variance32x8,
       aom_sub_pixel_variance32x8, aom_sub_pixel_avg_variance32x8, NULL, NULL,
       aom_sad32x8x4d)
-#endif
+
+  BFP(BLOCK_16X64, aom_sad16x64, aom_sad16x64_avg, aom_variance16x64,
+      aom_sub_pixel_variance16x64, aom_sub_pixel_avg_variance16x64, NULL, NULL,
+      aom_sad16x64x4d)
+
+  BFP(BLOCK_64X16, aom_sad64x16, aom_sad64x16_avg, aom_variance64x16,
+      aom_sub_pixel_variance64x16, aom_sub_pixel_avg_variance64x16, NULL, NULL,
+      aom_sad64x16x4d)
+
+#if CONFIG_EXT_PARTITION
+  BFP(BLOCK_32X128, aom_sad32x128, aom_sad32x128_avg, aom_variance32x128,
+      aom_sub_pixel_variance32x128, aom_sub_pixel_avg_variance32x128, NULL,
+      NULL, aom_sad32x128x4d)
+
+  BFP(BLOCK_128X32, aom_sad128x32, aom_sad128x32_avg, aom_variance128x32,
+      aom_sub_pixel_variance128x32, aom_sub_pixel_avg_variance128x32, NULL,
+      NULL, aom_sad128x32x4d)
+#endif  // CONFIG_EXT_PARTITION
+#endif  // CONFIG_EXT_PARTITION_TYPES
 
 #if CONFIG_EXT_PARTITION
   BFP(BLOCK_128X128, aom_sad128x128, aom_sad128x128_avg, aom_variance128x128,
@@ -2640,10 +3011,23 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
 
   OBFP(BLOCK_32X8, aom_obmc_sad32x8, aom_obmc_variance32x8,
        aom_obmc_sub_pixel_variance32x8)
-#endif
+
+  OBFP(BLOCK_16X64, aom_obmc_sad16x64, aom_obmc_variance16x64,
+       aom_obmc_sub_pixel_variance16x64)
+
+  OBFP(BLOCK_64X16, aom_obmc_sad64x16, aom_obmc_variance64x16,
+       aom_obmc_sub_pixel_variance64x16)
+
+#if CONFIG_EXT_PARTITION
+  OBFP(BLOCK_32X128, aom_obmc_sad32x128, aom_obmc_variance32x128,
+       aom_obmc_sub_pixel_variance32x128)
+
+  OBFP(BLOCK_128X32, aom_obmc_sad128x32, aom_obmc_variance128x32,
+       aom_obmc_sub_pixel_variance128x32)
+#endif  // CONFIG_EXT_PARTITION
+#endif  // CONFIG_EXT_PARTITION_TYPES
 #endif  // CONFIG_MOTION_VAR
 
-#if CONFIG_EXT_INTER
 #define MBFP(BT, MCSDF, MCSVF)  \
   cpi->fn_ptr[BT].msdf = MCSDF; \
   cpi->fn_ptr[BT].msvf = MCSVF;
@@ -2676,8 +3060,17 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
   MBFP(BLOCK_8X32, aom_masked_sad8x32, aom_masked_sub_pixel_variance8x32)
 
   MBFP(BLOCK_32X8, aom_masked_sad32x8, aom_masked_sub_pixel_variance32x8)
-#endif
-#endif  // CONFIG_EXT_INTER
+
+  MBFP(BLOCK_16X64, aom_masked_sad16x64, aom_masked_sub_pixel_variance16x64)
+
+  MBFP(BLOCK_64X16, aom_masked_sad64x16, aom_masked_sub_pixel_variance64x16)
+
+#if CONFIG_EXT_PARTITION
+  MBFP(BLOCK_32X128, aom_masked_sad32x128, aom_masked_sub_pixel_variance32x128)
+
+  MBFP(BLOCK_128X32, aom_masked_sad128x32, aom_masked_sub_pixel_variance128x32)
+#endif  // CONFIG_EXT_PARTITION
+#endif  // CONFIG_EXT_PARTITION_TYPES
 
 #if CONFIG_HIGHBITDEPTH
   highbd_set_var_fns(cpi);
@@ -2695,7 +3088,7 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
 
   av1_loop_filter_init(cm);
 #if CONFIG_FRAME_SUPERRES
-  cm->superres_scale_numerator = SCALE_DENOMINATOR;
+  cm->superres_scale_denominator = SCALE_NUMERATOR;
   cm->superres_upscaled_width = oxcf->width;
   cm->superres_upscaled_height = oxcf->height;
 #endif  // CONFIG_FRAME_SUPERRES
@@ -2815,9 +3208,7 @@ void av1_remove_compressor(AV1_COMP *cpi) {
 
     // Deallocate allocated thread data.
     if (t < cpi->num_workers - 1) {
-#if CONFIG_PALETTE
       aom_free(thread_data->td->palette_buffer);
-#endif  // CONFIG_PALETTE
 #if CONFIG_MOTION_VAR
       aom_free(thread_data->td->above_pred_buf);
       aom_free(thread_data->td->left_pred_buf);
@@ -2862,7 +3253,6 @@ void av1_remove_compressor(AV1_COMP *cpi) {
 #ifdef OUTPUT_YUV_REC
   fclose(yuv_rec_file);
 #endif
-
 #if 0
 
   if (keyfile)
@@ -2911,31 +3301,9 @@ void av1_update_reference(AV1_COMP *cpi, int ref_frame_flags) {
   cpi->ext_refresh_frame_flags_pending = 1;
 }
 
-static YV12_BUFFER_CONFIG *get_av1_ref_frame_buffer(
-    AV1_COMP *cpi, AOM_REFFRAME ref_frame_flag) {
-  MV_REFERENCE_FRAME ref_frame = NONE_FRAME;
-  if (ref_frame_flag == AOM_LAST_FLAG) ref_frame = LAST_FRAME;
-#if CONFIG_EXT_REFS
-  else if (ref_frame_flag == AOM_LAST2_FLAG)
-    ref_frame = LAST2_FRAME;
-  else if (ref_frame_flag == AOM_LAST3_FLAG)
-    ref_frame = LAST3_FRAME;
-#endif  // CONFIG_EXT_REFS
-  else if (ref_frame_flag == AOM_GOLD_FLAG)
-    ref_frame = GOLDEN_FRAME;
-#if CONFIG_EXT_REFS
-  else if (ref_frame_flag == AOM_BWD_FLAG)
-    ref_frame = BWDREF_FRAME;
-#endif  // CONFIG_EXT_REFS
-  else if (ref_frame_flag == AOM_ALT_FLAG)
-    ref_frame = ALTREF_FRAME;
-
-  return ref_frame == NONE_FRAME ? NULL : get_ref_frame_buffer(cpi, ref_frame);
-}
-
-int av1_copy_reference_enc(AV1_COMP *cpi, AOM_REFFRAME ref_frame_flag,
-                           YV12_BUFFER_CONFIG *sd) {
-  YV12_BUFFER_CONFIG *cfg = get_av1_ref_frame_buffer(cpi, ref_frame_flag);
+int av1_copy_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd) {
+  AV1_COMMON *const cm = &cpi->common;
+  YV12_BUFFER_CONFIG *cfg = get_ref_frame(cm, idx);
   if (cfg) {
     aom_yv12_copy_frame(cfg, sd);
     return 0;
@@ -2944,9 +3312,9 @@ int av1_copy_reference_enc(AV1_COMP *cpi, AOM_REFFRAME ref_frame_flag,
   }
 }
 
-int av1_set_reference_enc(AV1_COMP *cpi, AOM_REFFRAME ref_frame_flag,
-                          YV12_BUFFER_CONFIG *sd) {
-  YV12_BUFFER_CONFIG *cfg = get_av1_ref_frame_buffer(cpi, ref_frame_flag);
+int av1_set_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd) {
+  AV1_COMMON *const cm = &cpi->common;
+  YV12_BUFFER_CONFIG *cfg = get_ref_frame(cm, idx);
   if (cfg) {
     aom_yv12_copy_frame(sd, cfg);
     return 0;
@@ -2994,7 +3362,53 @@ void aom_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) {
 #endif
 
 #if CONFIG_EXT_REFS && !CONFIG_XIPHRC
+#if USE_GF16_MULTI_LAYER
+static void check_show_existing_frame_gf16(AV1_COMP *cpi) {
+  const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+  AV1_COMMON *const cm = &cpi->common;
+  const FRAME_UPDATE_TYPE next_frame_update_type =
+      gf_group->update_type[gf_group->index];
+
+  if (cm->show_existing_frame == 1) {
+    cm->show_existing_frame = 0;
+  } else if (cpi->rc.is_last_bipred_frame) {
+    cpi->rc.is_last_bipred_frame = 0;
+    cm->show_existing_frame = 1;
+    cpi->existing_fb_idx_to_show = cpi->bwd_fb_idx;
+  } else if (next_frame_update_type == OVERLAY_UPDATE ||
+             next_frame_update_type == INTNL_OVERLAY_UPDATE) {
+    // Check the temporal filtering status for the next OVERLAY frame
+    const int num_arfs_in_gf = cpi->num_extra_arfs + 1;
+    int which_arf = 0, arf_idx;
+    // Identify the index to the next overlay frame.
+    for (arf_idx = 0; arf_idx < num_arfs_in_gf; arf_idx++) {
+      if (gf_group->index == cpi->arf_pos_for_ovrly[arf_idx]) {
+        which_arf = arf_idx;
+        break;
+      }
+    }
+    assert(arf_idx < num_arfs_in_gf);
+    if (cpi->is_arf_filter_off[which_arf]) {
+      cm->show_existing_frame = 1;
+      cpi->rc.is_src_frame_alt_ref = 1;
+      cpi->existing_fb_idx_to_show = (next_frame_update_type == OVERLAY_UPDATE)
+                                         ? cpi->alt_fb_idx
+                                         : cpi->bwd_fb_idx;
+      cpi->is_arf_filter_off[which_arf] = 0;
+    }
+  }
+  cpi->rc.is_src_frame_ext_arf = 0;
+}
+#endif  // USE_GF16_MULTI_LAYER
+
 static void check_show_existing_frame(AV1_COMP *cpi) {
+#if USE_GF16_MULTI_LAYER
+  if (cpi->rc.baseline_gf_interval == 16) {
+    check_show_existing_frame_gf16(cpi);
+    return;
+  }
+#endif  // USE_GF16_MULTI_LAYER
+
   const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
   AV1_COMMON *const cm = &cpi->common;
   const FRAME_UPDATE_TYPE next_frame_update_type =
@@ -3004,9 +3418,9 @@ static void check_show_existing_frame(AV1_COMP *cpi) {
   if (cm->show_existing_frame == 1) {
     cm->show_existing_frame = 0;
   } else if (cpi->rc.is_last_bipred_frame) {
-    // NOTE(zoeliu): If the current frame is a last bi-predictive frame, it is
-    //               needed next to show the BWDREF_FRAME, which is pointed by
-    //               the last_fb_idxes[0] after reference frame buffer update
+    // NOTE: If the current frame is a last bi-predictive frame, it is
+    //       needed next to show the BWDREF_FRAME, which is pointed by
+    //       the last_fb_idxes[0] after reference frame buffer update
     cpi->rc.is_last_bipred_frame = 0;
     cm->show_existing_frame = 1;
     cpi->existing_fb_idx_to_show = cpi->lst_fb_idxes[0];
@@ -3017,7 +3431,9 @@ static void check_show_existing_frame(AV1_COMP *cpi) {
     // in av1_rc_get_second_pass_params(cpi)
     cm->show_existing_frame = 1;
     cpi->rc.is_src_frame_alt_ref = 1;
-    cpi->existing_fb_idx_to_show = cpi->alt_fb_idx;
+    cpi->existing_fb_idx_to_show = (next_frame_update_type == OVERLAY_UPDATE)
+                                       ? cpi->alt_fb_idx
+                                       : cpi->alt2_fb_idx;
     cpi->is_arf_filter_off[which_arf] = 0;
   }
   cpi->rc.is_src_frame_ext_arf = 0;
@@ -3028,7 +3444,7 @@ static void check_show_existing_frame(AV1_COMP *cpi) {
 void aom_write_one_yuv_frame(AV1_COMMON *cm, YV12_BUFFER_CONFIG *s) {
   uint8_t *src = s->y_buffer;
   int h = cm->height;
-
+  if (yuv_rec_file == NULL) return;
 #if CONFIG_HIGHBITDEPTH
   if (s->flags & YV12_FLAG_HIGHBITDEPTH) {
     uint16_t *src16 = CONVERT_TO_SHORTPTR(s->y_buffer);
@@ -3095,7 +3511,8 @@ static int recode_loop_test_global_motion(AV1_COMP *cpi) {
     if (cm->global_motion[i].wmtype != IDENTITY &&
         rdc->global_motion_used[i] * GM_RECODE_LOOP_NUM4X4_FACTOR <
             cpi->gmparams_cost[i]) {
-      set_default_warp_params(&cm->global_motion[i]);
+      cm->global_motion[i] = default_warp_params;
+      assert(cm->global_motion[i].wmtype == IDENTITY);
       cpi->gmparams_cost[i] = 0;
       recode = 1;
       recode |= (rdc->global_motion_used[i] > 0);
@@ -3242,14 +3659,69 @@ static void enc_check_valid_ref_frames(AV1_COMP *const cpi) {
 }
 #endif  // CONFIG_VAR_REFS
 
-void av1_update_reference_frames(AV1_COMP *cpi) {
+#if CONFIG_EXT_REFS
+#if USE_GF16_MULTI_LAYER
+static void update_reference_frames_gf16(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   BufferPool *const pool = cm->buffer_pool;
 
+  if (cm->frame_type == KEY_FRAME) {
+    for (int ref_frame = 0; ref_frame < LAST_REF_FRAMES; ++ref_frame) {
+      ref_cnt_fb(pool->frame_bufs,
+                 &cm->ref_frame_map[cpi->lst_fb_idxes[ref_frame]],
+                 cm->new_fb_idx);
+    }
+    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
+               cm->new_fb_idx);
+    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->bwd_fb_idx],
+               cm->new_fb_idx);
+    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->alt2_fb_idx],
+               cm->new_fb_idx);
+    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->alt_fb_idx],
+               cm->new_fb_idx);
+  } else {
+    if (cpi->refresh_last_frame || cpi->refresh_golden_frame ||
+        cpi->refresh_bwd_ref_frame || cpi->refresh_alt2_ref_frame ||
+        cpi->refresh_alt_ref_frame) {
+      assert(cpi->refresh_fb_idx >= 0 && cpi->refresh_fb_idx < REF_FRAMES);
+      ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->refresh_fb_idx],
+                 cm->new_fb_idx);
+    }
+
+    // TODO(zoeliu): To handle cpi->interp_filter_selected[].
+
+    // For GF of 16, an additional ref frame index mapping needs to be handled
+    // if this is the last frame to encode in the current GF group.
+    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+    if (gf_group->update_type[gf_group->index + 1] == OVERLAY_UPDATE)
+      av1_ref_frame_map_idx_updates(cpi, gf_group->index + 1);
+  }
+
+#if DUMP_REF_FRAME_IMAGES == 1
+  // Dump out all reference frame images.
+  dump_ref_frame_images(cpi);
+#endif  // DUMP_REF_FRAME_IMAGES
+}
+#endif  // USE_GF16_MULTI_LAYER
+#endif  // CONFIG_EXT_REFS
+
+static void update_reference_frames(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+
   // NOTE: Save the new show frame buffer index for --test-code=warn, i.e.,
   //       for the purpose to verify no mismatch between encoder and decoder.
   if (cm->show_frame) cpi->last_show_frame_buf_idx = cm->new_fb_idx;
 
+#if CONFIG_EXT_REFS
+#if USE_GF16_MULTI_LAYER
+  if (cpi->rc.baseline_gf_interval == 16) {
+    update_reference_frames_gf16(cpi);
+    return;
+  }
+#endif  // USE_GF16_MULTI_LAYER
+#endif  // CONFIG_EXT_REFS
+
+  BufferPool *const pool = cm->buffer_pool;
   // At this point the new frame has been encoded.
   // If any buffer copy / swapping is signaled it should be done here.
   if (cm->frame_type == KEY_FRAME) {
@@ -3258,6 +3730,8 @@ void av1_update_reference_frames(AV1_COMP *cpi) {
 #if CONFIG_EXT_REFS
     ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->bwd_fb_idx],
                cm->new_fb_idx);
+    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->alt2_fb_idx],
+               cm->new_fb_idx);
 #endif  // CONFIG_EXT_REFS
     ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->alt_fb_idx],
                cm->new_fb_idx);
@@ -3281,7 +3755,7 @@ void av1_update_reference_frames(AV1_COMP *cpi) {
 #if CONFIG_EXT_REFS
     // We need to modify the mapping accordingly
     cpi->arf_map[0] = cpi->alt_fb_idx;
-#endif
+#endif  // CONFIG_EXT_REFS
 // TODO(zoeliu): Do we need to copy cpi->interp_filter_selected[0] over to
 // cpi->interp_filter_selected[GOLDEN_FRAME]?
 #if CONFIG_EXT_REFS
@@ -3290,36 +3764,32 @@ void av1_update_reference_frames(AV1_COMP *cpi) {
     // Refresh the LAST_FRAME with the ALTREF_FRAME and retire the LAST3_FRAME
     // by updating the virtual indices.
     const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-    int which_arf = gf_group->arf_ref_idx[gf_group->index];
-    int tmp = cpi->lst_fb_idxes[LAST_REF_FRAMES - 1];
+    const int which_arf = gf_group->arf_ref_idx[gf_group->index];
+    assert(gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE);
 
+    const int tmp = cpi->lst_fb_idxes[LAST_REF_FRAMES - 1];
     shift_last_ref_frames(cpi);
-    cpi->lst_fb_idxes[0] = cpi->alt_fb_idx;
-    cpi->alt_fb_idx = tmp;
 
+    cpi->lst_fb_idxes[0] = cpi->alt2_fb_idx;
+    cpi->alt2_fb_idx = tmp;
     // We need to modify the mapping accordingly
-    cpi->arf_map[which_arf] = cpi->alt_fb_idx;
+    cpi->arf_map[which_arf] = cpi->alt2_fb_idx;
 
     memcpy(cpi->interp_filter_selected[LAST_FRAME],
-           cpi->interp_filter_selected[ALTREF_FRAME + which_arf],
-           sizeof(cpi->interp_filter_selected[ALTREF_FRAME + which_arf]));
+           cpi->interp_filter_selected[ALTREF2_FRAME],
+           sizeof(cpi->interp_filter_selected[ALTREF2_FRAME]));
 #endif     // CONFIG_EXT_REFS
   } else { /* For non key/golden frames */
+    // === ALTREF_FRAME ===
     if (cpi->refresh_alt_ref_frame) {
       int arf_idx = cpi->alt_fb_idx;
       int which_arf = 0;
-#if CONFIG_EXT_REFS
-      if (cpi->oxcf.pass == 2) {
-        const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-        which_arf = gf_group->arf_update_idx[gf_group->index];
-        arf_idx = cpi->arf_map[which_arf];
-      }
-#else
+#if !CONFIG_EXT_REFS
       if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) {
         const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
         arf_idx = gf_group->arf_update_idx[gf_group->index];
       }
-#endif  // CONFIG_EXT_REFS
+#endif  // !CONFIG_EXT_REFS
       ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[arf_idx], cm->new_fb_idx);
 
       memcpy(cpi->interp_filter_selected[ALTREF_FRAME + which_arf],
@@ -3327,6 +3797,7 @@ void av1_update_reference_frames(AV1_COMP *cpi) {
              sizeof(cpi->interp_filter_selected[0]));
     }
 
+    // === GOLDEN_FRAME ===
     if (cpi->refresh_golden_frame) {
       ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
                  cm->new_fb_idx);
@@ -3340,18 +3811,8 @@ void av1_update_reference_frames(AV1_COMP *cpi) {
     }
 
 #if CONFIG_EXT_REFS
+    // === BWDREF_FRAME ===
     if (cpi->refresh_bwd_ref_frame) {
-      if (cpi->rc.is_bwd_ref_frame && cpi->num_extra_arfs) {
-        // We have swapped the virtual indices to allow bwd_ref_frame to use
-        // ALT0 as reference frame. We need to swap them back.
-        // NOTE: The ALT_REFs' are indexed reversely, and ALT0 refers to the
-        //       farthest ALT_REF from the first frame in the gf group.
-        int tmp = cpi->arf_map[0];
-        cpi->arf_map[0] = cpi->alt_fb_idx;
-        cpi->alt_fb_idx = cpi->bwd_fb_idx;
-        cpi->bwd_fb_idx = tmp;
-      }
-
       ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->bwd_fb_idx],
                  cm->new_fb_idx);
 
@@ -3359,6 +3820,16 @@ void av1_update_reference_frames(AV1_COMP *cpi) {
              cpi->interp_filter_selected[0],
              sizeof(cpi->interp_filter_selected[0]));
     }
+
+    // === ALTREF2_FRAME ===
+    if (cpi->refresh_alt2_ref_frame) {
+      ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->alt2_fb_idx],
+                 cm->new_fb_idx);
+
+      memcpy(cpi->interp_filter_selected[ALTREF2_FRAME],
+             cpi->interp_filter_selected[0],
+             sizeof(cpi->interp_filter_selected[0]));
+    }
 #endif  // CONFIG_EXT_REFS
   }
 
@@ -3396,15 +3867,6 @@ void av1_update_reference_frames(AV1_COMP *cpi) {
     // lst_fb_idxes[2], lst_fb_idxes[0], lst_fb_idxes[1]
     int ref_frame;
 
-    if (cpi->rc.is_bwd_ref_frame && cpi->num_extra_arfs) {
-      // We have swapped the virtual indices to use ALT0 as BWD_REF
-      // and we need to swap them back.
-      int tmp = cpi->arf_map[0];
-      cpi->arf_map[0] = cpi->alt_fb_idx;
-      cpi->alt_fb_idx = cpi->bwd_fb_idx;
-      cpi->bwd_fb_idx = tmp;
-    }
-
     if (cm->frame_type == KEY_FRAME) {
       for (ref_frame = 0; ref_frame < LAST_REF_FRAMES; ++ref_frame) {
         ref_cnt_fb(pool->frame_bufs,
@@ -3448,7 +3910,7 @@ void av1_update_reference_frames(AV1_COMP *cpi) {
                sizeof(cpi->interp_filter_selected[BWDREF_FRAME]));
       }
     }
-#else
+#else   // !CONFIG_EXT_REFS
     ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx],
                cm->new_fb_idx);
     if (!cpi->rc.is_src_frame_alt_ref) {
@@ -3468,18 +3930,12 @@ void av1_update_reference_frames(AV1_COMP *cpi) {
 static INLINE void alloc_frame_mvs(AV1_COMMON *const cm, int buffer_idx) {
   assert(buffer_idx != INVALID_IDX);
   RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx];
-  if (new_fb_ptr->mvs == NULL || new_fb_ptr->mi_rows < cm->mi_rows ||
-      new_fb_ptr->mi_cols < cm->mi_cols) {
-    aom_free(new_fb_ptr->mvs);
-    CHECK_MEM_ERROR(cm, new_fb_ptr->mvs,
-                    (MV_REF *)aom_calloc(cm->mi_rows * cm->mi_cols,
-                                         sizeof(*new_fb_ptr->mvs)));
-    new_fb_ptr->mi_rows = cm->mi_rows;
-    new_fb_ptr->mi_cols = cm->mi_cols;
-  }
+  ensure_mv_buffer(new_fb_ptr, cm);
+  new_fb_ptr->width = cm->width;
+  new_fb_ptr->height = cm->height;
 }
 
-void av1_scale_references(AV1_COMP *cpi) {
+static void scale_references(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
   MV_REFERENCE_FRAME ref_frame;
   const AOM_REFFRAME ref_mask[INTER_REFS_PER_FRAME] = {
@@ -3491,6 +3947,7 @@ void av1_scale_references(AV1_COMP *cpi) {
     AOM_GOLD_FLAG,
 #if CONFIG_EXT_REFS
     AOM_BWD_FLAG,
+    AOM_ALT2_FLAG,
 #endif  // CONFIG_EXT_REFS
     AOM_ALT_FLAG
   };
@@ -3581,8 +4038,9 @@ static void release_scaled_references(AV1_COMP *cpi) {
     refresh[1] = refresh[2] = 0;
     refresh[3] = (cpi->refresh_golden_frame) ? 1 : 0;
     refresh[4] = (cpi->refresh_bwd_ref_frame) ? 1 : 0;
-    refresh[5] = (cpi->refresh_alt_ref_frame) ? 1 : 0;
-#else
+    refresh[5] = (cpi->refresh_alt2_ref_frame) ? 1 : 0;
+    refresh[6] = (cpi->refresh_alt_ref_frame) ? 1 : 0;
+#else   // !CONFIG_EXT_REFS
     refresh[1] = (cpi->refresh_golden_frame) ? 1 : 0;
     refresh[2] = (cpi->refresh_alt_ref_frame) ? 1 : 0;
 #endif  // CONFIG_EXT_REFS
@@ -3611,28 +4069,6 @@ static void release_scaled_references(AV1_COMP *cpi) {
   }
 }
 
-static void full_to_model_count(unsigned int *model_count,
-                                unsigned int *full_count) {
-  int n;
-  model_count[ZERO_TOKEN] = full_count[ZERO_TOKEN];
-  model_count[ONE_TOKEN] = full_count[ONE_TOKEN];
-  model_count[TWO_TOKEN] = full_count[TWO_TOKEN];
-  for (n = THREE_TOKEN; n < EOB_TOKEN; ++n)
-    model_count[TWO_TOKEN] += full_count[n];
-  model_count[EOB_MODEL_TOKEN] = full_count[EOB_TOKEN];
-}
-
-void av1_full_to_model_counts(av1_coeff_count_model *model_count,
-                              av1_coeff_count *full_count) {
-  int i, j, k, l;
-
-  for (i = 0; i < PLANE_TYPES; ++i)
-    for (j = 0; j < REF_TYPES; ++j)
-      for (k = 0; k < COEF_BANDS; ++k)
-        for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l)
-          full_to_model_count(model_count[i][j][k][l], full_count[i][j][k][l]);
-}
-
 #if 0 && CONFIG_INTERNAL_STATS
 static void output_frame_level_debug_stats(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
@@ -3731,7 +4167,7 @@ static void set_size_independent_vars(AV1_COMP *cpi) {
 #if CONFIG_GLOBAL_MOTION
   int i;
   for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
-    set_default_warp_params(&cpi->common.global_motion[i]);
+    cpi->common.global_motion[i] = default_warp_params;
   }
   cpi->global_motion_search_done = 0;
 #endif  // CONFIG_GLOBAL_MOTION
@@ -3739,9 +4175,7 @@ static void set_size_independent_vars(AV1_COMP *cpi) {
   av1_set_rd_speed_thresholds(cpi);
   av1_set_rd_speed_thresholds_sub8x8(cpi);
   cpi->common.interp_filter = cpi->sf.default_interp_filter;
-#if CONFIG_EXT_INTER
   if (!frame_is_intra_only(&cpi->common)) set_compound_tools(&cpi->common);
-#endif  // CONFIG_EXT_INTER
 }
 
 static void set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index,
@@ -3759,11 +4193,17 @@ static void set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index,
       &cpi->od_rc, cpi->refresh_golden_frame, cpi->refresh_alt_ref_frame,
       frame_type, bottom_index, top_index);
 #else
-  *q = av1_rc_pick_q_and_bounds(cpi, bottom_index, top_index);
+  *q = av1_rc_pick_q_and_bounds(cpi, cm->width, cm->height, bottom_index,
+                                top_index);
 #endif
 
   if (!frame_is_intra_only(cm)) {
-    av1_set_high_precision_mv(cpi, (*q) < HIGH_PRECISION_MV_QTHRESH);
+#if CONFIG_AMVR
+    set_high_precision_mv(cpi, (*q) < HIGH_PRECISION_MV_QTHRESH,
+                          cpi->common.cur_frame_mv_precision_level);
+#else
+    set_high_precision_mv(cpi, (*q) < HIGH_PRECISION_MV_QTHRESH);
+#endif
   }
 
   // Configure experimental use of segmentation for enhanced coding of
@@ -3801,9 +4241,89 @@ static void set_restoration_tilesize(int width, int height, int sx, int sy,
   rst[0].restoration_tilesize = (RESTORATION_TILESIZE_MAX >> 1);
   rst[1].restoration_tilesize = rst[0].restoration_tilesize >> s;
   rst[2].restoration_tilesize = rst[1].restoration_tilesize;
+
+  rst[0].procunit_width = rst[0].procunit_height = RESTORATION_PROC_UNIT_SIZE;
+  rst[1].procunit_width = rst[2].procunit_width =
+      RESTORATION_PROC_UNIT_SIZE >> sx;
+  rst[1].procunit_height = rst[2].procunit_height =
+      RESTORATION_PROC_UNIT_SIZE >> sy;
 }
 #endif  // CONFIG_LOOP_RESTORATION
 
+static void init_ref_frame_bufs(AV1_COMMON *cm) {
+  int i;
+  BufferPool *const pool = cm->buffer_pool;
+  cm->new_fb_idx = INVALID_IDX;
+  for (i = 0; i < REF_FRAMES; ++i) {
+    cm->ref_frame_map[i] = INVALID_IDX;
+    pool->frame_bufs[i].ref_count = 0;
+  }
+#if CONFIG_HASH_ME
+  for (i = 0; i < FRAME_BUFFERS; ++i) {
+    av1_hash_table_init(&pool->frame_bufs[i].hash_table);
+  }
+#endif
+}
+
+static void check_initial_width(AV1_COMP *cpi,
+#if CONFIG_HIGHBITDEPTH
+                                int use_highbitdepth,
+#endif
+                                int subsampling_x, int subsampling_y) {
+  AV1_COMMON *const cm = &cpi->common;
+
+  if (!cpi->initial_width ||
+#if CONFIG_HIGHBITDEPTH
+      cm->use_highbitdepth != use_highbitdepth ||
+#endif
+      cm->subsampling_x != subsampling_x ||
+      cm->subsampling_y != subsampling_y) {
+    cm->subsampling_x = subsampling_x;
+    cm->subsampling_y = subsampling_y;
+#if CONFIG_HIGHBITDEPTH
+    cm->use_highbitdepth = use_highbitdepth;
+#endif
+
+    alloc_raw_frame_buffers(cpi);
+    init_ref_frame_bufs(cm);
+    alloc_util_frame_buffers(cpi);
+
+    init_motion_estimation(cpi);  // TODO(agrange) This can be removed.
+
+    cpi->initial_width = cm->width;
+    cpi->initial_height = cm->height;
+    cpi->initial_mbs = cm->MBs;
+  }
+}
+
+// Returns 1 if the assigned width or height was <= 0.
+static int set_size_literal(AV1_COMP *cpi, int width, int height) {
+  AV1_COMMON *cm = &cpi->common;
+#if CONFIG_HIGHBITDEPTH
+  check_initial_width(cpi, cm->use_highbitdepth, cm->subsampling_x,
+                      cm->subsampling_y);
+#else
+  check_initial_width(cpi, cm->subsampling_x, cm->subsampling_y);
+#endif  // CONFIG_HIGHBITDEPTH
+
+  if (width <= 0 || height <= 0) return 1;
+
+  cm->width = width;
+  cm->height = height;
+
+  if (cpi->initial_width && cpi->initial_height &&
+      (cm->width > cpi->initial_width || cm->height > cpi->initial_height)) {
+    av1_free_context_buffers(cm);
+    av1_free_pc_tree(&cpi->td);
+    alloc_compressor_data(cpi);
+    realloc_segmentation_maps(cpi);
+    cpi->initial_width = cpi->initial_height = 0;
+  }
+  update_frame_size(cpi);
+
+  return 0;
+}
+
 static void set_frame_size(AV1_COMP *cpi, int width, int height) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
@@ -3811,13 +4331,13 @@ static void set_frame_size(AV1_COMP *cpi, int width, int height) {
 
   if (width != cm->width || height != cm->height) {
     // There has been a change in the encoded frame size
-    av1_set_size_literal(cpi, width, height);
+    set_size_literal(cpi, width, height);
     set_mv_search_params(cpi);
   }
 
 #if !CONFIG_XIPHRC
   if (cpi->oxcf.pass == 2) {
-    av1_set_target_rate(cpi);
+    av1_set_target_rate(cpi, cm->width, cm->height);
   }
 #endif
 
@@ -3848,6 +4368,8 @@ static void set_frame_size(AV1_COMP *cpi, int width, int height) {
   for (int i = 0; i < MAX_MB_PLANE; ++i) {
     cpi->rst_search[i].restoration_tilesize =
         cm->rst_info[i].restoration_tilesize;
+    cpi->rst_search[i].procunit_width = cm->rst_info[i].procunit_width;
+    cpi->rst_search[i].procunit_height = cm->rst_info[i].procunit_height;
     av1_alloc_restoration_struct(cm, &cpi->rst_search[i],
 #if CONFIG_FRAME_SUPERRES
                                  cm->superres_upscaled_width,
@@ -3903,26 +4425,189 @@ static void set_frame_size(AV1_COMP *cpi, int width, int height) {
   set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
 }
 
-static void setup_frame_size(AV1_COMP *cpi) {
-  int encode_width = cpi->oxcf.width;
-  int encode_height = cpi->oxcf.height;
+static uint8_t calculate_next_resize_scale(const AV1_COMP *cpi) {
+  // Choose an arbitrary random number
+  static unsigned int seed = 56789;
+  const AV1EncoderConfig *oxcf = &cpi->oxcf;
+  if (oxcf->pass == 1) return SCALE_NUMERATOR;
+  uint8_t new_denom = SCALE_NUMERATOR;
+
+  switch (oxcf->resize_mode) {
+    case RESIZE_NONE: new_denom = SCALE_NUMERATOR; break;
+    case RESIZE_FIXED:
+      if (cpi->common.frame_type == KEY_FRAME)
+        new_denom = oxcf->resize_kf_scale_denominator;
+      else
+        new_denom = oxcf->resize_scale_denominator;
+      break;
+    case RESIZE_RANDOM: new_denom = lcg_rand16(&seed) % 9 + 8; break;
+    default: assert(0);
+  }
+  return new_denom;
+}
+
+#if CONFIG_FRAME_SUPERRES
+
+static uint8_t calculate_next_superres_scale(AV1_COMP *cpi) {
+  // Choose an arbitrary random number
+  static unsigned int seed = 34567;
+  const AV1EncoderConfig *oxcf = &cpi->oxcf;
+  if (oxcf->pass == 1) return SCALE_NUMERATOR;
+  uint8_t new_denom = SCALE_NUMERATOR;
+  int bottom_index, top_index, q, qthresh;
+
+  switch (oxcf->superres_mode) {
+    case SUPERRES_NONE: new_denom = SCALE_NUMERATOR; break;
+    case SUPERRES_FIXED:
+      if (cpi->common.frame_type == KEY_FRAME)
+        new_denom = oxcf->superres_kf_scale_denominator;
+      else
+        new_denom = oxcf->superres_scale_denominator;
+      break;
+    case SUPERRES_RANDOM: new_denom = lcg_rand16(&seed) % 9 + 8; break;
+    case SUPERRES_QTHRESH:
+      qthresh = (cpi->common.frame_type == KEY_FRAME ? oxcf->superres_kf_qthresh
+                                                     : oxcf->superres_qthresh);
+      av1_set_target_rate(cpi, cpi->oxcf.width, cpi->oxcf.height);
+      q = av1_rc_pick_q_and_bounds(cpi, cpi->oxcf.width, cpi->oxcf.height,
+                                   &bottom_index, &top_index);
+      if (q < qthresh) {
+        new_denom = SCALE_NUMERATOR;
+      } else {
+        new_denom = SCALE_NUMERATOR + 1 + ((q - qthresh) >> 3);
+        new_denom = AOMMIN(SCALE_NUMERATOR << 1, new_denom);
+        // printf("SUPERRES: q %d, qthresh %d: denom %d\n", q, qthresh,
+        // new_denom);
+      }
+      break;
+    default: assert(0);
+  }
+  return new_denom;
+}
+
+static int dimension_is_ok(int orig_dim, int resized_dim, int denom) {
+  return (resized_dim * SCALE_NUMERATOR >= orig_dim * denom / 2);
+}
+
+// TODO(now): Fix?
+static int dimensions_are_ok(int owidth, int oheight, size_params_type *rsz) {
+  return dimension_is_ok(owidth, rsz->resize_width, rsz->superres_denom) &&
+         (CONFIG_HORZONLY_FRAME_SUPERRES ||
+          dimension_is_ok(oheight, rsz->resize_height, rsz->superres_denom));
+}
+
+#define DIVIDE_AND_ROUND(x, y) (((x) + ((y) >> 1)) / (y))
+
+static int validate_size_scales(RESIZE_MODE resize_mode,
+                                SUPERRES_MODE superres_mode, int owidth,
+                                int oheight, size_params_type *rsz) {
+  if (dimensions_are_ok(owidth, oheight, rsz)) {  // Nothing to do.
+    return 1;
+  }
+
+  // Calculate current resize scale.
+  int resize_denom =
+      AOMMAX(DIVIDE_AND_ROUND(owidth * SCALE_NUMERATOR, rsz->resize_width),
+             DIVIDE_AND_ROUND(oheight * SCALE_NUMERATOR, rsz->resize_height));
+
+  if (resize_mode != RESIZE_RANDOM && superres_mode == SUPERRES_RANDOM) {
+    // Alter superres scale as needed to enforce conformity.
+    rsz->superres_denom =
+        (2 * SCALE_NUMERATOR * SCALE_NUMERATOR) / resize_denom;
+    if (!dimensions_are_ok(owidth, oheight, rsz)) {
+      if (rsz->superres_denom > SCALE_NUMERATOR) --rsz->superres_denom;
+    }
+  } else if (resize_mode == RESIZE_RANDOM && superres_mode != SUPERRES_RANDOM) {
+    // Alter resize scale as needed to enforce conformity.
+    resize_denom =
+        (2 * SCALE_NUMERATOR * SCALE_NUMERATOR) / rsz->superres_denom;
+    rsz->resize_width = owidth;
+    rsz->resize_height = oheight;
+    av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height,
+                              resize_denom);
+    if (!dimensions_are_ok(owidth, oheight, rsz)) {
+      if (resize_denom > SCALE_NUMERATOR) {
+        --resize_denom;
+        rsz->resize_width = owidth;
+        rsz->resize_height = oheight;
+        av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height,
+                                  resize_denom);
+      }
+    }
+  } else if (resize_mode == RESIZE_RANDOM && superres_mode == SUPERRES_RANDOM) {
+    // Alter both resize and superres scales as needed to enforce conformity.
+    do {
+      if (resize_denom > rsz->superres_denom)
+        --resize_denom;
+      else
+        --rsz->superres_denom;
+      rsz->resize_width = owidth;
+      rsz->resize_height = oheight;
+      av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height,
+                                resize_denom);
+    } while (!dimensions_are_ok(owidth, oheight, rsz) &&
+             (resize_denom > SCALE_NUMERATOR ||
+              rsz->superres_denom > SCALE_NUMERATOR));
+  } else {  // We are allowed to alter neither resize scale nor superres scale.
+    return 0;
+  }
+  return dimensions_are_ok(owidth, oheight, rsz);
+}
+#undef DIVIDE_AND_ROUND
+#endif  // CONFIG_FRAME_SUPERRES
+
+// Calculates resize and superres params for next frame
+size_params_type av1_calculate_next_size_params(AV1_COMP *cpi) {
+  const AV1EncoderConfig *oxcf = &cpi->oxcf;
+  size_params_type rsz = {
+    oxcf->width,
+    oxcf->height,
+#if CONFIG_FRAME_SUPERRES
+    SCALE_NUMERATOR
+#endif  // CONFIG_FRAME_SUPERRES
+  };
+  int resize_denom;
+  if (oxcf->pass == 1) return rsz;
+  if (cpi->resize_pending_width && cpi->resize_pending_height) {
+    rsz.resize_width = cpi->resize_pending_width;
+    rsz.resize_height = cpi->resize_pending_height;
+    cpi->resize_pending_width = cpi->resize_pending_height = 0;
+  } else {
+    resize_denom = calculate_next_resize_scale(cpi);
+    rsz.resize_width = cpi->oxcf.width;
+    rsz.resize_height = cpi->oxcf.height;
+    av1_calculate_scaled_size(&rsz.resize_width, &rsz.resize_height,
+                              resize_denom);
+  }
+#if CONFIG_FRAME_SUPERRES
+  rsz.superres_denom = calculate_next_superres_scale(cpi);
+  if (!validate_size_scales(oxcf->resize_mode, oxcf->superres_mode, oxcf->width,
+                            oxcf->height, &rsz))
+    assert(0 && "Invalid scale parameters");
+#endif  // CONFIG_FRAME_SUPERRES
+  return rsz;
+}
 
-  uint8_t resize_num = av1_calculate_next_resize_scale(cpi);
-  av1_calculate_scaled_size(&encode_width, &encode_height, resize_num);
+static void setup_frame_size_from_params(AV1_COMP *cpi, size_params_type *rsz) {
+  int encode_width = rsz->resize_width;
+  int encode_height = rsz->resize_height;
 
 #if CONFIG_FRAME_SUPERRES
   AV1_COMMON *cm = &cpi->common;
   cm->superres_upscaled_width = encode_width;
   cm->superres_upscaled_height = encode_height;
-  cm->superres_scale_numerator =
-      av1_calculate_next_superres_scale(cpi, encode_width, encode_width);
-  av1_calculate_scaled_size(&encode_width, &encode_height,
-                            cm->superres_scale_numerator);
+  cm->superres_scale_denominator = rsz->superres_denom;
+  av1_calculate_scaled_superres_size(&encode_width, &encode_height,
+                                     rsz->superres_denom);
 #endif  // CONFIG_FRAME_SUPERRES
-
   set_frame_size(cpi, encode_width, encode_height);
 }
 
+static void setup_frame_size(AV1_COMP *cpi) {
+  size_params_type rsz = av1_calculate_next_size_params(cpi);
+  setup_frame_size_from_params(cpi, &rsz);
+}
+
 #if CONFIG_FRAME_SUPERRES
 static void superres_post_encode(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
@@ -3978,7 +4663,12 @@ static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
 #endif  // CONFIG_EXT_TILE
 
   if (no_loopfilter) {
+#if CONFIG_LOOPFILTER_LEVEL
+    lf->filter_level[0] = 0;
+    lf->filter_level[1] = 0;
+#else
     lf->filter_level = 0;
+#endif
   } else {
     struct aom_usec_timer timer;
 
@@ -3992,15 +4682,31 @@ static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
     cpi->time_pick_lpf += aom_usec_timer_elapsed(&timer);
   }
 
-  if (lf->filter_level > 0) {
+#if !CONFIG_LPF_SB
+#if CONFIG_LOOPFILTER_LEVEL
+  if (lf->filter_level[0] || lf->filter_level[1])
+#else
+  if (lf->filter_level > 0)
+#endif
+#endif  // CONFIG_LPF_SB
+  {
 #if CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_CB4X4
-#if CONFIG_UV_LVL
-    av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
-    av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level_u, 1, 0);
-    av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level_v, 2, 0);
+#if CONFIG_LPF_SB
+    av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0, 0,
+                          0);
+#else
+#if CONFIG_LOOPFILTER_LEVEL
+    av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level[0],
+                          lf->filter_level[1], 0, 0);
+    av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level_u,
+                          lf->filter_level_u, 1, 0);
+    av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level_v,
+                          lf->filter_level_v, 2, 0);
+
 #else
     av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
-#endif  // CONFIG_UV_LVL
+#endif  // CONFIG_LOOPFILTER_LEVEL
+#endif  // CONFIG_LPF_SB
 #else
     if (cpi->num_workers > 1)
       av1_loop_filter_frame_mt(cm->frame_to_show, cm, xd->plane,
@@ -4010,13 +4716,18 @@ static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
       av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
 #endif
   }
+
+#if CONFIG_STRIPED_LOOP_RESTORATION
+  av1_loop_restoration_save_boundary_lines(cm->frame_to_show, cm);
+#endif
+
 #if CONFIG_CDEF
   if (is_lossless_requested(&cpi->oxcf)) {
     cm->cdef_bits = 0;
     cm->cdef_strengths[0] = 0;
     cm->nb_cdef_strengths = 1;
   } else {
-    // Find cm->dering_level, cm->clpf_strength_u and cm->clpf_strength_v
+    // Find CDEF parameters
     av1_cdef_search(cm->frame_to_show, cpi->source, cm, xd,
                     cpi->oxcf.speed > 0);
 
@@ -4030,6 +4741,7 @@ static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
 #endif  // CONFIG_FRAME_SUPERRES
 
 #if CONFIG_LOOP_RESTORATION
+  aom_extend_frame_borders(cm->frame_to_show);
   av1_pick_filter_restoration(cpi->source, cpi, cpi->sf.lpf_pick);
   if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
       cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
@@ -4049,7 +4761,9 @@ static void encode_without_recode_loop(AV1_COMP *cpi) {
   aom_clear_system_state();
 
   set_size_independent_vars(cpi);
+
   setup_frame_size(cpi);
+
   assert(cm->width == cpi->scaled_source.y_crop_width);
   assert(cm->height == cpi->scaled_source.y_crop_height);
 
@@ -4060,9 +4774,12 @@ static void encode_without_recode_loop(AV1_COMP *cpi) {
   if (cpi->unscaled_last_source != NULL)
     cpi->last_source = av1_scale_if_required(cm, cpi->unscaled_last_source,
                                              &cpi->scaled_last_source);
+#if CONFIG_HIGHBITDEPTH && CONFIG_GLOBAL_MOTION
+  cpi->source->buf_8bit_valid = 0;
+#endif
 
   if (frame_is_intra_only(cm) == 0) {
-    av1_scale_references(cpi);
+    scale_references(cpi);
   }
 
   av1_set_quantizer(cm, q);
@@ -4113,14 +4830,18 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size,
 
   set_size_independent_vars(cpi);
 
+#if CONFIG_HIGHBITDEPTH && CONFIG_GLOBAL_MOTION
+  cpi->source->buf_8bit_valid = 0;
+#endif
+
+  aom_clear_system_state();
+  setup_frame_size(cpi);
+  set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
+
   do {
     aom_clear_system_state();
 
-    setup_frame_size(cpi);
-
     if (loop_count == 0) {
-      set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
-
       // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed.
       set_mv_search_params(cpi);
 
@@ -4143,6 +4864,13 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size,
                                        &frame_over_shoot_limit);
     }
 
+#if CONFIG_GLOBAL_MOTION
+    // if frame was scaled calculate global_motion_search again if already done
+    if (loop_count > 0 && cpi->source && cpi->global_motion_search_done)
+      if (cpi->source->y_crop_width != cm->width ||
+          cpi->source->y_crop_height != cm->height)
+        cpi->global_motion_search_done = 0;
+#endif  // CONFIG_GLOBAL_MOTION
     cpi->source =
         av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source);
     if (cpi->unscaled_last_source != NULL)
@@ -4153,9 +4881,8 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size,
       if (loop_count > 0) {
         release_scaled_references(cpi);
       }
-      av1_scale_references(cpi);
+      scale_references(cpi);
     }
-
     av1_set_quantizer(cm, q);
 
     if (loop_count == 0) setup_frame(cpi);
@@ -4170,7 +4897,13 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size,
           cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL) {
         for (i = 0; i < FRAME_CONTEXTS; ++i) cm->frame_contexts[i] = *cm->fc;
       } else if (cm->reset_frame_context == RESET_FRAME_CONTEXT_CURRENT) {
+#if CONFIG_NO_FRAME_CONTEXT_SIGNALING
+        if (cm->frame_refs[0].idx >= 0) {
+          cm->frame_contexts[cm->frame_refs[0].idx] = *cm->fc;
+        }
+#else
         cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
+#endif
       }
     }
 #endif  // CONFIG_Q_ADAPT_PROBS
@@ -4184,6 +4917,7 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size,
     }
 
     // transform / motion compensation build reconstruction frame
+    save_coding_context(cpi);
     av1_encode_frame(cpi);
 
     // Update the skip mb flag probabilities based on the distribution
@@ -4196,8 +4930,7 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size,
     // accurate estimate of output frame size to determine if we need
     // to recode.
     if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) {
-      save_coding_context(cpi);
-
+      restore_coding_context(cpi);
       av1_pack_bitstream(cpi, dest, size);
 
       rc->projected_frame_size = (int)(*size) << 3;
@@ -4279,20 +5012,22 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size,
 
           if (undershoot_seen || loop_at_this_size > 1) {
             // Update rate_correction_factor unless
-            av1_rc_update_rate_correction_factors(cpi);
+            av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
 
             q = (q_high + q_low + 1) / 2;
           } else {
             // Update rate_correction_factor unless
-            av1_rc_update_rate_correction_factors(cpi);
+            av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
 
             q = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
-                                  AOMMAX(q_high, top_index));
+                                  AOMMAX(q_high, top_index), cm->width,
+                                  cm->height);
 
             while (q < q_low && retries < 10) {
-              av1_rc_update_rate_correction_factors(cpi);
+              av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
               q = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
-                                    AOMMAX(q_high, top_index));
+                                    AOMMAX(q_high, top_index), cm->width,
+                                    cm->height);
               retries++;
             }
           }
@@ -4303,12 +5038,12 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size,
           q_high = q > q_low ? q - 1 : q_low;
 
           if (overshoot_seen || loop_at_this_size > 1) {
-            av1_rc_update_rate_correction_factors(cpi);
+            av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
             q = (q_high + q_low) / 2;
           } else {
-            av1_rc_update_rate_correction_factors(cpi);
+            av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
             q = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
-                                  top_index);
+                                  top_index, cm->width, cm->height);
             // Special case reset for qlow for constrained quality.
             // This should only trigger where there is very substantial
             // undershoot on a frame and the auto cq level is above
@@ -4318,9 +5053,9 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size,
             }
 
             while (q > q_high && retries < 10) {
-              av1_rc_update_rate_correction_factors(cpi);
+              av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
               q = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
-                                    top_index);
+                                    top_index, cm->width, cm->height);
               retries++;
             }
           }
@@ -4369,13 +5104,13 @@ static int get_ref_frame_flags(const AV1_COMP *cpi) {
   const int last3_is_last =
       map[cpi->lst_fb_idxes[2]] == map[cpi->lst_fb_idxes[0]];
   const int gld_is_last = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[0]];
-#if CONFIG_ONE_SIDED_COMPOUND
+#if CONFIG_ONE_SIDED_COMPOUND && !CONFIG_EXT_COMP_REFS
   const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idxes[0]];
   const int last3_is_last2 =
       map[cpi->lst_fb_idxes[2]] == map[cpi->lst_fb_idxes[1]];
   const int gld_is_last2 = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[1]];
   const int gld_is_last3 = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[2]];
-#else  // !CONFIG_ONE_SIDED_COMPOUND
+#else   // !CONFIG_ONE_SIDED_COMPOUND || CONFIG_EXT_COMP_REFS
   const int bwd_is_last = map[cpi->bwd_fb_idx] == map[cpi->lst_fb_idxes[0]];
   const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idxes[0]];
 
@@ -4388,12 +5123,19 @@ static int get_ref_frame_flags(const AV1_COMP *cpi) {
   const int bwd_is_last3 = map[cpi->bwd_fb_idx] == map[cpi->lst_fb_idxes[2]];
 
   const int bwd_is_gld = map[cpi->bwd_fb_idx] == map[cpi->gld_fb_idx];
+#endif  // CONFIG_ONE_SIDED_COMPOUND && !CONFIG_EXT_COMP_REFS
+
+  const int alt2_is_last = map[cpi->alt2_fb_idx] == map[cpi->lst_fb_idxes[0]];
+  const int alt2_is_last2 = map[cpi->alt2_fb_idx] == map[cpi->lst_fb_idxes[1]];
+  const int alt2_is_last3 = map[cpi->alt2_fb_idx] == map[cpi->lst_fb_idxes[2]];
+  const int alt2_is_gld = map[cpi->alt2_fb_idx] == map[cpi->gld_fb_idx];
+  const int alt2_is_bwd = map[cpi->alt2_fb_idx] == map[cpi->bwd_fb_idx];
 
-#endif  // CONFIG_ONE_SIDED_COMPOUND
   const int last2_is_alt = map[cpi->lst_fb_idxes[1]] == map[cpi->alt_fb_idx];
   const int last3_is_alt = map[cpi->lst_fb_idxes[2]] == map[cpi->alt_fb_idx];
   const int gld_is_alt = map[cpi->gld_fb_idx] == map[cpi->alt_fb_idx];
   const int bwd_is_alt = map[cpi->bwd_fb_idx] == map[cpi->alt_fb_idx];
+  const int alt2_is_alt = map[cpi->alt2_fb_idx] == map[cpi->alt_fb_idx];
 #else   // !CONFIG_EXT_REFS
   const int gld_is_last = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idx];
   const int gld_is_alt = map[cpi->gld_fb_idx] == map[cpi->alt_fb_idx];
@@ -4402,13 +5144,6 @@ static int get_ref_frame_flags(const AV1_COMP *cpi) {
 
   int flags = AOM_REFFRAME_ALL;
 
-#if CONFIG_EXT_REFS
-  // Disable the use of BWDREF_FRAME for non-bipredictive frames.
-  if (!(cpi->rc.is_bipred_frame || cpi->rc.is_last_bipred_frame ||
-        (cpi->rc.is_bwd_ref_frame && cpi->num_extra_arfs)))
-    flags &= ~AOM_BWD_FLAG;
-#endif  // CONFIG_EXT_REFS
-
   if (gld_is_last || gld_is_alt) flags &= ~AOM_GOLD_FLAG;
 
   if (cpi->rc.frames_till_gf_update_due == INT_MAX) flags &= ~AOM_GOLD_FLAG;
@@ -4422,15 +5157,21 @@ static int get_ref_frame_flags(const AV1_COMP *cpi) {
 
   if (gld_is_last2 || gld_is_last3) flags &= ~AOM_GOLD_FLAG;
 
-#if CONFIG_ONE_SIDED_COMPOUND  // Changes LL & HL bitstream
+#if CONFIG_ONE_SIDED_COMPOUND && \
+    !CONFIG_EXT_COMP_REFS  // Changes LL & HL bitstream
   /* Allow biprediction between two identical frames (e.g. bwd_is_last = 1) */
   if (bwd_is_alt && (flags & AOM_BWD_FLAG)) flags &= ~AOM_BWD_FLAG;
-#else
+#else   // !CONFIG_ONE_SIDED_COMPOUND || CONFIG_EXT_COMP_REFS
   if ((bwd_is_last || bwd_is_last2 || bwd_is_last3 || bwd_is_gld ||
        bwd_is_alt) &&
       (flags & AOM_BWD_FLAG))
     flags &= ~AOM_BWD_FLAG;
-#endif
+#endif  // CONFIG_ONE_SIDED_COMPOUND && !CONFIG_EXT_COMP_REFS
+
+  if ((alt2_is_last || alt2_is_last2 || alt2_is_last3 || alt2_is_gld ||
+       alt2_is_bwd || alt2_is_alt) &&
+      (flags & AOM_ALT2_FLAG))
+    flags &= ~AOM_ALT2_FLAG;
 #endif  // CONFIG_EXT_REFS
 
   return flags;
@@ -4453,6 +5194,7 @@ static void set_ext_overrides(AV1_COMP *cpi) {
   }
 }
 
+#if !CONFIG_FRAME_SIGN_BIAS
 static void set_arf_sign_bias(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   int arf_sign_bias;
@@ -4461,8 +5203,8 @@ static void set_arf_sign_bias(AV1_COMP *cpi) {
   // The arf_sign_bias will be one for internal ARFs'
   arf_sign_bias = cpi->rc.source_alt_ref_active &&
                   (!cpi->refresh_alt_ref_frame ||
-                   (gf_group->rf_level[gf_group->index] == GF_ARF_LOW));
-#else
+                   gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE);
+#else   // !CONFIG_EXT_REFS
   if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) {
     const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
     arf_sign_bias = cpi->rc.source_alt_ref_active &&
@@ -4477,8 +5219,11 @@ static void set_arf_sign_bias(AV1_COMP *cpi) {
   cm->ref_frame_sign_bias[ALTREF_FRAME] = arf_sign_bias;
 #if CONFIG_EXT_REFS
   cm->ref_frame_sign_bias[BWDREF_FRAME] = cm->ref_frame_sign_bias[ALTREF_FRAME];
+  cm->ref_frame_sign_bias[ALTREF2_FRAME] =
+      cm->ref_frame_sign_bias[ALTREF_FRAME];
 #endif  // CONFIG_EXT_REFS
 }
+#endif  // !CONFIG_FRAME_SIGN_BIAS
 
 static int setup_interp_filter_search_mask(AV1_COMP *cpi) {
   InterpFilter ifilter;
@@ -4488,26 +5233,16 @@ static int setup_interp_filter_search_mask(AV1_COMP *cpi) {
   int arf_idx = ALTREF_FRAME;
 
 #if CONFIG_EXT_REFS
-  // Get which arf used as ALTREF_FRAME
-  if (cpi->oxcf.pass == 2)
-    arf_idx += cpi->twopass.gf_group.arf_ref_idx[cpi->twopass.gf_group.index];
-#endif  // CONFIG_EXT_REFS
-
+  if (cpi->common.last_frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame ||
+      cpi->refresh_alt2_ref_frame)
+#else   // !CONFIG_EXT_REFS
   if (cpi->common.last_frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame)
+#endif  // CONFIG_EXT_REFS
     return mask;
 
-#if CONFIG_EXT_REFS
-  for (ref = LAST_FRAME; ref < ALTREF_FRAME; ++ref)
-    for (ifilter = EIGHTTAP_REGULAR; ifilter < SWITCHABLE_FILTERS; ++ifilter)
-      ref_total[ref] += cpi->interp_filter_selected[ref][ifilter];
-
-  for (ifilter = EIGHTTAP_REGULAR; ifilter < SWITCHABLE_FILTERS; ++ifilter)
-    ref_total[ref] += cpi->interp_filter_selected[arf_idx][ifilter];
-#else
   for (ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref)
     for (ifilter = EIGHTTAP_REGULAR; ifilter < SWITCHABLE_FILTERS; ++ifilter)
       ref_total[ref] += cpi->interp_filter_selected[ref][ifilter];
-#endif  // CONFIG_EXT_REFS
 
   for (ifilter = EIGHTTAP_REGULAR; ifilter < SWITCHABLE_FILTERS; ++ifilter) {
     if ((ref_total[LAST_FRAME] &&
@@ -4527,6 +5262,9 @@ static int setup_interp_filter_search_mask(AV1_COMP *cpi) {
         (ref_total[BWDREF_FRAME] == 0 ||
          cpi->interp_filter_selected[BWDREF_FRAME][ifilter] * 50 <
              ref_total[BWDREF_FRAME]) &&
+        (ref_total[ALTREF2_FRAME] == 0 ||
+         cpi->interp_filter_selected[ALTREF2_FRAME][ifilter] * 50 <
+             ref_total[ALTREF2_FRAME]) &&
 #endif  // CONFIG_EXT_REFS
         (ref_total[ALTREF_FRAME] == 0 ||
          cpi->interp_filter_selected[arf_idx][ifilter] * 50 <
@@ -4574,6 +5312,19 @@ static void dump_filtered_recon_frames(AV1_COMP *cpi) {
       cpi->refresh_alt_ref_frame,
       cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index],
       recon_buf->y_stride, recon_buf->uv_stride, cm->width, cm->height);
+#if 0
+  int ref_frame;
+  printf("get_ref_frame_map_idx: [");
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame)
+    printf(" %d", get_ref_frame_map_idx(cpi, ref_frame));
+  printf(" ]\n");
+  printf("cm->new_fb_idx = %d\n", cm->new_fb_idx);
+  printf("cm->ref_frame_map = [");
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    printf(" %d", cm->ref_frame_map[ref_frame - LAST_FRAME]);
+  }
+  printf(" ]\n");
+#endif  // 0
 
   // --- Y ---
   for (h = 0; h < cm->height; ++h) {
@@ -4609,7 +5360,6 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
   AV1_COMMON *const cm = &cpi->common;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   struct segmentation *const seg = &cm->seg;
-  TX_SIZE t;
   FRAME_CONTEXT **tile_ctxs = aom_malloc(cm->tile_rows * cm->tile_cols *
                                          sizeof(&cpi->tile_data[0].tctx));
   aom_cdf_prob **cdf_ptrs =
@@ -4622,8 +5372,11 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
   set_ext_overrides(cpi);
   aom_clear_system_state();
 
+#if !CONFIG_FRAME_SIGN_BIAS
   // Set the arf sign bias for this frame.
   set_arf_sign_bias(cpi);
+#endif  // !CONFIG_FRAME_SIGN_BIAS
+
 #if CONFIG_TEMPMV_SIGNALING
   // frame type has been decided outside of this function call
   cm->cur_frame->intra_only = cm->frame_type == KEY_FRAME || cm->intra_only;
@@ -4654,12 +5407,14 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
     cpi->refresh_last_frame = 0;
     cpi->refresh_golden_frame = 0;
     cpi->refresh_bwd_ref_frame = 0;
+    cpi->refresh_alt2_ref_frame = 0;
     cpi->refresh_alt_ref_frame = 0;
 
     cpi->rc.is_bwd_ref_frame = 0;
     cpi->rc.is_last_bipred_frame = 0;
     cpi->rc.is_bipred_frame = 0;
 
+    restore_coding_context(cpi);
     // Build the bitstream
     av1_pack_bitstream(cpi, dest, size);
 
@@ -4672,7 +5427,16 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
 #endif  // DUMP_RECON_FRAMES
 
     // Update the LAST_FRAME in the reference frame buffer.
-    av1_update_reference_frames(cpi);
+    // NOTE:
+    // (1) For BWDREF_FRAME as the show_existing_frame, the reference frame
+    //     update has been done previously when handling the LAST_BIPRED_FRAME
+    //     right before BWDREF_FRAME (in the display order);
+    // (2) For INTNL_OVERLAY as the show_existing_frame, the reference frame
+    //     update will be done when the following is called, which will exchange
+    //     the virtual indexes between LAST_FRAME and ALTREF2_FRAME, so that
+    //     LAST3 will get retired, LAST2 becomes LAST3, LAST becomes LAST2, and
+    //     ALTREF2_FRAME will serve as the new LAST_FRAME.
+    update_reference_frames(cpi);
 
     // Update frame flags
     cpi->frame_flags &= ~FRAMEFLAGS_GOLDEN;
@@ -4687,7 +5451,7 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
     // Since we allocate a spot for the OVERLAY frame in the gf group, we need
     // to do post-encoding update accordingly.
     if (cpi->rc.is_src_frame_alt_ref) {
-      av1_set_target_rate(cpi);
+      av1_set_target_rate(cpi, cm->width, cm->height);
 #if CONFIG_XIPHRC
       frame_type = cm->frame_type == INTER_FRAME ? OD_P_FRAME : OD_I_FRAME;
       drop_this_frame = od_enc_rc_update_state(
@@ -4728,6 +5492,7 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
 
     cm->error_resilient_mode = oxcf->error_resilient_mode;
 
+#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING
     // By default, encoder assumes decoder can use prev_mi.
     if (cm->error_resilient_mode) {
       cm->reset_frame_context = RESET_FRAME_CONTEXT_NONE;
@@ -4736,6 +5501,7 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
       // Only reset the current context.
       cm->reset_frame_context = RESET_FRAME_CONTEXT_CURRENT;
     }
+#endif
   }
   if (cpi->oxcf.mtu == 0) {
     cm->num_tg = cpi->oxcf.num_tile_groups;
@@ -4781,7 +5547,7 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
 #endif
 
 #if CONFIG_REFERENCE_BUFFER
-  {
+  if (cm->seq_params.frame_id_numbers_present_flag) {
     /* Non-normative definition of current_frame_id ("frame counter" with
     * wraparound) */
     const int frame_id_length = FRAME_ID_LENGTH_MINUS7 + 7;
@@ -4806,11 +5572,14 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
           (1 << frame_id_length);
     }
   }
-#endif
+#endif  // CONFIG_REFERENCE_BUFFER
 
 #if CONFIG_EXT_DELTA_Q
   cm->delta_q_present_flag = cpi->oxcf.deltaq_mode != NO_DELTA_Q;
   cm->delta_lf_present_flag = cpi->oxcf.deltaq_mode == DELTA_Q_LF;
+#if CONFIG_LOOPFILTER_LEVEL
+  cm->delta_lf_multi = DEFAULT_DELTA_LF_MULTI;
+#endif  // CONFIG_LOOPFILTER_LEVEL
 #endif
 
   if (cpi->sf.recode_loop == DISALLOW_RECODE) {
@@ -4819,6 +5588,9 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
     encode_with_recode_loop(cpi, size, dest);
   }
 
+  cm->last_tile_cols = cm->tile_cols;
+  cm->last_tile_rows = cm->tile_rows;
+
 #ifdef OUTPUT_YUV_SKINMAP
   if (cpi->common.current_video_frame > 1) {
     av1_compute_skin_map(cpi, yuv_skinmap_file);
@@ -4864,6 +5636,10 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
   // Pick the loop filter level for the frame.
   loopfilter_frame(cpi, cm);
 
+#ifdef OUTPUT_YUV_REC
+  aom_write_one_yuv_frame(cm, cm->frame_to_show);
+#endif
+
   // Build the bitstream
   av1_pack_bitstream(cpi, dest, size);
 
@@ -4874,7 +5650,7 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
   }
 
 #if CONFIG_REFERENCE_BUFFER
-  {
+  if (cm->seq_params.frame_id_numbers_present_flag) {
     int i;
     /* Update reference frame id values based on the value of refresh_mask */
     for (i = 0; i < REF_FRAMES; i++) {
@@ -4883,7 +5659,7 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
       }
     }
   }
-#endif
+#endif  // CONFIG_REFERENCE_BUFFER
 
 #if DUMP_RECON_FRAMES == 1
   // NOTE(zoeliu): For debug - Output the filtered reconstructed video.
@@ -4896,11 +5672,8 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
     release_scaled_references(cpi);
   }
 
-  av1_update_reference_frames(cpi);
+  update_reference_frames(cpi);
 
-  for (t = 0; t < TX_SIZES; t++)
-    av1_full_to_model_counts(cpi->td.counts->coef[t],
-                             cpi->td.rd_counts.coef_counts[t]);
 #if CONFIG_ENTROPY_STATS
   av1_accumulate_frame_counts(&aggregate_fc, &cm->counts);
   assert(cm->frame_context_idx < FRAME_CONTEXTS);
@@ -4908,7 +5681,9 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
                               &cm->counts);
 #endif  // CONFIG_ENTROPY_STATS
   if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+#if CONFIG_LV_MAP
     av1_adapt_coef_probs(cm);
+#endif  // CONFIG_LV_MAP
     av1_adapt_intra_frame_probs(cm);
     make_update_tile_list_enc(cpi, cm->tile_rows, cm->tile_cols, tile_ctxs);
     av1_average_tile_coef_cdfs(cpi->common.fc, tile_ctxs, cdf_ptrs,
@@ -4997,7 +5772,7 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
 // TODO(zoeliu): We may only swamp mi and prev_mi for those frames that are
 // being used as reference.
 #endif  // CONFIG_EXT_REFS
-    av1_swap_mi_and_prev_mi(cm);
+    swap_mi_and_prev_mi(cm);
     // Don't increment frame counters if this was an altref buffer
     // update not a real frame
     ++cm->current_video_frame;
@@ -5017,6 +5792,7 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
 #if CONFIG_EXT_REFS
   }
 #endif  // CONFIG_EXT_REFS
+
   aom_free(tile_ctxs);
   aom_free(cdf_ptrs);
 }
@@ -5077,47 +5853,6 @@ static void Pass2Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest,
 }
 #endif
 
-static void init_ref_frame_bufs(AV1_COMMON *cm) {
-  int i;
-  BufferPool *const pool = cm->buffer_pool;
-  cm->new_fb_idx = INVALID_IDX;
-  for (i = 0; i < REF_FRAMES; ++i) {
-    cm->ref_frame_map[i] = INVALID_IDX;
-    pool->frame_bufs[i].ref_count = 0;
-  }
-}
-
-static void check_initial_width(AV1_COMP *cpi,
-#if CONFIG_HIGHBITDEPTH
-                                int use_highbitdepth,
-#endif
-                                int subsampling_x, int subsampling_y) {
-  AV1_COMMON *const cm = &cpi->common;
-
-  if (!cpi->initial_width ||
-#if CONFIG_HIGHBITDEPTH
-      cm->use_highbitdepth != use_highbitdepth ||
-#endif
-      cm->subsampling_x != subsampling_x ||
-      cm->subsampling_y != subsampling_y) {
-    cm->subsampling_x = subsampling_x;
-    cm->subsampling_y = subsampling_y;
-#if CONFIG_HIGHBITDEPTH
-    cm->use_highbitdepth = use_highbitdepth;
-#endif
-
-    alloc_raw_frame_buffers(cpi);
-    init_ref_frame_bufs(cm);
-    alloc_util_frame_buffers(cpi);
-
-    init_motion_estimation(cpi);  // TODO(agrange) This can be removed.
-
-    cpi->initial_width = cm->width;
-    cpi->initial_height = cm->height;
-    cpi->initial_mbs = cm->MBs;
-  }
-}
-
 int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
                           YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
                           int64_t end_time) {
@@ -5169,7 +5904,7 @@ static int frame_is_reference(const AV1_COMP *cpi) {
   return cm->frame_type == KEY_FRAME || cpi->refresh_last_frame ||
          cpi->refresh_golden_frame ||
 #if CONFIG_EXT_REFS
-         cpi->refresh_bwd_ref_frame ||
+         cpi->refresh_bwd_ref_frame || cpi->refresh_alt2_ref_frame ||
 #endif  // CONFIG_EXT_REFS
          cpi->refresh_alt_ref_frame || !cm->error_resilient_mode ||
          cm->lf.mode_ref_delta_update || cm->seg.update_map ||
@@ -5252,6 +5987,21 @@ static int get_brf_src_index(AV1_COMP *cpi) {
 
   return brf_src_index;
 }
+
+// Returns 0 if this is not an alt ref else the offset of the source frame
+// used as the arf midpoint.
+static int get_arf2_src_index(AV1_COMP *cpi) {
+  int arf2_src_index = 0;
+  if (is_altref_enabled(cpi) && cpi->num_extra_arfs) {
+    if (cpi->oxcf.pass == 2) {
+      const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+      if (gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) {
+        arf2_src_index = gf_group->arf_src_offset[gf_group->index];
+      }
+    }
+  }
+  return arf2_src_index;
+}
 #endif  // CONFIG_EXT_REFS
 
 static void check_src_altref(AV1_COMP *cpi,
@@ -5268,6 +6018,10 @@ static void check_src_altref(AV1_COMP *cpi,
         (gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE) ||
 #endif  // CONFIG_EXT_REFS
         (gf_group->update_type[gf_group->index] == OVERLAY_UPDATE);
+#if CONFIG_EXT_REFS
+    rc->is_src_frame_ext_arf =
+        gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE;
+#endif  // CONFIG_EXT_REFS
   } else {
     rc->is_src_frame_alt_ref =
         cpi->alt_ref_source && (source == cpi->alt_ref_source);
@@ -5277,9 +6031,20 @@ static void check_src_altref(AV1_COMP *cpi,
     // Current frame is an ARF overlay frame.
     cpi->alt_ref_source = NULL;
 
-    // Don't refresh the last buffer for an ARF overlay frame. It will
-    // become the GF so preserve last as an alternative prediction option.
-    cpi->refresh_last_frame = 0;
+#if CONFIG_EXT_REFS
+    if (rc->is_src_frame_ext_arf && !cpi->common.show_existing_frame) {
+      // For INTNL_OVERLAY, when show_existing_frame == 0, they do need to
+      // refresh the LAST_FRAME, i.e. LAST3 gets retired, LAST2 becomes LAST3,
+      // LAST becomes LAST2, and INTNL_OVERLAY becomes LAST.
+      cpi->refresh_last_frame = 1;
+    } else {
+#endif  // CONFIG_EXT_REFS
+      // Don't refresh the last buffer for an ARF overlay frame. It will
+      // become the GF so preserve last as an alternative prediction option.
+      cpi->refresh_last_frame = 0;
+#if CONFIG_EXT_REFS
+    }
+#endif  // CONFIG_EXT_REFS
   }
 }
 
@@ -5402,6 +6167,123 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) {
 }
 #endif  // CONFIG_INTERNAL_STATS
 
+#if CONFIG_AMVR
+static int is_integer_mv(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *cur_picture,
+                         const YV12_BUFFER_CONFIG *last_picture,
+                         hash_table *last_hash_table) {
+  aom_clear_system_state();
+  // check use hash ME
+  int k;
+  uint32_t hash_value_1;
+  uint32_t hash_value_2;
+
+  const int block_size = 8;
+  const double threshold_current = 0.8;
+  const double threshold_average = 0.95;
+  const int max_history_size = 32;
+  int T = 0;  // total block
+  int C = 0;  // match with collocated block
+  int S = 0;  // smooth region but not match with collocated block
+  int M = 0;  // match with other block
+
+  const int pic_width = cur_picture->y_width;
+  const int pic_height = cur_picture->y_height;
+  for (int i = 0; i + block_size <= pic_height; i += block_size) {
+    for (int j = 0; j + block_size <= pic_width; j += block_size) {
+      const int x_pos = j;
+      const int y_pos = i;
+      int match = 1;
+      T++;
+
+      // check whether collocated block match with current
+      uint8_t *p_cur = cur_picture->y_buffer;
+      uint8_t *p_ref = last_picture->y_buffer;
+      int stride_cur = cur_picture->y_stride;
+      int stride_ref = last_picture->y_stride;
+      p_cur += (y_pos * stride_cur + x_pos);
+      p_ref += (y_pos * stride_ref + x_pos);
+
+      for (int tmpY = 0; tmpY < block_size && match; tmpY++) {
+        for (int tmpX = 0; tmpX < block_size && match; tmpX++) {
+          if (p_cur[tmpX] != p_ref[tmpX]) {
+            match = 0;
+          }
+        }
+        p_cur += stride_cur;
+        p_ref += stride_ref;
+      }
+
+      if (match) {
+        C++;
+        continue;
+      }
+
+      if (av1_hash_is_horizontal_perfect(cur_picture, block_size, x_pos,
+                                         y_pos) ||
+          av1_hash_is_vertical_perfect(cur_picture, block_size, x_pos, y_pos)) {
+        S++;
+        continue;
+      }
+
+      av1_get_block_hash_value(
+          cur_picture->y_buffer + y_pos * stride_cur + x_pos, stride_cur,
+          block_size, &hash_value_1, &hash_value_2);
+
+      if (av1_has_exact_match(last_hash_table, hash_value_1, hash_value_2)) {
+        M++;
+      }
+    }
+  }
+
+  assert(T > 0);
+  double csm_rate = ((double)(C + S + M)) / ((double)(T));
+  double m_rate = ((double)(M)) / ((double)(T));
+
+  cpi->csm_rate_array[cpi->rate_index] = csm_rate;
+  cpi->m_rate_array[cpi->rate_index] = m_rate;
+
+  cpi->rate_index = (cpi->rate_index + 1) % max_history_size;
+  cpi->rate_size++;
+  cpi->rate_size = AOMMIN(cpi->rate_size, max_history_size);
+
+  if (csm_rate < threshold_current) {
+    return 0;
+  }
+
+  if (C == T) {
+    return 1;
+  }
+
+  double csm_average = 0.0;
+  double m_average = 0.0;
+
+  for (k = 0; k < cpi->rate_size; k++) {
+    csm_average += cpi->csm_rate_array[k];
+    m_average += cpi->m_rate_array[k];
+  }
+  csm_average /= cpi->rate_size;
+  m_average /= cpi->rate_size;
+
+  if (csm_average < threshold_average) {
+    return 0;
+  }
+
+  if (M > (T - C - S) / 3) {
+    return 1;
+  }
+
+  if (csm_rate > 0.99 && m_rate > 0.01) {
+    return 1;
+  }
+
+  if (csm_average + m_average > 1.01) {
+    return 1;
+  }
+
+  return 0;
+}
+#endif
+
 int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
                             size_t *size, uint8_t *dest, int64_t *time_stamp,
                             int64_t *time_end, int flush) {
@@ -5432,7 +6314,11 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
 
   aom_usec_timer_start(&cmptimer);
 
-  av1_set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV);
+#if CONFIG_AMVR
+  set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV, 0);
+#else
+  set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV);
+#endif
 
   // Is multi-arf enabled.
   // Note that at the moment multi_arf is only configured for 2 pass VBR
@@ -5441,8 +6327,10 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
   else
     cpi->multi_arf_allowed = 0;
 
-  // Normal defaults
+// Normal defaults
+#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING
   cm->reset_frame_context = RESET_FRAME_CONTEXT_NONE;
+#endif
   cm->refresh_frame_context =
       (oxcf->error_resilient_mode || oxcf->frame_parallel_decoding_mode)
           ? REFRESH_FRAME_CONTEXT_FORWARD
@@ -5452,6 +6340,7 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
   cpi->refresh_golden_frame = 0;
 #if CONFIG_EXT_REFS
   cpi->refresh_bwd_ref_frame = 0;
+  cpi->refresh_alt2_ref_frame = 0;
 #endif  // CONFIG_EXT_REFS
   cpi->refresh_alt_ref_frame = 0;
 
@@ -5537,7 +6426,7 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
 #endif  // CONFIG_BGSPRITE
           av1_temporal_filter(cpi,
 #if CONFIG_BGSPRITE
-                              NULL,
+                              NULL, &cpi->alt_ref_buffer,
 #endif  // CONFIG_BGSPRITE
                               arf_src_index);
         aom_extend_frame_borders(&cpi->alt_ref_buffer);
@@ -5547,14 +6436,64 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
       cm->show_frame = 0;
       cm->intra_only = 0;
       cpi->refresh_alt_ref_frame = 1;
-      cpi->refresh_golden_frame = 0;
       cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
+#if CONFIG_EXT_REFS
+      cpi->refresh_bwd_ref_frame = 0;
+      cpi->refresh_alt2_ref_frame = 0;
+#endif  // CONFIG_EXT_REFS
       rc->is_src_frame_alt_ref = 0;
     }
     rc->source_alt_ref_pending = 0;
   }
 
 #if CONFIG_EXT_REFS
+  // Should we encode an arf2 frame.
+  arf_src_index = get_arf2_src_index(cpi);
+  if (arf_src_index) {
+    for (i = 0; i <= arf_src_index; ++i) {
+      struct lookahead_entry *e = av1_lookahead_peek(cpi->lookahead, i);
+      // Avoid creating an alt-ref if there's a forced keyframe pending.
+      if (e == NULL) {
+        break;
+      } else if (e->flags == AOM_EFLAG_FORCE_KF) {
+        arf_src_index = 0;
+        flush = 1;
+        break;
+      }
+    }
+  }
+
+  if (arf_src_index) {
+    assert(arf_src_index <= rc->frames_to_key);
+
+    if ((source = av1_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) {
+      cpi->alt_ref_source = source;
+
+      if (oxcf->arnr_max_frames > 0) {
+        // Produce the filtered ARF frame.
+        av1_temporal_filter(cpi,
+#if CONFIG_BGSPRITE
+                            NULL, NULL,
+#endif  // CONFIG_BGSPRITE
+                            arf_src_index);
+        aom_extend_frame_borders(&cpi->alt_ref_buffer);
+        force_src_buffer = &cpi->alt_ref_buffer;
+      }
+
+      cm->show_frame = 0;
+      cm->intra_only = 0;
+      cpi->refresh_alt2_ref_frame = 1;
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_bwd_ref_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+      rc->is_src_frame_alt_ref = 0;
+      rc->is_src_frame_ext_arf = 0;
+    }
+    rc->source_alt_ref_pending = 0;
+  }
+
   rc->is_bwd_ref_frame = 0;
   brf_src_index = get_brf_src_index(cpi);
   if (brf_src_index) {
@@ -5566,6 +6505,7 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
       cpi->refresh_bwd_ref_frame = 1;
       cpi->refresh_last_frame = 0;
       cpi->refresh_golden_frame = 0;
+      cpi->refresh_alt2_ref_frame = 0;
       cpi->refresh_alt_ref_frame = 0;
 
       rc->is_bwd_ref_frame = 1;
@@ -5634,13 +6574,10 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
   if (cm->new_fb_idx == INVALID_IDX) return -1;
 
   cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
-
-#if CONFIG_EXT_REFS
-  if (oxcf->pass == 2) {
-    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-    cpi->alt_fb_idx = cpi->arf_map[gf_group->arf_ref_idx[gf_group->index]];
-  }
-#else
+#if CONFIG_HIGHBITDEPTH && CONFIG_GLOBAL_MOTION
+  cm->cur_frame->buf.buf_8bit_valid = 0;
+#endif
+#if !CONFIG_EXT_REFS
   if (cpi->multi_arf_allowed) {
     if (cm->frame_type == KEY_FRAME) {
       init_buffer_indices(cpi);
@@ -5649,7 +6586,7 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
       cpi->alt_fb_idx = gf_group->arf_ref_idx[gf_group->index];
     }
   }
-#endif  // CONFIG_EXT_REFS
+#endif  // !CONFIG_EXT_REFS
 
   // Start with a 0 size frame.
   *size = 0;
@@ -5679,8 +6616,26 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
 #endif
 
 #if CONFIG_REFERENCE_BUFFER
-  if (*time_stamp == 0) {
-    cpi->common.current_frame_id = -1;
+  if (cm->seq_params.frame_id_numbers_present_flag) {
+    if (*time_stamp == 0) {
+      cpi->common.current_frame_id = -1;
+    }
+  }
+#endif  // CONFIG_REFERENCE_BUFFER
+#if CONFIG_AMVR
+  cpi->cur_poc++;
+  if (oxcf->pass != 1 && cpi->common.allow_screen_content_tools) {
+    if (cpi->common.seq_mv_precision_level == 2) {
+      struct lookahead_entry *previous_entry =
+          cpi->lookahead->buf + cpi->previsous_index;
+      cpi->common.cur_frame_mv_precision_level = is_integer_mv(
+          cpi, cpi->source, &previous_entry->img, cpi->previsou_hash_table);
+    } else {
+      cpi->common.cur_frame_mv_precision_level =
+          cpi->common.seq_mv_precision_level;
+    }
+  } else {
+    cpi->common.cur_frame_mv_precision_level = 0;
   }
 #endif
 
@@ -5711,9 +6666,35 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
     Pass0Encode(cpi, size, dest, 0, frame_flags);
   }
 #endif
+#if CONFIG_HASH_ME
+  if (oxcf->pass != 1 && cpi->common.allow_screen_content_tools) {
+#if CONFIG_AMVR
+    cpi->previsou_hash_table = &cm->cur_frame->hash_table;
+    {
+      int l;
+      for (l = -MAX_PRE_FRAMES; l < cpi->lookahead->max_sz; l++) {
+        if ((cpi->lookahead->buf + l) == source) {
+          cpi->previsous_index = l;
+          break;
+        }
+      }
+
+      if (l == cpi->lookahead->max_sz) {
+        aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                           "Failed to find last frame original buffer");
+      }
+    }
+#endif
+  }
+
+#endif
 
+#if CONFIG_NO_FRAME_CONTEXT_SIGNALING
+  cm->frame_contexts[cm->new_fb_idx] = *cm->fc;
+#else
   if (!cm->error_resilient_mode)
     cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
+#endif  // CONFIG_NO_FRAME_CONTEXT_SIGNALING
 
   // No frame encoded, or frame was dropped, release scaled references.
   if ((*size == 0) && (frame_is_intra_only(cm) == 0)) {
@@ -5776,7 +6757,6 @@ int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame) {
 
 int av1_set_internal_size(AV1_COMP *cpi, AOM_SCALING horiz_mode,
                           AOM_SCALING vert_mode) {
-  AV1_COMMON *cm = &cpi->common;
   int hr = 0, hs = 0, vr = 0, vs = 0;
 
   if (horiz_mode > ONETWO || vert_mode > ONETWO) return -1;
@@ -5785,43 +6765,8 @@ int av1_set_internal_size(AV1_COMP *cpi, AOM_SCALING horiz_mode,
   Scale2Ratio(vert_mode, &vr, &vs);
 
   // always go to the next whole number
-  cm->width = (hs - 1 + cpi->oxcf.width * hr) / hs;
-  cm->height = (vs - 1 + cpi->oxcf.height * vr) / vs;
-  assert(cm->width <= cpi->initial_width);
-  assert(cm->height <= cpi->initial_height);
-
-  update_frame_size(cpi);
-
-  return 0;
-}
-
-int av1_set_size_literal(AV1_COMP *cpi, int width, int height) {
-  AV1_COMMON *cm = &cpi->common;
-#if CONFIG_HIGHBITDEPTH
-  check_initial_width(cpi, cm->use_highbitdepth, cm->subsampling_x,
-                      cm->subsampling_y);
-#else
-  check_initial_width(cpi, cm->subsampling_x, cm->subsampling_y);
-#endif  // CONFIG_HIGHBITDEPTH
-
-  if (width <= 0 || height <= 0) return 1;
-
-  cm->width = width;
-  if (cm->width > cpi->initial_width) {
-    cm->width = cpi->initial_width;
-    printf("Warning: Desired width too large, changed to %d\n", cm->width);
-  }
-
-  cm->height = height;
-  if (cm->height > cpi->initial_height) {
-    cm->height = cpi->initial_height;
-    printf("Warning: Desired height too large, changed to %d\n", cm->height);
-  }
-
-  assert(cm->width <= cpi->initial_width);
-  assert(cm->height <= cpi->initial_height);
-
-  update_frame_size(cpi);
+  cpi->resize_pending_width = (hs - 1 + cpi->oxcf.width * hr) / hs;
+  cpi->resize_pending_height = (vs - 1 + cpi->oxcf.height * vr) / vs;
 
   return 0;
 }
diff --git a/third_party/aom/av1/encoder/encoder.h b/third_party/aom/av1/encoder/encoder.h
index 9b98975b7..eb779a3cd 100644
--- a/third_party/aom/av1/encoder/encoder.h
+++ b/third_party/aom/av1/encoder/encoder.h
@@ -53,23 +53,20 @@
 extern "C" {
 #endif
 
-#if CONFIG_SPEED_REFS
-#define MIN_SPEED_REFS_BLKSIZE BLOCK_16X16
-#endif  // CONFIG_SPEED_REFS
-
 typedef struct {
   int nmv_vec_cost[NMV_CONTEXTS][MV_JOINTS];
   int nmv_costs[NMV_CONTEXTS][2][MV_VALS];
   int nmv_costs_hp[NMV_CONTEXTS][2][MV_VALS];
 
   // 0 = Intra, Last, GF, ARF
-  signed char last_ref_lf_deltas[TOTAL_REFS_PER_FRAME];
+  int8_t last_ref_lf_deltas[TOTAL_REFS_PER_FRAME];
   // 0 = ZERO_MV, MV
-  signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];
+  int8_t last_mode_lf_deltas[MAX_MODE_LF_DELTAS];
 
   FRAME_CONTEXT fc;
 } CODING_CONTEXT;
 
+#if !CONFIG_NO_FRAME_CONTEXT_SIGNALING
 typedef enum {
   // regular inter frame
   REGULAR_FRAME = 0,
@@ -86,6 +83,7 @@ typedef enum {
   EXT_ARF_FRAME = 5
 #endif
 } FRAME_CONTEXT_INDEX;
+#endif
 
 typedef enum {
   NORMAL = 0,
@@ -105,8 +103,9 @@ typedef enum {
   FRAMEFLAGS_GOLDEN = 1 << 1,
 #if CONFIG_EXT_REFS
   FRAMEFLAGS_BWDREF = 1 << 2,
+  // TODO(zoeliu): To determine whether a frame flag is needed for ALTREF2_FRAME
   FRAMEFLAGS_ALTREF = 1 << 3,
-#else
+#else   // !CONFIG_EXT_REFS
   FRAMEFLAGS_ALTREF = 1 << 2,
 #endif  // CONFIG_EXT_REFS
 } FRAMETYPE_FLAGS;
@@ -116,7 +115,7 @@ typedef enum {
   VARIANCE_AQ = 1,
   COMPLEXITY_AQ = 2,
   CYCLIC_REFRESH_AQ = 3,
-#if CONFIG_DELTA_Q && !CONFIG_EXT_DELTA_Q
+#if !CONFIG_EXT_DELTA_Q
   DELTA_AQ = 4,
 #endif
   AQ_MODE_COUNT  // This should always be the last member of the enum
@@ -131,14 +130,20 @@ typedef enum {
 #endif
 typedef enum {
   RESIZE_NONE = 0,    // No frame resizing allowed.
-  RESIZE_FIXED = 1,   // All frames are coded at the specified dimension.
-  RESIZE_DYNAMIC = 2  // Coded size of each frame is determined by the codec.
+  RESIZE_FIXED = 1,   // All frames are coded at the specified scale.
+  RESIZE_RANDOM = 2,  // All frames are coded at a random scale.
+  RESIZE_MODES
 } RESIZE_MODE;
 #if CONFIG_FRAME_SUPERRES
 typedef enum {
-  SUPERRES_NONE = 0,
-  SUPERRES_FIXED = 1,
-  SUPERRES_DYNAMIC = 2
+  SUPERRES_NONE = 0,     // No frame superres allowed
+  SUPERRES_FIXED = 1,    // All frames are coded at the specified scale,
+                         // and super-resolved.
+  SUPERRES_RANDOM = 2,   // All frames are coded at a random scale,
+                         // and super-resolved.
+  SUPERRES_QTHRESH = 3,  // Superres scale for a frame is determined based on
+                         // q_index
+  SUPERRES_MODES
 } SUPERRES_MODE;
 #endif  // CONFIG_FRAME_SUPERRES
 
@@ -202,6 +207,9 @@ typedef struct AV1EncoderConfig {
   int qm_minlevel;
   int qm_maxlevel;
 #endif
+#if CONFIG_DIST_8X8
+  int using_dist_8x8;
+#endif
   unsigned int num_tile_groups;
   unsigned int mtu;
 
@@ -210,14 +218,16 @@ typedef struct AV1EncoderConfig {
 #endif
   // Internal frame size scaling.
   RESIZE_MODE resize_mode;
-  uint8_t resize_scale_numerator;
-  uint8_t resize_kf_scale_numerator;
+  uint8_t resize_scale_denominator;
+  uint8_t resize_kf_scale_denominator;
 
 #if CONFIG_FRAME_SUPERRES
   // Frame Super-Resolution size scaling.
   SUPERRES_MODE superres_mode;
-  uint8_t superres_scale_numerator;
-  uint8_t superres_kf_scale_numerator;
+  uint8_t superres_scale_denominator;
+  uint8_t superres_kf_scale_denominator;
+  int superres_qthresh;
+  int superres_kf_qthresh;
 #endif  // CONFIG_FRAME_SUPERRES
 
   // Enable feature to reduce the frame quantization every x frames.
@@ -255,6 +265,12 @@ typedef struct AV1EncoderConfig {
 
   int tile_columns;
   int tile_rows;
+#if CONFIG_MAX_TILE
+  int tile_width_count;
+  int tile_height_count;
+  int tile_widths[MAX_TILE_COLS];
+  int tile_heights[MAX_TILE_ROWS];
+#endif
 #if CONFIG_DEPENDENT_HORZTILES
   int dependent_horz_tiles;
 #endif
@@ -277,10 +293,8 @@ typedef struct AV1EncoderConfig {
   int use_highbitdepth;
 #endif
   aom_color_space_t color_space;
-#if CONFIG_COLORSPACE_HEADERS
   aom_transfer_function_t transfer_function;
   aom_chroma_sample_position_t chroma_sample_position;
-#endif
   int color_range;
   int render_width;
   int render_height;
@@ -320,7 +334,6 @@ typedef struct TileDataEnc {
 } TileDataEnc;
 
 typedef struct RD_COUNTS {
-  av1_coeff_count coef_counts[TX_SIZES][PLANE_TYPES];
   int64_t comp_pred_diff[REFERENCE_MODES];
 #if CONFIG_GLOBAL_MOTION
   // Stores number of 4x4 blocks using global motion per reference frame.
@@ -334,8 +347,9 @@ typedef struct ThreadData {
   MACROBLOCK mb;
   RD_COUNTS rd_counts;
   FRAME_COUNTS *counts;
-
+#if !CONFIG_CB4X4
   PICK_MODE_CONTEXT *leaf_tree;
+#endif
   PC_TREE *pc_tree;
   PC_TREE *pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2 + 1];
 #if CONFIG_MOTION_VAR
@@ -345,9 +359,7 @@ typedef struct ThreadData {
   uint8_t *left_pred_buf;
 #endif
 
-#if CONFIG_PALETTE
   PALETTE_BUFFER *palette_buffer;
-#endif  // CONFIG_PALETTE
 } ThreadData;
 
 struct EncWorkerData;
@@ -381,6 +393,9 @@ typedef struct AV1_COMP {
   QUANTS quants;
   ThreadData td;
   MB_MODE_INFO_EXT *mbmi_ext_base;
+#if CONFIG_LV_MAP
+  CB_COEFF_BUFFER *coeff_buffer_base;
+#endif
   Dequants dequants;
   AV1_COMMON common;
   AV1EncoderConfig oxcf;
@@ -396,6 +411,15 @@ typedef struct AV1_COMP {
 
   // For a still frame, this flag is set to 1 to skip partition search.
   int partition_search_skippable_frame;
+#if CONFIG_AMVR
+  double csm_rate_array[32];
+  double m_rate_array[32];
+  int rate_size;
+  int rate_index;
+  hash_table *previsou_hash_table;
+  int previsous_index;
+  int cur_poc;  // DebugInfo
+#endif
 
   int scaled_ref_idx[TOTAL_REFS_PER_FRAME];
 #if CONFIG_EXT_REFS
@@ -405,9 +429,14 @@ typedef struct AV1_COMP {
 #endif  // CONFIG_EXT_REFS
   int gld_fb_idx;
 #if CONFIG_EXT_REFS
-  int bwd_fb_idx;  // BWD_REF_FRAME
-#endif             // CONFIG_EXT_REFS
+  int bwd_fb_idx;   // BWDREF_FRAME
+  int alt2_fb_idx;  // ALTREF2_FRAME
+#endif              // CONFIG_EXT_REFS
   int alt_fb_idx;
+#if CONFIG_EXT_REFS
+  int ext_fb_idx;      // extra ref frame buffer index
+  int refresh_fb_idx;  // ref frame buffer index to refresh
+#endif                 // CONFIG_EXT_REFS
 
   int last_show_frame_buf_idx;  // last show frame buffer index
 
@@ -415,6 +444,7 @@ typedef struct AV1_COMP {
   int refresh_golden_frame;
 #if CONFIG_EXT_REFS
   int refresh_bwd_ref_frame;
+  int refresh_alt2_ref_frame;
 #endif  // CONFIG_EXT_REFS
   int refresh_alt_ref_frame;
 
@@ -441,6 +471,11 @@ typedef struct AV1_COMP {
 
   CODING_CONTEXT coding_context;
 
+#if CONFIG_GLOBAL_MOTION
+  int gmtype_cost[TRANS_TYPES];
+  int gmparams_cost[TOTAL_REFS_PER_FRAME];
+#endif  // CONFIG_GLOBAL_MOTION
+
   int nmv_costs[NMV_CONTEXTS][2][MV_VALS];
   int nmv_costs_hp[NMV_CONTEXTS][2][MV_VALS];
 
@@ -534,77 +569,17 @@ typedef struct AV1_COMP {
                     // number of MBs in the current frame when the frame is
                     // scaled.
 
+  // When resize is triggered through external control, the desired width/height
+  // are stored here until use in the next frame coded. They are effective only
+  // for
+  // one frame and are reset after use.
+  int resize_pending_width;
+  int resize_pending_height;
+
   int frame_flags;
 
   search_site_config ss_cfg;
 
-  int mbmode_cost[BLOCK_SIZE_GROUPS][INTRA_MODES];
-  int newmv_mode_cost[NEWMV_MODE_CONTEXTS][2];
-  int zeromv_mode_cost[ZEROMV_MODE_CONTEXTS][2];
-  int refmv_mode_cost[REFMV_MODE_CONTEXTS][2];
-  int drl_mode_cost0[DRL_MODE_CONTEXTS][2];
-
-  unsigned int inter_mode_cost[INTER_MODE_CONTEXTS][INTER_MODES];
-#if CONFIG_EXT_INTER
-  unsigned int inter_compound_mode_cost[INTER_MODE_CONTEXTS]
-                                       [INTER_COMPOUND_MODES];
-#if CONFIG_COMPOUND_SINGLEREF
-  unsigned int inter_singleref_comp_mode_cost[INTER_MODE_CONTEXTS]
-                                             [INTER_SINGLEREF_COMP_MODES];
-#endif  // CONFIG_COMPOUND_SINGLEREF
-#if CONFIG_INTERINTRA
-  unsigned int interintra_mode_cost[BLOCK_SIZE_GROUPS][INTERINTRA_MODES];
-#endif  // CONFIG_INTERINTRA
-#endif  // CONFIG_EXT_INTER
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-  int motion_mode_cost[BLOCK_SIZES_ALL][MOTION_MODES];
-#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-  int motion_mode_cost1[BLOCK_SIZES_ALL][2];
-#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-#if CONFIG_MOTION_VAR && CONFIG_NCOBMC_ADAPT_WEIGHT
-  int ncobmc_mode_cost[ADAPT_OVERLAP_BLOCKS][MAX_NCOBMC_MODES];
-#endif  // CONFIG_MOTION_VAR && CONFIG_NCOBMC_ADAPT_WEIGHT
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-  int intra_uv_mode_cost[INTRA_MODES][UV_INTRA_MODES];
-  int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
-  int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
-#if CONFIG_EXT_PARTITION_TYPES
-  int partition_cost[PARTITION_CONTEXTS + CONFIG_UNPOISON_PARTITION_CTX]
-                    [EXT_PARTITION_TYPES];
-#else
-  int partition_cost[PARTITION_CONTEXTS + CONFIG_UNPOISON_PARTITION_CTX]
-                    [PARTITION_TYPES];
-#endif
-#if CONFIG_PALETTE
-  int palette_y_size_cost[PALETTE_BLOCK_SIZES][PALETTE_SIZES];
-  int palette_uv_size_cost[PALETTE_BLOCK_SIZES][PALETTE_SIZES];
-  int palette_y_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
-                          [PALETTE_COLORS];
-  int palette_uv_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
-                           [PALETTE_COLORS];
-#endif  // CONFIG_PALETTE
-  int tx_size_cost[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES];
-#if CONFIG_EXT_TX
-  int inter_tx_type_costs[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES];
-  int intra_tx_type_costs[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
-                         [TX_TYPES];
-#else
-  int intra_tx_type_costs[EXT_TX_SIZES][TX_TYPES][TX_TYPES];
-  int inter_tx_type_costs[EXT_TX_SIZES][TX_TYPES];
-#endif  // CONFIG_EXT_TX
-#if CONFIG_EXT_INTRA
-#if CONFIG_INTRA_INTERP
-  int intra_filter_cost[INTRA_FILTERS + 1][INTRA_FILTERS];
-#endif  // CONFIG_INTRA_INTERP
-#endif  // CONFIG_EXT_INTRA
-#if CONFIG_LOOP_RESTORATION
-  int switchable_restore_cost[RESTORE_SWITCHABLE_TYPES];
-#endif  // CONFIG_LOOP_RESTORATION
-#if CONFIG_GLOBAL_MOTION
-  int gmtype_cost[TRANS_TYPES];
-  int gmparams_cost[TOTAL_REFS_PER_FRAME];
-#endif  // CONFIG_GLOBAL_MOTION
-
   int multi_arf_allowed;
   int multi_arf_enabled;
   int multi_arf_last_grp_enabled;
@@ -639,25 +614,24 @@ typedef struct AV1_COMP {
   int is_arf_filter_off[MAX_EXT_ARFS + 1];
   int num_extra_arfs;
   int arf_map[MAX_EXT_ARFS + 1];
+  int arf_pos_in_gf[MAX_EXT_ARFS + 1];
+  int arf_pos_for_ovrly[MAX_EXT_ARFS + 1];
 #endif  // CONFIG_EXT_REFS
 #if CONFIG_GLOBAL_MOTION
   int global_motion_search_done;
 #endif
-#if CONFIG_REFERENCE_BUFFER
-  SequenceHeader seq_params;
-#endif
 #if CONFIG_LV_MAP
   tran_low_t *tcoeff_buf[MAX_MB_PLANE];
 #endif
 
-#if CONFIG_SPEED_REFS
-  int sb_scanning_pass_idx;
-#endif  // CONFIG_SPEED_REFS
-
-#if CONFIG_FLEX_REFS
+#if CONFIG_EXT_REFS
   int extra_arf_allowed;
   int bwd_ref_allowed;
-#endif  // CONFIG_FLEX_REFS
+#endif  // CONFIG_EXT_REFS
+
+#if CONFIG_BGSPRITE
+  int bgsprite_allowed;
+#endif  // CONFIG_BGSPRITE
 } AV1_COMP;
 
 void av1_initialize_enc(void);
@@ -686,11 +660,9 @@ int av1_use_as_reference(AV1_COMP *cpi, int ref_frame_flags);
 
 void av1_update_reference(AV1_COMP *cpi, int ref_frame_flags);
 
-int av1_copy_reference_enc(AV1_COMP *cpi, AOM_REFFRAME ref_frame_flag,
-                           YV12_BUFFER_CONFIG *sd);
+int av1_copy_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd);
 
-int av1_set_reference_enc(AV1_COMP *cpi, AOM_REFFRAME ref_frame_flag,
-                          YV12_BUFFER_CONFIG *sd);
+int av1_set_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd);
 
 int av1_update_entropy(AV1_COMP *cpi, int update);
 
@@ -701,14 +673,8 @@ int av1_get_active_map(AV1_COMP *cpi, unsigned char *map, int rows, int cols);
 int av1_set_internal_size(AV1_COMP *cpi, AOM_SCALING horiz_mode,
                           AOM_SCALING vert_mode);
 
-// Returns 1 if the assigned width or height was <= 0.
-int av1_set_size_literal(AV1_COMP *cpi, int width, int height);
-
 int av1_get_quantizer(struct AV1_COMP *cpi);
 
-void av1_full_to_model_counts(av1_coeff_count_model *model_count,
-                              av1_coeff_count *full_count);
-
 static INLINE int frame_is_kf_gf_arf(const AV1_COMP *cpi) {
   return frame_is_intra_only(&cpi->common) || cpi->refresh_alt_ref_frame ||
          (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref);
@@ -727,6 +693,8 @@ static INLINE int get_ref_frame_map_idx(const AV1_COMP *cpi,
 #if CONFIG_EXT_REFS
   else if (ref_frame == BWDREF_FRAME)
     return cpi->bwd_fb_idx;
+  else if (ref_frame == ALTREF2_FRAME)
+    return cpi->alt2_fb_idx;
 #endif  // CONFIG_EXT_REFS
   else
     return cpi->alt_fb_idx;
@@ -739,6 +707,17 @@ static INLINE int get_ref_frame_buf_idx(const AV1_COMP *cpi,
   return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : INVALID_IDX;
 }
 
+#if CONFIG_HASH_ME
+static INLINE hash_table *get_ref_frame_hash_map(const AV1_COMP *cpi,
+                                                 MV_REFERENCE_FRAME ref_frame) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+  return buf_idx != INVALID_IDX
+             ? &cm->buffer_pool->frame_bufs[buf_idx].hash_table
+             : NULL;
+}
+#endif
+
 static INLINE YV12_BUFFER_CONFIG *get_ref_frame_buffer(
     const AV1_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {
   const AV1_COMMON *const cm = &cpi->common;
@@ -781,13 +760,6 @@ static INLINE unsigned int allocated_tokens(TileInfo tile) {
   return get_token_alloc(tile_mb_rows, tile_mb_cols);
 }
 
-void av1_alloc_compressor_data(AV1_COMP *cpi);
-
-void av1_scale_references(AV1_COMP *cpi);
-
-void av1_update_reference_frames(AV1_COMP *cpi);
-
-void av1_set_high_precision_mv(AV1_COMP *cpi, int allow_high_precision_mv);
 #if CONFIG_TEMPMV_SIGNALING
 void av1_set_temporal_mv_prediction(AV1_COMP *cpi, int allow_tempmv_prediction);
 #endif
diff --git a/third_party/aom/av1/encoder/encodetxb.c b/third_party/aom/av1/encoder/encodetxb.c
index 3aa4c183e..6209d6fa4 100644
--- a/third_party/aom/av1/encoder/encodetxb.c
+++ b/third_party/aom/av1/encoder/encodetxb.c
@@ -38,7 +38,14 @@ void av1_alloc_txb_buf(AV1_COMP *cpi) {
         aom_malloc(sizeof(*cpi->tcoeff_buf[i]) * pixel_stride * pixel_height));
   }
 #else
-  (void)cpi;
+  AV1_COMMON *cm = &cpi->common;
+  int size = ((cm->mi_rows >> MAX_MIB_SIZE_LOG2) + 1) *
+             ((cm->mi_cols >> MAX_MIB_SIZE_LOG2) + 1);
+
+  av1_free_txb_buf(cpi);
+  // TODO(jingning): This should be further reduced.
+  CHECK_MEM_ERROR(cm, cpi->coeff_buffer_base,
+                  aom_malloc(sizeof(*cpi->coeff_buffer_base) * size));
 #endif
 }
 
@@ -49,10 +56,27 @@ void av1_free_txb_buf(AV1_COMP *cpi) {
     aom_free(cpi->tcoeff_buf[i]);
   }
 #else
-  (void)cpi;
+  aom_free(cpi->coeff_buffer_base);
 #endif
 }
 
+void av1_set_coeff_buffer(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                          int mi_row, int mi_col) {
+  int stride = (cpi->common.mi_cols >> MAX_MIB_SIZE_LOG2) + 1;
+  int offset =
+      (mi_row >> MAX_MIB_SIZE_LOG2) * stride + (mi_col >> MAX_MIB_SIZE_LOG2);
+  CB_COEFF_BUFFER *coeff_buf = &cpi->coeff_buffer_base[offset];
+  const int txb_offset = x->cb_offset / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
+  for (int plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    x->mbmi_ext->tcoeff[plane] = coeff_buf->tcoeff[plane] + x->cb_offset;
+    x->mbmi_ext->eobs[plane] = coeff_buf->eobs[plane] + txb_offset;
+    x->mbmi_ext->txb_skip_ctx[plane] =
+        coeff_buf->txb_skip_ctx[plane] + txb_offset;
+    x->mbmi_ext->dc_sign_ctx[plane] =
+        coeff_buf->dc_sign_ctx[plane] + txb_offset;
+  }
+}
+
 static void write_golomb(aom_writer *w, int level) {
   int x = level + 1;
   int i = x;
@@ -69,12 +93,178 @@ static void write_golomb(aom_writer *w, int level) {
   for (i = length - 1; i >= 0; --i) aom_write_bit(w, (x >> i) & 0x01);
 }
 
+static INLINE void write_nz_map(aom_writer *w, const tran_low_t *tcoeff,
+                                uint16_t eob, int plane, const int16_t *scan,
+                                TX_SIZE tx_size, TX_TYPE tx_type,
+                                FRAME_CONTEXT *fc) {
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const TX_SIZE txs_ctx = get_txsize_context(tx_size);
+  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
+  const int height = tx_size_high[tx_size];
+#if CONFIG_CTX1D
+  const int width = tx_size_wide[tx_size];
+  const int eob_offset = width + height;
+  const TX_CLASS tx_class = get_tx_class(tx_type);
+  const int seg_eob =
+      (tx_class == TX_CLASS_2D) ? tx_size_2d[tx_size] : eob_offset;
+#else
+  const int seg_eob = tx_size_2d[tx_size];
+#endif
+#if !LV_MAP_PROB
+  aom_prob *nz_map = fc->nz_map[txs_ctx][plane_type];
+  aom_prob *eob_flag = fc->eob_flag[txs_ctx][plane_type];
+#endif
+
+  for (int c = 0; c < eob; ++c) {
+    int coeff_ctx = get_nz_map_ctx(tcoeff, c, scan, bwl, height, tx_type);
+    int eob_ctx = get_eob_ctx(tcoeff, scan[c], txs_ctx, tx_type);
+
+    tran_low_t v = tcoeff[scan[c]];
+    int is_nz = (v != 0);
+
+    if (c == seg_eob - 1) break;
+
+#if LV_MAP_PROB
+    aom_write_bin(w, is_nz, fc->nz_map_cdf[txs_ctx][plane_type][coeff_ctx], 2);
+#else
+    aom_write(w, is_nz, nz_map[coeff_ctx]);
+#endif
+
+    if (is_nz) {
+#if LV_MAP_PROB
+      aom_write_bin(w, c == (eob - 1),
+                    fc->eob_flag_cdf[txs_ctx][plane_type][eob_ctx], 2);
+#else
+      aom_write(w, c == (eob - 1), eob_flag[eob_ctx]);
+#endif
+    }
+  }
+}
+
+#if CONFIG_CTX1D
+static INLINE void write_nz_map_vert(aom_writer *w, const tran_low_t *tcoeff,
+                                     uint16_t eob, int plane,
+                                     const int16_t *scan, const int16_t *iscan,
+                                     TX_SIZE tx_size, TX_TYPE tx_type,
+                                     FRAME_CONTEXT *fc) {
+  (void)eob;
+  const TX_SIZE txs_ctx = get_txsize_context(tx_size);
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const TX_CLASS tx_class = get_tx_class(tx_type);
+  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  int16_t eob_ls[MAX_HVTX_SIZE];
+  get_eob_vert(eob_ls, tcoeff, width, height);
+#if !LV_MAP_PROB
+  aom_prob *nz_map = fc->nz_map[txs_ctx][plane_type];
+#endif
+  for (int c = 0; c < width; ++c) {
+    int16_t veob = eob_ls[c];
+    assert(veob <= height);
+    int el_ctx = get_empty_line_ctx(c, eob_ls);
+#if LV_MAP_PROB
+    aom_write_bin(w, veob == 0,
+                  fc->empty_line_cdf[txs_ctx][plane_type][tx_class][el_ctx], 2);
+#else
+    aom_write(w, veob == 0,
+              fc->empty_line[txs_ctx][plane_type][tx_class][el_ctx]);
+#endif
+    if (veob) {
+      for (int r = 0; r < veob; ++r) {
+        if (r + 1 != height) {
+          int coeff_idx = r * width + c;
+          int scan_idx = iscan[coeff_idx];
+          int is_nz = tcoeff[coeff_idx] != 0;
+          int coeff_ctx =
+              get_nz_map_ctx(tcoeff, scan_idx, scan, bwl, height, tx_type);
+#if LV_MAP_PROB
+          aom_write_bin(w, is_nz,
+                        fc->nz_map_cdf[txs_ctx][plane_type][coeff_ctx], 2);
+#else
+          aom_write(w, is_nz, nz_map[coeff_ctx]);
+#endif
+          if (is_nz) {
+            int eob_ctx = get_hv_eob_ctx(c, r, eob_ls);
+#if LV_MAP_PROB
+            aom_write_bin(
+                w, r == veob - 1,
+                fc->hv_eob_cdf[txs_ctx][plane_type][tx_class][eob_ctx], 2);
+#else
+            aom_write(w, r == veob - 1,
+                      fc->hv_eob[txs_ctx][plane_type][tx_class][eob_ctx]);
+#endif
+          }
+        }
+      }
+    }
+  }
+}
+
+static INLINE void write_nz_map_horiz(aom_writer *w, const tran_low_t *tcoeff,
+                                      uint16_t eob, int plane,
+                                      const int16_t *scan, const int16_t *iscan,
+                                      TX_SIZE tx_size, TX_TYPE tx_type,
+                                      FRAME_CONTEXT *fc) {
+  (void)scan;
+  (void)eob;
+  const TX_SIZE txs_ctx = get_txsize_context(tx_size);
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const TX_CLASS tx_class = get_tx_class(tx_type);
+  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  int16_t eob_ls[MAX_HVTX_SIZE];
+  get_eob_horiz(eob_ls, tcoeff, width, height);
+#if !LV_MAP_PROB
+  aom_prob *nz_map = fc->nz_map[txs_ctx][plane_type];
+#endif
+  for (int r = 0; r < height; ++r) {
+    int16_t heob = eob_ls[r];
+    int el_ctx = get_empty_line_ctx(r, eob_ls);
+#if LV_MAP_PROB
+    aom_write_bin(w, heob == 0,
+                  fc->empty_line_cdf[txs_ctx][plane_type][tx_class][el_ctx], 2);
+#else
+    aom_write(w, heob == 0,
+              fc->empty_line[txs_ctx][plane_type][tx_class][el_ctx]);
+#endif
+    if (heob) {
+      for (int c = 0; c < heob; ++c) {
+        if (c + 1 != width) {
+          int coeff_idx = r * width + c;
+          int scan_idx = iscan[coeff_idx];
+          int is_nz = tcoeff[coeff_idx] != 0;
+          int coeff_ctx =
+              get_nz_map_ctx(tcoeff, scan_idx, scan, bwl, height, tx_type);
+#if LV_MAP_PROB
+          aom_write_bin(w, is_nz,
+                        fc->nz_map_cdf[txs_ctx][plane_type][coeff_ctx], 2);
+#else
+          aom_write(w, is_nz, nz_map[coeff_ctx]);
+#endif
+          if (is_nz) {
+            int eob_ctx = get_hv_eob_ctx(r, c, eob_ls);
+#if LV_MAP_PROB
+            aom_write_bin(
+                w, c == heob - 1,
+                fc->hv_eob_cdf[txs_ctx][plane_type][tx_class][eob_ctx], 2);
+#else
+            aom_write(w, c == heob - 1,
+                      fc->hv_eob[txs_ctx][plane_type][tx_class][eob_ctx]);
+#endif
+          }
+        }
+      }
+    }
+  }
+}
+#endif
+
 void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
                           aom_writer *w, int blk_row, int blk_col, int block,
                           int plane, TX_SIZE tx_size, const tran_low_t *tcoeff,
                           uint16_t eob, TXB_CTX *txb_ctx) {
-  aom_prob *nz_map;
-  aom_prob *eob_flag;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   const PLANE_TYPE plane_type = get_plane_type(plane);
   const TX_SIZE txs_ctx = get_txsize_context(tx_size);
@@ -82,18 +272,21 @@ void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
       av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
   const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, mbmi);
   const int16_t *scan = scan_order->scan;
-  const int16_t *iscan = scan_order->iscan;
   int c;
-  int is_nz;
   const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
   const int height = tx_size_high[tx_size];
-  const int seg_eob = tx_size_2d[tx_size];
   uint16_t update_eob = 0;
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 
   (void)blk_row;
   (void)blk_col;
 
-  aom_write(w, eob == 0, cm->fc->txb_skip[txs_ctx][txb_ctx->txb_skip_ctx]);
+#if LV_MAP_PROB
+  aom_write_bin(w, eob == 0,
+                ec_ctx->txb_skip_cdf[txs_ctx][txb_ctx->txb_skip_ctx], 2);
+#else
+  aom_write(w, eob == 0, ec_ctx->txb_skip[txs_ctx][txb_ctx->txb_skip_ctx]);
+#endif
 
   if (eob == 0) return;
 #if CONFIG_TXK_SEL
@@ -101,29 +294,42 @@ void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
                     get_min_tx_size(tx_size), w);
 #endif
 
-  nz_map = cm->fc->nz_map[txs_ctx][plane_type];
-  eob_flag = cm->fc->eob_flag[txs_ctx][plane_type];
-
-  for (c = 0; c < eob; ++c) {
-    int coeff_ctx = get_nz_map_ctx(tcoeff, scan[c], bwl, height, iscan);
-    int eob_ctx = get_eob_ctx(tcoeff, scan[c], txs_ctx);
-
-    tran_low_t v = tcoeff[scan[c]];
-    is_nz = (v != 0);
-
-    if (c == seg_eob - 1) break;
-
-    aom_write(w, is_nz, nz_map[coeff_ctx]);
-
-    if (is_nz) {
-      aom_write(w, c == (eob - 1), eob_flag[eob_ctx]);
+#if CONFIG_CTX1D
+  TX_CLASS tx_class = get_tx_class(tx_type);
+  if (tx_class == TX_CLASS_2D) {
+    write_nz_map(w, tcoeff, eob, plane, scan, tx_size, tx_type, ec_ctx);
+  } else {
+    const int width = tx_size_wide[tx_size];
+    const int eob_offset = width + height;
+    const int eob_mode = eob > eob_offset;
+#if LV_MAP_PROB
+    aom_write_bin(w, eob_mode,
+                  ec_ctx->eob_mode_cdf[txs_ctx][plane_type][tx_class], 2);
+#else
+    aom_write(w, eob_mode, ec_ctx->eob_mode[txs_ctx][plane_type][tx_class]);
+#endif
+    if (eob_mode == 0) {
+      write_nz_map(w, tcoeff, eob, plane, scan, tx_size, tx_type, ec_ctx);
+    } else {
+      const int16_t *iscan = scan_order->iscan;
+      assert(tx_class == TX_CLASS_VERT || tx_class == TX_CLASS_HORIZ);
+      if (tx_class == TX_CLASS_VERT)
+        write_nz_map_vert(w, tcoeff, eob, plane, scan, iscan, tx_size, tx_type,
+                          ec_ctx);
+      else
+        write_nz_map_horiz(w, tcoeff, eob, plane, scan, iscan, tx_size, tx_type,
+                           ec_ctx);
     }
   }
+#else
+  write_nz_map(w, tcoeff, eob, plane, scan, tx_size, tx_type, ec_ctx);
+#endif  // CONFIG_CTX1D
 
   int i;
   for (i = 0; i < NUM_BASE_LEVELS; ++i) {
-    aom_prob *coeff_base = cm->fc->coeff_base[txs_ctx][plane_type][i];
-
+#if !LV_MAP_PROB
+    aom_prob *coeff_base = ec_ctx->coeff_base[txs_ctx][plane_type][i];
+#endif
     update_eob = 0;
     for (c = eob - 1; c >= 0; --c) {
       tran_low_t v = tcoeff[scan[c]];
@@ -136,15 +342,32 @@ void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
       ctx = get_base_ctx(tcoeff, scan[c], bwl, height, i + 1);
 
       if (level == i + 1) {
+#if LV_MAP_PROB
+        aom_write_bin(w, 1, ec_ctx->coeff_base_cdf[txs_ctx][plane_type][i][ctx],
+                      2);
+#else
         aom_write(w, 1, coeff_base[ctx]);
+#endif
         if (c == 0) {
-          aom_write(w, sign, cm->fc->dc_sign[plane_type][txb_ctx->dc_sign_ctx]);
+#if LV_MAP_PROB
+          aom_write_bin(w, sign,
+                        ec_ctx->dc_sign_cdf[plane_type][txb_ctx->dc_sign_ctx],
+                        2);
+#else
+          aom_write(w, sign, ec_ctx->dc_sign[plane_type][txb_ctx->dc_sign_ctx]);
+#endif
         } else {
           aom_write_bit(w, sign);
         }
         continue;
       }
+
+#if LV_MAP_PROB
+      aom_write_bin(w, 0, ec_ctx->coeff_base_cdf[txs_ctx][plane_type][i][ctx],
+                    2);
+#else
       aom_write(w, 0, coeff_base[ctx]);
+#endif
       update_eob = AOMMAX(update_eob, c);
     }
   }
@@ -159,21 +382,70 @@ void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
     if (level <= NUM_BASE_LEVELS) continue;
 
     if (c == 0) {
-      aom_write(w, sign, cm->fc->dc_sign[plane_type][txb_ctx->dc_sign_ctx]);
+#if LV_MAP_PROB
+      aom_write_bin(w, sign,
+                    ec_ctx->dc_sign_cdf[plane_type][txb_ctx->dc_sign_ctx], 2);
+#else
+      aom_write(w, sign, ec_ctx->dc_sign[plane_type][txb_ctx->dc_sign_ctx]);
+#endif
     } else {
       aom_write_bit(w, sign);
     }
 
     // level is above 1.
     ctx = get_br_ctx(tcoeff, scan[c], bwl, height);
+
+#if BR_NODE
+    int base_range = level - 1 - NUM_BASE_LEVELS;
+    int br_set_idx = 0;
+    int br_base = 0;
+    int br_offset = 0;
+
+    if (base_range >= COEFF_BASE_RANGE)
+      br_set_idx = BASE_RANGE_SETS;
+    else
+      br_set_idx = coeff_to_br_index[base_range];
+
+    for (idx = 0; idx < BASE_RANGE_SETS; ++idx) {
+      aom_write_bin(w, idx == br_set_idx,
+                    ec_ctx->coeff_br_cdf[txs_ctx][plane_type][idx][ctx], 2);
+      if (idx == br_set_idx) {
+        br_base = br_index_to_coeff[br_set_idx];
+        br_offset = base_range - br_base;
+        int extra_bits = (1 << br_extra_bits[idx]) - 1;
+        for (int tok = 0; tok < extra_bits; ++tok) {
+          if (tok == br_offset) {
+            aom_write_bin(w, 1, ec_ctx->coeff_lps_cdf[txs_ctx][plane_type][ctx],
+                          2);
+            break;
+          }
+          aom_write_bin(w, 0, ec_ctx->coeff_lps_cdf[txs_ctx][plane_type][ctx],
+                        2);
+        }
+        //        aom_write_literal(w, br_offset, br_extra_bits[idx]);
+        break;
+      }
+    }
+
+    if (br_set_idx < BASE_RANGE_SETS) continue;
+#else  // BR_NODE
     for (idx = 0; idx < COEFF_BASE_RANGE; ++idx) {
       if (level == (idx + 1 + NUM_BASE_LEVELS)) {
-        aom_write(w, 1, cm->fc->coeff_lps[txs_ctx][plane_type][ctx]);
+#if LV_MAP_PROB
+        aom_write_bin(w, 1, ec_ctx->coeff_lps_cdf[txs_ctx][plane_type][ctx], 2);
+#else
+        aom_write(w, 1, ec_ctx->coeff_lps[txs_ctx][plane_type][ctx]);
+#endif
         break;
       }
-      aom_write(w, 0, cm->fc->coeff_lps[txs_ctx][plane_type][ctx]);
+#if LV_MAP_PROB
+      aom_write_bin(w, 0, ec_ctx->coeff_lps_cdf[txs_ctx][plane_type][ctx], 2);
+#else
+      aom_write(w, 0, ec_ctx->coeff_lps[txs_ctx][plane_type][ctx]);
+#endif
     }
     if (idx < COEFF_BASE_RANGE) continue;
+#endif  // BR_NODE
 
     // use 0-th order Golomb code to handle the residual level.
     write_golomb(w, level - COEFF_BASE_RANGE - 1 - NUM_BASE_LEVELS);
@@ -247,50 +519,164 @@ static INLINE void get_base_ctx_set(const tran_low_t *tcoeffs,
   }
 
   for (i = 0; i < NUM_BASE_LEVELS; ++i) {
-    ctx_set[i] = (ctx_set[i] + 1) >> 1;
-
-    if (row == 0 && col == 0)
-      ctx_set[i] = (ctx_set[i] << 1) + mag[i];
-    else if (row == 0)
-      ctx_set[i] = 8 + (ctx_set[i] << 1) + mag[i];
-    else if (col == 0)
-      ctx_set[i] = 18 + (ctx_set[i] << 1) + mag[i];
-    else
-      ctx_set[i] = 28 + (ctx_set[i] << 1) + mag[i];
+    ctx_set[i] = get_base_ctx_from_count_mag(row, col, ctx_set[i], mag[i]);
   }
   return;
 }
 
 static INLINE int get_br_cost(tran_low_t abs_qc, int ctx,
-                              const aom_prob *coeff_lps) {
+                              const int *coeff_lps) {
   const tran_low_t min_level = 1 + NUM_BASE_LEVELS;
   const tran_low_t max_level = 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE;
+  (void)ctx;
   if (abs_qc >= min_level) {
-    const int cost0 = av1_cost_bit(coeff_lps[ctx], 0);
-    const int cost1 = av1_cost_bit(coeff_lps[ctx], 1);
+#if BR_NODE
+    if (abs_qc >= max_level)
+      return coeff_lps[COEFF_BASE_RANGE];  // COEFF_BASE_RANGE * cost0;
+    else
+      return coeff_lps[(abs_qc - min_level)];  //  * cost0 + cost1;
+#else
+    const int cost0 = coeff_lps[0];
+    const int cost1 = coeff_lps[1];
     if (abs_qc >= max_level)
       return COEFF_BASE_RANGE * cost0;
     else
       return (abs_qc - min_level) * cost0 + cost1;
+#endif
   } else {
     return 0;
   }
 }
 
 static INLINE int get_base_cost(tran_low_t abs_qc, int ctx,
-                                aom_prob (*coeff_base)[COEFF_BASE_CONTEXTS],
-                                int base_idx) {
+                                const int coeff_base[2], int base_idx) {
   const int level = base_idx + 1;
+  (void)ctx;
   if (abs_qc < level)
     return 0;
   else
-    return av1_cost_bit(coeff_base[base_idx][ctx], abs_qc == level);
+    return coeff_base[abs_qc == level];
+}
+
+int get_nz_eob_map_cost(const LV_MAP_COEFF_COST *coeff_costs,
+                        const tran_low_t *qcoeff, uint16_t eob, int plane,
+                        const int16_t *scan, TX_SIZE tx_size, TX_TYPE tx_type) {
+  (void)plane;
+  TX_SIZE txs_ctx = get_txsize_context(tx_size);
+  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
+  const int height = tx_size_high[tx_size];
+#if CONFIG_CTX1D
+  const TX_CLASS tx_class = get_tx_class(tx_type);
+  const int width = tx_size_wide[tx_size];
+  const int eob_offset = width + height;
+  const int seg_eob =
+      (tx_class == TX_CLASS_2D) ? tx_size_2d[tx_size] : eob_offset;
+#else
+  const int seg_eob = tx_size_2d[tx_size];
+#endif
+  int cost = 0;
+  for (int c = 0; c < eob; ++c) {
+    tran_low_t v = qcoeff[scan[c]];
+    int is_nz = (v != 0);
+    if (c + 1 != seg_eob) {
+      int coeff_ctx = get_nz_map_ctx(qcoeff, c, scan, bwl, height, tx_type);
+      cost += coeff_costs->nz_map_cost[coeff_ctx][is_nz];
+      if (is_nz) {
+        int eob_ctx = get_eob_ctx(qcoeff, scan[c], txs_ctx, tx_type);
+        cost += coeff_costs->eob_cost[eob_ctx][c == (eob - 1)];
+      }
+    }
+  }
+  return cost;
+}
+
+#if CONFIG_CTX1D
+static INLINE int get_nz_eob_map_cost_vert(const LV_MAP_COEFF_COST *coeff_costs,
+                                           const tran_low_t *qcoeff,
+                                           uint16_t eob, int plane,
+                                           const int16_t *scan,
+                                           const int16_t *iscan,
+                                           TX_SIZE tx_size, TX_TYPE tx_type) {
+  (void)tx_size;
+  (void)scan;
+  (void)eob;
+  (void)plane;
+  const TX_CLASS tx_class = get_tx_class(tx_type);
+  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  int16_t eob_ls[MAX_HVTX_SIZE];
+  get_eob_vert(eob_ls, qcoeff, width, height);
+  int cost = 0;
+  for (int c = 0; c < width; ++c) {
+    int16_t veob = eob_ls[c];
+    assert(veob <= height);
+    int el_ctx = get_empty_line_ctx(c, eob_ls);
+    cost += coeff_costs->empty_line_cost[tx_class][el_ctx][veob == 0];
+    if (veob) {
+      for (int r = 0; r < veob; ++r) {
+        if (r + 1 != height) {
+          int coeff_idx = r * width + c;
+          int scan_idx = iscan[coeff_idx];
+          int is_nz = qcoeff[coeff_idx] != 0;
+          int coeff_ctx =
+              get_nz_map_ctx(qcoeff, scan_idx, scan, bwl, height, tx_type);
+          cost += coeff_costs->nz_map_cost[coeff_ctx][is_nz];
+          if (is_nz) {
+            int eob_ctx = get_hv_eob_ctx(c, r, eob_ls);
+            cost += coeff_costs->hv_eob_cost[tx_class][eob_ctx][r == veob - 1];
+          }
+        }
+      }
+    }
+  }
+  return cost;
+}
+
+static INLINE int get_nz_eob_map_cost_horiz(
+    const LV_MAP_COEFF_COST *coeff_costs, const tran_low_t *qcoeff,
+    uint16_t eob, int plane, const int16_t *scan, const int16_t *iscan,
+    TX_SIZE tx_size, TX_TYPE tx_type) {
+  (void)tx_size;
+  (void)scan;
+  (void)eob;
+  (void)plane;
+  const TX_CLASS tx_class = get_tx_class(tx_type);
+  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  int16_t eob_ls[MAX_HVTX_SIZE];
+  get_eob_horiz(eob_ls, qcoeff, width, height);
+  int cost = 0;
+  for (int r = 0; r < height; ++r) {
+    int16_t heob = eob_ls[r];
+    assert(heob <= width);
+    int el_ctx = get_empty_line_ctx(r, eob_ls);
+    cost += coeff_costs->empty_line_cost[tx_class][el_ctx][heob == 0];
+    if (heob) {
+      for (int c = 0; c < heob; ++c) {
+        if (c + 1 != width) {
+          int coeff_idx = r * width + c;
+          int scan_idx = iscan[coeff_idx];
+          int is_nz = qcoeff[coeff_idx] != 0;
+          int coeff_ctx =
+              get_nz_map_ctx(qcoeff, scan_idx, scan, bwl, height, tx_type);
+          cost += coeff_costs->nz_map_cost[coeff_ctx][is_nz];
+          if (is_nz) {
+            int eob_ctx = get_hv_eob_ctx(r, c, eob_ls);
+            cost += coeff_costs->hv_eob_cost[tx_class][eob_ctx][c == heob - 1];
+          }
+        }
+      }
+    }
+  }
+  return cost;
 }
+#endif
 
-int av1_cost_coeffs_txb(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
+int av1_cost_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
                         int blk_row, int blk_col, int block, TX_SIZE tx_size,
                         TXB_CTX *txb_ctx) {
-  const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   TX_SIZE txs_ctx = get_txsize_context(tx_size);
   const PLANE_TYPE plane_type = get_plane_type(plane);
@@ -301,43 +687,62 @@ int av1_cost_coeffs_txb(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
   const int eob = p->eobs[block];
   const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   int c, cost;
-  const int seg_eob = AOMMIN(eob, tx_size_2d[tx_size] - 1);
   int txb_skip_ctx = txb_ctx->txb_skip_ctx;
-  aom_prob *nz_map = xd->fc->nz_map[txs_ctx][plane_type];
 
   const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
   const int height = tx_size_high[tx_size];
 
-  aom_prob(*coeff_base)[COEFF_BASE_CONTEXTS] =
-      xd->fc->coeff_base[txs_ctx][plane_type];
-
   const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, mbmi);
   const int16_t *scan = scan_order->scan;
-  const int16_t *iscan = scan_order->iscan;
+
+  LV_MAP_COEFF_COST *coeff_costs = &x->coeff_costs[txs_ctx][plane_type];
 
   cost = 0;
 
   if (eob == 0) {
-    cost = av1_cost_bit(xd->fc->txb_skip[txs_ctx][txb_skip_ctx], 1);
+    cost = coeff_costs->txb_skip_cost[txb_skip_ctx][1];
     return cost;
   }
-
-  cost = av1_cost_bit(xd->fc->txb_skip[txs_ctx][txb_skip_ctx], 0);
+  cost = coeff_costs->txb_skip_cost[txb_skip_ctx][0];
 
 #if CONFIG_TXK_SEL
-  cost += av1_tx_type_cost(cpi, xd, mbmi->sb_type, plane, tx_size, tx_type);
+  cost += av1_tx_type_cost(cm, x, xd, mbmi->sb_type, plane, tx_size, tx_type);
 #endif
 
+#if CONFIG_CTX1D
+  TX_CLASS tx_class = get_tx_class(tx_type);
+  if (tx_class == TX_CLASS_2D) {
+    cost += get_nz_eob_map_cost(coeff_costs, qcoeff, eob, plane, scan, tx_size,
+                                tx_type);
+  } else {
+    const int width = tx_size_wide[tx_size];
+    const int eob_offset = width + height;
+    const int eob_mode = eob > eob_offset;
+    cost += coeff_costs->eob_mode_cost[tx_class][eob_mode];
+    if (eob_mode == 0) {
+      cost += get_nz_eob_map_cost(coeff_costs, qcoeff, eob, plane, scan,
+                                  tx_size, tx_type);
+    } else {
+      const int16_t *iscan = scan_order->iscan;
+      assert(tx_class == TX_CLASS_VERT || tx_class == TX_CLASS_HORIZ);
+      if (tx_class == TX_CLASS_VERT)
+        cost += get_nz_eob_map_cost_vert(coeff_costs, qcoeff, eob, plane, scan,
+                                         iscan, tx_size, tx_type);
+      else
+        cost += get_nz_eob_map_cost_horiz(coeff_costs, qcoeff, eob, plane, scan,
+                                          iscan, tx_size, tx_type);
+    }
+  }
+#else   // CONFIG_CTX1D
+  cost += get_nz_eob_map_cost(coeff_costs, qcoeff, eob, plane, scan, tx_size,
+                              tx_type);
+#endif  // CONFIG_CTX1D
+
   for (c = 0; c < eob; ++c) {
     tran_low_t v = qcoeff[scan[c]];
     int is_nz = (v != 0);
     int level = abs(v);
 
-    if (c < seg_eob) {
-      int coeff_ctx = get_nz_map_ctx(qcoeff, scan[c], bwl, height, iscan);
-      cost += av1_cost_bit(nz_map[coeff_ctx], is_nz);
-    }
-
     if (is_nz) {
       int ctx_ls[NUM_BASE_LEVELS] = { 0 };
       int sign = (v < 0) ? 1 : 0;
@@ -345,8 +750,7 @@ int av1_cost_coeffs_txb(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
       // sign bit cost
       if (c == 0) {
         int dc_sign_ctx = txb_ctx->dc_sign_ctx;
-
-        cost += av1_cost_bit(xd->fc->dc_sign[plane_type][dc_sign_ctx], sign);
+        cost += coeff_costs->dc_sign_cost[dc_sign_ctx][sign];
       } else {
         cost += av1_cost_bit(128, sign);
       }
@@ -358,28 +762,33 @@ int av1_cost_coeffs_txb(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
         if (level <= i) continue;
 
         if (level == i + 1) {
-          cost += av1_cost_bit(coeff_base[i][ctx_ls[i]], 1);
+          cost += coeff_costs->base_cost[i][ctx_ls[i]][1];
           continue;
         }
-        cost += av1_cost_bit(coeff_base[i][ctx_ls[i]], 0);
+        cost += coeff_costs->base_cost[i][ctx_ls[i]][0];
       }
 
       if (level > NUM_BASE_LEVELS) {
-        int idx;
         int ctx;
-
         ctx = get_br_ctx(qcoeff, scan[c], bwl, height);
+#if BR_NODE
+        int base_range = level - 1 - NUM_BASE_LEVELS;
+        if (base_range < COEFF_BASE_RANGE) {
+          cost += coeff_costs->lps_cost[ctx][base_range];
+        } else {
+          cost += coeff_costs->lps_cost[ctx][COEFF_BASE_RANGE];
+        }
 
-        for (idx = 0; idx < COEFF_BASE_RANGE; ++idx) {
+#else
+        for (int idx = 0; idx < COEFF_BASE_RANGE; ++idx) {
           if (level == (idx + 1 + NUM_BASE_LEVELS)) {
-            cost +=
-                av1_cost_bit(xd->fc->coeff_lps[txs_ctx][plane_type][ctx], 1);
+            cost += coeff_costs->lps_cost[ctx][1];
             break;
           }
-          cost += av1_cost_bit(xd->fc->coeff_lps[txs_ctx][plane_type][ctx], 0);
+          cost += coeff_costs->lps_cost[ctx][0];
         }
-
-        if (idx >= COEFF_BASE_RANGE) {
+#endif
+        if (level >= 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) {
           // residual cost
           int r = level - COEFF_BASE_RANGE - NUM_BASE_LEVELS;
           int ri = r;
@@ -396,12 +805,6 @@ int av1_cost_coeffs_txb(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
             cost += av1_cost_bit(128, (r >> ri) & 0x01);
         }
       }
-
-      if (c < seg_eob) {
-        int eob_ctx = get_eob_ctx(qcoeff, scan[c], txs_ctx);
-        cost += av1_cost_bit(xd->fc->eob_flag[txs_ctx][plane_type][eob_ctx],
-                             c == (eob - 1));
-      }
     }
   }
 
@@ -413,118 +816,17 @@ static INLINE int has_base(tran_low_t qc, int base_idx) {
   return abs(qc) >= level;
 }
 
-static void gen_base_count_mag_arr(int (*base_count_arr)[MAX_TX_SQUARE],
-                                   int (*base_mag_arr)[2],
-                                   const tran_low_t *qcoeff, int stride,
-                                   int height, int eob, const int16_t *scan) {
-  for (int c = 0; c < eob; ++c) {
-    const int coeff_idx = scan[c];  // raster order
-    if (!has_base(qcoeff[coeff_idx], 0)) continue;
-    const int row = coeff_idx / stride;
-    const int col = coeff_idx % stride;
-    int *mag = base_mag_arr[coeff_idx];
-    get_mag(mag, qcoeff, stride, height, row, col, base_ref_offset,
-            BASE_CONTEXT_POSITION_NUM);
-    for (int i = 0; i < NUM_BASE_LEVELS; ++i) {
-      if (!has_base(qcoeff[coeff_idx], i)) continue;
-      int *count = base_count_arr[i] + coeff_idx;
-      *count = get_level_count(qcoeff, stride, height, row, col, i,
-                               base_ref_offset, BASE_CONTEXT_POSITION_NUM);
-    }
-  }
-}
-
-static void gen_nz_count_arr(int(*nz_count_arr), const tran_low_t *qcoeff,
-                             int stride, int height, int eob,
-                             const SCAN_ORDER *scan_order) {
-  const int16_t *scan = scan_order->scan;
-  const int16_t *iscan = scan_order->iscan;
-  for (int c = 0; c < eob; ++c) {
-    const int coeff_idx = scan[c];  // raster order
-    const int row = coeff_idx / stride;
-    const int col = coeff_idx % stride;
-    nz_count_arr[coeff_idx] =
-        get_nz_count(qcoeff, stride, height, row, col, iscan);
-  }
-}
-
-static void gen_nz_ctx_arr(int (*nz_ctx_arr)[2], int(*nz_count_arr),
-                           const tran_low_t *qcoeff, int bwl, int eob,
-                           const SCAN_ORDER *scan_order) {
-  const int16_t *scan = scan_order->scan;
-  const int16_t *iscan = scan_order->iscan;
-  for (int c = 0; c < eob; ++c) {
-    const int coeff_idx = scan[c];  // raster order
-    const int count = nz_count_arr[coeff_idx];
-    nz_ctx_arr[coeff_idx][0] =
-        get_nz_map_ctx_from_count(count, qcoeff, coeff_idx, bwl, iscan);
-  }
-}
-
-static void gen_base_ctx_arr(int (*base_ctx_arr)[MAX_TX_SQUARE][2],
-                             int (*base_count_arr)[MAX_TX_SQUARE],
-                             int (*base_mag_arr)[2], const tran_low_t *qcoeff,
-                             int stride, int eob, const int16_t *scan) {
-  (void)qcoeff;
-  for (int i = 0; i < NUM_BASE_LEVELS; ++i) {
-    for (int c = 0; c < eob; ++c) {
-      const int coeff_idx = scan[c];  // raster order
-      if (!has_base(qcoeff[coeff_idx], i)) continue;
-      const int row = coeff_idx / stride;
-      const int col = coeff_idx % stride;
-      const int count = base_count_arr[i][coeff_idx];
-      const int *mag = base_mag_arr[coeff_idx];
-      const int level = i + 1;
-      base_ctx_arr[i][coeff_idx][0] =
-          get_base_ctx_from_count_mag(row, col, count, mag[0], level);
-    }
-  }
-}
-
 static INLINE int has_br(tran_low_t qc) {
   return abs(qc) >= 1 + NUM_BASE_LEVELS;
 }
 
-static void gen_br_count_mag_arr(int *br_count_arr, int (*br_mag_arr)[2],
-                                 const tran_low_t *qcoeff, int stride,
-                                 int height, int eob, const int16_t *scan) {
-  for (int c = 0; c < eob; ++c) {
-    const int coeff_idx = scan[c];  // raster order
-    if (!has_br(qcoeff[coeff_idx])) continue;
-    const int row = coeff_idx / stride;
-    const int col = coeff_idx % stride;
-    int *count = br_count_arr + coeff_idx;
-    int *mag = br_mag_arr[coeff_idx];
-    *count = get_level_count(qcoeff, stride, height, row, col, NUM_BASE_LEVELS,
-                             br_ref_offset, BR_CONTEXT_POSITION_NUM);
-    get_mag(mag, qcoeff, stride, height, row, col, br_ref_offset,
-            BR_CONTEXT_POSITION_NUM);
-  }
-}
-
-static void gen_br_ctx_arr(int (*br_ctx_arr)[2], const int *br_count_arr,
-                           int (*br_mag_arr)[2], const tran_low_t *qcoeff,
-                           int stride, int eob, const int16_t *scan) {
-  (void)qcoeff;
-  for (int c = 0; c < eob; ++c) {
-    const int coeff_idx = scan[c];  // raster order
-    if (!has_br(qcoeff[coeff_idx])) continue;
-    const int row = coeff_idx / stride;
-    const int col = coeff_idx % stride;
-    const int count = br_count_arr[coeff_idx];
-    const int *mag = br_mag_arr[coeff_idx];
-    br_ctx_arr[coeff_idx][0] =
-        get_br_ctx_from_count_mag(row, col, count, mag[0]);
-  }
-}
-
 static INLINE int get_sign_bit_cost(tran_low_t qc, int coeff_idx,
-                                    const aom_prob *dc_sign_prob,
+                                    const int (*dc_sign_cost)[2],
                                     int dc_sign_ctx) {
   const int sign = (qc < 0) ? 1 : 0;
   // sign bit cost
   if (coeff_idx == 0) {
-    return av1_cost_bit(dc_sign_prob[dc_sign_ctx], sign);
+    return dc_sign_cost[dc_sign_ctx][sign];
   } else {
     return av1_cost_bit(128, sign);
   }
@@ -547,42 +849,80 @@ static INLINE int get_golomb_cost(int abs_qc) {
   }
 }
 
-// TODO(angiebird): add static once this function is called
 void gen_txb_cache(TxbCache *txb_cache, TxbInfo *txb_info) {
+  // gen_nz_count_arr
   const int16_t *scan = txb_info->scan_order->scan;
-  gen_nz_count_arr(txb_cache->nz_count_arr, txb_info->qcoeff, txb_info->stride,
-                   txb_info->height, txb_info->eob, txb_info->scan_order);
-  gen_nz_ctx_arr(txb_cache->nz_ctx_arr, txb_cache->nz_count_arr,
-                 txb_info->qcoeff, txb_info->bwl, txb_info->eob,
-                 txb_info->scan_order);
-  gen_base_count_mag_arr(txb_cache->base_count_arr, txb_cache->base_mag_arr,
-                         txb_info->qcoeff, txb_info->stride, txb_info->height,
-                         txb_info->eob, scan);
-  gen_base_ctx_arr(txb_cache->base_ctx_arr, txb_cache->base_count_arr,
-                   txb_cache->base_mag_arr, txb_info->qcoeff, txb_info->stride,
-                   txb_info->eob, scan);
-  gen_br_count_mag_arr(txb_cache->br_count_arr, txb_cache->br_mag_arr,
-                       txb_info->qcoeff, txb_info->stride, txb_info->height,
-                       txb_info->eob, scan);
-  gen_br_ctx_arr(txb_cache->br_ctx_arr, txb_cache->br_count_arr,
-                 txb_cache->br_mag_arr, txb_info->qcoeff, txb_info->stride,
-                 txb_info->eob, scan);
+  const int bwl = txb_info->bwl;
+  const int height = txb_info->height;
+  tran_low_t *qcoeff = txb_info->qcoeff;
+  const BASE_CTX_TABLE *base_ctx_table =
+      txb_info->coeff_ctx_table->base_ctx_table;
+  for (int c = 0; c < txb_info->eob; ++c) {
+    const int coeff_idx = scan[c];  // raster order
+    const int row = coeff_idx >> bwl;
+    const int col = coeff_idx - (row << bwl);
+#if REDUCE_CONTEXT_DEPENDENCY
+    int prev_coeff_idx;
+    int prev_row;
+    int prev_col;
+    if (c > MIN_SCAN_IDX_REDUCE_CONTEXT_DEPENDENCY) {
+      prev_coeff_idx = scan[c - 1];  // raster order
+      prev_row = prev_coeff_idx >> bwl;
+      prev_col = prev_coeff_idx - (prev_row << bwl);
+    } else {
+      prev_coeff_idx = -1;
+      prev_row = -1;
+      prev_col = -1;
+    }
+    txb_cache->nz_count_arr[coeff_idx] =
+        get_nz_count(qcoeff, bwl, height, row, col, prev_row, prev_col);
+#else
+    txb_cache->nz_count_arr[coeff_idx] =
+        get_nz_count(qcoeff, bwl, height, row, col);
+#endif
+    const int nz_count = txb_cache->nz_count_arr[coeff_idx];
+    txb_cache->nz_ctx_arr[coeff_idx] =
+        get_nz_map_ctx_from_count(nz_count, coeff_idx, bwl, txb_info->tx_type);
+
+    // gen_base_count_mag_arr
+    if (!has_base(qcoeff[coeff_idx], 0)) continue;
+    int *base_mag = txb_cache->base_mag_arr[coeff_idx];
+    int count[NUM_BASE_LEVELS];
+    get_base_count_mag(base_mag, count, qcoeff, bwl, height, row, col);
+
+    for (int i = 0; i < NUM_BASE_LEVELS; ++i) {
+      if (!has_base(qcoeff[coeff_idx], i)) break;
+      txb_cache->base_count_arr[i][coeff_idx] = count[i];
+      const int level = i + 1;
+      txb_cache->base_ctx_arr[i][coeff_idx] =
+          base_ctx_table[row != 0][col != 0][base_mag[0] > level][count[i]];
+    }
+
+    // gen_br_count_mag_arr
+    if (!has_br(qcoeff[coeff_idx])) continue;
+    int *br_count = txb_cache->br_count_arr + coeff_idx;
+    int *br_mag = txb_cache->br_mag_arr[coeff_idx];
+    *br_count = get_br_count_mag(br_mag, qcoeff, bwl, height, row, col,
+                                 NUM_BASE_LEVELS);
+    txb_cache->br_ctx_arr[coeff_idx] =
+        get_br_ctx_from_count_mag(row, col, *br_count, br_mag[0]);
+  }
 }
 
-static INLINE aom_prob get_level_prob(int level, int coeff_idx,
-                                      const TxbCache *txb_cache,
-                                      const TxbProbs *txb_probs) {
+static INLINE const int *get_level_prob(int level, int coeff_idx,
+                                        const TxbCache *txb_cache,
+                                        const LV_MAP_COEFF_COST *txb_costs) {
   if (level == 0) {
-    const int ctx = txb_cache->nz_ctx_arr[coeff_idx][0];
-    return txb_probs->nz_map[ctx];
+    const int ctx = txb_cache->nz_ctx_arr[coeff_idx];
+    return txb_costs->nz_map_cost[ctx];
   } else if (level >= 1 && level < 1 + NUM_BASE_LEVELS) {
     const int idx = level - 1;
-    const int ctx = txb_cache->base_ctx_arr[idx][coeff_idx][0];
-    return txb_probs->coeff_base[idx][ctx];
+    const int ctx = txb_cache->base_ctx_arr[idx][coeff_idx];
+    return txb_costs->base_cost[idx][ctx];
   } else if (level >= 1 + NUM_BASE_LEVELS &&
              level < 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) {
-    const int ctx = txb_cache->br_ctx_arr[coeff_idx][0];
-    return txb_probs->coeff_lps[ctx];
+    const int ctx = txb_cache->br_ctx_arr[coeff_idx];
+    return txb_costs->lps_cost[ctx];
   } else if (level >= 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) {
     printf("get_level_prob does not support golomb\n");
     assert(0);
@@ -657,7 +997,7 @@ static int neighbor_level_down_update(int *new_count, int *new_mag, int count,
 
 static int try_neighbor_level_down_br(int coeff_idx, int nb_coeff_idx,
                                       const TxbCache *txb_cache,
-                                      const TxbProbs *txb_probs,
+                                      const LV_MAP_COEFF_COST *txb_costs,
                                       const TxbInfo *txb_info) {
   const tran_low_t qc = txb_info->qcoeff[coeff_idx];
   const tran_low_t abs_qc = abs(qc);
@@ -676,11 +1016,12 @@ static int try_neighbor_level_down_br(int coeff_idx, int nb_coeff_idx,
   if (update) {
     const int row = coeff_idx >> txb_info->bwl;
     const int col = coeff_idx - (row << txb_info->bwl);
-    const int ctx = txb_cache->br_ctx_arr[coeff_idx][0];
-    const int org_cost = get_br_cost(abs_qc, ctx, txb_probs->coeff_lps);
+    const int ctx = txb_cache->br_ctx_arr[coeff_idx];
+    const int org_cost = get_br_cost(abs_qc, ctx, txb_costs->lps_cost[ctx]);
 
     const int new_ctx = get_br_ctx_from_count_mag(row, col, new_count, new_mag);
-    const int new_cost = get_br_cost(abs_qc, new_ctx, txb_probs->coeff_lps);
+    const int new_cost =
+        get_br_cost(abs_qc, new_ctx, txb_costs->lps_cost[new_ctx]);
     const int cost_diff = -org_cost + new_cost;
     return cost_diff;
   } else {
@@ -690,10 +1031,12 @@ static int try_neighbor_level_down_br(int coeff_idx, int nb_coeff_idx,
 
 static int try_neighbor_level_down_base(int coeff_idx, int nb_coeff_idx,
                                         const TxbCache *txb_cache,
-                                        const TxbProbs *txb_probs,
+                                        const LV_MAP_COEFF_COST *txb_costs,
                                         const TxbInfo *txb_info) {
   const tran_low_t qc = txb_info->qcoeff[coeff_idx];
   const tran_low_t abs_qc = abs(qc);
+  const BASE_CTX_TABLE *base_ctx_table =
+      txb_info->coeff_ctx_table->base_ctx_table;
 
   int cost_diff = 0;
   for (int base_idx = 0; base_idx < NUM_BASE_LEVELS; ++base_idx) {
@@ -713,14 +1056,14 @@ static int try_neighbor_level_down_base(int coeff_idx, int nb_coeff_idx,
     if (update) {
       const int row = coeff_idx >> txb_info->bwl;
       const int col = coeff_idx - (row << txb_info->bwl);
-      const int ctx = txb_cache->base_ctx_arr[base_idx][coeff_idx][0];
-      const int org_cost =
-          get_base_cost(abs_qc, ctx, txb_probs->coeff_base, base_idx);
+      const int ctx = txb_cache->base_ctx_arr[base_idx][coeff_idx];
+      const int org_cost = get_base_cost(
+          abs_qc, ctx, txb_costs->base_cost[base_idx][ctx], base_idx);
 
       const int new_ctx =
-          get_base_ctx_from_count_mag(row, col, new_count, new_mag, level);
-      const int new_cost =
-          get_base_cost(abs_qc, new_ctx, txb_probs->coeff_base, base_idx);
+          base_ctx_table[row != 0][col != 0][new_mag > level][new_count];
+      const int new_cost = get_base_cost(
+          abs_qc, new_ctx, txb_costs->base_cost[base_idx][new_ctx], base_idx);
       cost_diff += -org_cost + new_cost;
     }
   }
@@ -729,7 +1072,7 @@ static int try_neighbor_level_down_base(int coeff_idx, int nb_coeff_idx,
 
 static int try_neighbor_level_down_nz(int coeff_idx, int nb_coeff_idx,
                                       const TxbCache *txb_cache,
-                                      const TxbProbs *txb_probs,
+                                      const LV_MAP_COEFF_COST *txb_costs,
                                       TxbInfo *txb_info) {
   // assume eob doesn't change
   const tran_low_t qc = txb_info->qcoeff[coeff_idx];
@@ -746,12 +1089,12 @@ static int try_neighbor_level_down_nz(int coeff_idx, int nb_coeff_idx,
     assert(count > 0);
     txb_info->qcoeff[nb_coeff_idx] = get_lower_coeff(nb_coeff);
     const int new_ctx = get_nz_map_ctx_from_count(
-        count - 1, txb_info->qcoeff, coeff_idx, txb_info->bwl, iscan);
+        count - 1, coeff_idx, txb_info->bwl, txb_info->tx_type);
     txb_info->qcoeff[nb_coeff_idx] = nb_coeff;
-    const int ctx = txb_cache->nz_ctx_arr[coeff_idx][0];
+    const int ctx = txb_cache->nz_ctx_arr[coeff_idx];
     const int is_nz = abs_qc > 0;
-    const int org_cost = av1_cost_bit(txb_probs->nz_map[ctx], is_nz);
-    const int new_cost = av1_cost_bit(txb_probs->nz_map[new_ctx], is_nz);
+    const int org_cost = txb_costs->nz_map_cost[ctx][is_nz];
+    const int new_cost = txb_costs->nz_map_cost[new_ctx][is_nz];
     const int cost_diff = new_cost - org_cost;
     return cost_diff;
   } else {
@@ -761,7 +1104,8 @@ static int try_neighbor_level_down_nz(int coeff_idx, int nb_coeff_idx,
 
 static int try_self_level_down(tran_low_t *low_coeff, int coeff_idx,
                                const TxbCache *txb_cache,
-                               const TxbProbs *txb_probs, TxbInfo *txb_info) {
+                               const LV_MAP_COEFF_COST *txb_costs,
+                               TxbInfo *txb_info) {
   const tran_low_t qc = txb_info->qcoeff[coeff_idx];
   if (qc == 0) {
     *low_coeff = 0;
@@ -772,44 +1116,68 @@ static int try_self_level_down(tran_low_t *low_coeff, int coeff_idx,
   int cost_diff;
   if (*low_coeff == 0) {
     const int scan_idx = txb_info->scan_order->iscan[coeff_idx];
-    const aom_prob level_prob =
-        get_level_prob(abs_qc, coeff_idx, txb_cache, txb_probs);
-    const aom_prob low_level_prob =
-        get_level_prob(abs(*low_coeff), coeff_idx, txb_cache, txb_probs);
+    const int *level_cost =
+        get_level_prob(abs_qc, coeff_idx, txb_cache, txb_costs);
+    const int *low_level_cost =
+        get_level_prob(abs(*low_coeff), coeff_idx, txb_cache, txb_costs);
     if (scan_idx < txb_info->seg_eob) {
       // When level-0, we code the binary of abs_qc > level
       // but when level-k k > 0 we code the binary of abs_qc == level
       // That's why wee need this special treatment for level-0 map
       // TODO(angiebird): make leve-0 consistent to other levels
-      cost_diff = -av1_cost_bit(level_prob, 1) +
-                  av1_cost_bit(low_level_prob, 0) -
-                  av1_cost_bit(low_level_prob, 1);
+      cost_diff = -level_cost[1] + low_level_cost[0] - low_level_cost[1];
     } else {
-      cost_diff = -av1_cost_bit(level_prob, 1);
+      cost_diff = -level_cost[1];
     }
 
     if (scan_idx < txb_info->seg_eob) {
-      const int eob_ctx =
-          get_eob_ctx(txb_info->qcoeff, coeff_idx, txb_info->txs_ctx);
-      cost_diff -= av1_cost_bit(txb_probs->eob_flag[eob_ctx],
-                                scan_idx == (txb_info->eob - 1));
+      const int eob_ctx = get_eob_ctx(txb_info->qcoeff, coeff_idx,
+                                      txb_info->txs_ctx, txb_info->tx_type);
+      cost_diff -=
+          txb_costs->eob_cost[eob_ctx][scan_idx == (txb_info->eob - 1)];
     }
 
     const int sign_cost = get_sign_bit_cost(
-        qc, coeff_idx, txb_probs->dc_sign_prob, txb_info->txb_ctx->dc_sign_ctx);
+        qc, coeff_idx, txb_costs->dc_sign_cost, txb_info->txb_ctx->dc_sign_ctx);
     cost_diff -= sign_cost;
+  } else if (abs_qc <= NUM_BASE_LEVELS) {
+    const int *level_cost =
+        get_level_prob(abs_qc, coeff_idx, txb_cache, txb_costs);
+    const int *low_level_cost =
+        get_level_prob(abs(*low_coeff), coeff_idx, txb_cache, txb_costs);
+    cost_diff = -level_cost[1] + low_level_cost[1] - low_level_cost[0];
+  } else if (abs_qc == NUM_BASE_LEVELS + 1) {
+    const int *level_cost =
+        get_level_prob(abs_qc, coeff_idx, txb_cache, txb_costs);
+    const int *low_level_cost =
+        get_level_prob(abs(*low_coeff), coeff_idx, txb_cache, txb_costs);
+#if BR_NODE
+    cost_diff = -level_cost[0] + low_level_cost[1] - low_level_cost[0];
+#else
+    cost_diff = -level_cost[1] + low_level_cost[1] - low_level_cost[0];
+#endif
   } else if (abs_qc < 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) {
-    const aom_prob level_prob =
-        get_level_prob(abs_qc, coeff_idx, txb_cache, txb_probs);
-    const aom_prob low_level_prob =
-        get_level_prob(abs(*low_coeff), coeff_idx, txb_cache, txb_probs);
-    cost_diff = -av1_cost_bit(level_prob, 1) + av1_cost_bit(low_level_prob, 1) -
-                av1_cost_bit(low_level_prob, 0);
+    const int *level_cost =
+        get_level_prob(abs_qc, coeff_idx, txb_cache, txb_costs);
+    const int *low_level_cost =
+        get_level_prob(abs(*low_coeff), coeff_idx, txb_cache, txb_costs);
+
+#if BR_NODE
+    cost_diff = -level_cost[abs_qc - 1 - NUM_BASE_LEVELS] +
+                low_level_cost[abs(*low_coeff) - 1 - NUM_BASE_LEVELS];
+#else
+    cost_diff = -level_cost[1] + low_level_cost[1] - low_level_cost[0];
+#endif
   } else if (abs_qc == 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) {
-    const aom_prob low_level_prob =
-        get_level_prob(abs(*low_coeff), coeff_idx, txb_cache, txb_probs);
-    cost_diff = -get_golomb_cost(abs_qc) + av1_cost_bit(low_level_prob, 1) -
-                av1_cost_bit(low_level_prob, 0);
+    const int *low_level_cost =
+        get_level_prob(abs(*low_coeff), coeff_idx, txb_cache, txb_costs);
+#if BR_NODE
+    cost_diff = -get_golomb_cost(abs_qc) - low_level_cost[COEFF_BASE_RANGE] +
+                low_level_cost[COEFF_BASE_RANGE - 1];
+#else
+    cost_diff =
+        -get_golomb_cost(abs_qc) + low_level_cost[1] - low_level_cost[0];
+#endif
   } else {
     assert(abs_qc > 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE);
     const tran_low_t abs_low_coeff = abs(*low_coeff);
@@ -831,10 +1199,26 @@ static INLINE int check_br_neighbor(tran_low_t qc) {
   return abs(qc) > BR_MAG_OFFSET;
 }
 
+#define FAST_OPTIMIZE_TXB 1
+
+#if FAST_OPTIMIZE_TXB
+#define ALNB_REF_OFFSET_NUM 2
+static int alnb_ref_offset[ALNB_REF_OFFSET_NUM][2] = {
+  { -1, 0 }, { 0, -1 },
+};
+#define NB_REF_OFFSET_NUM 4
+static int nb_ref_offset[NB_REF_OFFSET_NUM][2] = {
+  { -1, 0 }, { 0, -1 }, { 1, 0 }, { 0, 1 },
+};
+#endif  // FAST_OPTIMIZE_TXB
+
 // TODO(angiebird): add static to this function once it's called
 int try_level_down(int coeff_idx, const TxbCache *txb_cache,
-                   const TxbProbs *txb_probs, TxbInfo *txb_info,
-                   int (*cost_map)[COST_MAP_SIZE]) {
+                   const LV_MAP_COEFF_COST *txb_costs, TxbInfo *txb_info,
+                   int (*cost_map)[COST_MAP_SIZE], int fast_mode) {
+#if !FAST_OPTIMIZE_TXB
+  (void)fast_mode;
+#endif
   if (cost_map) {
     for (int i = 0; i < COST_MAP_SIZE; ++i) av1_zero(cost_map[i]);
   }
@@ -849,7 +1233,7 @@ int try_level_down(int coeff_idx, const TxbCache *txb_cache,
   const int scan_idx = iscan[coeff_idx];
   if (scan_idx < eob) {
     const int cost_diff = try_self_level_down(&low_coeff, coeff_idx, txb_cache,
-                                              txb_probs, txb_info);
+                                              txb_costs, txb_info);
     if (cost_map)
       cost_map[0 + COST_MAP_OFFSET][0 + COST_MAP_OFFSET] = cost_diff;
     accu_cost_diff += cost_diff;
@@ -858,19 +1242,33 @@ int try_level_down(int coeff_idx, const TxbCache *txb_cache,
   const int row = coeff_idx >> txb_info->bwl;
   const int col = coeff_idx - (row << txb_info->bwl);
   if (check_nz_neighbor(qc)) {
-    for (int i = 0; i < SIG_REF_OFFSET_NUM; ++i) {
-      const int nb_row = row - sig_ref_offset[i][0];
-      const int nb_col = col - sig_ref_offset[i][1];
+#if FAST_OPTIMIZE_TXB
+    int(*ref_offset)[2];
+    int ref_num;
+    if (fast_mode) {
+      ref_offset = alnb_ref_offset;
+      ref_num = ALNB_REF_OFFSET_NUM;
+    } else {
+      ref_offset = sig_ref_offset;
+      ref_num = SIG_REF_OFFSET_NUM;
+    }
+#else
+    int(*ref_offset)[2] = sig_ref_offset;
+    const int ref_num = SIG_REF_OFFSET_NUM;
+#endif
+    for (int i = 0; i < ref_num; ++i) {
+      const int nb_row = row - ref_offset[i][0];
+      const int nb_col = col - ref_offset[i][1];
       const int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
 
-      if (!(nb_row >= 0 && nb_col >= 0 && nb_row < txb_info->height &&
-            nb_col < txb_info->stride))
+      if (nb_row < 0 || nb_col < 0 || nb_row >= txb_info->height ||
+          nb_col >= txb_info->stride)
         continue;
 
       const int nb_scan_idx = iscan[nb_coeff_idx];
       if (nb_scan_idx < eob) {
         const int cost_diff = try_neighbor_level_down_nz(
-            nb_coeff_idx, coeff_idx, txb_cache, txb_probs, txb_info);
+            nb_coeff_idx, coeff_idx, txb_cache, txb_costs, txb_info);
         if (cost_map)
           cost_map[nb_row - row + COST_MAP_OFFSET]
                   [nb_col - col + COST_MAP_OFFSET] += cost_diff;
@@ -880,19 +1278,33 @@ int try_level_down(int coeff_idx, const TxbCache *txb_cache,
   }
 
   if (check_base_neighbor(qc)) {
-    for (int i = 0; i < BASE_CONTEXT_POSITION_NUM; ++i) {
-      const int nb_row = row - base_ref_offset[i][0];
-      const int nb_col = col - base_ref_offset[i][1];
+#if FAST_OPTIMIZE_TXB
+    int(*ref_offset)[2];
+    int ref_num;
+    if (fast_mode) {
+      ref_offset = nb_ref_offset;
+      ref_num = NB_REF_OFFSET_NUM;
+    } else {
+      ref_offset = base_ref_offset;
+      ref_num = BASE_CONTEXT_POSITION_NUM;
+    }
+#else
+    int(*ref_offset)[2] = base_ref_offset;
+    int ref_num = BASE_CONTEXT_POSITION_NUM;
+#endif
+    for (int i = 0; i < ref_num; ++i) {
+      const int nb_row = row - ref_offset[i][0];
+      const int nb_col = col - ref_offset[i][1];
       const int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
 
-      if (!(nb_row >= 0 && nb_col >= 0 && nb_row < txb_info->height &&
-            nb_col < txb_info->stride))
+      if (nb_row < 0 || nb_col < 0 || nb_row >= txb_info->height ||
+          nb_col >= txb_info->stride)
         continue;
 
       const int nb_scan_idx = iscan[nb_coeff_idx];
       if (nb_scan_idx < eob) {
         const int cost_diff = try_neighbor_level_down_base(
-            nb_coeff_idx, coeff_idx, txb_cache, txb_probs, txb_info);
+            nb_coeff_idx, coeff_idx, txb_cache, txb_costs, txb_info);
         if (cost_map)
           cost_map[nb_row - row + COST_MAP_OFFSET]
                   [nb_col - col + COST_MAP_OFFSET] += cost_diff;
@@ -902,19 +1314,33 @@ int try_level_down(int coeff_idx, const TxbCache *txb_cache,
   }
 
   if (check_br_neighbor(qc)) {
-    for (int i = 0; i < BR_CONTEXT_POSITION_NUM; ++i) {
-      const int nb_row = row - br_ref_offset[i][0];
-      const int nb_col = col - br_ref_offset[i][1];
+#if FAST_OPTIMIZE_TXB
+    int(*ref_offset)[2];
+    int ref_num;
+    if (fast_mode) {
+      ref_offset = nb_ref_offset;
+      ref_num = NB_REF_OFFSET_NUM;
+    } else {
+      ref_offset = br_ref_offset;
+      ref_num = BR_CONTEXT_POSITION_NUM;
+    }
+#else
+    int(*ref_offset)[2] = br_ref_offset;
+    const int ref_num = BR_CONTEXT_POSITION_NUM;
+#endif
+    for (int i = 0; i < ref_num; ++i) {
+      const int nb_row = row - ref_offset[i][0];
+      const int nb_col = col - ref_offset[i][1];
       const int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
 
-      if (!(nb_row >= 0 && nb_col >= 0 && nb_row < txb_info->height &&
-            nb_col < txb_info->stride))
+      if (nb_row < 0 || nb_col < 0 || nb_row >= txb_info->height ||
+          nb_col >= txb_info->stride)
         continue;
 
       const int nb_scan_idx = iscan[nb_coeff_idx];
       if (nb_scan_idx < eob) {
         const int cost_diff = try_neighbor_level_down_br(
-            nb_coeff_idx, coeff_idx, txb_cache, txb_probs, txb_info);
+            nb_coeff_idx, coeff_idx, txb_cache, txb_costs, txb_info);
         if (cost_map)
           cost_map[nb_row - row + COST_MAP_OFFSET]
                   [nb_col - col + COST_MAP_OFFSET] += cost_diff;
@@ -927,7 +1353,7 @@ int try_level_down(int coeff_idx, const TxbCache *txb_cache,
 }
 
 static int get_low_coeff_cost(int coeff_idx, const TxbCache *txb_cache,
-                              const TxbProbs *txb_probs,
+                              const LV_MAP_COEFF_COST *txb_costs,
                               const TxbInfo *txb_info) {
   const tran_low_t qc = txb_info->qcoeff[coeff_idx];
   const int abs_qc = abs(qc);
@@ -935,22 +1361,21 @@ static int get_low_coeff_cost(int coeff_idx, const TxbCache *txb_cache,
   int cost = 0;
   const int scan_idx = txb_info->scan_order->iscan[coeff_idx];
   if (scan_idx < txb_info->seg_eob) {
-    const aom_prob level_prob =
-        get_level_prob(0, coeff_idx, txb_cache, txb_probs);
-    cost += av1_cost_bit(level_prob, qc != 0);
+    const int *level_cost = get_level_prob(0, coeff_idx, txb_cache, txb_costs);
+    cost += level_cost[qc != 0];
   }
 
   if (qc != 0) {
     const int base_idx = 0;
-    const int ctx = txb_cache->base_ctx_arr[base_idx][coeff_idx][0];
-    cost += get_base_cost(abs_qc, ctx, txb_probs->coeff_base, base_idx);
+    const int ctx = txb_cache->base_ctx_arr[base_idx][coeff_idx];
+    cost += get_base_cost(abs_qc, ctx, txb_costs->base_cost[base_idx][ctx],
+                          base_idx);
     if (scan_idx < txb_info->seg_eob) {
-      const int eob_ctx =
-          get_eob_ctx(txb_info->qcoeff, coeff_idx, txb_info->txs_ctx);
-      cost += av1_cost_bit(txb_probs->eob_flag[eob_ctx],
-                           scan_idx == (txb_info->eob - 1));
+      const int eob_ctx = get_eob_ctx(txb_info->qcoeff, coeff_idx,
+                                      txb_info->txs_ctx, txb_info->tx_type);
+      cost += txb_costs->eob_cost[eob_ctx][scan_idx == (txb_info->eob - 1)];
     }
-    cost += get_sign_bit_cost(qc, coeff_idx, txb_probs->dc_sign_prob,
+    cost += get_sign_bit_cost(qc, coeff_idx, txb_costs->dc_sign_cost,
                               txb_info->txb_ctx->dc_sign_ctx);
   }
   return cost;
@@ -963,7 +1388,8 @@ static INLINE void set_eob(TxbInfo *txb_info, int eob) {
 
 // TODO(angiebird): add static to this function once it's called
 int try_change_eob(int *new_eob, int coeff_idx, const TxbCache *txb_cache,
-                   const TxbProbs *txb_probs, TxbInfo *txb_info) {
+                   const LV_MAP_COEFF_COST *txb_costs, TxbInfo *txb_info,
+                   int fast_mode) {
   assert(txb_info->eob > 0);
   const tran_low_t qc = txb_info->qcoeff[coeff_idx];
   const int abs_qc = abs(qc);
@@ -976,7 +1402,7 @@ int try_change_eob(int *new_eob, int coeff_idx, const TxbCache *txb_cache,
   const int scan_idx = iscan[coeff_idx];
   *new_eob = 0;
   int cost_diff = 0;
-  cost_diff -= get_low_coeff_cost(coeff_idx, txb_cache, txb_probs, txb_info);
+  cost_diff -= get_low_coeff_cost(coeff_idx, txb_cache, txb_costs, txb_info);
   // int coeff_cost =
   //     get_coeff_cost(qc, scan_idx, txb_info, txb_probs);
   // if (-cost_diff != coeff_cost) {
@@ -990,26 +1416,27 @@ int try_change_eob(int *new_eob, int coeff_idx, const TxbCache *txb_cache,
       *new_eob = si + 1;
       break;
     } else {
-      cost_diff -= get_low_coeff_cost(ci, txb_cache, txb_probs, txb_info);
+      cost_diff -= get_low_coeff_cost(ci, txb_cache, txb_costs, txb_info);
     }
   }
 
   const int org_eob = txb_info->eob;
   set_eob(txb_info, *new_eob);
-  cost_diff += try_level_down(coeff_idx, txb_cache, txb_probs, txb_info, NULL);
+  cost_diff += try_level_down(coeff_idx, txb_cache, txb_costs, txb_info, NULL,
+                              fast_mode);
   set_eob(txb_info, org_eob);
 
   if (*new_eob > 0) {
     // Note that get_eob_ctx does NOT actually account for qcoeff, so we don't
     // need to lower down the qcoeff here
-    const int eob_ctx =
-        get_eob_ctx(txb_info->qcoeff, scan[*new_eob - 1], txb_info->txs_ctx);
-    cost_diff -= av1_cost_bit(txb_probs->eob_flag[eob_ctx], 0);
-    cost_diff += av1_cost_bit(txb_probs->eob_flag[eob_ctx], 1);
+    const int eob_ctx = get_eob_ctx(txb_info->qcoeff, scan[*new_eob - 1],
+                                    txb_info->txs_ctx, txb_info->tx_type);
+    cost_diff -= txb_costs->eob_cost[eob_ctx][0];
+    cost_diff += txb_costs->eob_cost[eob_ctx][1];
   } else {
     const int txb_skip_ctx = txb_info->txb_ctx->txb_skip_ctx;
-    cost_diff -= av1_cost_bit(txb_probs->txb_skip[txb_skip_ctx], 0);
-    cost_diff += av1_cost_bit(txb_probs->txb_skip[txb_skip_ctx], 1);
+    cost_diff -= txb_costs->txb_skip_cost[txb_skip_ctx][0];
+    cost_diff += txb_costs->txb_skip_cost[txb_skip_ctx][1];
   }
   return cost_diff;
 }
@@ -1053,17 +1480,19 @@ void update_level_down(int coeff_idx, TxbCache *txb_cache, TxbInfo *txb_info) {
           assert(txb_cache->nz_count_arr[nb_coeff_idx] >= 0);
         }
         const int count = txb_cache->nz_count_arr[nb_coeff_idx];
-        txb_cache->nz_ctx_arr[nb_coeff_idx][0] = get_nz_map_ctx_from_count(
-            count, txb_info->qcoeff, nb_coeff_idx, txb_info->bwl, iscan);
+        txb_cache->nz_ctx_arr[nb_coeff_idx] = get_nz_map_ctx_from_count(
+            count, nb_coeff_idx, txb_info->bwl, txb_info->tx_type);
         // int ref_ctx = get_nz_map_ctx(txb_info->qcoeff, nb_coeff_idx,
-        // txb_info->bwl, iscan);
-        // if (ref_ctx != txb_cache->nz_ctx_arr[nb_coeff_idx][0])
+        // txb_info->bwl, tx_type);
+        // if (ref_ctx != txb_cache->nz_ctx_arr[nb_coeff_idx])
         //   printf("nz ctx %d ref_ctx %d\n",
-        //   txb_cache->nz_ctx_arr[nb_coeff_idx][0], ref_ctx);
+        //   txb_cache->nz_ctx_arr[nb_coeff_idx], ref_ctx);
       }
     }
   }
 
+  const BASE_CTX_TABLE *base_ctx_table =
+      txb_info->coeff_ctx_table->base_ctx_table;
   for (int i = 0; i < BASE_CONTEXT_POSITION_NUM; ++i) {
     const int nb_row = row - base_ref_offset[i][0];
     const int nb_col = col - base_ref_offset[i][1];
@@ -1089,13 +1518,13 @@ void update_level_down(int coeff_idx, TxbCache *txb_cache, TxbInfo *txb_info) {
           assert(txb_cache->base_count_arr[base_idx][nb_coeff_idx] >= 0);
         }
         const int count = txb_cache->base_count_arr[base_idx][nb_coeff_idx];
-        txb_cache->base_ctx_arr[base_idx][nb_coeff_idx][0] =
-            get_base_ctx_from_count_mag(nb_row, nb_col, count, mag, level);
+        txb_cache->base_ctx_arr[base_idx][nb_coeff_idx] =
+            base_ctx_table[nb_row != 0][nb_col != 0][mag > level][count];
         // int ref_ctx = get_base_ctx(txb_info->qcoeff, nb_coeff_idx,
         // txb_info->bwl, level);
-        // if (ref_ctx != txb_cache->base_ctx_arr[base_idx][nb_coeff_idx][0]) {
+        // if (ref_ctx != txb_cache->base_ctx_arr[base_idx][nb_coeff_idx]) {
         //   printf("base ctx %d ref_ctx %d\n",
-        //   txb_cache->base_ctx_arr[base_idx][nb_coeff_idx][0], ref_ctx);
+        //   txb_cache->base_ctx_arr[base_idx][nb_coeff_idx], ref_ctx);
         // }
       }
     }
@@ -1123,35 +1552,35 @@ void update_level_down(int coeff_idx, TxbCache *txb_cache, TxbInfo *txb_info) {
         update_mag_arr(txb_cache->br_mag_arr[nb_coeff_idx], abs_qc);
       const int count = txb_cache->br_count_arr[nb_coeff_idx];
       const int mag = get_mag_from_mag_arr(txb_cache->br_mag_arr[nb_coeff_idx]);
-      txb_cache->br_ctx_arr[nb_coeff_idx][0] =
+      txb_cache->br_ctx_arr[nb_coeff_idx] =
           get_br_ctx_from_count_mag(nb_row, nb_col, count, mag);
       // int ref_ctx = get_level_ctx(txb_info->qcoeff, nb_coeff_idx,
       // txb_info->bwl);
-      // if (ref_ctx != txb_cache->br_ctx_arr[nb_coeff_idx][0]) {
+      // if (ref_ctx != txb_cache->br_ctx_arr[nb_coeff_idx]) {
       //   printf("base ctx %d ref_ctx %d\n",
-      //   txb_cache->br_ctx_arr[nb_coeff_idx][0], ref_ctx);
+      //   txb_cache->br_ctx_arr[nb_coeff_idx], ref_ctx);
       // }
     }
   }
 }
 
 static int get_coeff_cost(tran_low_t qc, int scan_idx, TxbInfo *txb_info,
-                          const TxbProbs *txb_probs) {
+                          const LV_MAP_COEFF_COST *txb_costs) {
   const TXB_CTX *txb_ctx = txb_info->txb_ctx;
   const int is_nz = (qc != 0);
   const tran_low_t abs_qc = abs(qc);
   int cost = 0;
   const int16_t *scan = txb_info->scan_order->scan;
-  const int16_t *iscan = txb_info->scan_order->iscan;
 
   if (scan_idx < txb_info->seg_eob) {
-    int coeff_ctx = get_nz_map_ctx(txb_info->qcoeff, scan[scan_idx],
-                                   txb_info->bwl, txb_info->height, iscan);
-    cost += av1_cost_bit(txb_probs->nz_map[coeff_ctx], is_nz);
+    int coeff_ctx =
+        get_nz_map_ctx(txb_info->qcoeff, scan_idx, scan, txb_info->bwl,
+                       txb_info->height, txb_info->tx_type);
+    cost += txb_costs->nz_map_cost[coeff_ctx][is_nz];
   }
 
   if (is_nz) {
-    cost += get_sign_bit_cost(qc, scan_idx, txb_probs->dc_sign_prob,
+    cost += get_sign_bit_cost(qc, scan_idx, txb_costs->dc_sign_cost,
                               txb_ctx->dc_sign_ctx);
 
     int ctx_ls[NUM_BASE_LEVELS] = { 0 };
@@ -1160,21 +1589,21 @@ static int get_coeff_cost(tran_low_t qc, int scan_idx, TxbInfo *txb_info,
 
     int i;
     for (i = 0; i < NUM_BASE_LEVELS; ++i) {
-      cost += get_base_cost(abs_qc, ctx_ls[i], txb_probs->coeff_base, i);
+      cost += get_base_cost(abs_qc, ctx_ls[i],
+                            txb_costs->base_cost[i][ctx_ls[i]], i);
     }
 
     if (abs_qc > NUM_BASE_LEVELS) {
       int ctx = get_br_ctx(txb_info->qcoeff, scan[scan_idx], txb_info->bwl,
                            txb_info->height);
-      cost += get_br_cost(abs_qc, ctx, txb_probs->coeff_lps);
+      cost += get_br_cost(abs_qc, ctx, txb_costs->lps_cost[ctx]);
       cost += get_golomb_cost(abs_qc);
     }
 
     if (scan_idx < txb_info->seg_eob) {
-      int eob_ctx =
-          get_eob_ctx(txb_info->qcoeff, scan[scan_idx], txb_info->txs_ctx);
-      cost += av1_cost_bit(txb_probs->eob_flag[eob_ctx],
-                           scan_idx == (txb_info->eob - 1));
+      int eob_ctx = get_eob_ctx(txb_info->qcoeff, scan[scan_idx],
+                                txb_info->txs_ctx, txb_info->tx_type);
+      cost += txb_costs->eob_cost[eob_ctx][scan_idx == (txb_info->eob - 1)];
     }
   }
   return cost;
@@ -1188,7 +1617,7 @@ static int all_ref_offset[ALL_REF_OFFSET_NUM][2] = {
   { 1, 0 },  { 2, 0 },   { 0, 1 },  { 0, 2 },  { 1, 1 },
 };
 
-static int try_level_down_ref(int coeff_idx, const TxbProbs *txb_probs,
+static int try_level_down_ref(int coeff_idx, const LV_MAP_COEFF_COST *txb_costs,
                               TxbInfo *txb_info,
                               int (*cost_map)[COST_MAP_SIZE]) {
   if (cost_map) {
@@ -1205,9 +1634,9 @@ static int try_level_down_ref(int coeff_idx, const TxbProbs *txb_probs,
     int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
     int nb_scan_idx = txb_info->scan_order->iscan[nb_coeff_idx];
     if (nb_scan_idx < txb_info->eob && nb_row >= 0 && nb_col >= 0 &&
-        nb_row < txb_info->stride && nb_col < txb_info->stride) {
+        nb_row < txb_info->height && nb_col < txb_info->stride) {
       tran_low_t nb_coeff = txb_info->qcoeff[nb_coeff_idx];
-      int cost = get_coeff_cost(nb_coeff, nb_scan_idx, txb_info, txb_probs);
+      int cost = get_coeff_cost(nb_coeff, nb_scan_idx, txb_info, txb_costs);
       if (cost_map)
         cost_map[nb_row - row + COST_MAP_OFFSET]
                 [nb_col - col + COST_MAP_OFFSET] -= cost;
@@ -1222,9 +1651,9 @@ static int try_level_down_ref(int coeff_idx, const TxbProbs *txb_probs,
     int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
     int nb_scan_idx = txb_info->scan_order->iscan[nb_coeff_idx];
     if (nb_scan_idx < txb_info->eob && nb_row >= 0 && nb_col >= 0 &&
-        nb_row < txb_info->stride && nb_col < txb_info->stride) {
+        nb_row < txb_info->height && nb_col < txb_info->stride) {
       tran_low_t nb_coeff = txb_info->qcoeff[nb_coeff_idx];
-      int cost = get_coeff_cost(nb_coeff, nb_scan_idx, txb_info, txb_probs);
+      int cost = get_coeff_cost(nb_coeff, nb_scan_idx, txb_info, txb_costs);
       if (cost_map)
         cost_map[nb_row - row + COST_MAP_OFFSET]
                 [nb_col - col + COST_MAP_OFFSET] += cost;
@@ -1236,13 +1665,14 @@ static int try_level_down_ref(int coeff_idx, const TxbProbs *txb_probs,
 }
 
 static void test_level_down(int coeff_idx, const TxbCache *txb_cache,
-                            const TxbProbs *txb_probs, TxbInfo *txb_info) {
+                            const LV_MAP_COEFF_COST *txb_costs,
+                            TxbInfo *txb_info) {
   int cost_map[COST_MAP_SIZE][COST_MAP_SIZE];
   int ref_cost_map[COST_MAP_SIZE][COST_MAP_SIZE];
   const int cost_diff =
-      try_level_down(coeff_idx, txb_cache, txb_probs, txb_info, cost_map);
+      try_level_down(coeff_idx, txb_cache, txb_costs, txb_info, cost_map, 0);
   const int cost_diff_ref =
-      try_level_down_ref(coeff_idx, txb_probs, txb_info, ref_cost_map);
+      try_level_down_ref(coeff_idx, txb_costs, txb_info, ref_cost_map);
   if (cost_diff != cost_diff_ref) {
     printf("qc %d cost_diff %d cost_diff_ref %d\n", txb_info->qcoeff[coeff_idx],
            cost_diff, cost_diff_ref);
@@ -1257,25 +1687,25 @@ static void test_level_down(int coeff_idx, const TxbCache *txb_cache,
 #endif
 
 // TODO(angiebird): make this static once it's called
-int get_txb_cost(TxbInfo *txb_info, const TxbProbs *txb_probs) {
+int get_txb_cost(TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs) {
   int cost = 0;
   int txb_skip_ctx = txb_info->txb_ctx->txb_skip_ctx;
   const int16_t *scan = txb_info->scan_order->scan;
   if (txb_info->eob == 0) {
-    cost = av1_cost_bit(txb_probs->txb_skip[txb_skip_ctx], 1);
+    cost = txb_costs->txb_skip_cost[txb_skip_ctx][1];
     return cost;
   }
-  cost = av1_cost_bit(txb_probs->txb_skip[txb_skip_ctx], 0);
+  cost = txb_costs->txb_skip_cost[txb_skip_ctx][0];
   for (int c = 0; c < txb_info->eob; ++c) {
     tran_low_t qc = txb_info->qcoeff[scan[c]];
-    int coeff_cost = get_coeff_cost(qc, c, txb_info, txb_probs);
+    int coeff_cost = get_coeff_cost(qc, c, txb_info, txb_costs);
     cost += coeff_cost;
   }
   return cost;
 }
 
 #if TEST_OPTIMIZE_TXB
-void test_try_change_eob(TxbInfo *txb_info, TxbProbs *txb_probs,
+void test_try_change_eob(TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
                          TxbCache *txb_cache) {
   int eob = txb_info->eob;
   const int16_t *scan = txb_info->scan_order->scan;
@@ -1286,13 +1716,13 @@ void test_try_change_eob(TxbInfo *txb_info, TxbProbs *txb_probs,
     if (abs(last_coeff) == 1) {
       int new_eob;
       int cost_diff =
-          try_change_eob(&new_eob, last_ci, txb_cache, txb_probs, txb_info);
+          try_change_eob(&new_eob, last_ci, txb_cache, txb_costs, txb_info, 0);
       int org_eob = txb_info->eob;
-      int cost = get_txb_cost(txb_info, txb_probs);
+      int cost = get_txb_cost(txb_info, txb_costs);
 
       txb_info->qcoeff[last_ci] = get_lower_coeff(last_coeff);
       set_eob(txb_info, new_eob);
-      int new_cost = get_txb_cost(txb_info, txb_probs);
+      int new_cost = get_txb_cost(txb_info, txb_costs);
       set_eob(txb_info, org_eob);
       txb_info->qcoeff[last_ci] = last_coeff;
 
@@ -1323,8 +1753,9 @@ typedef struct LevelDownStats {
 } LevelDownStats;
 
 void try_level_down_facade(LevelDownStats *stats, int scan_idx,
-                           const TxbCache *txb_cache, const TxbProbs *txb_probs,
-                           TxbInfo *txb_info) {
+                           const TxbCache *txb_cache,
+                           const LV_MAP_COEFF_COST *txb_costs,
+                           TxbInfo *txb_info, int fast_mode) {
   const int16_t *scan = txb_info->scan_order->scan;
   const int coeff_idx = scan[scan_idx];
   const tran_low_t qc = txb_info->qcoeff[coeff_idx];
@@ -1350,12 +1781,12 @@ void try_level_down_facade(LevelDownStats *stats, int scan_idx,
   stats->new_eob = txb_info->eob;
   if (scan_idx == txb_info->eob - 1 && abs(qc) == 1) {
     stats->cost_diff = try_change_eob(&stats->new_eob, coeff_idx, txb_cache,
-                                      txb_probs, txb_info);
+                                      txb_costs, txb_info, fast_mode);
   } else {
-    stats->cost_diff =
-        try_level_down(coeff_idx, txb_cache, txb_probs, txb_info, NULL);
+    stats->cost_diff = try_level_down(coeff_idx, txb_cache, txb_costs, txb_info,
+                                      NULL, fast_mode);
 #if TEST_OPTIMIZE_TXB
-    test_level_down(coeff_idx, txb_cache, txb_probs, txb_info);
+    test_level_down(coeff_idx, txb_cache, txb_costs, txb_info);
 #endif
   }
   stats->rd_diff = RDCOST(txb_info->rdmult, stats->cost_diff, stats->dist_diff);
@@ -1363,8 +1794,8 @@ void try_level_down_facade(LevelDownStats *stats, int scan_idx,
   return;
 }
 
-static int optimize_txb(TxbInfo *txb_info, const TxbProbs *txb_probs,
-                        TxbCache *txb_cache, int dry_run) {
+static int optimize_txb(TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
+                        TxbCache *txb_cache, int dry_run, int fast_mode) {
   int update = 0;
   if (txb_info->eob == 0) return update;
   int cost_diff = 0;
@@ -1377,7 +1808,7 @@ static int optimize_txb(TxbInfo *txb_info, const TxbProbs *txb_probs,
   int64_t org_dist =
       av1_block_error_c(txb_info->tcoeff, txb_info->dqcoeff, max_eob, &sse) *
       (1 << (2 * txb_info->shift));
-  int org_cost = get_txb_cost(txb_info, txb_probs);
+  int org_cost = get_txb_cost(txb_info, txb_costs);
 #endif
 
   tran_low_t *org_qcoeff = txb_info->qcoeff;
@@ -1402,7 +1833,8 @@ static int optimize_txb(TxbInfo *txb_info, const TxbProbs *txb_probs,
     tran_low_t qc = txb_info->qcoeff[coeff_idx];
     if (abs(qc) == 1) {
       LevelDownStats stats;
-      try_level_down_facade(&stats, si, txb_cache, txb_probs, txb_info);
+      try_level_down_facade(&stats, si, txb_cache, txb_costs, txb_info,
+                            fast_mode);
       if (stats.update) {
         update = 1;
         cost_diff += stats.cost_diff;
@@ -1415,10 +1847,17 @@ static int optimize_txb(TxbInfo *txb_info, const TxbProbs *txb_probs,
   }
 
   // backward optimize the level-k map
+  int eob_fix = 0;
   for (int si = txb_info->eob - 1; si >= 0; --si) {
-    LevelDownStats stats;
-    try_level_down_facade(&stats, si, txb_cache, txb_probs, txb_info);
     const int coeff_idx = scan[si];
+    if (eob_fix == 1 && txb_info->qcoeff[coeff_idx] == 1) {
+      // when eob is fixed, there is not need to optimize again when
+      // abs(qc) == 1
+      continue;
+    }
+    LevelDownStats stats;
+    try_level_down_facade(&stats, si, txb_cache, txb_costs, txb_info,
+                          fast_mode);
     if (stats.update) {
 #if TEST_OPTIMIZE_TXB
 // printf("si %d low_qc %d cost_diff %d dist_diff %ld rd_diff %ld eob %d new_eob
@@ -1432,13 +1871,14 @@ static int optimize_txb(TxbInfo *txb_info, const TxbProbs *txb_probs,
       update_level_down(coeff_idx, txb_cache, txb_info);
       set_eob(txb_info, stats.new_eob);
     }
+    if (eob_fix == 0 && txb_info->qcoeff[coeff_idx] != 0) eob_fix = 1;
     if (si > txb_info->eob) si = txb_info->eob;
   }
 #if TEST_OPTIMIZE_TXB
   int64_t new_dist =
       av1_block_error_c(txb_info->tcoeff, txb_info->dqcoeff, max_eob, &sse) *
       (1 << (2 * txb_info->shift));
-  int new_cost = get_txb_cost(txb_info, txb_probs);
+  int new_cost = get_txb_cost(txb_info, txb_costs);
   int64_t ref_dist_diff = new_dist - org_dist;
   int ref_cost_diff = new_cost - org_cost;
   if (cost_diff != ref_cost_diff || dist_diff != ref_dist_diff)
@@ -1463,7 +1903,7 @@ static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
 
 int av1_optimize_txb(const AV1_COMMON *cm, MACROBLOCK *x, int plane,
                      int blk_row, int blk_col, int block, TX_SIZE tx_size,
-                     TXB_CTX *txb_ctx) {
+                     TXB_CTX *txb_ctx, int fast_mode) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const PLANE_TYPE plane_type = get_plane_type(plane);
   const TX_SIZE txs_ctx = get_txsize_context(tx_size);
@@ -1478,38 +1918,40 @@ int av1_optimize_txb(const AV1_COMMON *cm, MACROBLOCK *x, int plane,
   const tran_low_t *tcoeff = BLOCK_OFFSET(p->coeff, block);
   const int16_t *dequant = pd->dequant;
   const int seg_eob = AOMMIN(eob, tx_size_2d[tx_size] - 1);
-  const aom_prob *nz_map = xd->fc->nz_map[txs_ctx][plane_type];
-
   const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
   const int stride = 1 << bwl;
   const int height = tx_size_high[tx_size];
-  aom_prob(*coeff_base)[COEFF_BASE_CONTEXTS] =
-      xd->fc->coeff_base[txs_ctx][plane_type];
-
-  const aom_prob *coeff_lps = xd->fc->coeff_lps[txs_ctx][plane_type];
-
   const int is_inter = is_inter_block(mbmi);
   const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, mbmi);
-
-  const TxbProbs txb_probs = { xd->fc->dc_sign[plane_type],
-                               nz_map,
-                               coeff_base,
-                               coeff_lps,
-                               xd->fc->eob_flag[txs_ctx][plane_type],
-                               xd->fc->txb_skip[txs_ctx] };
+  const LV_MAP_COEFF_COST txb_costs = x->coeff_costs[txs_ctx][plane_type];
 
   const int shift = av1_get_tx_scale(tx_size);
   const int64_t rdmult =
       (x->rdmult * plane_rd_mult[is_inter][plane_type] + 2) >> 2;
 
-  TxbInfo txb_info = { qcoeff,  dqcoeff, tcoeff,     dequant, shift,
-                       tx_size, txs_ctx, bwl,        stride,  height,
-                       eob,     seg_eob, scan_order, txb_ctx, rdmult };
+  TxbInfo txb_info = { qcoeff,
+                       dqcoeff,
+                       tcoeff,
+                       dequant,
+                       shift,
+                       tx_size,
+                       txs_ctx,
+                       tx_type,
+                       bwl,
+                       stride,
+                       height,
+                       eob,
+                       seg_eob,
+                       scan_order,
+                       txb_ctx,
+                       rdmult,
+                       &cm->coeff_ctx_table };
 
   TxbCache txb_cache;
   gen_txb_cache(&txb_cache, &txb_info);
 
-  const int update = optimize_txb(&txb_info, &txb_probs, &txb_cache, 0);
+  const int update =
+      optimize_txb(&txb_info, &txb_costs, &txb_cache, 0, fast_mode);
   if (update) p->eobs[block] = txb_info.eob;
   return txb_info.eob;
 }
@@ -1518,6 +1960,8 @@ int av1_get_txb_entropy_context(const tran_low_t *qcoeff,
   const int16_t *scan = scan_order->scan;
   int cul_level = 0;
   int c;
+
+  if (eob == 0) return 0;
   for (c = 0; c < eob; ++c) {
     cul_level += abs(qcoeff[scan[c]]);
   }
@@ -1552,6 +1996,153 @@ void av1_update_txb_context_b(int plane, int block, int blk_row, int blk_col,
   av1_set_contexts(xd, pd, plane, tx_size, cul_level, blk_col, blk_row);
 }
 
+static INLINE void av1_update_nz_eob_counts(FRAME_CONTEXT *fc,
+                                            FRAME_COUNTS *counts, uint16_t eob,
+                                            const tran_low_t *tcoeff, int plane,
+                                            TX_SIZE tx_size, TX_TYPE tx_type,
+                                            const int16_t *scan) {
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
+  const int height = tx_size_high[tx_size];
+  TX_SIZE txsize_ctx = get_txsize_context(tx_size);
+#if CONFIG_CTX1D
+  const int width = tx_size_wide[tx_size];
+  const int eob_offset = width + height;
+  const TX_CLASS tx_class = get_tx_class(tx_type);
+  const int seg_eob =
+      (tx_class == TX_CLASS_2D) ? tx_size_2d[tx_size] : eob_offset;
+#else
+  const int seg_eob = tx_size_2d[tx_size];
+#endif
+  unsigned int(*nz_map_count)[SIG_COEF_CONTEXTS][2] =
+      &counts->nz_map[txsize_ctx][plane_type];
+  for (int c = 0; c < eob; ++c) {
+    tran_low_t v = tcoeff[scan[c]];
+    int is_nz = (v != 0);
+    int coeff_ctx = get_nz_map_ctx(tcoeff, c, scan, bwl, height, tx_type);
+    int eob_ctx = get_eob_ctx(tcoeff, scan[c], txsize_ctx, tx_type);
+
+    if (c == seg_eob - 1) break;
+
+    ++(*nz_map_count)[coeff_ctx][is_nz];
+#if LV_MAP_PROB
+    update_bin(fc->nz_map_cdf[txsize_ctx][plane_type][coeff_ctx], is_nz, 2);
+#endif
+
+    if (is_nz) {
+      ++counts->eob_flag[txsize_ctx][plane_type][eob_ctx][c == (eob - 1)];
+#if LV_MAP_PROB
+      update_bin(fc->eob_flag_cdf[txsize_ctx][plane_type][eob_ctx],
+                 c == (eob - 1), 2);
+#endif
+    }
+  }
+}
+
+#if CONFIG_CTX1D
+static INLINE void av1_update_nz_eob_counts_vert(
+    FRAME_CONTEXT *fc, FRAME_COUNTS *counts, uint16_t eob,
+    const tran_low_t *tcoeff, int plane, TX_SIZE tx_size, TX_TYPE tx_type,
+    const int16_t *scan, const int16_t *iscan) {
+  (void)eob;
+  const TX_SIZE txs_ctx = get_txsize_context(tx_size);
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const TX_CLASS tx_class = get_tx_class(tx_type);
+  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  int16_t eob_ls[MAX_HVTX_SIZE];
+  get_eob_vert(eob_ls, tcoeff, width, height);
+  unsigned int(*nz_map_count)[SIG_COEF_CONTEXTS][2] =
+      &counts->nz_map[txs_ctx][plane_type];
+  for (int c = 0; c < width; ++c) {
+    int16_t veob = eob_ls[c];
+    assert(veob <= height);
+    int el_ctx = get_empty_line_ctx(c, eob_ls);
+    ++counts->empty_line[txs_ctx][plane_type][tx_class][el_ctx][veob == 0];
+#if LV_MAP_PROB
+    update_bin(fc->empty_line_cdf[txs_ctx][plane_type][tx_class][el_ctx],
+               veob == 0, 2);
+#endif
+    if (veob) {
+      for (int r = 0; r < veob; ++r) {
+        if (r + 1 != height) {
+          int coeff_idx = r * width + c;
+          int scan_idx = iscan[coeff_idx];
+          int is_nz = tcoeff[coeff_idx] != 0;
+          int coeff_ctx =
+              get_nz_map_ctx(tcoeff, scan_idx, scan, bwl, height, tx_type);
+          ++(*nz_map_count)[coeff_ctx][is_nz];
+#if LV_MAP_PROB
+          update_bin(fc->nz_map_cdf[txs_ctx][plane_type][coeff_ctx], is_nz, 2);
+#endif
+          if (is_nz) {
+            int eob_ctx = get_hv_eob_ctx(c, r, eob_ls);
+            ++counts->hv_eob[txs_ctx][plane_type][tx_class][eob_ctx]
+                            [r == veob - 1];
+#if LV_MAP_PROB
+            update_bin(fc->hv_eob_cdf[txs_ctx][plane_type][tx_class][eob_ctx],
+                       r == veob - 1, 2);
+#endif
+          }
+        }
+      }
+    }
+  }
+}
+
+static INLINE void av1_update_nz_eob_counts_horiz(
+    FRAME_CONTEXT *fc, FRAME_COUNTS *counts, uint16_t eob,
+    const tran_low_t *tcoeff, int plane, TX_SIZE tx_size, TX_TYPE tx_type,
+    const int16_t *scan, const int16_t *iscan) {
+  (void)eob;
+  (void)scan;
+  const TX_SIZE txs_ctx = get_txsize_context(tx_size);
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const TX_CLASS tx_class = get_tx_class(tx_type);
+  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  int16_t eob_ls[MAX_HVTX_SIZE];
+  get_eob_horiz(eob_ls, tcoeff, width, height);
+  unsigned int(*nz_map_count)[SIG_COEF_CONTEXTS][2] =
+      &counts->nz_map[txs_ctx][plane_type];
+  for (int r = 0; r < height; ++r) {
+    int16_t heob = eob_ls[r];
+    int el_ctx = get_empty_line_ctx(r, eob_ls);
+    ++counts->empty_line[txs_ctx][plane_type][tx_class][el_ctx][heob == 0];
+#if LV_MAP_PROB
+    update_bin(fc->empty_line_cdf[txs_ctx][plane_type][tx_class][el_ctx],
+               heob == 0, 2);
+#endif
+    if (heob) {
+      for (int c = 0; c < heob; ++c) {
+        if (c + 1 != width) {
+          int coeff_idx = r * width + c;
+          int scan_idx = iscan[coeff_idx];
+          int is_nz = tcoeff[coeff_idx] != 0;
+          int coeff_ctx =
+              get_nz_map_ctx(tcoeff, scan_idx, scan, bwl, height, tx_type);
+          ++(*nz_map_count)[coeff_ctx][is_nz];
+#if LV_MAP_PROB
+          update_bin(fc->nz_map_cdf[txs_ctx][plane_type][coeff_ctx], is_nz, 2);
+#endif
+          if (is_nz) {
+            int eob_ctx = get_hv_eob_ctx(r, c, eob_ls);
+            ++counts->hv_eob[txs_ctx][plane_type][tx_class][eob_ctx]
+                            [c == heob - 1];
+#if LV_MAP_PROB
+            update_bin(fc->hv_eob_cdf[txs_ctx][plane_type][tx_class][eob_ctx],
+                       c == heob - 1, 2);
+#endif
+          }
+        }
+      }
+    }
+  }
+}
+#endif  // CONFIG_CTX1D
+
 void av1_update_and_record_txb_context(int plane, int block, int blk_row,
                                        int blk_col, BLOCK_SIZE plane_bsize,
                                        TX_SIZE tx_size, void *arg) {
@@ -1573,8 +2164,7 @@ void av1_update_and_record_txb_context(int plane, int block, int blk_row,
       av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
   const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, mbmi);
   const int16_t *scan = scan_order->scan;
-  const int16_t *iscan = scan_order->iscan;
-  const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
+  const int seg_eob = av1_get_tx_eob(&cpi->common.seg, segment_id, tx_size);
   int c, i;
   TXB_CTX txb_ctx;
   get_txb_ctx(plane_bsize, tx_size, plane, pd->above_context + blk_col,
@@ -1582,15 +2172,17 @@ void av1_update_and_record_txb_context(int plane, int block, int blk_row,
   const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
   const int height = tx_size_high[tx_size];
   int cul_level = 0;
-  unsigned int(*nz_map_count)[SIG_COEF_CONTEXTS][2];
 
   TX_SIZE txsize_ctx = get_txsize_context(tx_size);
-
-  nz_map_count = &td->counts->nz_map[txsize_ctx][plane_type];
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 
   memcpy(tcoeff, qcoeff, sizeof(*tcoeff) * seg_eob);
 
   ++td->counts->txb_skip[txsize_ctx][txb_ctx.txb_skip_ctx][eob == 0];
+#if LV_MAP_PROB
+  update_bin(ec_ctx->txb_skip_cdf[txsize_ctx][txb_ctx.txb_skip_ctx], eob == 0,
+             2);
+#endif
   x->mbmi_ext->txb_skip_ctx[plane][block] = txb_ctx.txb_skip_ctx;
 
   x->mbmi_ext->eobs[plane][block] = eob;
@@ -1605,20 +2197,39 @@ void av1_update_and_record_txb_context(int plane, int block, int blk_row,
                            mbmi->sb_type, get_min_tx_size(tx_size), td->counts);
 #endif
 
-  for (c = 0; c < eob; ++c) {
-    tran_low_t v = qcoeff[scan[c]];
-    int is_nz = (v != 0);
-    int coeff_ctx = get_nz_map_ctx(tcoeff, scan[c], bwl, height, iscan);
-    int eob_ctx = get_eob_ctx(tcoeff, scan[c], txsize_ctx);
-
-    if (c == seg_eob - 1) break;
-
-    ++(*nz_map_count)[coeff_ctx][is_nz];
-
-    if (is_nz) {
-      ++td->counts->eob_flag[txsize_ctx][plane_type][eob_ctx][c == (eob - 1)];
+#if CONFIG_CTX1D
+  TX_CLASS tx_class = get_tx_class(tx_type);
+  if (tx_class == TX_CLASS_2D) {
+    av1_update_nz_eob_counts(ec_ctx, td->counts, eob, tcoeff, plane, tx_size,
+                             tx_type, scan);
+  } else {
+    const int width = tx_size_wide[tx_size];
+    const int eob_offset = width + height;
+    const int eob_mode = eob > eob_offset;
+    const TX_SIZE txs_ctx = get_txsize_context(tx_size);
+    ++td->counts->eob_mode[txs_ctx][plane_type][tx_class][eob_mode];
+#if LV_MAP_PROB
+    update_bin(ec_ctx->eob_mode_cdf[txs_ctx][plane_type][tx_class], eob_mode,
+               2);
+#endif
+    if (eob_mode == 0) {
+      av1_update_nz_eob_counts(ec_ctx, td->counts, eob, tcoeff, plane, tx_size,
+                               tx_type, scan);
+    } else {
+      const int16_t *iscan = scan_order->iscan;
+      assert(tx_class == TX_CLASS_VERT || tx_class == TX_CLASS_HORIZ);
+      if (tx_class == TX_CLASS_VERT)
+        av1_update_nz_eob_counts_vert(ec_ctx, td->counts, eob, tcoeff, plane,
+                                      tx_size, tx_type, scan, iscan);
+      else
+        av1_update_nz_eob_counts_horiz(ec_ctx, td->counts, eob, tcoeff, plane,
+                                       tx_size, tx_type, scan, iscan);
     }
   }
+#else   // CONFIG_CTX1D
+  av1_update_nz_eob_counts(ec_ctx, td->counts, eob, tcoeff, plane, tx_size,
+                           tx_type, scan);
+#endif  // CONFIG_CTX1D
 
   // Reverse process order to handle coefficient level and sign.
   for (i = 0; i < NUM_BASE_LEVELS; ++i) {
@@ -1634,16 +2245,26 @@ void av1_update_and_record_txb_context(int plane, int block, int blk_row,
 
       if (level == i + 1) {
         ++td->counts->coeff_base[txsize_ctx][plane_type][i][ctx][1];
+#if LV_MAP_PROB
+        update_bin(ec_ctx->coeff_base_cdf[txsize_ctx][plane_type][i][ctx], 1,
+                   2);
+#endif
         if (c == 0) {
           int dc_sign_ctx = txb_ctx.dc_sign_ctx;
 
           ++td->counts->dc_sign[plane_type][dc_sign_ctx][v < 0];
+#if LV_MAP_PROB
+          update_bin(ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx], v < 0, 2);
+#endif
           x->mbmi_ext->dc_sign_ctx[plane][block] = dc_sign_ctx;
         }
         cul_level += level;
         continue;
       }
       ++td->counts->coeff_base[txsize_ctx][plane_type][i][ctx][0];
+#if LV_MAP_PROB
+      update_bin(ec_ctx->coeff_base_cdf[txsize_ctx][plane_type][i][ctx], 0, 2);
+#endif
       update_eob = AOMMAX(update_eob, c);
     }
   }
@@ -1661,20 +2282,68 @@ void av1_update_and_record_txb_context(int plane, int block, int blk_row,
       int dc_sign_ctx = txb_ctx.dc_sign_ctx;
 
       ++td->counts->dc_sign[plane_type][dc_sign_ctx][v < 0];
+#if LV_MAP_PROB
+      update_bin(ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx], v < 0, 2);
+#endif
       x->mbmi_ext->dc_sign_ctx[plane][block] = dc_sign_ctx;
     }
 
     // level is above 1.
     ctx = get_br_ctx(tcoeff, scan[c], bwl, height);
+
+#if BR_NODE
+    int base_range = level - 1 - NUM_BASE_LEVELS;
+    int br_set_idx = base_range < COEFF_BASE_RANGE
+                         ? coeff_to_br_index[base_range]
+                         : BASE_RANGE_SETS;
+
+    for (idx = 0; idx < BASE_RANGE_SETS; ++idx) {
+      if (idx == br_set_idx) {
+        int br_base = br_index_to_coeff[br_set_idx];
+        int br_offset = base_range - br_base;
+        ++td->counts->coeff_br[txsize_ctx][plane_type][idx][ctx][1];
+#if LV_MAP_PROB
+        update_bin(ec_ctx->coeff_br_cdf[txsize_ctx][plane_type][idx][ctx], 1,
+                   2);
+#endif
+        int extra_bits = (1 << br_extra_bits[idx]) - 1;
+        for (int tok = 0; tok < extra_bits; ++tok) {
+          if (br_offset == tok) {
+            ++td->counts->coeff_lps[txsize_ctx][plane_type][ctx][1];
+#if LV_MAP_PROB
+            update_bin(ec_ctx->coeff_lps_cdf[txsize_ctx][plane_type][ctx], 1,
+                       2);
+#endif
+            break;
+          }
+          ++td->counts->coeff_lps[txsize_ctx][plane_type][ctx][0];
+#if LV_MAP_PROB
+          update_bin(ec_ctx->coeff_lps_cdf[txsize_ctx][plane_type][ctx], 0, 2);
+#endif
+        }
+        break;
+      }
+      ++td->counts->coeff_br[txsize_ctx][plane_type][idx][ctx][0];
+#if LV_MAP_PROB
+      update_bin(ec_ctx->coeff_br_cdf[txsize_ctx][plane_type][idx][ctx], 0, 2);
+#endif
+    }
+#else  // BR_NODE
     for (idx = 0; idx < COEFF_BASE_RANGE; ++idx) {
       if (level == (idx + 1 + NUM_BASE_LEVELS)) {
         ++td->counts->coeff_lps[txsize_ctx][plane_type][ctx][1];
+#if LV_MAP_PROB
+        update_bin(ec_ctx->coeff_lps_cdf[txsize_ctx][plane_type][ctx], 1, 2);
+#endif
         break;
       }
       ++td->counts->coeff_lps[txsize_ctx][plane_type][ctx][0];
+#if LV_MAP_PROB
+      update_bin(ec_ctx->coeff_lps_cdf[txsize_ctx][plane_type][ctx], 0, 2);
+#endif
     }
     if (idx < COEFF_BASE_RANGE) continue;
-
+#endif  // BR_NODE
     // use 0-th order Golomb code to handle the residual level.
   }
 
@@ -1848,6 +2517,10 @@ void av1_write_txb_probs(AV1_COMP *cpi, aom_writer *w) {
   TX_SIZE tx_size;
   int ctx, plane;
 
+#if LV_MAP_PROB
+  return;
+#endif
+
   for (plane = 0; plane < PLANE_TYPES; ++plane)
     for (ctx = 0; ctx < DC_SIGN_CONTEXTS; ++ctx)
       av1_cond_prob_diff_update(w, &cpi->common.fc->dc_sign[plane][ctx],
@@ -1888,12 +2561,11 @@ int64_t av1_search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
     }
 
 #if CONFIG_EXT_TX
-    int is_inter = is_inter_block(mbmi);
-    int ext_tx_set = get_ext_tx_set(get_min_tx_size(tx_size), mbmi->sb_type,
-                                    is_inter, cm->reduced_tx_set_used);
-    if (!(is_inter && ext_tx_used_inter[ext_tx_set][tx_type]) &&
-        !(!is_inter && ext_tx_used_intra[ext_tx_set][tx_type]))
-      continue;
+    const int is_inter = is_inter_block(mbmi);
+    const TxSetType tx_set_type =
+        get_ext_tx_set_type(get_min_tx_size(tx_size), mbmi->sb_type, is_inter,
+                            cm->reduced_tx_set_used);
+    if (!av1_ext_tx_used[tx_set_type][tx_type]) continue;
 #endif  // CONFIG_EXT_TX
 
     RD_STATS this_rd_stats;
@@ -1901,7 +2573,7 @@ int64_t av1_search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
     av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
                     coeff_ctx, AV1_XFORM_QUANT_FP);
     av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size,
-                   a, l);
+                   a, l, 1);
     av1_dist_block(cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size,
                    &this_rd_stats.dist, &this_rd_stats.sse,
                    OUTPUT_HAS_PREDICTED_PIXELS);
@@ -1921,10 +2593,6 @@ int64_t av1_search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
 
   av1_merge_rd_stats(rd_stats, &best_rd_stats);
 
-  //  if (x->plane[plane].eobs[block] == 0)
-  //    if (best_tx_type != DCT_DCT)
-  //      exit(0);
-
   if (best_eob == 0 && is_inter_block(mbmi)) best_tx_type = DCT_DCT;
 
   if (plane == 0) mbmi->txk_type[(blk_row << 4) + blk_col] = best_tx_type;
@@ -1936,7 +2604,7 @@ int64_t av1_search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
     av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
                     coeff_ctx, AV1_XFORM_QUANT_FP);
     av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size,
-                   a, l);
+                   a, l, 1);
 
     av1_inverse_transform_block_facade(xd, plane, block, blk_row, blk_col,
                                        x->plane[plane].eobs[block]);
diff --git a/third_party/aom/av1/encoder/encodetxb.h b/third_party/aom/av1/encoder/encodetxb.h
index cbafe59c9..76a04bb41 100644
--- a/third_party/aom/av1/encoder/encodetxb.h
+++ b/third_party/aom/av1/encoder/encodetxb.h
@@ -31,6 +31,7 @@ typedef struct TxbInfo {
   int shift;
   TX_SIZE tx_size;
   TX_SIZE txs_ctx;
+  TX_TYPE tx_type;
   int bwl;
   int stride;
   int height;
@@ -39,20 +40,21 @@ typedef struct TxbInfo {
   const SCAN_ORDER *scan_order;
   TXB_CTX *txb_ctx;
   int64_t rdmult;
+  const LV_MAP_CTX_TABLE *coeff_ctx_table;
 } TxbInfo;
 
 typedef struct TxbCache {
   int nz_count_arr[MAX_TX_SQUARE];
-  int nz_ctx_arr[MAX_TX_SQUARE][2];
+  int nz_ctx_arr[MAX_TX_SQUARE];
   int base_count_arr[NUM_BASE_LEVELS][MAX_TX_SQUARE];
   int base_mag_arr[MAX_TX_SQUARE]
                   [2];  // [0]: max magnitude [1]: num of max magnitude
-  int base_ctx_arr[NUM_BASE_LEVELS][MAX_TX_SQUARE][2];  // [1]: not used
+  int base_ctx_arr[NUM_BASE_LEVELS][MAX_TX_SQUARE];
 
   int br_count_arr[MAX_TX_SQUARE];
   int br_mag_arr[MAX_TX_SQUARE]
                 [2];  // [0]: max magnitude [1]: num of max magnitude
-  int br_ctx_arr[MAX_TX_SQUARE][2];  // [1]: not used
+  int br_ctx_arr[MAX_TX_SQUARE];
 } TxbCache;
 
 typedef struct TxbProbs {
@@ -62,11 +64,14 @@ typedef struct TxbProbs {
   const aom_prob *coeff_lps;
   const aom_prob *eob_flag;
   const aom_prob *txb_skip;
+#if BR_NODE
+  const aom_prob *coeff_br;
+#endif
 } TxbProbs;
 
 void av1_alloc_txb_buf(AV1_COMP *cpi);
 void av1_free_txb_buf(AV1_COMP *cpi);
-int av1_cost_coeffs_txb(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
+int av1_cost_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
                         int blk_row, int blk_col, int block, TX_SIZE tx_size,
                         TXB_CTX *txb_ctx);
 void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
@@ -90,6 +95,9 @@ void av1_update_and_record_txb_context(int plane, int block, int blk_row,
                                        int blk_col, BLOCK_SIZE plane_bsize,
                                        TX_SIZE tx_size, void *arg);
 
+void av1_set_coeff_buffer(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                          int mi_row, int mi_col);
+
 #if CONFIG_TXK_SEL
 int64_t av1_search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
                             int block, int blk_row, int blk_col,
@@ -99,7 +107,7 @@ int64_t av1_search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
 #endif
 int av1_optimize_txb(const AV1_COMMON *cm, MACROBLOCK *x, int plane,
                      int blk_row, int blk_col, int block, TX_SIZE tx_size,
-                     TXB_CTX *txb_ctx);
+                     TXB_CTX *txb_ctx, int fast_mode);
 #ifdef __cplusplus
 }
 #endif
diff --git a/third_party/aom/av1/encoder/ethread.c b/third_party/aom/av1/encoder/ethread.c
index 1aa1d52a2..edc9b1d61 100644
--- a/third_party/aom/av1/encoder/ethread.c
+++ b/third_party/aom/av1/encoder/ethread.c
@@ -15,13 +15,11 @@
 #include "aom_dsp/aom_dsp_common.h"
 
 static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
-  int i, j, k, l, m, n;
-
-  for (i = 0; i < REFERENCE_MODES; i++)
+  for (int i = 0; i < REFERENCE_MODES; i++)
     td->rd_counts.comp_pred_diff[i] += td_t->rd_counts.comp_pred_diff[i];
 
 #if CONFIG_GLOBAL_MOTION
-  for (i = 0; i < TOTAL_REFS_PER_FRAME; i++)
+  for (int i = 0; i < TOTAL_REFS_PER_FRAME; i++)
     td->rd_counts.global_motion_used[i] +=
         td_t->rd_counts.global_motion_used[i];
 #endif  // CONFIG_GLOBAL_MOTION
@@ -29,15 +27,6 @@ static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
   td->rd_counts.compound_ref_used_flag |=
       td_t->rd_counts.compound_ref_used_flag;
   td->rd_counts.single_ref_used_flag |= td_t->rd_counts.single_ref_used_flag;
-
-  for (i = 0; i < TX_SIZES; i++)
-    for (j = 0; j < PLANE_TYPES; j++)
-      for (k = 0; k < REF_TYPES; k++)
-        for (l = 0; l < COEF_BANDS; l++)
-          for (m = 0; m < COEFF_CONTEXTS; m++)
-            for (n = 0; n < ENTROPY_TOKENS; n++)
-              td->rd_counts.coef_counts[i][j][k][l][m][n] +=
-                  td_t->rd_counts.coef_counts[i][j][k][l][m][n];
 }
 
 static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) {
@@ -92,8 +81,10 @@ void av1_encode_tiles_mt(AV1_COMP *cpi) {
                         aom_memalign(32, sizeof(*thread_data->td)));
         av1_zero(*thread_data->td);
 
-        // Set up pc_tree.
+// Set up pc_tree.
+#if !CONFIG_CB4X4
         thread_data->td->leaf_tree = NULL;
+#endif
         thread_data->td->pc_tree = NULL;
         av1_setup_pc_tree(cm, thread_data->td);
 
@@ -105,12 +96,14 @@ void av1_encode_tiles_mt(AV1_COMP *cpi) {
 #endif
         CHECK_MEM_ERROR(cm, thread_data->td->above_pred_buf,
                         (uint8_t *)aom_memalign(
-                            16, buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE *
-                                    sizeof(*thread_data->td->above_pred_buf)));
+                            16,
+                            buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE *
+                                sizeof(*thread_data->td->above_pred_buf)));
         CHECK_MEM_ERROR(cm, thread_data->td->left_pred_buf,
                         (uint8_t *)aom_memalign(
-                            16, buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE *
-                                    sizeof(*thread_data->td->left_pred_buf)));
+                            16,
+                            buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE *
+                                sizeof(*thread_data->td->left_pred_buf)));
         CHECK_MEM_ERROR(
             cm, thread_data->td->wsrc_buf,
             (int32_t *)aom_memalign(
@@ -124,12 +117,10 @@ void av1_encode_tiles_mt(AV1_COMP *cpi) {
         CHECK_MEM_ERROR(cm, thread_data->td->counts,
                         aom_calloc(1, sizeof(*thread_data->td->counts)));
 
-#if CONFIG_PALETTE
         // Allocate buffers used by palette coding mode.
         CHECK_MEM_ERROR(
             cm, thread_data->td->palette_buffer,
             aom_memalign(16, sizeof(*thread_data->td->palette_buffer)));
-#endif  // CONFIG_PALETTE
 
         // Create threads
         if (!winterface->reset(worker))
@@ -169,10 +160,8 @@ void av1_encode_tiles_mt(AV1_COMP *cpi) {
              sizeof(cpi->common.counts));
     }
 
-#if CONFIG_PALETTE
     if (i < num_workers - 1)
       thread_data->td->mb.palette_buffer = thread_data->td->palette_buffer;
-#endif  // CONFIG_PALETTE
   }
 
   // Encode a frame
diff --git a/third_party/aom/av1/encoder/firstpass.c b/third_party/aom/av1/encoder/firstpass.c
index e7d78d83e..2a4200887 100644
--- a/third_party/aom/av1/encoder/firstpass.c
+++ b/third_party/aom/av1/encoder/firstpass.c
@@ -27,8 +27,11 @@
 #include "av1/common/entropymv.h"
 #include "av1/common/quant_common.h"
 #include "av1/common/reconinter.h"  // av1_setup_dst_planes()
-#include "av1/encoder/av1_quantize.h"
+#if CONFIG_LV_MAP
+#include "av1/common/txb_common.h"
+#endif
 #include "av1/encoder/aq_variance.h"
+#include "av1/encoder/av1_quantize.h"
 #include "av1/encoder/block.h"
 #include "av1/encoder/encodeframe.h"
 #include "av1/encoder/encodemb.h"
@@ -112,7 +115,7 @@ static void output_stats(FIRSTPASS_STATS *stats,
     fprintf(fpfile,
             "%12.0lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf %12.4lf"
             "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf"
-            "%12.4lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf\n",
+            "%12.4lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf %12.4lf\n",
             stats->frame, stats->weight, stats->intra_error, stats->coded_error,
             stats->sr_coded_error, stats->pcnt_inter, stats->pcnt_motion,
             stats->pcnt_second_ref, stats->pcnt_neutral, stats->intra_skip_pct,
@@ -456,7 +459,7 @@ static void set_first_pass_params(AV1_COMP *cpi) {
   cpi->rc.frames_to_key = INT_MAX;
 }
 
-#if CONFIG_FLEX_REFS
+#if CONFIG_EXT_REFS
 static double raw_motion_error_stdev(int *raw_motion_err_list,
                                      int raw_motion_err_counts) {
   int64_t sum_raw_err = 0;
@@ -468,7 +471,7 @@ static double raw_motion_error_stdev(int *raw_motion_err_list,
   for (i = 0; i < raw_motion_err_counts; i++) {
     sum_raw_err += raw_motion_err_list[i];
   }
-  raw_err_avg = sum_raw_err / raw_motion_err_counts;
+  raw_err_avg = (double)sum_raw_err / raw_motion_err_counts;
   for (i = 0; i < raw_motion_err_counts; i++) {
     raw_err_stdev += (raw_motion_err_list[i] - raw_err_avg) *
                      (raw_motion_err_list[i] - raw_err_avg);
@@ -479,7 +482,7 @@ static double raw_motion_error_stdev(int *raw_motion_err_list,
   raw_err_stdev = sqrt(raw_err_stdev / raw_motion_err_counts);
   return raw_err_stdev;
 }
-#endif  // CONFIG_FLEX_REFS
+#endif  // CONFIG_EXT_REFS
 
 #define UL_INTRA_THRESH 50
 #define INVALID_ROW -1
@@ -531,13 +534,13 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
   od_adapt_ctx pvq_context;
 #endif
 
-#if CONFIG_FLEX_REFS
+#if CONFIG_EXT_REFS
   int *raw_motion_err_list;
   int raw_motion_err_counts = 0;
   CHECK_MEM_ERROR(
       cm, raw_motion_err_list,
       aom_calloc(cm->mb_rows * cm->mb_cols, sizeof(*raw_motion_err_list)));
-#endif  // CONFIG_FLEX_REFS
+#endif  // CONFIG_EXT_REFS
   // First pass code requires valid last and new frame buffers.
   assert(new_yv12 != NULL);
   assert(frame_is_intra_only(cm) || (lst_yv12 != NULL));
@@ -575,8 +578,8 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
 
 #if CONFIG_CFL
   // Don't store luma on the fist pass since chroma is not computed
-  x->cfl_store_y = 0;
-#endif
+  xd->cfl->store_y = 0;
+#endif  // CONFIG_CFL
   av1_frame_init_quantizer(cpi);
 
 #if CONFIG_PVQ
@@ -623,6 +626,9 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
   }
 
   av1_init_mv_probs(cm);
+#if CONFIG_LV_MAP
+  av1_init_lv_map(cm);
+#endif
 #if CONFIG_ADAPT_SCAN
   av1_init_scan_order(cm);
   av1_deliver_eob_threshold(cm, xd);
@@ -1000,9 +1006,9 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
             }
           }
         }
-#if CONFIG_FLEX_REFS
+#if CONFIG_EXT_REFS
         raw_motion_err_list[raw_motion_err_counts++] = raw_motion_error;
-#endif  // CONFIG_FLEX_REFS
+#endif  // CONFIG_EXT_REFS
       } else {
         sr_coded_error += (int64_t)this_error;
       }
@@ -1025,10 +1031,12 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
 
     aom_clear_system_state();
   }
-#if CONFIG_FLEX_REFS
+#if CONFIG_EXT_REFS
   const double raw_err_stdev =
       raw_motion_error_stdev(raw_motion_err_list, raw_motion_err_counts);
-#endif  // CONFIG_FLEX_REFS
+  aom_free(raw_motion_err_list);
+#endif  // CONFIG_EXT_REFS
+
 #if CONFIG_PVQ
 #if !CONFIG_ANS
   od_ec_enc_clear(&x->daala_enc.w.ec);
@@ -1082,9 +1090,9 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
     fps.intra_skip_pct = (double)intra_skip_count / num_mbs;
     fps.inactive_zone_rows = (double)image_data_start_row;
     fps.inactive_zone_cols = (double)0;  // TODO(paulwilkins): fix
-#if CONFIG_FLEX_REFS
+#if CONFIG_EXT_REFS
     fps.raw_error_stdev = raw_err_stdev;
-#endif  // CONFIG_FLEX_REFS
+#endif  // CONFIG_EXT_REFS
 
     if (mvcount > 0) {
       fps.MVr = (double)sum_mvr / mvcount;
@@ -1666,47 +1674,618 @@ static void get_arf_buffer_indices(unsigned char *arf_buffer_indices) {
   arf_buffer_indices[0] = ARF_SLOT1;
   arf_buffer_indices[1] = ARF_SLOT2;
 }
-#endif
+#endif  // !CONFIG_EXT_REFS
 
-static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
-                                   double group_error, int gf_arf_bits) {
+#if CONFIG_EXT_REFS
+#if USE_GF16_MULTI_LAYER
+// === GF Group of 16 ===
+#define GF_INTERVAL_16 16
+#define GF_FRAME_PARAMS (REF_FRAMES + 5)
+
+// GF Group of 16: multi-layer hierarchical coding structure
+//   1st Layer: Frame 0 and Frame 16 (ALTREF)
+//   2nd Layer: Frame 8 (ALTREF2)
+//   3rd Layer: Frame 4 and 12 (ALTREF2)
+//   4th Layer: Frame 2, 6, 10, and 14 (BWDREF)
+//   5th Layer: Frame 1, 3, 5, 7, 9, 11, 13, and 15
+static const unsigned char gf16_multi_layer_params[][GF_FRAME_PARAMS] = {
+  // gf_group->index: coding order
+  // (Frame #)      : display order
+  {
+      // gf_group->index == 0 (Frame 0)
+      OVERLAY_UPDATE,  // update_type
+      0,               // arf_src_offset
+      0,               // brf_src_offset
+      // References (previous ===> current)
+      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
+      LAST2_FRAME,    // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
+      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
+      GOLDEN_FRAME,   // cpi->gld_fb_idx (GOLDEN_FRAME)
+      BWDREF_FRAME,   // cpi->bwd_fb_idx (BWDREF_FRAME)
+      ALTREF2_FRAME,  // cpi->alt2_fb_idx (ALTREF2_FRAME)
+      ALTREF_FRAME,   // cpi->alt_fb_idx (ALTREF_FRAME)
+      REF_FRAMES,     // cpi->ext_fb_idx (extra ref frame)
+      // Refreshment (index, flag)
+      ALTREF_FRAME,  // Index (current) of reference to get updated
+      GOLDEN_FRAME   // cpi->refresh_golden_frame = 1
+  },
+  {
+      // gf_group->index == 1 (Frame 16)
+      ARF_UPDATE,          // update_type
+      GF_INTERVAL_16 - 1,  // arf_src_offset
+      0,                   // brf_src_offset
+      // Reference frame indexes (previous ===> current)
+      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
+      LAST2_FRAME,    // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
+      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
+      ALTREF_FRAME,   // cpi->alt_fb_idx ===> cpi->gld_fb_idx (GOLDEN_FRAME)
+      BWDREF_FRAME,   // cpi->bwd_fb_idx (BWDREF_FRAME)
+      ALTREF2_FRAME,  // cpi->alt2_fb_idx (ALTREF2_FRAME)
+      GOLDEN_FRAME,   // cpi->gld_fb_idx ===> cpi->alt_fb_idx (ALTREF_FRAME)
+      REF_FRAMES,     // cpi->ext_fb_idx (extra ref frame)
+      // Refreshment (index, flag)
+      ALTREF_FRAME,  // Index (current) of reference to get updated
+      ALTREF_FRAME   // cpi->refresh_alt_ref_frame = 1
+  },
+  {
+      // gf_group->index == 2 (Frame 8)
+      INTNL_ARF_UPDATE,           // update_type
+      (GF_INTERVAL_16 >> 1) - 1,  // arf_src_offset
+      0,                          // brf_src_offset
+      // Reference frame indexes (previous ===> current)
+      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
+      LAST2_FRAME,    // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
+      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
+      GOLDEN_FRAME,   // cpi->gld_fb_idx (GOLDEN_FRAME)
+      BWDREF_FRAME,   // cpi->bwd_fb_idx (BWDREF_FRAME)
+      ALTREF2_FRAME,  // cpi->alt2_fb_idx (ALTREF2_FRAME)
+      ALTREF_FRAME,   // cpi->alt_fb_idx (ALTREF_FRAME)
+      REF_FRAMES,     // cpi->ext_fb_idx (extra ref frame)
+      // Refreshment (index, flag)
+      ALTREF2_FRAME,  // Index (current) of reference to get updated
+      ALTREF2_FRAME   // cpi->refresh_alt2_ref_frame = 1
+  },
+  {
+      // gf_group->index == 3 (Frame 4)
+      INTNL_ARF_UPDATE,           // update_type
+      (GF_INTERVAL_16 >> 2) - 1,  // arf_src_offset
+      0,                          // brf_src_offset
+      // Reference frame indexes (previous ===> current)
+      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
+      LAST2_FRAME,    // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
+      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
+      GOLDEN_FRAME,   // cpi->gld_fb_idx (GOLDEN_FRAME)
+      ALTREF2_FRAME,  // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx
+                      // (BWDREF_FRAME)
+      BWDREF_FRAME,   // cpi->bwd_fb_idx ===> cpi->alt2_fb_idx
+                      // (ALTREF2_FRAME)
+      ALTREF_FRAME,   // cpi->alt_fb_idx (ALTREF_FRAME)
+      REF_FRAMES,     // cpi->ext_fb_idx (extra ref frame)
+      // Refreshment (index, flag)
+      ALTREF2_FRAME,  // Index (current) of reference to get updated
+      ALTREF2_FRAME   // cpi->refresh_alt2_ref_frame = 1
+  },
+  {
+      // gf_group->index == 4 (Frame 2)
+      BRF_UPDATE,  // update_type
+      0,           // arf_src_offset
+      1,           // brf_src_offset
+      // Reference frame indexes (previous ===> current)
+      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
+      LAST2_FRAME,    // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
+      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
+      GOLDEN_FRAME,   // cpi->gld_fb_idx (GOLDEN_FRAME)
+      ALTREF2_FRAME,  // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx
+                      // (BWDREF_FRAME)
+      BWDREF_FRAME,   // cpi->bwd_fb_idx ===> cpi->alt2_fb_idx
+                      // (ALTREF2_FRAME)
+      ALTREF_FRAME,   // cpi->alt_fb_idx (ALTREF_FRAME)
+      REF_FRAMES,     // cpi->ext_fb_idx (extra ref frame)
+      // Refreshment (index, flag)
+      REF_FRAMES,   // Index (current) of reference to get updated
+      BWDREF_FRAME  // cpi->refresh_bwd_ref_frame = 1
+  },
+  {
+      // gf_group->index == 5 (Frame 1)
+      LAST_BIPRED_UPDATE,  // update_type
+      0,                   // arf_src_offset
+      0,                   // brf_src_offset
+      // Reference frame indexes (previous ===> current)
+      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
+      LAST2_FRAME,    // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
+      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
+      GOLDEN_FRAME,   // cpi->gld_fb_idx (GOLDEN_FRAME)
+      REF_FRAMES,     // cpi->ext_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME)
+      BWDREF_FRAME,   // cpi->bwd_fb_idx ===> cpi->alt2_fb_idx (ALTREF2_FRAME)
+      ALTREF2_FRAME,  // cpi->alt2_fb_idx ===> cpi->alt_fb_idx (ALTREF_FRAME)
+      ALTREF_FRAME,   // cpi->alt_fb_idx ===> cpi->ext_fb_idx (extra ref frame)
+      // Refreshment (index, flag)
+      LAST3_FRAME,  // Index (current) of reference to get updated
+      LAST_FRAME    // cpi->refresh_last_frame = 1
+  },
+  {
+      // gf_group->index == 6 (Frame 3)
+      LF_UPDATE,  // update_type
+      0,          // arf_src_offset
+      0,          // brf_src_offset
+      // Reference frame indexes (previous ===> current)
+      BWDREF_FRAME,   // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME -
+                      // LAST_FRAME]
+      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
+                      // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
+      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
+                      // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
+      GOLDEN_FRAME,   // cpi->gld_fb_idx (GOLDEN_FRAME)
+      ALTREF2_FRAME,  // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME)
+      ALTREF_FRAME,   // cpi->alt_fb_idx ===> cpi->alt2_fb_idx (ALTREF2_FRAME)
+      REF_FRAMES,     // cpi->ext_fb_idx ===> cpi->alt_fb_idx (ALTREF_FRAME)
+      LAST2_FRAME,    // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
+                      // cpi->ext_fb_idx (extra ref frame)
+      // Refreshment (index, flag)
+      LAST3_FRAME,  // Index (current) of reference to get updated
+      LAST_FRAME    // cpi->refresh_last_frame = 1
+  },
+  {
+      // gf_group->index == 7 (Frame 4 - OVERLAY)
+      INTNL_OVERLAY_UPDATE,  // update_type
+      0,                     // arf_src_offset
+      0,                     // brf_src_offset
+      // Reference frame indexes (previous ===> current)
+      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
+                      // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
+      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
+                      // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
+      LAST2_FRAME,    // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
+                      // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
+      GOLDEN_FRAME,   // cpi->gld_fb_idx (GOLDEN_FRAME)
+      BWDREF_FRAME,   // cpi->bwd_fb_idx (BWDREF_FRAME)
+      ALTREF2_FRAME,  // cpi->alt2_fb_idx (ALTREF2_FRAME)
+      ALTREF_FRAME,   // cpi->alt_fb_idx (ALTREF_FRAME)
+      REF_FRAMES,     // cpi->ext_fb_idx (extra ref frame)
+      // Refreshment (index, flag)
+      BWDREF_FRAME,  // Index (current) of reference to get updated
+      ALTREF2_FRAME  // cpi->refresh_alt2_ref_frame = 1
+  },
+  {
+      // gf_group->index == 8 (Frame 6)
+      BRF_UPDATE,  // update_type
+      0,           // arf_src_offset
+      1,           // brf_src_offset
+      // Reference frame indexes (previous ===> current)
+      BWDREF_FRAME,   // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME -
+                      // LAST_FRAME]
+      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
+                      // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
+      LAST2_FRAME,    // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
+                      // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
+      GOLDEN_FRAME,   // cpi->gld_fb_idx (GOLDEN_FRAME)
+      ALTREF2_FRAME,  // cpi->alt2_fb_idx -> cpi->bwd_fb_idx (BWDREF_FRAME)
+      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
+                      // cpi->alt2_fb_idx (ALTREF2_FRAME)
+      ALTREF_FRAME,   // cpi->alt_fb_idx (ALTREF_FRAME)
+      REF_FRAMES,     // cpi->ext_fb_idx (extra ref frame)
+      // Refreshment (index, flag)
+      ALTREF2_FRAME,  // Index (current) of reference to get updated
+      BWDREF_FRAME    // cpi->refresh_bwd_frame = 1
+  },
+  {
+      // gf_group->index == 9 (Frame 5)
+      LAST_BIPRED_UPDATE,  // update_type
+      0,                   // arf_src_offset
+      0,                   // brf_src_offset
+      // Reference frame indexes (previous ===> current)
+      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
+      LAST2_FRAME,    // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
+      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
+      GOLDEN_FRAME,   // cpi->gld_fb_idx (GOLDEN_FRAME)
+      ALTREF2_FRAME,  // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME)
+      BWDREF_FRAME,   // cpi->bwd_fb_idx ===> cpi->alt2_fb_idx (ALTREF2_FRAME)
+      ALTREF_FRAME,   // cpi->alt_fb_idx (ALTREF_FRAME)
+      REF_FRAMES,     // cpi->ext_fb_idx (extra ref frame)
+      // Refreshment (index, flag)
+      LAST3_FRAME,  // Index (current) of reference to get updated
+      LAST_FRAME    // cpi->refresh_last_frame = 1
+  },
+  {
+      // gf_group->index == 10 (Frame 7)
+      LF_UPDATE,  // update_type
+      0,          // arf_src_offset
+      0,          // brf_src_offset
+      // Reference frame indexes (previous ===> current)
+      BWDREF_FRAME,   // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME -
+                      // LAST_FRAME]
+      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
+                      // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
+      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
+                      // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
+      GOLDEN_FRAME,   // cpi->gld_fb_idx (GOLDEN_FRAME)
+      ALTREF2_FRAME,  // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME)
+      LAST2_FRAME,    // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
+                      // cpi->alt2_fb_idx (ALTREF2_FRAME)
+      ALTREF_FRAME,   // cpi->alt_fb_idx (ALTREF_FRAME)
+      REF_FRAMES,     // cpi->ext_fb_idx (extra ref frame)
+      // Refreshment (index, flag)
+      LAST3_FRAME,  // Index (current) of reference to get updated
+      LAST_FRAME    // cpi->refresh_last_frame = 1
+  },
+  {
+      // gf_group->index == 11 (Frame 8 - OVERLAY)
+      INTNL_OVERLAY_UPDATE,  // update_type
+      0,                     // arf_src_offset
+      0,                     // brf_src_offset
+      // Reference frame indexes (previous ===> current)
+      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
+                      // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
+      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
+                      // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
+      LAST2_FRAME,    // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
+                      // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
+      GOLDEN_FRAME,   // cpi->gld_fb_idx (GOLDEN_FRAME)
+      BWDREF_FRAME,   // cpi->bwd_fb_idx (BWDREF_FRAME)
+      ALTREF2_FRAME,  // cpi->alt2_fb_idx (ALTREF2_FRAME)
+      ALTREF_FRAME,   // cpi->alt_fb_idx (ALTREF_FRAME)
+      REF_FRAMES,     // cpi->ext_fb_idx (extra ref frame)
+      // Refreshment (index, flag)
+      BWDREF_FRAME,  // Index (current) of reference to get updated
+      ALTREF2_FRAME  // cpi->refresh_alt2_ref_frame = 1
+  },
+  {
+      // gf_group->index == 12 (Frame 12)
+      INTNL_ARF_UPDATE,           // update_type
+      (GF_INTERVAL_16 >> 2) - 1,  // arf_src_offset
+      0,                          // brf_src_offset
+      // Reference frame indexes (previous ===> current)
+      BWDREF_FRAME,   // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME -
+                      // LAST_FRAME]
+      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
+                      // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
+      LAST2_FRAME,    //  cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
+                      //  cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
+      GOLDEN_FRAME,   // cpi->gld_fb_idx (GOLDEN_FRAME)
+      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
+                      // cpi->bwd_fb_idx (BWDREF_FRAME)
+      ALTREF2_FRAME,  // cpi->alt2_fb_idx (ALTREF2_FRAME)
+      ALTREF_FRAME,   // cpi->alt_fb_idx (ALTREF_FRAME)
+      REF_FRAMES,     // cpi->ext_fb_idx (extra ref frame)
+      // Refreshment (index, flag)
+      ALTREF2_FRAME,  // Index (current) of reference to get updated
+      ALTREF2_FRAME   // cpi->refresh_alt2_ref_frame = 1
+  },
+  {
+      // gf_group->index == 13 (Frame 10)
+      BRF_UPDATE,  // update_type
+      0,           // arf_src_offset
+      1,           // brf_src_offset
+      // Reference frame indexes (previous ===> current)
+      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
+      LAST2_FRAME,    // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
+      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
+      GOLDEN_FRAME,   // cpi->gld_fb_idx (GOLDEN_FRAME)
+      ALTREF2_FRAME,  // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME)
+      BWDREF_FRAME,   // cpi->bwd_fb_idx ===> cpi->alt2_fb_idx (ALTREF2_FRAME)
+      ALTREF_FRAME,   // cpi->alt_fb_idx (ALTREF_FRAME)
+      REF_FRAMES,     // cpi->ext_fb_idx (extra ref frame)
+      // Refreshment (index, flag)
+      ALTREF2_FRAME,  // Index (current) of reference to get updated
+      BWDREF_FRAME    // cpi->refresh_bwd_frame = 1
+  },
+  {
+      // gf_group->index == 14 (Frame 9)
+      LAST_BIPRED_UPDATE,  // update_type
+      0,                   // arf_src_offset
+      0,                   // brf_src_offset
+      // Reference frame indexes (previous ===> current)
+      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
+      LAST2_FRAME,    // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
+      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
+      GOLDEN_FRAME,   // cpi->gld_fb_idx (GOLDEN_FRAME)
+      ALTREF2_FRAME,  // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME)
+      BWDREF_FRAME,   // cpi->bwd_fb_idx ===> cpi->alt2_fb_idx (ALTREF2_FRAME)
+      ALTREF_FRAME,   // cpi->alt_fb_idx (ALTREF_FRAME)
+      REF_FRAMES,     // cpi->ext_fb_idx (extra ref frame)
+      // Refreshment (index, flag)
+      LAST3_FRAME,  // Index (current) of reference to get updated
+      LAST_FRAME    // cpi->refresh_last_frame = 1
+  },
+  {
+      // gf_group->index == 15 (Frame 11)
+      LF_UPDATE,  // update_type
+      0,          // arf_src_offset
+      0,          // brf_src_offset
+      // Reference frame indexes (previous ===> current)
+      BWDREF_FRAME,   // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME -
+                      // LAST_FRAME]
+      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
+                      // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
+      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
+                      // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
+      GOLDEN_FRAME,   // cpi->gld_fb_idx (GOLDEN_FRAME)
+      ALTREF2_FRAME,  // cpi->alt2_fb_idx ===> cpi->bwd_fb_idx (BWDREF_FRAME)
+      LAST2_FRAME,    // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
+                      // cpi->alt2_fb_idx (ALTREF2_FRAME)
+      ALTREF_FRAME,   // cpi->alt_fb_idx (ALTREF_FRAME)
+      REF_FRAMES,     // cpi->ext_fb_idx (extra ref frame)
+      // Refreshment (index, flag)
+      LAST3_FRAME,  // Index (current) of reference to get updated
+      LAST_FRAME    // cpi->refresh_last_frame = 1
+  },
+  {
+      // gf_group->index == 16 (Frame 12 - OVERLAY)
+      INTNL_OVERLAY_UPDATE,  // update_type
+      0,                     // arf_src_offset
+      0,                     // brf_src_offset
+      // Reference frame indexes (previous ===> current)
+      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
+                      // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
+      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
+                      // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
+      LAST2_FRAME,    // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
+                      // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
+      GOLDEN_FRAME,   // cpi->gld_fb_idx (GOLDEN_FRAME)
+      BWDREF_FRAME,   // cpi->bwd_fb_idx (BWDREF_FRAME)
+      ALTREF2_FRAME,  // cpi->alt2_fb_idx (ALTREF2_FRAME)
+      ALTREF_FRAME,   // cpi->alt_fb_idx (ALTREF_FRAME)
+      REF_FRAMES,     // cpi->ext_fb_idx (extra ref frame)
+      // Refreshment (index, flag)
+      BWDREF_FRAME,  // Index (current) of reference to get updated
+      ALTREF2_FRAME  // cpi->refresh_alt2_ref_frame = 1
+  },
+  {
+      // gf_group->index == 17 (Frame 14)
+      BRF_UPDATE,  // update_type
+      0,           // arf_src_offset
+      1,           // brf_src_offset
+      // Reference frame indexes (previous ===> current)
+      BWDREF_FRAME,   // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME -
+                      // LAST_FRAME]
+      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
+                      // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
+      LAST2_FRAME,    // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
+                      // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
+      GOLDEN_FRAME,   // cpi->gld_fb_idx (GOLDEN_FRAME)
+      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
+                      // cpi->bwd_fb_idx (BWDREF_FRAME)
+      ALTREF2_FRAME,  // cpi->alt2_fb_idx (ALTREF2_FRAME)
+      ALTREF_FRAME,   // cpi->alt_fb_idx (ALTREF_FRAME)
+      REF_FRAMES,     // cpi->ext_fb_idx (extra ref frame)
+      // Refreshment (index, flag)
+      BWDREF_FRAME,  // Index (current) of reference to get updated
+      BWDREF_FRAME   // cpi->refresh_bwd_frame = 1
+  },
+  {
+      // gf_group->index == 18 (Frame 13)
+      LAST_BIPRED_UPDATE,  // update_type
+      0,                   // arf_src_offset
+      0,                   // brf_src_offset
+      // Reference frame indexes (previous ===> current)
+      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
+      LAST2_FRAME,    // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
+      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
+      GOLDEN_FRAME,   // cpi->gld_fb_idx (GOLDEN_FRAME)
+      BWDREF_FRAME,   // cpi->bwd_fb_idx (BWDREF_FRAME)
+      ALTREF2_FRAME,  // cpi->alt2_fb_idx (ALTREF2_FRAME)
+      ALTREF_FRAME,   // cpi->alt_fb_idx (ALTREF_FRAME)
+      REF_FRAMES,     // cpi->ext_fb_idx (extra ref frame)
+      // Refreshment (index, flag)
+      LAST3_FRAME,  // Index (current) of reference to get updated
+      LAST_FRAME    // cpi->refresh_last_frame = 1
+  },
+  {
+      // gf_group->index == 19 (Frame 15)
+      LF_UPDATE,  // update_type
+      0,          // arf_src_offset
+      0,          // brf_src_offset
+      // Reference frame indexes (previous ===> current)
+      BWDREF_FRAME,   // cpi->bwd_fb_idx ===> cpi->lst_fb_idxes[LAST_FRAME -
+                      // LAST_FRAME]
+      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
+                      // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
+      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
+                      // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
+      GOLDEN_FRAME,   // cpi->gld_fb_idx (GOLDEN_FRAME)
+      LAST2_FRAME,    // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
+                      // cpi->bwd_fb_idx (BWDREF_FRAME)
+      ALTREF2_FRAME,  // cpi->alt2_fb_idx (ALTREF2_FRAME)
+      ALTREF_FRAME,   // cpi->alt_fb_idx (ALTREF_FRAME)
+      REF_FRAMES,     // cpi->ext_fb_idx (extra ref frame)
+      // Refreshment (index, flag)
+      LAST3_FRAME,  // Index (current) of reference to get updated
+      LAST_FRAME    // cpi->refresh_last_frame = 1
+  },
+  {
+      // gf_group->index == 20 (Frame 16 - OVERLAY: Belonging to the next GF
+      // group)
+      OVERLAY_UPDATE,  // update_type
+      0,               // arf_src_offset
+      0,               // brf_src_offset
+      // Reference frame indexes (previous ===> current)
+      LAST3_FRAME,    // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] ===>
+                      // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]
+      LAST_FRAME,     // cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] ===>
+                      // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME]
+      LAST2_FRAME,    // cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] ===>
+                      // cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME]
+      GOLDEN_FRAME,   // cpi->gld_fb_idx (GOLDEN_FRAME)
+      BWDREF_FRAME,   // cpi->bwd_fb_idx (BWDREF_FRAME)
+      ALTREF2_FRAME,  // cpi->alt2_fb_idx (ALTREF2_FRAME)
+      ALTREF_FRAME,   // cpi->alt_fb_idx (ALTREF_FRAME)
+      REF_FRAMES,     // cpi->ext_fb_idx (extra ref frame)
+      // Refreshment (index, flag)
+      ALTREF_FRAME,  // Index (current) of reference to get updated
+      GOLDEN_FRAME   // cpi->refresh_golden_frame = 1
+  }
+};
+
+// === GF Group of 16 ===
+static void define_gf_group_structure_16(AV1_COMP *cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
-  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   TWO_PASS *const twopass = &cpi->twopass;
   GF_GROUP *const gf_group = &twopass->gf_group;
-  FIRSTPASS_STATS frame_stats;
+  const int key_frame = cpi->common.frame_type == KEY_FRAME;
+
+  assert(rc->baseline_gf_interval == GF_INTERVAL_16);
+
+  // Total number of frames to consider for GF group of 16:
+  //   = GF group interval + number of OVERLAY's
+  //   = rc->baseline_gf_interval + MAX_EXT_ARFS + 1 + 1
+  // NOTE: The OVERLAY frame for the next GF group also needs to consider to
+  //       prepare for the reference frame index mapping.
+
+  const int gf_update_frames = rc->baseline_gf_interval + MAX_EXT_ARFS + 2;
+
+  for (int frame_index = 0; frame_index < gf_update_frames; ++frame_index) {
+    int param_idx = 0;
+
+    // Treat KEY_FRAME differently
+    if (frame_index == 0 && key_frame) {
+      gf_group->update_type[frame_index] = KF_UPDATE;
+
+      gf_group->rf_level[frame_index] = KF_STD;
+      gf_group->arf_src_offset[frame_index] = 0;
+      gf_group->brf_src_offset[frame_index] = 0;
+      gf_group->bidir_pred_enabled[frame_index] = 0;
+      for (int ref_idx = 0; ref_idx < REF_FRAMES; ++ref_idx)
+        gf_group->ref_fb_idx_map[frame_index][ref_idx] = ref_idx;
+      gf_group->refresh_idx[frame_index] =
+          cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME];
+      gf_group->refresh_flag[frame_index] =
+          cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME];
+
+      continue;
+    }
+
+    // == update_type ==
+    gf_group->update_type[frame_index] =
+        gf16_multi_layer_params[frame_index][param_idx++];
+
+    // == rf_level ==
+    // Derive rf_level from update_type
+    switch (gf_group->update_type[frame_index]) {
+      case LF_UPDATE: gf_group->rf_level[frame_index] = INTER_NORMAL; break;
+      case ARF_UPDATE: gf_group->rf_level[frame_index] = GF_ARF_LOW; break;
+      case OVERLAY_UPDATE:
+        gf_group->rf_level[frame_index] = INTER_NORMAL;
+        break;
+      case BRF_UPDATE: gf_group->rf_level[frame_index] = GF_ARF_LOW; break;
+      case LAST_BIPRED_UPDATE:
+        gf_group->rf_level[frame_index] = INTER_NORMAL;
+        break;
+      case BIPRED_UPDATE: gf_group->rf_level[frame_index] = INTER_NORMAL; break;
+      case INTNL_ARF_UPDATE:
+        gf_group->rf_level[frame_index] = GF_ARF_LOW;
+        break;
+      case INTNL_OVERLAY_UPDATE:
+        gf_group->rf_level[frame_index] = INTER_NORMAL;
+        break;
+      default: gf_group->rf_level[frame_index] = INTER_NORMAL; break;
+    }
+
+    // == arf_src_offset ==
+    gf_group->arf_src_offset[frame_index] =
+        gf16_multi_layer_params[frame_index][param_idx++];
+
+    // == brf_src_offset ==
+    gf_group->brf_src_offset[frame_index] =
+        gf16_multi_layer_params[frame_index][param_idx++];
+
+    // == bidir_pred_enabled ==
+    // Derive bidir_pred_enabled from bidir_src_offset
+    gf_group->bidir_pred_enabled[frame_index] =
+        gf_group->brf_src_offset[frame_index] ? 1 : 0;
+
+    // == ref_fb_idx_map ==
+    for (int ref_idx = 0; ref_idx < REF_FRAMES; ++ref_idx)
+      gf_group->ref_fb_idx_map[frame_index][ref_idx] =
+          gf16_multi_layer_params[frame_index][param_idx++];
+
+    // == refresh_idx ==
+    gf_group->refresh_idx[frame_index] =
+        gf16_multi_layer_params[frame_index][param_idx++];
+
+    // == refresh_flag ==
+    gf_group->refresh_flag[frame_index] =
+        gf16_multi_layer_params[frame_index][param_idx];
+  }
+
+  // Mark the ARF_UPDATE / INTNL_ARF_UPDATE and OVERLAY_UPDATE /
+  // INTNL_OVERLAY_UPDATE for rate allocation
+  // NOTE: Indexes are designed in the display order backward:
+  //       ALT[3] .. ALT[2] .. ALT[1] .. ALT[0],
+  //       but their coding order is as follows:
+  // ALT0-ALT2-ALT3 .. OVERLAY3 .. OVERLAY2-ALT1 .. OVERLAY1 .. OVERLAY0
+
+  const int num_arfs_in_gf = cpi->num_extra_arfs + 1;
+  const int sub_arf_interval = rc->baseline_gf_interval / num_arfs_in_gf;
+
+  // == arf_pos_for_ovrly ==: Position for OVERLAY
+  for (int arf_idx = 0; arf_idx < num_arfs_in_gf; arf_idx++) {
+    const int prior_num_arfs =
+        (arf_idx <= 1) ? num_arfs_in_gf : (num_arfs_in_gf - 1);
+    cpi->arf_pos_for_ovrly[arf_idx] =
+        sub_arf_interval * (num_arfs_in_gf - arf_idx) + prior_num_arfs;
+  }
+
+  // == arf_pos_in_gf ==: Position for ALTREF
+  cpi->arf_pos_in_gf[0] = 1;
+  cpi->arf_pos_in_gf[1] = cpi->arf_pos_for_ovrly[2] + 1;
+  cpi->arf_pos_in_gf[2] = 2;
+  cpi->arf_pos_in_gf[3] = 3;
+
+  // == arf_update_idx ==
+  // == arf_ref_idx ==
+  // NOTE: Due to the hierarchical nature of GF16, these two parameters only
+  //       relect the index to the nearest future overlay.
+  int start_frame_index = 0;
+  for (int arf_idx = (num_arfs_in_gf - 1); arf_idx >= 0; --arf_idx) {
+    const int end_frame_index = cpi->arf_pos_for_ovrly[arf_idx];
+    for (int frame_index = start_frame_index; frame_index <= end_frame_index;
+         ++frame_index) {
+      gf_group->arf_update_idx[frame_index] = arf_idx;
+      gf_group->arf_ref_idx[frame_index] = arf_idx;
+    }
+    start_frame_index = end_frame_index + 1;
+  }
+}
+#endif  // USE_GF16_MULTI_LAYER
+#endif  // CONFIG_EXT_REFS
+
+static void define_gf_group_structure(AV1_COMP *cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+
+#if CONFIG_EXT_REFS
+#if USE_GF16_MULTI_LAYER
+  if (rc->baseline_gf_interval == 16) {
+    define_gf_group_structure_16(cpi);
+    return;
+  }
+#endif  // USE_GF16_MULTI_LAYER
+#endif  // CONFIG_EXT_REFS
+
+  TWO_PASS *const twopass = &cpi->twopass;
+  GF_GROUP *const gf_group = &twopass->gf_group;
   int i;
   int frame_index = 0;
-  int target_frame_size;
-  int key_frame;
-  const int max_bits = frame_max_bits(&cpi->rc, &cpi->oxcf);
-  int64_t total_group_bits = gf_group_bits;
-  double modified_err = 0.0;
-  double err_fraction;
-  int mid_boost_bits = 0;
+  const int key_frame = cpi->common.frame_type == KEY_FRAME;
+
 #if CONFIG_EXT_REFS
   // The use of bi-predictive frames are only enabled when following 3
   // conditions are met:
-  // (1) Alt-ref is enabled;
+  // (1) ALTREF is enabled;
   // (2) The bi-predictive group interval is at least 2; and
   // (3) The bi-predictive group interval is strictly smaller than the
   //     golden group interval.
   const int is_bipred_enabled =
-#if CONFIG_FLEX_REFS
-      cpi->bwd_ref_allowed &&
-#endif
-      rc->source_alt_ref_pending && rc->bipred_group_interval &&
+      cpi->bwd_ref_allowed && rc->source_alt_ref_pending &&
+      rc->bipred_group_interval &&
       rc->bipred_group_interval <=
           (rc->baseline_gf_interval - rc->source_alt_ref_pending);
   int bipred_group_end = 0;
   int bipred_frame_index = 0;
 
-  int arf_pos[MAX_EXT_ARFS + 1];
   const unsigned char ext_arf_interval =
       (unsigned char)(rc->baseline_gf_interval / (cpi->num_extra_arfs + 1) - 1);
   int which_arf = cpi->num_extra_arfs;
   int subgroup_interval[MAX_EXT_ARFS + 1];
-  int ext_arf_boost[MAX_EXT_ARFS];
   int is_sg_bipred_enabled = is_bipred_enabled;
   int accumulative_subgroup_interval = 0;
 #else
@@ -1714,27 +2293,20 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
   unsigned char arf_buffer_indices[MAX_ACTIVE_ARFS];
 #endif  // CONFIG_EXT_REFS
 
-#if CONFIG_EXT_REFS
-  av1_zero_array(ext_arf_boost, MAX_EXT_ARFS);
-#endif  // CONFIG_EXT_REFS
-
-  key_frame = cpi->common.frame_type == KEY_FRAME;
-
 #if !CONFIG_EXT_REFS
   get_arf_buffer_indices(arf_buffer_indices);
 #endif  // !CONFIG_EXT_REFS
 
   // For key frames the frame target rate is already set and it
   // is also the golden frame.
+  // === [frame_index == 0] ===
   if (!key_frame) {
     if (rc->source_alt_ref_active) {
       gf_group->update_type[frame_index] = OVERLAY_UPDATE;
       gf_group->rf_level[frame_index] = INTER_NORMAL;
-      gf_group->bit_allocation[frame_index] = 0;
     } else {
       gf_group->update_type[frame_index] = GF_UPDATE;
       gf_group->rf_level[frame_index] = GF_ARF_STD;
-      gf_group->bit_allocation[frame_index] = gf_arf_bits;
     }
 #if CONFIG_EXT_REFS
     gf_group->arf_update_idx[frame_index] = 0;
@@ -1743,8 +2315,6 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
     gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0];
     gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0];
 #endif  // CONFIG_EXT_REFS
-    // Step over the golden frame / overlay frame
-    if (EOF == input_stats(twopass, &frame_stats)) return;
   }
 
 #if CONFIG_EXT_REFS
@@ -1752,22 +2322,16 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
   gf_group->brf_src_offset[frame_index] = 0;
 #endif  // CONFIG_EXT_REFS
 
-  // Deduct the boost bits for arf (or gf if it is not a key frame)
-  // from the group total.
-  if (rc->source_alt_ref_pending || !key_frame) total_group_bits -= gf_arf_bits;
-
   frame_index++;
 
 #if CONFIG_EXT_REFS
   bipred_frame_index++;
 #endif  // CONFIG_EXT_REFS
 
-  // Store the bits to spend on the ARF if there is one.
+  // === [frame_index == 1] ===
   if (rc->source_alt_ref_pending) {
     gf_group->update_type[frame_index] = ARF_UPDATE;
     gf_group->rf_level[frame_index] = GF_ARF_STD;
-    gf_group->bit_allocation[frame_index] = gf_arf_bits;
-
     gf_group->arf_src_offset[frame_index] =
         (unsigned char)(rc->baseline_gf_interval - 1);
 
@@ -1792,34 +2356,38 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
     // We index ALTREF's as: KEY ----- ALT2 ----- ALT1 ----- ALT0
     // but code them in the following order:
     // KEY-ALT0-ALT2 ----- OVERLAY2-ALT1 ----- OVERLAY1 ----- OVERLAY0
-    arf_pos[0] =
-        frame_index + cpi->num_extra_arfs + gf_group->arf_src_offset[1] + 1;
+    //
+    // arf_pos_for_ovrly[]: Position for OVERLAY
+    // arf_pos_in_gf[]:     Position for ALTREF
+    cpi->arf_pos_for_ovrly[0] = frame_index + cpi->num_extra_arfs +
+                                gf_group->arf_src_offset[frame_index] + 1;
     for (i = 0; i < cpi->num_extra_arfs; ++i) {
-      arf_pos[i + 1] =
+      cpi->arf_pos_for_ovrly[i + 1] =
           frame_index + (cpi->num_extra_arfs - i) * (ext_arf_interval + 2);
-      subgroup_interval[i] = arf_pos[i] - arf_pos[i + 1] - (i == 0 ? 1 : 2);
+      subgroup_interval[i] = cpi->arf_pos_for_ovrly[i] -
+                             cpi->arf_pos_for_ovrly[i + 1] - (i == 0 ? 1 : 2);
     }
-    subgroup_interval[cpi->num_extra_arfs] = arf_pos[cpi->num_extra_arfs] -
-                                             frame_index -
-                                             (cpi->num_extra_arfs == 0 ? 1 : 2);
+    subgroup_interval[cpi->num_extra_arfs] =
+        cpi->arf_pos_for_ovrly[cpi->num_extra_arfs] - frame_index -
+        (cpi->num_extra_arfs == 0 ? 1 : 2);
 #endif  // CONFIG_EXT_REFS
 
     ++frame_index;
 
 #if CONFIG_EXT_REFS
     // Insert an extra ARF
+    // === [frame_index == 2] ===
     if (cpi->num_extra_arfs) {
-      gf_group->update_type[frame_index] = ARF_UPDATE;
-      // Note (weitinglin): GF_ARF_LOW is also used as an identifier
-      //                    for internal ALT_REF's:
+      gf_group->update_type[frame_index] = INTNL_ARF_UPDATE;
       gf_group->rf_level[frame_index] = GF_ARF_LOW;
       gf_group->arf_src_offset[frame_index] = ext_arf_interval;
+
       gf_group->arf_update_idx[frame_index] = which_arf;
       gf_group->arf_ref_idx[frame_index] = 0;
       ++frame_index;
     }
     accumulative_subgroup_interval += subgroup_interval[cpi->num_extra_arfs];
-#else
+#else   // !CONFIG_EXT_ARFS
     if (cpi->multi_arf_enabled) {
       // Set aside a slot for a level 1 arf.
       gf_group->update_type[frame_index] = ARF_UPDATE;
@@ -1838,30 +2406,14 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
   mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1;
 #endif  // !CONFIG_EXT_REFS
 
-  // Allocate bits to the other frames in the group.
   for (i = 0; i < rc->baseline_gf_interval - rc->source_alt_ref_pending; ++i) {
 #if !CONFIG_EXT_REFS
     int arf_idx = 0;
-#endif  // !CONFIG_EXT_REFS
-
-    if (EOF == input_stats(twopass, &frame_stats)) break;
-
-    modified_err = calculate_modified_err(cpi, twopass, oxcf, &frame_stats);
-
-    if (group_error > 0)
-      err_fraction = modified_err / DOUBLE_DIVIDE_CHECK(group_error);
-    else
-      err_fraction = 0.0;
-
-    target_frame_size = (int)((double)total_group_bits * err_fraction);
 
     if (rc->source_alt_ref_pending && cpi->multi_arf_enabled) {
-      mid_boost_bits += (target_frame_size >> 4);
-      target_frame_size -= (target_frame_size >> 4);
-#if !CONFIG_EXT_REFS
       if (frame_index <= mid_frame_idx) arf_idx = 1;
-#endif  // !CONFIG_EXT_REFS
     }
+#endif  // !CONFIG_EXT_REFS
 
 #if CONFIG_EXT_REFS
     gf_group->arf_update_idx[frame_index] = which_arf;
@@ -1871,12 +2423,12 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
     gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[arf_idx];
 #endif  // CONFIG_EXT_REFS
 
-    target_frame_size =
-        clamp(target_frame_size, 0, AOMMIN(max_bits, (int)total_group_bits));
-
 #if CONFIG_EXT_REFS
-    // If we are going to have ARFs, check if we can have BWDREF in this
-    // subgroup.
+    // If we are going to have ARFs, check whether we can have BWDREF in this
+    // subgroup, and further, whether we can have ARF subgroup which contains
+    // the BWDREF subgroup but contained within the GF group:
+    //
+    // GF group --> ARF subgroup --> BWDREF subgroup
     if (rc->source_alt_ref_pending) {
       is_sg_bipred_enabled =
           is_bipred_enabled &&
@@ -1890,24 +2442,26 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
     if (is_sg_bipred_enabled && !bipred_group_end) {
       const int cur_brf_src_offset = rc->bipred_group_interval - 1;
 
-      // --- BRF_UPDATE ---
       if (bipred_frame_index == 1) {
+        // --- BRF_UPDATE ---
         gf_group->update_type[frame_index] = BRF_UPDATE;
-        gf_group->bidir_pred_enabled[frame_index] = 1;
+        gf_group->rf_level[frame_index] = GF_ARF_LOW;
         gf_group->brf_src_offset[frame_index] = cur_brf_src_offset;
-        // --- LAST_BIPRED_UPDATE ---
       } else if (bipred_frame_index == rc->bipred_group_interval) {
+        // --- LAST_BIPRED_UPDATE ---
         gf_group->update_type[frame_index] = LAST_BIPRED_UPDATE;
-        gf_group->bidir_pred_enabled[frame_index] = 1;
+        gf_group->rf_level[frame_index] = INTER_NORMAL;
         gf_group->brf_src_offset[frame_index] = 0;
+
         // Reset the bi-predictive frame index.
         bipred_frame_index = 0;
-        // --- BIPRED_UPDATE ---
       } else {
+        // --- BIPRED_UPDATE ---
         gf_group->update_type[frame_index] = BIPRED_UPDATE;
-        gf_group->bidir_pred_enabled[frame_index] = 1;
+        gf_group->rf_level[frame_index] = INTER_NORMAL;
         gf_group->brf_src_offset[frame_index] = 0;
       }
+      gf_group->bidir_pred_enabled[frame_index] = 1;
 
       bipred_frame_index++;
       // Check whether the next bi-predictive frame group would entirely be
@@ -1920,51 +2474,30 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
     } else {
 #endif  // CONFIG_EXT_REFS
       gf_group->update_type[frame_index] = LF_UPDATE;
+      gf_group->rf_level[frame_index] = INTER_NORMAL;
 #if CONFIG_EXT_REFS
       gf_group->bidir_pred_enabled[frame_index] = 0;
       gf_group->brf_src_offset[frame_index] = 0;
     }
 #endif  // CONFIG_EXT_REFS
 
-#if CONFIG_EXT_REFS
-    if (gf_group->update_type[frame_index] == BRF_UPDATE) {
-      // Boost up the allocated bits on BWDREF_FRAME
-      gf_group->rf_level[frame_index] = GF_ARF_LOW;
-      gf_group->bit_allocation[frame_index] =
-          target_frame_size + (target_frame_size >> 2);
-    } else if (gf_group->update_type[frame_index] == LAST_BIPRED_UPDATE) {
-      // Press down the allocated bits on LAST_BIPRED_UPDATE frames
-      gf_group->rf_level[frame_index] = INTER_NORMAL;
-      gf_group->bit_allocation[frame_index] =
-          target_frame_size - (target_frame_size >> 1);
-    } else if (gf_group->update_type[frame_index] == BIPRED_UPDATE) {
-      // TODO(zoeliu): To investigate whether the allocated bits on
-      // BIPRED_UPDATE frames need to be further adjusted.
-      gf_group->rf_level[frame_index] = INTER_NORMAL;
-      gf_group->bit_allocation[frame_index] = target_frame_size;
-    } else {
-#endif  // CONFIG_EXT_REFS
-      gf_group->rf_level[frame_index] = INTER_NORMAL;
-      gf_group->bit_allocation[frame_index] = target_frame_size;
-#if CONFIG_EXT_REFS
-    }
-#endif  // CONFIG_EXT_REFS
-
     ++frame_index;
 
 #if CONFIG_EXT_REFS
-    // Check if we need to update the ARF
+    // Check if we need to update the ARF.
     if (is_sg_bipred_enabled && cpi->num_extra_arfs && which_arf > 0 &&
-        frame_index > arf_pos[which_arf]) {
+        frame_index > cpi->arf_pos_for_ovrly[which_arf]) {
       --which_arf;
       accumulative_subgroup_interval += subgroup_interval[which_arf] + 1;
-      // Meet the new subgroup. Reset the bipred_group_end flag;
+
+      // Meet the new subgroup; Reset the bipred_group_end flag.
       bipred_group_end = 0;
       // Insert another extra ARF after the overlay frame
       if (which_arf) {
-        gf_group->update_type[frame_index] = ARF_UPDATE;
+        gf_group->update_type[frame_index] = INTNL_ARF_UPDATE;
         gf_group->rf_level[frame_index] = GF_ARF_LOW;
         gf_group->arf_src_offset[frame_index] = ext_arf_interval;
+
         gf_group->arf_update_idx[frame_index] = which_arf;
         gf_group->arf_ref_idx[frame_index] = 0;
         ++frame_index;
@@ -1973,10 +2506,9 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
 #endif  // CONFIG_EXT_REFS
   }
 
-// Note:
-// We need to configure the frame at the end of the sequence + 1 that will be
-// the start frame for the next group. Otherwise prior to the call to
-// av1_rc_get_second_pass_params() the data will be undefined.
+// NOTE: We need to configure the frame at the end of the sequence + 1 that will
+//       be the start frame for the next group. Otherwise prior to the call to
+//       av1_rc_get_second_pass_params() the data will be undefined.
 #if CONFIG_EXT_REFS
   gf_group->arf_update_idx[frame_index] = 0;
   gf_group->arf_ref_idx[frame_index] = 0;
@@ -1990,23 +2522,22 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
     gf_group->rf_level[frame_index] = INTER_NORMAL;
 
 #if CONFIG_EXT_REFS
+    cpi->arf_pos_in_gf[0] = 1;
     if (cpi->num_extra_arfs) {
+      // Overwrite the update_type for extra-ARF's corresponding internal
+      // OVERLAY's: Change from LF_UPDATE to INTNL_OVERLAY_UPDATE.
       for (i = cpi->num_extra_arfs; i > 0; --i) {
-        int arf_pos_in_gf = (i == cpi->num_extra_arfs ? 2 : arf_pos[i + 1] + 1);
-        gf_group->bit_allocation[arf_pos_in_gf] =
-            gf_group->bit_allocation[arf_pos[i]];
-        gf_group->update_type[arf_pos[i]] = INTNL_OVERLAY_UPDATE;
-        gf_group->bit_allocation[arf_pos[i]] = 0;
-        gf_group->rf_level[arf_pos[i]] = INTER_NORMAL;
+        cpi->arf_pos_in_gf[i] =
+            (i == cpi->num_extra_arfs ? 2 : cpi->arf_pos_for_ovrly[i + 1] + 1);
+
+        gf_group->update_type[cpi->arf_pos_for_ovrly[i]] = INTNL_OVERLAY_UPDATE;
+        gf_group->rf_level[cpi->arf_pos_for_ovrly[i]] = INTER_NORMAL;
       }
     }
 #else
     // Final setup for second arf and its overlay.
     if (cpi->multi_arf_enabled) {
-      gf_group->bit_allocation[2] =
-          gf_group->bit_allocation[mid_frame_idx] + mid_boost_bits;
       gf_group->update_type[mid_frame_idx] = OVERLAY_UPDATE;
-      gf_group->bit_allocation[mid_frame_idx] = 0;
     }
 #endif  // CONFIG_EXT_REFS
   } else {
@@ -2018,6 +2549,168 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
   gf_group->bidir_pred_enabled[frame_index] = 0;
   gf_group->brf_src_offset[frame_index] = 0;
 #endif  // CONFIG_EXT_REFS
+}
+
+static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
+                                   double group_error, int gf_arf_bits) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  TWO_PASS *const twopass = &cpi->twopass;
+  GF_GROUP *const gf_group = &twopass->gf_group;
+  FIRSTPASS_STATS frame_stats;
+  int i;
+  int frame_index = 0;
+  int target_frame_size;
+  int key_frame;
+  const int max_bits = frame_max_bits(&cpi->rc, &cpi->oxcf);
+  int64_t total_group_bits = gf_group_bits;
+  double modified_err = 0.0;
+  double err_fraction;
+  int mid_boost_bits = 0;
+#if CONFIG_EXT_REFS
+  int ext_arf_boost[MAX_EXT_ARFS];
+#else
+  int mid_frame_idx;
+#endif  // CONFIG_EXT_REFS
+
+  define_gf_group_structure(cpi);
+
+#if CONFIG_EXT_REFS
+  av1_zero_array(ext_arf_boost, MAX_EXT_ARFS);
+#endif  // CONFIG_EXT_REFS
+
+  key_frame = cpi->common.frame_type == KEY_FRAME;
+
+  // For key frames the frame target rate is already set and it
+  // is also the golden frame.
+  // === [frame_index == 0] ===
+  if (!key_frame) {
+    if (rc->source_alt_ref_active)
+      gf_group->bit_allocation[frame_index] = 0;
+    else
+      gf_group->bit_allocation[frame_index] = gf_arf_bits;
+
+    // Step over the golden frame / overlay frame
+    if (EOF == input_stats(twopass, &frame_stats)) return;
+  }
+
+  // Deduct the boost bits for arf (or gf if it is not a key frame)
+  // from the group total.
+  if (rc->source_alt_ref_pending || !key_frame) total_group_bits -= gf_arf_bits;
+
+  frame_index++;
+
+  // Store the bits to spend on the ARF if there is one.
+  // === [frame_index == 1] ===
+  if (rc->source_alt_ref_pending) {
+    gf_group->bit_allocation[frame_index] = gf_arf_bits;
+
+    ++frame_index;
+
+#if CONFIG_EXT_REFS
+    // Skip all the extra-ARF's right after ARF at the starting segment of
+    // the current GF group.
+    if (cpi->num_extra_arfs) {
+      while (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE)
+        ++frame_index;
+    }
+#else   // !CONFIG_EXT_ARFS
+    // Set aside a slot for a level 1 arf.
+    if (cpi->multi_arf_enabled) ++frame_index;
+#endif  // CONFIG_EXT_ARFS
+  }
+
+#if !CONFIG_EXT_REFS
+  // Define middle frame
+  mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1;
+#endif  // !CONFIG_EXT_REFS
+
+  // Allocate bits to the other frames in the group.
+  for (i = 0; i < rc->baseline_gf_interval - rc->source_alt_ref_pending; ++i) {
+    if (EOF == input_stats(twopass, &frame_stats)) break;
+
+    modified_err = calculate_modified_err(cpi, twopass, oxcf, &frame_stats);
+
+    if (group_error > 0)
+      err_fraction = modified_err / DOUBLE_DIVIDE_CHECK(group_error);
+    else
+      err_fraction = 0.0;
+
+    target_frame_size = (int)((double)total_group_bits * err_fraction);
+
+    if (rc->source_alt_ref_pending && cpi->multi_arf_enabled) {
+      mid_boost_bits += (target_frame_size >> 4);
+      target_frame_size -= (target_frame_size >> 4);
+    }
+
+    target_frame_size =
+        clamp(target_frame_size, 0, AOMMIN(max_bits, (int)total_group_bits));
+
+#if CONFIG_EXT_REFS
+    if (gf_group->update_type[frame_index] == BRF_UPDATE) {
+      // Boost up the allocated bits on BWDREF_FRAME
+      gf_group->bit_allocation[frame_index] =
+          target_frame_size + (target_frame_size >> 2);
+    } else if (gf_group->update_type[frame_index] == LAST_BIPRED_UPDATE) {
+      // Press down the allocated bits on LAST_BIPRED_UPDATE frames
+      gf_group->bit_allocation[frame_index] =
+          target_frame_size - (target_frame_size >> 1);
+    } else if (gf_group->update_type[frame_index] == BIPRED_UPDATE) {
+      // TODO(zoeliu): To investigate whether the allocated bits on
+      // BIPRED_UPDATE frames need to be further adjusted.
+      gf_group->bit_allocation[frame_index] = target_frame_size;
+    } else {
+      assert(gf_group->update_type[frame_index] == LF_UPDATE ||
+             gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE);
+#endif  // CONFIG_EXT_REFS
+      gf_group->bit_allocation[frame_index] = target_frame_size;
+#if CONFIG_EXT_REFS
+    }
+#endif  // CONFIG_EXT_REFS
+
+    ++frame_index;
+
+#if CONFIG_EXT_REFS
+    // Skip all the extra-ARF's.
+    if (cpi->num_extra_arfs) {
+      while (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE)
+        ++frame_index;
+    }
+#endif  // CONFIG_EXT_REFS
+  }
+
+  // NOTE: We need to configure the frame at the end of the sequence + 1 that
+  //       will be the start frame for the next group. Otherwise prior to the
+  //       call to av1_rc_get_second_pass_params() the data will be undefined.
+  if (rc->source_alt_ref_pending) {
+#if CONFIG_EXT_REFS
+    if (cpi->num_extra_arfs) {
+      // NOTE: For bit allocation, move the allocated bits associated with
+      //       INTNL_OVERLAY_UPDATE to the corresponding INTNL_ARF_UPDATE.
+      //       i > 0 for extra-ARF's and i == 0 for ARF:
+      //         arf_pos_for_ovrly[i]: Position for INTNL_OVERLAY_UPDATE
+      //         arf_pos_in_gf[i]: Position for INTNL_ARF_UPDATE
+      for (i = cpi->num_extra_arfs; i > 0; --i) {
+        assert(gf_group->update_type[cpi->arf_pos_for_ovrly[i]] ==
+               INTNL_OVERLAY_UPDATE);
+
+        // Encoder's choice:
+        //   Set show_existing_frame == 1 for all extra-ARF's, and hence
+        //   allocate zero bit for both all internal OVERLAY frames.
+        gf_group->bit_allocation[cpi->arf_pos_in_gf[i]] =
+            gf_group->bit_allocation[cpi->arf_pos_for_ovrly[i]];
+        gf_group->bit_allocation[cpi->arf_pos_for_ovrly[i]] = 0;
+      }
+    }
+#else
+    // Final setup for second arf and its overlay.
+    if (cpi->multi_arf_enabled) {
+      gf_group->bit_allocation[2] =
+          gf_group->bit_allocation[mid_frame_idx] + mid_boost_bits;
+      gf_group->bit_allocation[mid_frame_idx] = 0;
+    }
+#endif  // CONFIG_EXT_REFS
+  }
 
   // Note whether multi-arf was enabled this group for next time.
   cpi->multi_arf_last_grp_enabled = cpi->multi_arf_enabled;
@@ -2068,10 +2761,10 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   const int is_key_frame = frame_is_intra_only(cm);
   const int arf_active_or_kf = is_key_frame || rc->source_alt_ref_active;
 
-#if CONFIG_FLEX_REFS
+#if CONFIG_EXT_REFS
   cpi->extra_arf_allowed = 1;
   cpi->bwd_ref_allowed = 1;
-#endif
+#endif  // CONFIG_EXT_REFS
 
   // Reset the GF group data structures unless this is a key
   // frame in which case it will already have been done.
@@ -2133,11 +2826,15 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     }
   }
 
-#if CONFIG_FLEX_REFS
+#if CONFIG_EXT_REFS || CONFIG_BGSPRITE
   double avg_sr_coded_error = 0;
   double avg_raw_err_stdev = 0;
   int non_zero_stdev_count = 0;
-#endif  // CONFIG_FLEX_REFS
+#endif  // CONFIG_EXT_REFS || CONFIG_BGSPRITE
+#if CONFIG_BGSPRITE
+  double avg_pcnt_second_ref = 0;
+  int non_zero_pcnt_second_ref_count = 0;
+#endif
 
   i = 0;
   while (i < rc->static_scene_max_gf_interval && i < rc->frames_to_key) {
@@ -2162,14 +2859,20 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     accumulate_frame_motion_stats(
         &next_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
         &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
-#if CONFIG_FLEX_REFS
+#if CONFIG_EXT_REFS || CONFIG_BGSPRITE
     // sum up the metric values of current gf group
     avg_sr_coded_error += next_frame.sr_coded_error;
-    if (next_frame.raw_error_stdev) {
+    if (fabs(next_frame.raw_error_stdev) > 0.000001) {
       non_zero_stdev_count++;
       avg_raw_err_stdev += next_frame.raw_error_stdev;
     }
-#endif  // CONFIG_FLEX_REFS
+#endif  // CONFIG_EXT_REFS || CONFIG_BGSPRITE
+#if CONFIG_BGSPRITE
+    if (this_frame->pcnt_second_ref) {
+      avg_pcnt_second_ref += this_frame->pcnt_second_ref;
+    }
+    non_zero_pcnt_second_ref_count++;
+#endif  // CONFIG_BGSPRITE
 
     // Accumulate the effect of prediction quality decay.
     if (!flash_detected) {
@@ -2209,8 +2912,18 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
              (abs_mv_in_out_accumulator > 3.0) ||
              (mv_in_out_accumulator < -2.0) ||
              ((boost_score - old_boost_score) < BOOST_BREAKOUT)))) {
-      boost_score = old_boost_score;
-      break;
+#if CONFIG_EXT_REFS
+      // If GF group interval is < 12, we force it to be 8. Otherwise,
+      // if it is >= 12, we keep it as is.
+      // NOTE: 'i' is 1 more than the GF group interval candidate that is being
+      //       checked.
+      if (i == (8 + 1) || i >= (12 + 1)) {
+#endif  // CONFIG_EXT_REFS
+        boost_score = old_boost_score;
+        break;
+#if CONFIG_EXT_REFS
+      }
+#endif  // CONFIG_EXT_REFS
     }
 
     *this_frame = next_frame;
@@ -2221,6 +2934,13 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   // Was the group length constrained by the requirement for a new KF?
   rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0;
 
+#if CONFIG_EXT_REFS || CONFIG_BGSPRITE
+  const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
+                                                             : cpi->common.MBs;
+  assert(num_mbs > 0);
+  if (i) avg_sr_coded_error /= i;
+#endif  // CONFIG_EXT_REFS || CONFIG_BGSPRITE
+
   // Should we use the alternate reference frame.
   if (allow_alt_ref && (i < cpi->oxcf.lag_in_frames) &&
       (i >= rc->min_gf_interval)) {
@@ -2235,6 +2955,17 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
          (zero_motion_accumulator < 0.995))
             ? 1
             : 0;
+#if CONFIG_BGSPRITE
+    if (non_zero_pcnt_second_ref_count) {
+      avg_pcnt_second_ref /= non_zero_pcnt_second_ref_count;
+    }
+
+    cpi->bgsprite_allowed = 1;
+    if (abs_mv_in_out_accumulator > 0.30 || decay_accumulator < 0.90 ||
+        avg_sr_coded_error / num_mbs < 20 || avg_pcnt_second_ref < 0.30) {
+      cpi->bgsprite_allowed = 0;
+    }
+#endif  // CONFIG_BGSPRITE
   } else {
     rc->gfu_boost = AOMMAX((int)boost_score, MIN_ARF_GF_BOOST);
     rc->source_alt_ref_pending = 0;
@@ -2243,19 +2974,13 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   // Set the interval until the next gf.
   rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending);
 #if CONFIG_EXT_REFS
-#if CONFIG_FLEX_REFS
-  const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
-                                                             : cpi->common.MBs;
-  if (i) avg_sr_coded_error /= i;
   if (non_zero_stdev_count) avg_raw_err_stdev /= non_zero_stdev_count;
 
-  // Disable extra alter refs and backward ref for "still" gf group
-  // zero_motion_accumulator indicates the minimum percentage of (0, 0) motion
-  // in gf group
-  // avg_sr_coded_error indicates the average of the sse per pixel of each frame
-  // in gf group
-  // avg_raw_err_stdev indicates the average of the standard deviation of (0, 0)
-  // motion error per block of each frame in gf group
+  // Disable extra altrefs and backward refs for "still" gf group:
+  //   zero_motion_accumulator: minimum percentage of (0,0) motion;
+  //   avg_sr_coded_error:      average of the SSE per pixel of each frame;
+  //   avg_raw_err_stdev:       average of the standard deviation of (0,0)
+  //                            motion error per block of each frame.
   assert(num_mbs > 0);
   const int disable_bwd_extarf =
       (zero_motion_accumulator > MIN_ZERO_MOTION &&
@@ -2264,13 +2989,13 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
   if (disable_bwd_extarf) cpi->extra_arf_allowed = cpi->bwd_ref_allowed = 0;
 
-  if (!cpi->extra_arf_allowed)
+  if (!cpi->extra_arf_allowed) {
     cpi->num_extra_arfs = 0;
-  else
-#endif  // CONFIG_FLEX_REFS
+  } else {
     // Compute how many extra alt_refs we can have
     cpi->num_extra_arfs = get_number_of_extra_arfs(rc->baseline_gf_interval,
                                                    rc->source_alt_ref_pending);
+  }
   // Currently at maximum two extra ARFs' are allowed
   assert(cpi->num_extra_arfs <= MAX_EXT_ARFS);
 #endif  // CONFIG_EXT_REFS
@@ -2652,7 +3377,8 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       boost_score += (decay_accumulator * frame_boost);
     }
   }
-  av_decay_accumulator /= (double)loop_decay_counter;
+  if (loop_decay_counter > 0)
+    av_decay_accumulator /= (double)loop_decay_counter;
 
   reset_fpf_position(twopass, start_position);
 
@@ -2698,11 +3424,158 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   twopass->modified_error_left -= kf_group_err;
 }
 
+#if USE_GF16_MULTI_LAYER
+// === GF Group of 16 ===
+void av1_ref_frame_map_idx_updates(AV1_COMP *cpi, int gf_frame_index) {
+  TWO_PASS *const twopass = &cpi->twopass;
+  GF_GROUP *const gf_group = &twopass->gf_group;
+
+  int ref_fb_idx_prev[REF_FRAMES];
+  int ref_fb_idx_curr[REF_FRAMES];
+
+  ref_fb_idx_prev[LAST_FRAME - LAST_FRAME] =
+      cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME];
+  ref_fb_idx_prev[LAST2_FRAME - LAST_FRAME] =
+      cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME];
+  ref_fb_idx_prev[LAST3_FRAME - LAST_FRAME] =
+      cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME];
+  ref_fb_idx_prev[GOLDEN_FRAME - LAST_FRAME] = cpi->gld_fb_idx;
+  ref_fb_idx_prev[BWDREF_FRAME - LAST_FRAME] = cpi->bwd_fb_idx;
+  ref_fb_idx_prev[ALTREF2_FRAME - LAST_FRAME] = cpi->alt2_fb_idx;
+  ref_fb_idx_prev[ALTREF_FRAME - LAST_FRAME] = cpi->alt_fb_idx;
+  ref_fb_idx_prev[REF_FRAMES - LAST_FRAME] = cpi->ext_fb_idx;
+
+  // Update map index for each reference frame
+  for (int ref_idx = 0; ref_idx < REF_FRAMES; ++ref_idx) {
+    int ref_frame = gf_group->ref_fb_idx_map[gf_frame_index][ref_idx];
+    ref_fb_idx_curr[ref_idx] = ref_fb_idx_prev[ref_frame - LAST_FRAME];
+  }
+
+  cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME] =
+      ref_fb_idx_curr[LAST_FRAME - LAST_FRAME];
+  cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME] =
+      ref_fb_idx_curr[LAST2_FRAME - LAST_FRAME];
+  cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME] =
+      ref_fb_idx_curr[LAST3_FRAME - LAST_FRAME];
+  cpi->gld_fb_idx = ref_fb_idx_curr[GOLDEN_FRAME - LAST_FRAME];
+  cpi->bwd_fb_idx = ref_fb_idx_curr[BWDREF_FRAME - LAST_FRAME];
+  cpi->alt2_fb_idx = ref_fb_idx_curr[ALTREF2_FRAME - LAST_FRAME];
+  cpi->alt_fb_idx = ref_fb_idx_curr[ALTREF_FRAME - LAST_FRAME];
+  cpi->ext_fb_idx = ref_fb_idx_curr[REF_FRAMES - LAST_FRAME];
+}
+
+// Define the reference buffers that will be updated post encode.
+static void configure_buffer_updates_16(AV1_COMP *cpi) {
+  TWO_PASS *const twopass = &cpi->twopass;
+  GF_GROUP *const gf_group = &twopass->gf_group;
+
+  if (gf_group->update_type[gf_group->index] == KF_UPDATE) {
+    cpi->refresh_fb_idx = 0;
+
+    cpi->refresh_last_frame = 1;
+    cpi->refresh_golden_frame = 1;
+    cpi->refresh_bwd_ref_frame = 1;
+    cpi->refresh_alt2_ref_frame = 1;
+    cpi->refresh_alt_ref_frame = 1;
+
+    return;
+  }
+
+  // Update reference frame map indexes
+  av1_ref_frame_map_idx_updates(cpi, gf_group->index);
+
+  // Update refresh index
+  switch (gf_group->refresh_idx[gf_group->index]) {
+    case LAST_FRAME:
+      cpi->refresh_fb_idx = cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME];
+      break;
+
+    case LAST2_FRAME:
+      cpi->refresh_fb_idx = cpi->lst_fb_idxes[LAST2_FRAME - LAST_FRAME];
+      break;
+
+    case LAST3_FRAME:
+      cpi->refresh_fb_idx = cpi->lst_fb_idxes[LAST3_FRAME - LAST_FRAME];
+      break;
+
+    case GOLDEN_FRAME: cpi->refresh_fb_idx = cpi->gld_fb_idx; break;
+
+    case BWDREF_FRAME: cpi->refresh_fb_idx = cpi->bwd_fb_idx; break;
+
+    case ALTREF2_FRAME: cpi->refresh_fb_idx = cpi->alt2_fb_idx; break;
+
+    case ALTREF_FRAME: cpi->refresh_fb_idx = cpi->alt_fb_idx; break;
+
+    case REF_FRAMES: cpi->refresh_fb_idx = cpi->ext_fb_idx; break;
+
+    default: assert(0); break;
+  }
+
+  // Update refresh flags
+  switch (gf_group->refresh_flag[gf_group->index]) {
+    case LAST_FRAME:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_bwd_ref_frame = 0;
+      cpi->refresh_alt2_ref_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+      break;
+
+    case GOLDEN_FRAME:
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 1;
+      cpi->refresh_bwd_ref_frame = 0;
+      cpi->refresh_alt2_ref_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+      break;
+
+    case BWDREF_FRAME:
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_bwd_ref_frame = 1;
+      cpi->refresh_alt2_ref_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+      break;
+
+    case ALTREF2_FRAME:
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_bwd_ref_frame = 0;
+      cpi->refresh_alt2_ref_frame = 1;
+      cpi->refresh_alt_ref_frame = 0;
+      break;
+
+    case ALTREF_FRAME:
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_bwd_ref_frame = 0;
+      cpi->refresh_alt2_ref_frame = 0;
+      cpi->refresh_alt_ref_frame = 1;
+      break;
+
+    default: assert(0); break;
+  }
+
+  switch (gf_group->update_type[gf_group->index]) {
+    case BRF_UPDATE: cpi->rc.is_bwd_ref_frame = 1; break;
+
+    case LAST_BIPRED_UPDATE: cpi->rc.is_last_bipred_frame = 1; break;
+
+    case BIPRED_UPDATE: cpi->rc.is_bipred_frame = 1; break;
+
+    case INTNL_OVERLAY_UPDATE: cpi->rc.is_src_frame_ext_arf = 1;
+    case OVERLAY_UPDATE: cpi->rc.is_src_frame_alt_ref = 1; break;
+
+    default: break;
+  }
+}
+#endif  // USE_GF16_MULTI_LAYER
+
 // Define the reference buffers that will be updated post encode.
 static void configure_buffer_updates(AV1_COMP *cpi) {
   TWO_PASS *const twopass = &cpi->twopass;
 
-  // Wei-Ting: Should we define another function to take care of
+  // NOTE(weitinglin): Should we define another function to take care of
   // cpi->rc.is_$Source_Type to make this function as it is in the comment?
 
   cpi->rc.is_src_frame_alt_ref = 0;
@@ -2711,45 +3584,42 @@ static void configure_buffer_updates(AV1_COMP *cpi) {
   cpi->rc.is_last_bipred_frame = 0;
   cpi->rc.is_bipred_frame = 0;
   cpi->rc.is_src_frame_ext_arf = 0;
+
+#if USE_GF16_MULTI_LAYER
+  RATE_CONTROL *const rc = &cpi->rc;
+  if (rc->baseline_gf_interval == 16) {
+    configure_buffer_updates_16(cpi);
+    return;
+  }
+#endif  // USE_GF16_MULTI_LAYER
 #endif  // CONFIG_EXT_REFS
 
   switch (twopass->gf_group.update_type[twopass->gf_group.index]) {
-    case KF_UPDATE:
+    case KF_UPDATE: cpi->refresh_last_frame = 1; cpi->refresh_golden_frame = 1;
 #if CONFIG_EXT_REFS
       cpi->refresh_bwd_ref_frame = 1;
+      cpi->refresh_alt2_ref_frame = 1;
 #endif  // CONFIG_EXT_REFS
-      cpi->refresh_last_frame = 1;
-      cpi->refresh_golden_frame = 1;
       cpi->refresh_alt_ref_frame = 1;
       break;
 
-    case LF_UPDATE:
+    case LF_UPDATE: cpi->refresh_last_frame = 1; cpi->refresh_golden_frame = 0;
 #if CONFIG_EXT_REFS
-      // If we have extra ALT_REFs, we can use the farthest ALT (ALT0) as
-      // the BWD_REF.
-      if (cpi->num_extra_arfs) {
-        int tmp = cpi->bwd_fb_idx;
-
-        cpi->bwd_fb_idx = cpi->alt_fb_idx;
-        cpi->alt_fb_idx = cpi->arf_map[0];
-        cpi->arf_map[0] = tmp;
-
-        cpi->rc.is_bwd_ref_frame = 1;
-      } else {
-        cpi->rc.is_bwd_ref_frame = 0;
-      }
+      cpi->refresh_bwd_ref_frame = 0;
+      cpi->refresh_alt2_ref_frame = 0;
 #endif  // CONFIG_EXT_REFS
-      cpi->refresh_last_frame = 1;
-      cpi->refresh_golden_frame = 0;
       cpi->refresh_alt_ref_frame = 0;
       break;
 
     case GF_UPDATE:
+      // TODO(zoeliu): To further investigate whether 'refresh_last_frame' is
+      //               needed.
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 1;
 #if CONFIG_EXT_REFS
       cpi->refresh_bwd_ref_frame = 0;
+      cpi->refresh_alt2_ref_frame = 0;
 #endif  // CONFIG_EXT_REFS
-      cpi->refresh_last_frame = 1;
-      cpi->refresh_golden_frame = 1;
       cpi->refresh_alt_ref_frame = 0;
       break;
 
@@ -2758,17 +3628,19 @@ static void configure_buffer_updates(AV1_COMP *cpi) {
       cpi->refresh_golden_frame = 1;
 #if CONFIG_EXT_REFS
       cpi->refresh_bwd_ref_frame = 0;
+      cpi->refresh_alt2_ref_frame = 0;
 #endif  // CONFIG_EXT_REFS
       cpi->refresh_alt_ref_frame = 0;
+
       cpi->rc.is_src_frame_alt_ref = 1;
       break;
 
-    case ARF_UPDATE:
+    case ARF_UPDATE: cpi->refresh_last_frame = 0; cpi->refresh_golden_frame = 0;
 #if CONFIG_EXT_REFS
-      cpi->refresh_bwd_ref_frame = 1;
+      // NOTE: BWDREF does not get updated along with ALTREF_FRAME.
+      cpi->refresh_bwd_ref_frame = 0;
+      cpi->refresh_alt2_ref_frame = 0;
 #endif  // CONFIG_EXT_REFS
-      cpi->refresh_last_frame = 0;
-      cpi->refresh_golden_frame = 0;
       cpi->refresh_alt_ref_frame = 1;
       break;
 
@@ -2777,26 +3649,19 @@ static void configure_buffer_updates(AV1_COMP *cpi) {
       cpi->refresh_last_frame = 0;
       cpi->refresh_golden_frame = 0;
       cpi->refresh_bwd_ref_frame = 1;
+      cpi->refresh_alt2_ref_frame = 0;
       cpi->refresh_alt_ref_frame = 0;
+
       cpi->rc.is_bwd_ref_frame = 1;
-      if (cpi->num_extra_arfs) {
-        // Allow BRF use the farthest ALT_REF (ALT0) as BWD_REF by swapping
-        // the virtual indices.
-        // NOTE: The indices will be swapped back after this frame is encoded
-        //       (in av1_update_reference_frames()).
-        int tmp = cpi->bwd_fb_idx;
-
-        cpi->bwd_fb_idx = cpi->alt_fb_idx;
-        cpi->alt_fb_idx = cpi->arf_map[0];
-        cpi->arf_map[0] = tmp;
-      }
       break;
 
     case LAST_BIPRED_UPDATE:
       cpi->refresh_last_frame = 1;
       cpi->refresh_golden_frame = 0;
       cpi->refresh_bwd_ref_frame = 0;
+      cpi->refresh_alt2_ref_frame = 0;
       cpi->refresh_alt_ref_frame = 0;
+
       cpi->rc.is_last_bipred_frame = 1;
       break;
 
@@ -2804,7 +3669,9 @@ static void configure_buffer_updates(AV1_COMP *cpi) {
       cpi->refresh_last_frame = 1;
       cpi->refresh_golden_frame = 0;
       cpi->refresh_bwd_ref_frame = 0;
+      cpi->refresh_alt2_ref_frame = 0;
       cpi->refresh_alt_ref_frame = 0;
+
       cpi->rc.is_bipred_frame = 1;
       break;
 
@@ -2812,10 +3679,20 @@ static void configure_buffer_updates(AV1_COMP *cpi) {
       cpi->refresh_last_frame = 1;
       cpi->refresh_golden_frame = 0;
       cpi->refresh_bwd_ref_frame = 0;
+      cpi->refresh_alt2_ref_frame = 0;
       cpi->refresh_alt_ref_frame = 0;
+
       cpi->rc.is_src_frame_alt_ref = 1;
       cpi->rc.is_src_frame_ext_arf = 1;
       break;
+
+    case INTNL_ARF_UPDATE:
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_bwd_ref_frame = 0;
+      cpi->refresh_alt2_ref_frame = 1;
+      cpi->refresh_alt_ref_frame = 0;
+      break;
 #endif  // CONFIG_EXT_REFS
 
     default: assert(0); break;
@@ -2857,7 +3734,11 @@ void av1_rc_get_second_pass_params(AV1_COMP *cpi) {
 
   // If this is an arf frame then we dont want to read the stats file or
   // advance the input pointer as we already have what we need.
-  if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
+  if (gf_group->update_type[gf_group->index] == ARF_UPDATE
+#if CONFIG_EXT_REFS
+      || gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE
+#endif  // CONFIG_EXT_REFS
+      ) {
     configure_buffer_updates(cpi);
     target_rate = gf_group->bit_allocation[gf_group->index];
     target_rate = av1_rc_clamp_pframe_target_size(cpi, target_rate);
@@ -2935,7 +3816,7 @@ void av1_rc_get_second_pass_params(AV1_COMP *cpi) {
       FILE *fpfile;
       fpfile = fopen("arf.stt", "a");
       ++arf_count;
-      fprintf(fpfile, "%10d %10ld %10d %10d %10ld\n", cm->current_video_frame,
+      fprintf(fpfile, "%10d %10d %10d %10d %10d\n", cm->current_video_frame,
               rc->frames_till_gf_update_due, rc->kf_boost, arf_count,
               rc->gfu_boost);
 
diff --git a/third_party/aom/av1/encoder/firstpass.h b/third_party/aom/av1/encoder/firstpass.h
index 266766d99..9ac542bf3 100644
--- a/third_party/aom/av1/encoder/firstpass.h
+++ b/third_party/aom/av1/encoder/firstpass.h
@@ -12,6 +12,8 @@
 #ifndef AV1_ENCODER_FIRSTPASS_H_
 #define AV1_ENCODER_FIRSTPASS_H_
 
+#include "av1/common/enums.h"
+#include "av1/common/onyxc_int.h"
 #include "av1/encoder/lookahead.h"
 #include "av1/encoder/ratectrl.h"
 
@@ -45,19 +47,24 @@ typedef struct {
 // NOTE: Currently each BFG contains one backward ref (BWF) frame plus a certain
 //       number of bi-predictive frames.
 #define BFG_INTERVAL 2
-// The maximum number of extra ALT_REF's
-// NOTE: This number cannot be greater than 2 or the reference frame buffer will
-//       overflow.
-#define MAX_EXT_ARFS 2
+// The maximum number of extra ALTREF's except ALTREF_FRAME
+// NOTE: REF_FRAMES indicates the maximum number of frames that may be buffered
+//       to serve as references. Currently REF_FRAMES == 8.
+#define USE_GF16_MULTI_LAYER 0
+
+#if USE_GF16_MULTI_LAYER
+#define MAX_EXT_ARFS (REF_FRAMES - BWDREF_FRAME)
+#else  // !USE_GF16_MULTI_LAYER
+#define MAX_EXT_ARFS (REF_FRAMES - BWDREF_FRAME - 1)
+#endif  // USE_GF16_MULTI_LAYER
+
 #define MIN_EXT_ARF_INTERVAL 4
-#endif  // CONFIG_EXT_REFS
 
-#if CONFIG_FLEX_REFS
 #define MIN_ZERO_MOTION 0.95
 #define MAX_SR_CODED_ERROR 40
 #define MAX_RAW_ERR_VAR 2000
 #define MIN_MV_IN_OUT 0.4
-#endif  // CONFIG_FLEX_REFS
+#endif  // CONFIG_EXT_REFS
 
 #define VLOW_MOTION_THRESHOLD 950
 
@@ -84,10 +91,10 @@ typedef struct {
   double new_mv_count;
   double duration;
   double count;
-#if CONFIG_FLEX_REFS
+#if CONFIG_EXT_REFS || CONFIG_BGSPRITE
   // standard deviation for (0, 0) motion prediction error
   double raw_error_stdev;
-#endif  // CONFIG_FLEX_REFS
+#endif  // CONFIG_EXT_REFS
 } FIRSTPASS_STATS;
 
 typedef enum {
@@ -101,8 +108,9 @@ typedef enum {
   LAST_BIPRED_UPDATE = 6,    // Last Bi-predictive Frame
   BIPRED_UPDATE = 7,         // Bi-predictive Frame, but not the last one
   INTNL_OVERLAY_UPDATE = 8,  // Internal Overlay Frame
-  FRAME_UPDATE_TYPES = 9
-#else
+  INTNL_ARF_UPDATE = 9,      // Internal Altref Frame (candidate for ALTREF2)
+  FRAME_UPDATE_TYPES = 10
+#else   // !CONFIG_EXT_REFS
   FRAME_UPDATE_TYPES = 5
 #endif  // CONFIG_EXT_REFS
 } FRAME_UPDATE_TYPE;
@@ -124,6 +132,9 @@ typedef struct {
 #if CONFIG_EXT_REFS
   unsigned char brf_src_offset[(MAX_LAG_BUFFERS * 2) + 1];
   unsigned char bidir_pred_enabled[(MAX_LAG_BUFFERS * 2) + 1];
+  unsigned char ref_fb_idx_map[(MAX_LAG_BUFFERS * 2) + 1][REF_FRAMES];
+  unsigned char refresh_idx[(MAX_LAG_BUFFERS * 2) + 1];
+  unsigned char refresh_flag[(MAX_LAG_BUFFERS * 2) + 1];
 #endif  // CONFIG_EXT_REFS
   int bit_allocation[(MAX_LAG_BUFFERS * 2) + 1];
 } GF_GROUP;
@@ -183,12 +194,15 @@ void av1_end_first_pass(struct AV1_COMP *cpi);
 
 void av1_init_second_pass(struct AV1_COMP *cpi);
 void av1_rc_get_second_pass_params(struct AV1_COMP *cpi);
-void av1_twopass_postencode_update(struct AV1_COMP *cpi);
 
 // Post encode update of the rate control parameters for 2-pass
 void av1_twopass_postencode_update(struct AV1_COMP *cpi);
 
 #if CONFIG_EXT_REFS
+#if USE_GF16_MULTI_LAYER
+void av1_ref_frame_map_idx_updates(struct AV1_COMP *cpi, int gf_frame_index);
+#endif  // USE_GF16_MULTI_LAYER
+
 static INLINE int get_number_of_extra_arfs(int interval, int arf_pending) {
   if (arf_pending && MAX_EXT_ARFS > 0)
     return interval >= MIN_EXT_ARF_INTERVAL * (MAX_EXT_ARFS + 1)
diff --git a/third_party/aom/av1/encoder/global_motion.c b/third_party/aom/av1/encoder/global_motion.c
index 661a1feb4..4d44e9a6f 100644
--- a/third_party/aom/av1/encoder/global_motion.c
+++ b/third_party/aom/av1/encoder/global_motion.c
@@ -244,14 +244,18 @@ static unsigned char *downconvert_frame(YV12_BUFFER_CONFIG *frm,
                                         int bit_depth) {
   int i, j;
   uint16_t *orig_buf = CONVERT_TO_SHORTPTR(frm->y_buffer);
-  uint8_t *buf = malloc(frm->y_height * frm->y_stride * sizeof(*buf));
-
-  for (i = 0; i < frm->y_height; ++i)
-    for (j = 0; j < frm->y_width; ++j)
-      buf[i * frm->y_stride + j] =
-          orig_buf[i * frm->y_stride + j] >> (bit_depth - 8);
-
-  return buf;
+  uint8_t *buf_8bit = frm->y_buffer_8bit;
+  assert(buf_8bit);
+  if (!frm->buf_8bit_valid) {
+    for (i = 0; i < frm->y_height; ++i) {
+      for (j = 0; j < frm->y_width; ++j) {
+        buf_8bit[i * frm->y_stride + j] =
+            orig_buf[i * frm->y_stride + j] >> (bit_depth - 8);
+      }
+    }
+    frm->buf_8bit_valid = 1;
+  }
+  return buf_8bit;
 }
 #endif
 
@@ -274,16 +278,10 @@ int compute_global_motion_feature_based(
   if (frm->flags & YV12_FLAG_HIGHBITDEPTH) {
     // The frame buffer is 16-bit, so we need to convert to 8 bits for the
     // following code. We cache the result until the frame is released.
-    if (frm->y_buffer_8bit)
-      frm_buffer = frm->y_buffer_8bit;
-    else
-      frm_buffer = frm->y_buffer_8bit = downconvert_frame(frm, bit_depth);
+    frm_buffer = downconvert_frame(frm, bit_depth);
   }
   if (ref->flags & YV12_FLAG_HIGHBITDEPTH) {
-    if (ref->y_buffer_8bit)
-      ref_buffer = ref->y_buffer_8bit;
-    else
-      ref_buffer = ref->y_buffer_8bit = downconvert_frame(ref, bit_depth);
+    ref_buffer = downconvert_frame(ref, bit_depth);
   }
 #endif
 
diff --git a/third_party/aom/av1/encoder/hash.c b/third_party/aom/av1/encoder/hash.c
new file mode 100644
index 000000000..89c5bd8a3
--- /dev/null
+++ b/third_party/aom/av1/encoder/hash.c
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/hash.h"
+
+static void crc_calculator_process_data(CRC_CALCULATOR *p_crc_calculator,
+                                        uint8_t *pData, uint32_t dataLength) {
+  for (uint32_t i = 0; i < dataLength; i++) {
+    const uint8_t index =
+        (p_crc_calculator->remainder >> (p_crc_calculator->bits - 8)) ^
+        pData[i];
+    p_crc_calculator->remainder <<= 8;
+    p_crc_calculator->remainder ^= p_crc_calculator->table[index];
+  }
+}
+
+void crc_calculator_reset(CRC_CALCULATOR *p_crc_calculator) {
+  p_crc_calculator->remainder = 0;
+}
+
+static uint32_t crc_calculator_get_crc(CRC_CALCULATOR *p_crc_calculator) {
+  return p_crc_calculator->remainder & p_crc_calculator->final_result_mask;
+}
+
+static void crc_calculator_init_table(CRC_CALCULATOR *p_crc_calculator) {
+  const uint32_t high_bit = 1 << (p_crc_calculator->bits - 1);
+  const uint32_t byte_high_bit = 1 << (8 - 1);
+
+  for (uint32_t value = 0; value < 256; value++) {
+    uint32_t remainder = 0;
+    for (uint8_t mask = byte_high_bit; mask != 0; mask >>= 1) {
+      if (value & mask) {
+        remainder ^= high_bit;
+      }
+
+      if (remainder & high_bit) {
+        remainder <<= 1;
+        remainder ^= p_crc_calculator->trunc_poly;
+      } else {
+        remainder <<= 1;
+      }
+    }
+    p_crc_calculator->table[value] = remainder;
+  }
+}
+
+void av1_crc_calculator_init(CRC_CALCULATOR *p_crc_calculator, uint32_t bits,
+                             uint32_t truncPoly) {
+  p_crc_calculator->remainder = 0;
+  p_crc_calculator->bits = bits;
+  p_crc_calculator->trunc_poly = truncPoly;
+  p_crc_calculator->final_result_mask = (1 << bits) - 1;
+  crc_calculator_init_table(p_crc_calculator);
+}
+
+uint32_t av1_get_crc_value(CRC_CALCULATOR *p_crc_calculator, uint8_t *p,
+                           int length) {
+  crc_calculator_reset(p_crc_calculator);
+  crc_calculator_process_data(p_crc_calculator, p, length);
+  return crc_calculator_get_crc(p_crc_calculator);
+}
diff --git a/third_party/aom/av1/encoder/hash.h b/third_party/aom/av1/encoder/hash.h
new file mode 100644
index 000000000..a0fd54fb6
--- /dev/null
+++ b/third_party/aom/av1/encoder/hash.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_HASH_H_
+#define AV1_ENCODER_HASH_H_
+
+#include "./aom_config.h"
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _crc_calculator {
+  uint32_t remainder;
+  uint32_t trunc_poly;
+  uint32_t bits;
+  uint32_t table[256];
+  uint32_t final_result_mask;
+} CRC_CALCULATOR;
+
+// Initialize the crc calculator. It must be executed at least once before
+// calling av1_get_crc_value().
+void av1_crc_calculator_init(CRC_CALCULATOR *p_crc_calculator, uint32_t bits,
+                             uint32_t truncPoly);
+
+uint32_t av1_get_crc_value(CRC_CALCULATOR *p_crc_calculator, uint8_t *p,
+                           int length);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_HASH_H_
diff --git a/third_party/aom/av1/encoder/hash_motion.c b/third_party/aom/av1/encoder/hash_motion.c
new file mode 100644
index 000000000..2378597ad
--- /dev/null
+++ b/third_party/aom/av1/encoder/hash_motion.c
@@ -0,0 +1,380 @@
+#include <assert.h>
+#include "av1/encoder/hash.h"
+#include "av1/encoder/hash_motion.h"
+#include "./av1_rtcd.h"
+
+static const int crc_bits = 16;
+static const int block_size_bits = 3;
+static CRC_CALCULATOR crc_calculator1;
+static CRC_CALCULATOR crc_calculator2;
+static int g_crc_initialized = 0;
+
+static void hash_table_clear_all(hash_table *p_hash_table) {
+  if (p_hash_table->p_lookup_table == NULL) {
+    return;
+  }
+  int max_addr = 1 << (crc_bits + block_size_bits);
+  for (int i = 0; i < max_addr; i++) {
+    if (p_hash_table->p_lookup_table[i] != NULL) {
+      vector_destroy(p_hash_table->p_lookup_table[i]);
+      aom_free(p_hash_table->p_lookup_table[i]);
+      p_hash_table->p_lookup_table[i] = NULL;
+    }
+  }
+}
+
+// TODO(youzhou@microsoft.com): is higher than 8 bits screen content supported?
+// If yes, fix this function
+static void get_pixels_in_1D_char_array_by_block_2x2(uint8_t *y_src, int stride,
+                                                     uint8_t *p_pixels_in1D) {
+  uint8_t *p_pel = y_src;
+  int index = 0;
+  for (int i = 0; i < 2; i++) {
+    for (int j = 0; j < 2; j++) {
+      p_pixels_in1D[index++] = p_pel[j];
+    }
+    p_pel += stride;
+  }
+}
+
+static int is_block_2x2_row_same_value(uint8_t *p) {
+  if (p[0] != p[1] || p[2] != p[3]) {
+    return 0;
+  }
+
+  return 1;
+}
+
+static int is_block_2x2_col_same_value(uint8_t *p) {
+  if ((p[0] != p[2]) || (p[1] != p[3])) {
+    return 0;
+  }
+
+  return 1;
+}
+
+// the hash value (hash_value1 consists two parts, the first 3 bits relate to
+// the block size and the remaining 16 bits are the crc values. This fuction
+// is used to get the first 3 bits.
+static int hash_block_size_to_index(int block_size) {
+  switch (block_size) {
+    case 4: return 0;
+    case 8: return 1;
+    case 16: return 2;
+    case 32: return 3;
+    case 64: return 4;
+    default: return -1;
+  }
+}
+
+void av1_hash_table_init(hash_table *p_hash_table) {
+  if (g_crc_initialized == 0) {
+    av1_crc_calculator_init(&crc_calculator1, 24, 0x5D6DCB);
+    av1_crc_calculator_init(&crc_calculator2, 24, 0x864CFB);
+    g_crc_initialized = 1;
+  }
+  p_hash_table->p_lookup_table = NULL;
+}
+
+void av1_hash_table_destroy(hash_table *p_hash_table) {
+  hash_table_clear_all(p_hash_table);
+  aom_free(p_hash_table->p_lookup_table);
+  p_hash_table->p_lookup_table = NULL;
+}
+
+void av1_hash_table_create(hash_table *p_hash_table) {
+  if (p_hash_table->p_lookup_table != NULL) {
+    hash_table_clear_all(p_hash_table);
+    return;
+  }
+  const int max_addr = 1 << (crc_bits + block_size_bits);
+  p_hash_table->p_lookup_table =
+      (Vector **)aom_malloc(sizeof(p_hash_table->p_lookup_table[0]) * max_addr);
+  memset(p_hash_table->p_lookup_table, 0,
+         sizeof(p_hash_table->p_lookup_table[0]) * max_addr);
+}
+
+static void hash_table_add_to_table(hash_table *p_hash_table,
+                                    uint32_t hash_value,
+                                    block_hash *curr_block_hash) {
+  if (p_hash_table->p_lookup_table[hash_value] == NULL) {
+    p_hash_table->p_lookup_table[hash_value] =
+        aom_malloc(sizeof(p_hash_table->p_lookup_table[0][0]));
+    vector_setup(p_hash_table->p_lookup_table[hash_value], 10,
+                 sizeof(curr_block_hash[0]));
+    vector_push_back(p_hash_table->p_lookup_table[hash_value], curr_block_hash);
+  } else {
+    vector_push_back(p_hash_table->p_lookup_table[hash_value], curr_block_hash);
+  }
+}
+
+int32_t av1_hash_table_count(hash_table *p_hash_table, uint32_t hash_value) {
+  if (p_hash_table->p_lookup_table[hash_value] == NULL) {
+    return 0;
+  } else {
+    return (int32_t)(p_hash_table->p_lookup_table[hash_value]->size);
+  }
+}
+
+Iterator av1_hash_get_first_iterator(hash_table *p_hash_table,
+                                     uint32_t hash_value) {
+  assert(av1_hash_table_count(p_hash_table, hash_value) > 0);
+  return vector_begin(p_hash_table->p_lookup_table[hash_value]);
+}
+
+int32_t av1_has_exact_match(hash_table *p_hash_table, uint32_t hash_value1,
+                            uint32_t hash_value2) {
+  if (p_hash_table->p_lookup_table[hash_value1] == NULL) {
+    return 0;
+  }
+  Iterator iterator = vector_begin(p_hash_table->p_lookup_table[hash_value1]);
+  Iterator last = vector_end(p_hash_table->p_lookup_table[hash_value1]);
+  for (; !iterator_equals(&iterator, &last); iterator_increment(&iterator)) {
+    if ((*(block_hash *)iterator_get(&iterator)).hash_value2 == hash_value2) {
+      return 1;
+    }
+  }
+  return 0;
+}
+
+void av1_generate_block_2x2_hash_value(const YV12_BUFFER_CONFIG *picture,
+                                       uint32_t *pic_block_hash[2],
+                                       int8_t *pic_block_same_info[3]) {
+  const int width = 2;
+  const int height = 2;
+  const int x_end = picture->y_crop_width - width + 1;
+  const int y_end = picture->y_crop_height - height + 1;
+
+  const int length = width * 2;
+  uint8_t p[4];
+
+  int pos = 0;
+  for (int y_pos = 0; y_pos < y_end; y_pos++) {
+    for (int x_pos = 0; x_pos < x_end; x_pos++) {
+      get_pixels_in_1D_char_array_by_block_2x2(
+          picture->y_buffer + y_pos * picture->y_stride + x_pos,
+          picture->y_stride, p);
+      pic_block_same_info[0][pos] = is_block_2x2_row_same_value(p);
+      pic_block_same_info[1][pos] = is_block_2x2_col_same_value(p);
+
+      pic_block_hash[0][pos] =
+          av1_get_crc_value(&crc_calculator1, p, length * sizeof(p[0]));
+      pic_block_hash[1][pos] =
+          av1_get_crc_value(&crc_calculator2, p, length * sizeof(p[0]));
+
+      pos++;
+    }
+    pos += width - 1;
+  }
+}
+
+void av1_generate_block_hash_value(const YV12_BUFFER_CONFIG *picture,
+                                   int block_size,
+                                   uint32_t *src_pic_block_hash[2],
+                                   uint32_t *dst_pic_block_hash[2],
+                                   int8_t *src_pic_block_same_info[3],
+                                   int8_t *dst_pic_block_same_info[3]) {
+  const int pic_width = picture->y_crop_width;
+  const int x_end = picture->y_crop_width - block_size + 1;
+  const int y_end = picture->y_crop_height - block_size + 1;
+
+  const int src_size = block_size >> 1;
+  const int quad_size = block_size >> 2;
+
+  uint32_t p[4];
+  const int length = sizeof(p);
+
+  int pos = 0;
+  for (int y_pos = 0; y_pos < y_end; y_pos++) {
+    for (int x_pos = 0; x_pos < x_end; x_pos++) {
+      p[0] = src_pic_block_hash[0][pos];
+      p[1] = src_pic_block_hash[0][pos + src_size];
+      p[2] = src_pic_block_hash[0][pos + src_size * pic_width];
+      p[3] = src_pic_block_hash[0][pos + src_size * pic_width + src_size];
+      dst_pic_block_hash[0][pos] =
+          av1_get_crc_value(&crc_calculator1, (uint8_t *)p, length);
+
+      p[0] = src_pic_block_hash[1][pos];
+      p[1] = src_pic_block_hash[1][pos + src_size];
+      p[2] = src_pic_block_hash[1][pos + src_size * pic_width];
+      p[3] = src_pic_block_hash[1][pos + src_size * pic_width + src_size];
+      dst_pic_block_hash[1][pos] =
+          av1_get_crc_value(&crc_calculator2, (uint8_t *)p, length);
+
+      dst_pic_block_same_info[0][pos] =
+          src_pic_block_same_info[0][pos] &&
+          src_pic_block_same_info[0][pos + quad_size] &&
+          src_pic_block_same_info[0][pos + src_size] &&
+          src_pic_block_same_info[0][pos + src_size * pic_width] &&
+          src_pic_block_same_info[0][pos + src_size * pic_width + quad_size] &&
+          src_pic_block_same_info[0][pos + src_size * pic_width + src_size];
+
+      dst_pic_block_same_info[1][pos] =
+          src_pic_block_same_info[1][pos] &&
+          src_pic_block_same_info[1][pos + src_size] &&
+          src_pic_block_same_info[1][pos + quad_size * pic_width] &&
+          src_pic_block_same_info[1][pos + quad_size * pic_width + src_size] &&
+          src_pic_block_same_info[1][pos + src_size * pic_width] &&
+          src_pic_block_same_info[1][pos + src_size * pic_width + src_size];
+      pos++;
+    }
+    pos += block_size - 1;
+  }
+
+  if (block_size >= 4) {
+    const int size_minus1 = block_size - 1;
+    pos = 0;
+    for (int y_pos = 0; y_pos < y_end; y_pos++) {
+      for (int x_pos = 0; x_pos < x_end; x_pos++) {
+        dst_pic_block_same_info[2][pos] =
+            (!dst_pic_block_same_info[0][pos] &&
+             !dst_pic_block_same_info[1][pos]) ||
+            (((x_pos & size_minus1) == 0) && ((y_pos & size_minus1) == 0));
+        pos++;
+      }
+      pos += block_size - 1;
+    }
+  }
+}
+
+void av1_add_to_hash_map_by_row_with_precal_data(hash_table *p_hash_table,
+                                                 uint32_t *pic_hash[2],
+                                                 int8_t *pic_is_same,
+                                                 int pic_width, int pic_height,
+                                                 int block_size) {
+  const int x_end = pic_width - block_size + 1;
+  const int y_end = pic_height - block_size + 1;
+
+  const int8_t *src_is_added = pic_is_same;
+  const uint32_t *src_hash[2] = { pic_hash[0], pic_hash[1] };
+
+  int add_value = hash_block_size_to_index(block_size);
+  assert(add_value >= 0);
+  add_value <<= crc_bits;
+  const int crc_mask = (1 << crc_bits) - 1;
+
+  for (int x_pos = 0; x_pos < x_end; x_pos++) {
+    for (int y_pos = 0; y_pos < y_end; y_pos++) {
+      const int pos = y_pos * pic_width + x_pos;
+      // valid data
+      if (src_is_added[pos]) {
+        block_hash curr_block_hash;
+        curr_block_hash.x = x_pos;
+        curr_block_hash.y = y_pos;
+
+        const uint32_t hash_value1 = (src_hash[0][pos] & crc_mask) + add_value;
+        curr_block_hash.hash_value2 = src_hash[1][pos];
+
+        hash_table_add_to_table(p_hash_table, hash_value1, &curr_block_hash);
+      }
+    }
+  }
+}
+
+int av1_hash_is_horizontal_perfect(const YV12_BUFFER_CONFIG *picture,
+                                   int block_size, int x_start, int y_start) {
+  const int stride = picture->y_stride;
+  const uint8_t *p = picture->y_buffer + y_start * stride + x_start;
+
+  for (int i = 0; i < block_size; i++) {
+    for (int j = 1; j < block_size; j++) {
+      if (p[j] != p[0]) {
+        return 0;
+      }
+    }
+    p += stride;
+  }
+
+  return 1;
+}
+
+int av1_hash_is_vertical_perfect(const YV12_BUFFER_CONFIG *picture,
+                                 int block_size, int x_start, int y_start) {
+  const int stride = picture->y_stride;
+  const uint8_t *p = picture->y_buffer + y_start * stride + x_start;
+
+  for (int i = 0; i < block_size; i++) {
+    for (int j = 1; j < block_size; j++) {
+      if (p[j * stride + i] != p[i]) {
+        return 0;
+      }
+    }
+  }
+
+  return 1;
+}
+
+// global buffer for hash value calculation of a block
+// used only in av1_get_block_hash_value()
+static uint32_t hash_value_buffer[2][2][1024];  // [first hash/second hash]
+                                                // [two buffers used ping-pong]
+                                                // [num of 2x2 blocks in 64x64]
+
+void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size,
+                              uint32_t *hash_value1, uint32_t *hash_value2) {
+  uint8_t pixel_to_hash[4];
+  uint32_t to_hash[4];
+  const int add_value = hash_block_size_to_index(block_size) << crc_bits;
+  assert(add_value >= 0);
+  const int crc_mask = (1 << crc_bits) - 1;
+
+  // 2x2 subblock hash values in current CU
+  int sub_block_in_width = (block_size >> 1);
+  for (int y_pos = 0; y_pos < block_size; y_pos += 2) {
+    for (int x_pos = 0; x_pos < block_size; x_pos += 2) {
+      int pos = (y_pos >> 1) * sub_block_in_width + (x_pos >> 1);
+      get_pixels_in_1D_char_array_by_block_2x2(y_src + y_pos * stride + x_pos,
+                                               stride, pixel_to_hash);
+
+      hash_value_buffer[0][0][pos] = av1_get_crc_value(
+          &crc_calculator1, pixel_to_hash, sizeof(pixel_to_hash));
+      hash_value_buffer[1][0][pos] = av1_get_crc_value(
+          &crc_calculator2, pixel_to_hash, sizeof(pixel_to_hash));
+    }
+  }
+
+  int src_sub_block_in_width = sub_block_in_width;
+  sub_block_in_width >>= 1;
+
+  int src_idx = 1;
+  int dst_idx = 0;
+
+  // 4x4 subblock hash values to current block hash values
+  for (int sub_width = 4; sub_width <= block_size; sub_width *= 2) {
+    src_idx = 1 - src_idx;
+    dst_idx = 1 - dst_idx;
+
+    int dst_pos = 0;
+    for (int y_pos = 0; y_pos < sub_block_in_width; y_pos++) {
+      for (int x_pos = 0; x_pos < sub_block_in_width; x_pos++) {
+        int srcPos = (y_pos << 1) * src_sub_block_in_width + (x_pos << 1);
+
+        to_hash[0] = hash_value_buffer[0][src_idx][srcPos];
+        to_hash[1] = hash_value_buffer[0][src_idx][srcPos + 1];
+        to_hash[2] =
+            hash_value_buffer[0][src_idx][srcPos + src_sub_block_in_width];
+        to_hash[3] =
+            hash_value_buffer[0][src_idx][srcPos + src_sub_block_in_width + 1];
+
+        hash_value_buffer[0][dst_idx][dst_pos] = av1_get_crc_value(
+            &crc_calculator1, (uint8_t *)to_hash, sizeof(to_hash));
+
+        to_hash[0] = hash_value_buffer[1][src_idx][srcPos];
+        to_hash[1] = hash_value_buffer[1][src_idx][srcPos + 1];
+        to_hash[2] =
+            hash_value_buffer[1][src_idx][srcPos + src_sub_block_in_width];
+        to_hash[3] =
+            hash_value_buffer[1][src_idx][srcPos + src_sub_block_in_width + 1];
+        hash_value_buffer[1][dst_idx][dst_pos] = av1_get_crc_value(
+            &crc_calculator2, (uint8_t *)to_hash, sizeof(to_hash));
+        dst_pos++;
+      }
+    }
+
+    src_sub_block_in_width = sub_block_in_width;
+    sub_block_in_width >>= 1;
+  }
+
+  *hash_value1 = (hash_value_buffer[0][dst_idx][0] & crc_mask) + add_value;
+  *hash_value2 = hash_value_buffer[1][dst_idx][0];
+}
diff --git a/third_party/aom/av1/encoder/hash_motion.h b/third_party/aom/av1/encoder/hash_motion.h
new file mode 100644
index 000000000..26e1ac46e
--- /dev/null
+++ b/third_party/aom/av1/encoder/hash_motion.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_HASH_MOTION_H_
+#define AV1_ENCODER_HASH_MOTION_H_
+
+#include "./aom_config.h"
+#include "aom/aom_integer.h"
+#include "aom_scale/yv12config.h"
+#include "third_party/vector/vector.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// store a block's hash info.
+// x and y are the position from the top left of the picture
+// hash_value2 is used to store the second hash value
+typedef struct _block_hash {
+  int16_t x;
+  int16_t y;
+  uint32_t hash_value2;
+} block_hash;
+
+typedef struct _hash_table { Vector **p_lookup_table; } hash_table;
+
+void av1_hash_table_init(hash_table *p_hash_table);
+void av1_hash_table_destroy(hash_table *p_hash_table);
+void av1_hash_table_create(hash_table *p_hash_table);
+int32_t av1_hash_table_count(hash_table *p_hash_table, uint32_t hash_value);
+Iterator av1_hash_get_first_iterator(hash_table *p_hash_table,
+                                     uint32_t hash_value);
+int32_t av1_has_exact_match(hash_table *p_hash_table, uint32_t hash_value1,
+                            uint32_t hash_value2);
+void av1_generate_block_2x2_hash_value(const YV12_BUFFER_CONFIG *picture,
+                                       uint32_t *pic_block_hash[2],
+                                       int8_t *pic_block_same_info[3]);
+void av1_generate_block_hash_value(const YV12_BUFFER_CONFIG *picture,
+                                   int block_size,
+                                   uint32_t *src_pic_block_hash[2],
+                                   uint32_t *dst_pic_block_hash[2],
+                                   int8_t *src_pic_block_same_info[3],
+                                   int8_t *dst_pic_block_same_info[3]);
+void av1_add_to_hash_map_by_row_with_precal_data(hash_table *p_hash_table,
+                                                 uint32_t *pic_hash[2],
+                                                 int8_t *pic_is_same,
+                                                 int pic_width, int pic_height,
+                                                 int block_size);
+
+// check whether the block starts from (x_start, y_start) with the size of
+// block_size x block_size has the same color in all rows
+int av1_hash_is_horizontal_perfect(const YV12_BUFFER_CONFIG *picture,
+                                   int block_size, int x_start, int y_start);
+// check whether the block starts from (x_start, y_start) with the size of
+// block_size x block_size has the same color in all columns
+int av1_hash_is_vertical_perfect(const YV12_BUFFER_CONFIG *picture,
+                                 int block_size, int x_start, int y_start);
+void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size,
+                              uint32_t *hash_value1, uint32_t *hash_value2);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_HASH_MOTION_H_
diff --git a/third_party/aom/av1/encoder/hybrid_fwd_txfm.c b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c
index 85f4b7d9b..6ddeb2b77 100644
--- a/third_party/aom/av1/encoder/hybrid_fwd_txfm.c
+++ b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c
@@ -51,7 +51,7 @@ static void fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
     return;
   }
 
-#if CONFIG_LGT
+#if CONFIG_LGT || CONFIG_DAALA_DCT4
   // only C version has LGTs
   av1_fht4x4_c(src_diff, coeff, diff_stride, txfm_param);
 #else
@@ -107,7 +107,7 @@ static void fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff,
 
 static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
                          int diff_stride, TxfmParam *txfm_param) {
-#if CONFIG_LGT
+#if CONFIG_LGT || CONFIG_DAALA_DCT8
   av1_fht8x8_c(src_diff, coeff, diff_stride, txfm_param);
 #else
   av1_fht8x8(src_diff, coeff, diff_stride, txfm_param);
@@ -116,7 +116,11 @@ static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
 
 static void fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
                            int diff_stride, TxfmParam *txfm_param) {
+#if CONFIG_DAALA_DCT16
+  av1_fht16x16_c(src_diff, coeff, diff_stride, txfm_param);
+#else
   av1_fht16x16(src_diff, coeff, diff_stride, txfm_param);
+#endif  // CONFIG_DAALA_DCT16
 }
 
 static void fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff,
@@ -136,11 +140,31 @@ static void fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff,
                            int diff_stride, TxfmParam *txfm_param) {
 #if CONFIG_EXT_TX
   if (txfm_param->tx_type == IDTX)
-    av1_fwd_idtx_c(src_diff, coeff, diff_stride, 64, txfm_param->tx_type);
+    av1_fwd_idtx_c(src_diff, coeff, diff_stride, 64, 64, txfm_param->tx_type);
   else
 #endif
     av1_fht64x64(src_diff, coeff, diff_stride, txfm_param);
 }
+
+static void fwd_txfm_32x64(const int16_t *src_diff, tran_low_t *coeff,
+                           int diff_stride, TxfmParam *txfm_param) {
+#if CONFIG_EXT_TX
+  if (txfm_param->tx_type == IDTX)
+    av1_fwd_idtx_c(src_diff, coeff, diff_stride, 32, 64, txfm_param->tx_type);
+  else
+#endif
+    av1_fht32x64(src_diff, coeff, diff_stride, txfm_param);
+}
+
+static void fwd_txfm_64x32(const int16_t *src_diff, tran_low_t *coeff,
+                           int diff_stride, TxfmParam *txfm_param) {
+#if CONFIG_EXT_TX
+  if (txfm_param->tx_type == IDTX)
+    av1_fwd_idtx_c(src_diff, coeff, diff_stride, 64, 32, txfm_param->tx_type);
+  else
+#endif
+    av1_fht64x32(src_diff, coeff, diff_stride, txfm_param);
+}
 #endif  // CONFIG_TX64X64
 
 #if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
@@ -211,7 +235,7 @@ static void highbd_fwd_txfm_2x2(const int16_t *src_diff, tran_low_t *coeff,
 static void highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
                                 int diff_stride, TxfmParam *txfm_param) {
   int32_t *dst_coeff = (int32_t *)coeff;
-  const int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
   const int bd = txfm_param->bd;
   if (txfm_param->lossless) {
     assert(tx_type == DCT_DCT);
@@ -296,7 +320,7 @@ static void highbd_fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff,
 static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
                                 int diff_stride, TxfmParam *txfm_param) {
   int32_t *dst_coeff = (int32_t *)coeff;
-  const int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
   const int bd = txfm_param->bd;
   switch (tx_type) {
     case DCT_DCT:
@@ -334,7 +358,7 @@ static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
 static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
                                   int diff_stride, TxfmParam *txfm_param) {
   int32_t *dst_coeff = (int32_t *)coeff;
-  const int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
   const int bd = txfm_param->bd;
   switch (tx_type) {
     case DCT_DCT:
@@ -372,7 +396,7 @@ static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
 static void highbd_fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff,
                                   int diff_stride, TxfmParam *txfm_param) {
   int32_t *dst_coeff = (int32_t *)coeff;
-  const int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
   const int bd = txfm_param->bd;
   switch (tx_type) {
     case DCT_DCT:
@@ -408,10 +432,89 @@ static void highbd_fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff,
 }
 
 #if CONFIG_TX64X64
+static void highbd_fwd_txfm_32x64(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int bd = txfm_param->bd;
+  switch (tx_type) {
+    case DCT_DCT:
+      av1_fwd_txfm2d_32x64_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
+      break;
+#if CONFIG_EXT_TX
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      // TODO(sarahparker)
+      // I've deleted the 64x64 implementations that existed in lieu
+      // of adst, flipadst and identity for simplicity but will bring back
+      // in a later change. This shouldn't impact performance since
+      // DCT_DCT is the only extended type currently allowed for 64x64,
+      // as dictated by get_ext_tx_set_type in blockd.h.
+      av1_fwd_txfm2d_32x64_c(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
+      break;
+    case IDTX:
+      av1_fwd_idtx_c(src_diff, dst_coeff, diff_stride, 32, 64, tx_type);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+}
+
+static void highbd_fwd_txfm_64x32(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int bd = txfm_param->bd;
+  switch (tx_type) {
+    case DCT_DCT:
+      av1_fwd_txfm2d_64x32_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
+      break;
+#if CONFIG_EXT_TX
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      // TODO(sarahparker)
+      // I've deleted the 64x64 implementations that existed in lieu
+      // of adst, flipadst and identity for simplicity but will bring back
+      // in a later change. This shouldn't impact performance since
+      // DCT_DCT is the only extended type currently allowed for 64x64,
+      // as dictated by get_ext_tx_set_type in blockd.h.
+      av1_fwd_txfm2d_64x32_c(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
+      break;
+    case IDTX:
+      av1_fwd_idtx_c(src_diff, dst_coeff, diff_stride, 64, 32, tx_type);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+}
 static void highbd_fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff,
                                   int diff_stride, TxfmParam *txfm_param) {
   int32_t *dst_coeff = (int32_t *)coeff;
-  const int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
   const int bd = txfm_param->bd;
   switch (tx_type) {
     case DCT_DCT:
@@ -441,7 +544,7 @@ static void highbd_fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff,
       av1_fwd_txfm2d_64x64_c(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
       break;
     case IDTX:
-      av1_fwd_idtx_c(src_diff, dst_coeff, diff_stride, 64, tx_type);
+      av1_fwd_idtx_c(src_diff, dst_coeff, diff_stride, 64, 64, tx_type);
       break;
 #endif  // CONFIG_EXT_TX
     default: assert(0); break;
@@ -452,11 +555,25 @@ static void highbd_fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff,
 void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
                   TxfmParam *txfm_param) {
   const TX_SIZE tx_size = txfm_param->tx_size;
+#if CONFIG_LGT_FROM_PRED
+  if (txfm_param->use_lgt) {
+    // if use_lgt is 1, it will override tx_type
+    assert(is_lgt_allowed(txfm_param->mode, tx_size));
+    flgt2d_from_pred_c(src_diff, coeff, diff_stride, txfm_param);
+    return;
+  }
+#endif  // CONFIG_LGT_FROM_PRED
   switch (tx_size) {
 #if CONFIG_TX64X64
     case TX_64X64:
       fwd_txfm_64x64(src_diff, coeff, diff_stride, txfm_param);
       break;
+    case TX_32X64:
+      fwd_txfm_32x64(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_64X32:
+      fwd_txfm_64x32(src_diff, coeff, diff_stride, txfm_param);
+      break;
 #endif  // CONFIG_TX64X64
     case TX_32X32:
       fwd_txfm_32x32(src_diff, coeff, diff_stride, txfm_param);
@@ -509,6 +626,12 @@ void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
     case TX_64X64:
       highbd_fwd_txfm_64x64(src_diff, coeff, diff_stride, txfm_param);
       break;
+    case TX_32X64:
+      highbd_fwd_txfm_32x64(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_64X32:
+      highbd_fwd_txfm_64x32(src_diff, coeff, diff_stride, txfm_param);
+      break;
 #endif  // CONFIG_TX64X64
     case TX_32X32:
       highbd_fwd_txfm_32x32(src_diff, coeff, diff_stride, txfm_param);
diff --git a/third_party/aom/av1/encoder/k_means_template.h b/third_party/aom/av1/encoder/k_means_template.h
new file mode 100644
index 000000000..3a433d9b5
--- /dev/null
+++ b/third_party/aom/av1/encoder/k_means_template.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "av1/encoder/palette.h"
+#include "av1/encoder/random.h"
+
+#ifndef AV1_K_MEANS_DIM
+#error "This template requires AV1_K_MEANS_DIM to be defined"
+#endif
+
+#define RENAME_(x, y) AV1_K_MEANS_RENAME(x, y)
+#define RENAME(x) RENAME_(x, AV1_K_MEANS_DIM)
+
+static float RENAME(calc_dist)(const float *p1, const float *p2) {
+  float dist = 0;
+  int i;
+  for (i = 0; i < AV1_K_MEANS_DIM; ++i) {
+    const float diff = p1[i] - p2[i];
+    dist += diff * diff;
+  }
+  return dist;
+}
+
+void RENAME(av1_calc_indices)(const float *data, const float *centroids,
+                              uint8_t *indices, int n, int k) {
+  int i, j;
+  for (i = 0; i < n; ++i) {
+    float min_dist = RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM, centroids);
+    indices[i] = 0;
+    for (j = 1; j < k; ++j) {
+      const float this_dist = RENAME(calc_dist)(
+          data + i * AV1_K_MEANS_DIM, centroids + j * AV1_K_MEANS_DIM);
+      if (this_dist < min_dist) {
+        min_dist = this_dist;
+        indices[i] = j;
+      }
+    }
+  }
+}
+
+static void RENAME(calc_centroids)(const float *data, float *centroids,
+                                   const uint8_t *indices, int n, int k) {
+  int i, j, index;
+  int count[PALETTE_MAX_SIZE];
+  unsigned int rand_state = (unsigned int)data[0];
+
+  assert(n <= 32768);
+
+  memset(count, 0, sizeof(count[0]) * k);
+  memset(centroids, 0, sizeof(centroids[0]) * k * AV1_K_MEANS_DIM);
+
+  for (i = 0; i < n; ++i) {
+    index = indices[i];
+    assert(index < k);
+    ++count[index];
+    for (j = 0; j < AV1_K_MEANS_DIM; ++j) {
+      centroids[index * AV1_K_MEANS_DIM + j] += data[i * AV1_K_MEANS_DIM + j];
+    }
+  }
+
+  for (i = 0; i < k; ++i) {
+    if (count[i] == 0) {
+      memcpy(centroids + i * AV1_K_MEANS_DIM,
+             data + (lcg_rand16(&rand_state) % n) * AV1_K_MEANS_DIM,
+             sizeof(centroids[0]) * AV1_K_MEANS_DIM);
+    } else {
+      const float norm = 1.0f / count[i];
+      for (j = 0; j < AV1_K_MEANS_DIM; ++j)
+        centroids[i * AV1_K_MEANS_DIM + j] *= norm;
+    }
+  }
+
+  // Round to nearest integers.
+  for (i = 0; i < k * AV1_K_MEANS_DIM; ++i) {
+    centroids[i] = roundf(centroids[i]);
+  }
+}
+
+static float RENAME(calc_total_dist)(const float *data, const float *centroids,
+                                     const uint8_t *indices, int n, int k) {
+  float dist = 0;
+  int i;
+  (void)k;
+
+  for (i = 0; i < n; ++i)
+    dist += RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM,
+                              centroids + indices[i] * AV1_K_MEANS_DIM);
+
+  return dist;
+}
+
+void RENAME(av1_k_means)(const float *data, float *centroids, uint8_t *indices,
+                         int n, int k, int max_itr) {
+  int i;
+  float this_dist;
+  float pre_centroids[2 * PALETTE_MAX_SIZE];
+  uint8_t pre_indices[MAX_SB_SQUARE];
+
+  RENAME(av1_calc_indices)(data, centroids, indices, n, k);
+  this_dist = RENAME(calc_total_dist)(data, centroids, indices, n, k);
+
+  for (i = 0; i < max_itr; ++i) {
+    const float pre_dist = this_dist;
+    memcpy(pre_centroids, centroids,
+           sizeof(pre_centroids[0]) * k * AV1_K_MEANS_DIM);
+    memcpy(pre_indices, indices, sizeof(pre_indices[0]) * n);
+
+    RENAME(calc_centroids)(data, centroids, indices, n, k);
+    RENAME(av1_calc_indices)(data, centroids, indices, n, k);
+    this_dist = RENAME(calc_total_dist)(data, centroids, indices, n, k);
+
+    if (this_dist > pre_dist) {
+      memcpy(centroids, pre_centroids,
+             sizeof(pre_centroids[0]) * k * AV1_K_MEANS_DIM);
+      memcpy(indices, pre_indices, sizeof(pre_indices[0]) * n);
+      break;
+    }
+    if (!memcmp(centroids, pre_centroids,
+                sizeof(pre_centroids[0]) * k * AV1_K_MEANS_DIM))
+      break;
+  }
+}
+
+#undef RENAME_
+#undef RENAME
diff --git a/third_party/aom/av1/encoder/mbgraph.c b/third_party/aom/av1/encoder/mbgraph.c
index 3f5daebcc..7d2510af9 100644
--- a/third_party/aom/av1/encoder/mbgraph.c
+++ b/third_party/aom/av1/encoder/mbgraph.c
@@ -47,32 +47,32 @@ static unsigned int do_16x16_motion_iteration(AV1_COMP *cpi, const MV *ref_mv,
   av1_hex_search(x, &ref_full, step_param, x->errorperbit, 0,
                  cond_cost_list(cpi, cost_list), &v_fn_ptr, 0, ref_mv);
 
-  // Try sub-pixel MC
-  // if (bestsme > error_thresh && bestsme < INT_MAX)
+// Try sub-pixel MC
+// if (bestsme > error_thresh && bestsme < INT_MAX)
+#if CONFIG_AMVR
+  if (cpi->common.cur_frame_mv_precision_level == 1) {
+    x->best_mv.as_mv.row *= 8;
+    x->best_mv.as_mv.col *= 8;
+  } else {
+#else
   {
+#endif
     int distortion;
     unsigned int sse;
-    cpi->find_fractional_mv_step(
-        x, ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
-        &v_fn_ptr, 0, mv_sf->subpel_iters_per_step,
-        cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL,
-#if CONFIG_EXT_INTER
-        NULL, 0, 0,
-#endif
-        0, 0, 0);
+    cpi->find_fractional_mv_step(x, ref_mv, cpi->common.allow_high_precision_mv,
+                                 x->errorperbit, &v_fn_ptr, 0,
+                                 mv_sf->subpel_iters_per_step,
+                                 cond_cost_list(cpi, cost_list), NULL, NULL,
+                                 &distortion, &sse, NULL, NULL, 0, 0, 0, 0, 0);
   }
 
-#if CONFIG_EXT_INTER
   if (has_second_ref(&xd->mi[0]->mbmi))
     xd->mi[0]->mbmi.mode = NEW_NEWMV;
   else
-#endif  // CONFIG_EXT_INTER
     xd->mi[0]->mbmi.mode = NEWMV;
 
   xd->mi[0]->mbmi.mv[0] = x->best_mv;
-#if CONFIG_EXT_INTER
   xd->mi[0]->mbmi.ref_frame[1] = NONE_FRAME;
-#endif  // CONFIG_EXT_INTER
 
   av1_build_inter_predictors_sby(&cpi->common, xd, mb_row, mb_col, NULL,
                                  BLOCK_16X16);
@@ -136,6 +136,7 @@ static int do_16x16_zerozero_search(AV1_COMP *cpi, int_mv *dst_mv) {
   return err;
 }
 static int find_best_16x16_intra(AV1_COMP *cpi, PREDICTION_MODE *pbest_mode) {
+  const AV1_COMMON *cm = &cpi->common;
   MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   PREDICTION_MODE best_mode = -1, mode;
@@ -147,9 +148,10 @@ static int find_best_16x16_intra(AV1_COMP *cpi, PREDICTION_MODE *pbest_mode) {
     unsigned int err;
 
     xd->mi[0]->mbmi.mode = mode;
-    av1_predict_intra_block(xd, 16, 16, BLOCK_16X16, mode, x->plane[0].src.buf,
-                            x->plane[0].src.stride, xd->plane[0].dst.buf,
-                            xd->plane[0].dst.stride, 0, 0, 0);
+    av1_predict_intra_block(cm, xd, 16, 16, BLOCK_16X16, mode,
+                            x->plane[0].src.buf, x->plane[0].src.stride,
+                            xd->plane[0].dst.buf, xd->plane[0].dst.stride, 0, 0,
+                            0);
     err = aom_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
                        xd->plane[0].dst.buf, xd->plane[0].dst.stride);
 
diff --git a/third_party/aom/av1/encoder/mcomp.c b/third_party/aom/av1/encoder/mcomp.c
index 4efadff1b..6c8503da0 100644
--- a/third_party/aom/av1/encoder/mcomp.c
+++ b/third_party/aom/av1/encoder/mcomp.c
@@ -176,7 +176,6 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
 }
 
 /* checks if (r, c) has better score than previous best */
-#if CONFIG_EXT_INTER
 #define CHECK_BETTER(v, r, c)                                             \
   if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                 \
     MV this_mv = { r, c };                                                \
@@ -202,34 +201,10 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
   } else {                                                                \
     v = INT_MAX;                                                          \
   }
-#else
-#define CHECK_BETTER(v, r, c)                                             \
-  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                 \
-    MV this_mv = { r, c };                                                \
-    v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);    \
-    if (second_pred == NULL)                                              \
-      thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r),  \
-                         src_address, src_stride, &sse);                  \
-    else                                                                  \
-      thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \
-                          src_address, src_stride, &sse, second_pred);    \
-    v += thismse;                                                         \
-    if (v < besterr) {                                                    \
-      besterr = v;                                                        \
-      br = r;                                                             \
-      bc = c;                                                             \
-      *distortion = thismse;                                              \
-      *sse1 = sse;                                                        \
-    }                                                                     \
-  } else {                                                                \
-    v = INT_MAX;                                                          \
-  }
-#endif  // CONFIG_EXT_INTER
 
 #define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c)
 
 /* checks if (r, c) has better score than previous best */
-#if CONFIG_EXT_INTER
 #define CHECK_BETTER1(v, r, c)                                              \
   if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                   \
     MV this_mv = { r, c };                                                  \
@@ -249,26 +224,6 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
   } else {                                                                  \
     v = INT_MAX;                                                            \
   }
-#else
-#define CHECK_BETTER1(v, r, c)                                              \
-  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                   \
-    MV this_mv = { r, c };                                                  \
-    thismse = upsampled_pref_error(xd, vfp, src_address, src_stride,        \
-                                   pre(y, y_stride, r, c), y_stride, sp(c), \
-                                   sp(r), second_pred, w, h, &sse);         \
-    v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);      \
-    v += thismse;                                                           \
-    if (v < besterr) {                                                      \
-      besterr = v;                                                          \
-      br = r;                                                               \
-      bc = c;                                                               \
-      *distortion = thismse;                                                \
-      *sse1 = sse;                                                          \
-    }                                                                       \
-  } else {                                                                  \
-    v = INT_MAX;                                                            \
-  }
-#endif  // CONFIG_EXT_INTER
 
 #define FIRST_LEVEL_CHECKS                                       \
   {                                                              \
@@ -372,35 +327,28 @@ static unsigned int setup_center_error(
     const MACROBLOCKD *xd, const MV *bestmv, const MV *ref_mv,
     int error_per_bit, const aom_variance_fn_ptr_t *vfp,
     const uint8_t *const src, const int src_stride, const uint8_t *const y,
-    int y_stride, const uint8_t *second_pred,
-#if CONFIG_EXT_INTER
-    const uint8_t *mask, int mask_stride, int invert_mask,
-#endif
-    int w, int h, int offset, int *mvjcost, int *mvcost[2], unsigned int *sse1,
-    int *distortion) {
+    int y_stride, const uint8_t *second_pred, const uint8_t *mask,
+    int mask_stride, int invert_mask, int w, int h, int offset, int *mvjcost,
+    int *mvcost[2], unsigned int *sse1, int *distortion) {
   unsigned int besterr;
 #if CONFIG_HIGHBITDEPTH
   if (second_pred != NULL) {
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]);
-#if CONFIG_EXT_INTER
       if (mask)
         aom_highbd_comp_mask_pred(comp_pred16, second_pred, w, h, y + offset,
                                   y_stride, mask, mask_stride, invert_mask);
       else
-#endif
         aom_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
                                  y_stride);
       besterr =
           vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride, sse1);
     } else {
       DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]);
-#if CONFIG_EXT_INTER
       if (mask)
         aom_comp_mask_pred(comp_pred, second_pred, w, h, y + offset, y_stride,
                            mask, mask_stride, invert_mask);
       else
-#endif
         aom_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
       besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
     }
@@ -413,12 +361,10 @@ static unsigned int setup_center_error(
   (void)xd;
   if (second_pred != NULL) {
     DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]);
-#if CONFIG_EXT_INTER
     if (mask)
       aom_comp_mask_pred(comp_pred, second_pred, w, h, y + offset, y_stride,
                          mask, mask_stride, invert_mask);
     else
-#endif
       aom_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
     besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
   } else {
@@ -458,19 +404,13 @@ int av1_find_best_sub_pixel_tree_pruned_evenmore(
     MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit,
     const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
     int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
-    unsigned int *sse1, const uint8_t *second_pred,
-#if CONFIG_EXT_INTER
-    const uint8_t *mask, int mask_stride, int invert_mask,
-#endif
-    int w, int h, int use_upsampled_ref) {
+    unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask,
+    int mask_stride, int invert_mask, int w, int h, int use_upsampled_ref) {
   SETUP_SUBPEL_SEARCH;
-  besterr =
-      setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, src_address,
-                         src_stride, y, y_stride, second_pred,
-#if CONFIG_EXT_INTER
-                         mask, mask_stride, invert_mask,
-#endif
-                         w, h, offset, mvjcost, mvcost, sse1, distortion);
+  besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
+                               src_address, src_stride, y, y_stride,
+                               second_pred, mask, mask_stride, invert_mask, w,
+                               h, offset, mvjcost, mvcost, sse1, distortion);
   (void)halfiters;
   (void)quarteriters;
   (void)eighthiters;
@@ -531,21 +471,15 @@ int av1_find_best_sub_pixel_tree_pruned_more(
     MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit,
     const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
     int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
-    unsigned int *sse1, const uint8_t *second_pred,
-#if CONFIG_EXT_INTER
-    const uint8_t *mask, int mask_stride, int invert_mask,
-#endif
-    int w, int h, int use_upsampled_ref) {
+    unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask,
+    int mask_stride, int invert_mask, int w, int h, int use_upsampled_ref) {
   SETUP_SUBPEL_SEARCH;
   (void)use_upsampled_ref;
 
-  besterr =
-      setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, src_address,
-                         src_stride, y, y_stride, second_pred,
-#if CONFIG_EXT_INTER
-                         mask, mask_stride, invert_mask,
-#endif
-                         w, h, offset, mvjcost, mvcost, sse1, distortion);
+  besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
+                               src_address, src_stride, y, y_stride,
+                               second_pred, mask, mask_stride, invert_mask, w,
+                               h, offset, mvjcost, mvcost, sse1, distortion);
   if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
       cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
       cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) {
@@ -600,21 +534,15 @@ int av1_find_best_sub_pixel_tree_pruned(
     MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit,
     const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
     int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
-    unsigned int *sse1, const uint8_t *second_pred,
-#if CONFIG_EXT_INTER
-    const uint8_t *mask, int mask_stride, int invert_mask,
-#endif
-    int w, int h, int use_upsampled_ref) {
+    unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask,
+    int mask_stride, int invert_mask, int w, int h, int use_upsampled_ref) {
   SETUP_SUBPEL_SEARCH;
   (void)use_upsampled_ref;
 
-  besterr =
-      setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, src_address,
-                         src_stride, y, y_stride, second_pred,
-#if CONFIG_EXT_INTER
-                         mask, mask_stride, invert_mask,
-#endif
-                         w, h, offset, mvjcost, mvcost, sse1, distortion);
+  besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
+                               src_address, src_stride, y, y_stride,
+                               second_pred, mask, mask_stride, invert_mask, w,
+                               h, offset, mvjcost, mvcost, sse1, distortion);
   if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
       cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
       cost_list[4] != INT_MAX) {
@@ -696,26 +624,24 @@ static const MV search_step_table[12] = {
 };
 /* clang-format on */
 
-static int upsampled_pref_error(
-    const MACROBLOCKD *xd, const aom_variance_fn_ptr_t *vfp,
-    const uint8_t *const src, const int src_stride, const uint8_t *const y,
-    int y_stride, int subpel_x_q3, int subpel_y_q3, const uint8_t *second_pred,
-#if CONFIG_EXT_INTER
-    const uint8_t *mask, int mask_stride, int invert_mask,
-#endif
-    int w, int h, unsigned int *sse) {
+static int upsampled_pref_error(const MACROBLOCKD *xd,
+                                const aom_variance_fn_ptr_t *vfp,
+                                const uint8_t *const src, const int src_stride,
+                                const uint8_t *const y, int y_stride,
+                                int subpel_x_q3, int subpel_y_q3,
+                                const uint8_t *second_pred, const uint8_t *mask,
+                                int mask_stride, int invert_mask, int w, int h,
+                                unsigned int *sse) {
   unsigned int besterr;
 #if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
     if (second_pred != NULL) {
-#if CONFIG_EXT_INTER
       if (mask)
         aom_highbd_comp_mask_upsampled_pred(
             pred16, second_pred, w, h, subpel_x_q3, subpel_y_q3, y, y_stride,
             mask, mask_stride, invert_mask, xd->bd);
       else
-#endif
         aom_highbd_comp_avg_upsampled_pred(pred16, second_pred, w, h,
                                            subpel_x_q3, subpel_y_q3, y,
                                            y_stride, xd->bd);
@@ -732,13 +658,11 @@ static int upsampled_pref_error(
   (void)xd;
 #endif  // CONFIG_HIGHBITDEPTH
     if (second_pred != NULL) {
-#if CONFIG_EXT_INTER
       if (mask)
         aom_comp_mask_upsampled_pred(pred, second_pred, w, h, subpel_x_q3,
                                      subpel_y_q3, y, y_stride, mask,
                                      mask_stride, invert_mask);
       else
-#endif
         aom_comp_avg_upsampled_pred(pred, second_pred, w, h, subpel_x_q3,
                                     subpel_y_q3, y, y_stride);
     } else {
@@ -756,18 +680,12 @@ static unsigned int upsampled_setup_center_error(
     const MACROBLOCKD *xd, const MV *bestmv, const MV *ref_mv,
     int error_per_bit, const aom_variance_fn_ptr_t *vfp,
     const uint8_t *const src, const int src_stride, const uint8_t *const y,
-    int y_stride, const uint8_t *second_pred,
-#if CONFIG_EXT_INTER
-    const uint8_t *mask, int mask_stride, int invert_mask,
-#endif
-    int w, int h, int offset, int *mvjcost, int *mvcost[2], unsigned int *sse1,
-    int *distortion) {
+    int y_stride, const uint8_t *second_pred, const uint8_t *mask,
+    int mask_stride, int invert_mask, int w, int h, int offset, int *mvjcost,
+    int *mvcost[2], unsigned int *sse1, int *distortion) {
   unsigned int besterr = upsampled_pref_error(
-      xd, vfp, src, src_stride, y + offset, y_stride, 0, 0, second_pred,
-#if CONFIG_EXT_INTER
-      mask, mask_stride, invert_mask,
-#endif
-      w, h, sse1);
+      xd, vfp, src, src_stride, y + offset, y_stride, 0, 0, second_pred, mask,
+      mask_stride, invert_mask, w, h, sse1);
   *distortion = besterr;
   besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
   return besterr;
@@ -777,11 +695,8 @@ int av1_find_best_sub_pixel_tree(
     MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit,
     const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
     int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
-    unsigned int *sse1, const uint8_t *second_pred,
-#if CONFIG_EXT_INTER
-    const uint8_t *mask, int mask_stride, int invert_mask,
-#endif
-    int w, int h, int use_upsampled_ref) {
+    unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask,
+    int mask_stride, int invert_mask, int w, int h, int use_upsampled_ref) {
   const uint8_t *const src_address = x->plane[0].src.buf;
   const int src_stride = x->plane[0].src.stride;
   const MACROBLOCKD *xd = &x->e_mbd;
@@ -818,19 +733,13 @@ int av1_find_best_sub_pixel_tree(
   if (use_upsampled_ref)
     besterr = upsampled_setup_center_error(
         xd, bestmv, ref_mv, error_per_bit, vfp, src_address, src_stride, y,
-        y_stride, second_pred,
-#if CONFIG_EXT_INTER
-        mask, mask_stride, invert_mask,
-#endif
-        w, h, offset, mvjcost, mvcost, sse1, distortion);
+        y_stride, second_pred, mask, mask_stride, invert_mask, w, h, offset,
+        mvjcost, mvcost, sse1, distortion);
   else
-    besterr =
-        setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, src_address,
-                           src_stride, y, y_stride, second_pred,
-#if CONFIG_EXT_INTER
-                           mask, mask_stride, invert_mask,
-#endif
-                           w, h, offset, mvjcost, mvcost, sse1, distortion);
+    besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
+                                 src_address, src_stride, y, y_stride,
+                                 second_pred, mask, mask_stride, invert_mask, w,
+                                 h, offset, mvjcost, mvcost, sse1, distortion);
 
   (void)cost_list;  // to silence compiler warning
 
@@ -845,22 +754,17 @@ int av1_find_best_sub_pixel_tree(
         if (use_upsampled_ref) {
           thismse = upsampled_pref_error(xd, vfp, src_address, src_stride,
                                          pre(y, y_stride, tr, tc), y_stride,
-                                         sp(tc), sp(tr), second_pred,
-#if CONFIG_EXT_INTER
-                                         mask, mask_stride, invert_mask,
-#endif
-                                         w, h, &sse);
+                                         sp(tc), sp(tr), second_pred, mask,
+                                         mask_stride, invert_mask, w, h, &sse);
         } else {
           const uint8_t *const pre_address = pre(y, y_stride, tr, tc);
           if (second_pred == NULL)
             thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr),
                                src_address, src_stride, &sse);
-#if CONFIG_EXT_INTER
           else if (mask)
             thismse = vfp->msvf(pre_address, y_stride, sp(tc), sp(tr),
                                 src_address, src_stride, second_pred, mask,
                                 mask_stride, invert_mask, &sse);
-#endif
           else
             thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
                                 src_address, src_stride, &sse, second_pred);
@@ -892,23 +796,18 @@ int av1_find_best_sub_pixel_tree(
       if (use_upsampled_ref) {
         thismse = upsampled_pref_error(xd, vfp, src_address, src_stride,
                                        pre(y, y_stride, tr, tc), y_stride,
-                                       sp(tc), sp(tr), second_pred,
-#if CONFIG_EXT_INTER
-                                       mask, mask_stride, invert_mask,
-#endif
-                                       w, h, &sse);
+                                       sp(tc), sp(tr), second_pred, mask,
+                                       mask_stride, invert_mask, w, h, &sse);
       } else {
         const uint8_t *const pre_address = pre(y, y_stride, tr, tc);
 
         if (second_pred == NULL)
           thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), src_address,
                              src_stride, &sse);
-#if CONFIG_EXT_INTER
         else if (mask)
           thismse = vfp->msvf(pre_address, y_stride, sp(tc), sp(tr),
                               src_address, src_stride, second_pred, mask,
                               mask_stride, invert_mask, &sse);
-#endif
         else
           thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
                               src_address, src_stride, &sse, second_pred);
@@ -1225,6 +1124,7 @@ static int pattern_search(
   int thissad;
   int k = -1;
   const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+  assert(search_param < MAX_MVSEARCH_STEPS);
   int best_init_s = search_param_to_steps[search_param];
   // adjust ref_mv to make sure it is within MV range
   clamp_mv(start_mv, x->mv_limits.col_min, x->mv_limits.col_max,
@@ -1493,7 +1393,6 @@ int av1_get_mvpred_av_var(const MACROBLOCK *x, const MV *best_mv,
                      : 0);
 }
 
-#if CONFIG_EXT_INTER
 int av1_get_mvpred_mask_var(const MACROBLOCK *x, const MV *best_mv,
                             const MV *center_mv, const uint8_t *second_pred,
                             const uint8_t *mask, int mask_stride,
@@ -1512,7 +1411,6 @@ int av1_get_mvpred_mask_var(const MACROBLOCK *x, const MV *best_mv,
                                    x->errorperbit)
                      : 0);
 }
-#endif
 
 int av1_hex_search(MACROBLOCK *x, MV *start_mv, int search_param,
                    int sad_per_bit, int do_init_search, int *cost_list,
@@ -2481,11 +2379,9 @@ int av1_refining_search_sad(MACROBLOCK *x, MV *ref_mv, int error_per_bit,
 // mode, or when searching for one component of an ext-inter compound mode.
 int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
                              const aom_variance_fn_ptr_t *fn_ptr,
-#if CONFIG_EXT_INTER
                              const uint8_t *mask, int mask_stride,
-                             int invert_mask,
-#endif
-                             const MV *center_mv, const uint8_t *second_pred) {
+                             int invert_mask, const MV *center_mv,
+                             const uint8_t *second_pred) {
   const MV neighbors[8] = { { -1, 0 },  { 0, -1 }, { 0, 1 },  { 1, 0 },
                             { -1, -1 }, { 1, -1 }, { -1, 1 }, { 1, 1 } };
   const MACROBLOCKD *const xd = &x->e_mbd;
@@ -2498,14 +2394,12 @@ int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
 
   clamp_mv(best_mv, x->mv_limits.col_min, x->mv_limits.col_max,
            x->mv_limits.row_min, x->mv_limits.row_max);
-#if CONFIG_EXT_INTER
   if (mask)
     best_sad = fn_ptr->msdf(what->buf, what->stride,
                             get_buf_from_mv(in_what, best_mv), in_what->stride,
                             second_pred, mask, mask_stride, invert_mask) +
                mvsad_err_cost(x, best_mv, &fcenter_mv, error_per_bit);
   else
-#endif
     best_sad =
         fn_ptr->sdaf(what->buf, what->stride, get_buf_from_mv(in_what, best_mv),
                      in_what->stride, second_pred) +
@@ -2520,13 +2414,11 @@ int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
 
       if (is_mv_in(&x->mv_limits, &mv)) {
         unsigned int sad;
-#if CONFIG_EXT_INTER
         if (mask)
           sad = fn_ptr->msdf(what->buf, what->stride,
                              get_buf_from_mv(in_what, &mv), in_what->stride,
                              second_pred, mask, mask_stride, invert_mask);
         else
-#endif
           sad = fn_ptr->sdaf(what->buf, what->stride,
                              get_buf_from_mv(in_what, &mv), in_what->stride,
                              second_pred);
@@ -2562,10 +2454,45 @@ static int is_exhaustive_allowed(const AV1_COMP *const cpi, MACROBLOCK *x) {
          (*x->ex_search_count_ptr <= max_ex) && !cpi->rc.is_src_frame_alt_ref;
 }
 
+#if CONFIG_HASH_ME
+#define MAX_HASH_MV_TABLE_SIZE 5
+static void add_to_sort_table(block_hash block_hashes[MAX_HASH_MV_TABLE_SIZE],
+                              int costs[MAX_HASH_MV_TABLE_SIZE], int *existing,
+                              int max_size, block_hash curr_block,
+                              int curr_cost) {
+  if (*existing < max_size) {
+    block_hashes[*existing] = curr_block;
+    costs[*existing] = curr_cost;
+    (*existing)++;
+  } else {
+    int max_cost = 0;
+    int max_cost_idx = 0;
+    for (int i = 0; i < max_size; i++) {
+      if (costs[i] > max_cost) {
+        max_cost = costs[i];
+        max_cost_idx = i;
+      }
+    }
+
+    if (curr_cost < max_cost) {
+      block_hashes[max_cost_idx] = curr_block;
+      costs[max_cost_idx] = curr_cost;
+    }
+  }
+}
+#endif
+
+#if CONFIG_HASH_ME
+int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+                          MV *mvp_full, int step_param, int error_per_bit,
+                          int *cost_list, const MV *ref_mv, int var_max, int rd,
+                          int x_pos, int y_pos, int intra) {
+#else
 int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
                           MV *mvp_full, int step_param, int error_per_bit,
                           int *cost_list, const MV *ref_mv, int var_max,
                           int rd) {
+#endif
   const SPEED_FEATURES *const sf = &cpi->sf;
   const SEARCH_METHODS method = sf->mv.search_method;
   const aom_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize];
@@ -2637,6 +2564,93 @@ int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
   if (method != NSTEP && rd && var < var_max)
     var = av1_get_mvpred_var(x, &x->best_mv.as_mv, ref_mv, fn_ptr, 1);
 
+#if CONFIG_HASH_ME
+  do {
+    if (!cpi->common.allow_screen_content_tools) {
+      break;
+    }
+    // already single ME
+    // get block size and original buffer of current block
+    const int block_height = block_size_high[bsize];
+    const int block_width = block_size_wide[bsize];
+    if (block_height == block_width && x_pos >= 0 && y_pos >= 0) {
+      if (block_width == 4 || block_width == 8 || block_width == 16 ||
+          block_width == 32 || block_width == 64) {
+        uint8_t *what = x->plane[0].src.buf;
+        const int what_stride = x->plane[0].src.stride;
+        block_hash block_hashes[MAX_HASH_MV_TABLE_SIZE];
+        int costs[MAX_HASH_MV_TABLE_SIZE];
+        int existing = 0;
+        int i;
+        uint32_t hash_value1, hash_value2;
+        MV best_hash_mv;
+        int best_hash_cost = INT_MAX;
+
+        // for the hashMap
+        hash_table *ref_frame_hash =
+            intra ? &cpi->common.cur_frame->hash_table
+                  : get_ref_frame_hash_map(cpi,
+                                           x->e_mbd.mi[0]->mbmi.ref_frame[0]);
+
+        av1_get_block_hash_value(what, what_stride, block_width, &hash_value1,
+                                 &hash_value2);
+
+        const int count = av1_hash_table_count(ref_frame_hash, hash_value1);
+        // for intra, at lest one matching can be found, itself.
+        if (count <= (intra ? 1 : 0)) {
+          break;
+        }
+
+        Iterator iterator =
+            av1_hash_get_first_iterator(ref_frame_hash, hash_value1);
+        for (i = 0; i < count; i++, iterator_increment(&iterator)) {
+          block_hash ref_block_hash = *(block_hash *)(iterator_get(&iterator));
+          if (hash_value2 == ref_block_hash.hash_value2) {
+            // for intra, make sure the prediction is from valid area
+            // not predict from current block.
+            // TODO(roger): check if the constrain is necessary
+            if (intra &&
+                ref_block_hash.y + block_height >
+                    ((y_pos >> MAX_SB_SIZE_LOG2) << MAX_SB_SIZE_LOG2) &&
+                ref_block_hash.x + block_width >
+                    ((x_pos >> MAX_SB_SIZE_LOG2) << MAX_SB_SIZE_LOG2)) {
+              continue;
+            }
+            int refCost =
+                abs(ref_block_hash.x - x_pos) + abs(ref_block_hash.y - y_pos);
+            add_to_sort_table(block_hashes, costs, &existing,
+                              MAX_HASH_MV_TABLE_SIZE, ref_block_hash, refCost);
+          }
+        }
+
+        if (existing == 0) {
+          break;
+        }
+
+        for (i = 0; i < existing; i++) {
+          MV hash_mv;
+          hash_mv.col = block_hashes[i].x - x_pos;
+          hash_mv.row = block_hashes[i].y - y_pos;
+          if (!is_mv_in(&x->mv_limits, &hash_mv)) {
+            continue;
+          }
+          int currHashCost = av1_get_mvpred_var(x, &hash_mv, ref_mv, fn_ptr, 1);
+          if (currHashCost < best_hash_cost) {
+            best_hash_cost = currHashCost;
+            best_hash_mv = hash_mv;
+          }
+        }
+
+        if (best_hash_cost < var) {
+          x->second_best_mv = x->best_mv;
+          x->best_mv.as_mv = best_hash_mv;
+          var = best_hash_cost;
+        }
+      }
+    }
+  } while (0);
+#endif
+
   return var;
 }
 
@@ -3150,25 +3164,24 @@ int av1_return_max_sub_pixel_mv(
     MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit,
     const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
     int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
-    unsigned int *sse1, const uint8_t *second_pred,
-#if CONFIG_EXT_INTER
-    const uint8_t *mask, int mask_stride, int invert_mask,
-#endif
-    int w, int h, int use_upsampled_ref) {
+    unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask,
+    int mask_stride, int invert_mask, int w, int h, int use_upsampled_ref) {
   COMMON_MV_TEST;
-#if CONFIG_EXT_INTER
   (void)mask;
   (void)mask_stride;
   (void)invert_mask;
-#endif
   (void)minr;
   (void)minc;
   bestmv->row = maxr;
   bestmv->col = maxc;
   besterr = 0;
-  // In the sub-pel motion search, if hp is not used, then the last bit of mv
-  // has to be 0.
+// In the sub-pel motion search, if hp is not used, then the last bit of mv
+// has to be 0.
+#if CONFIG_AMVR
+  lower_mv_precision(bestmv, allow_hp, 0);
+#else
   lower_mv_precision(bestmv, allow_hp);
+#endif
   return besterr;
 }
 // Return the minimum MV.
@@ -3176,24 +3189,23 @@ int av1_return_min_sub_pixel_mv(
     MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit,
     const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
     int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
-    unsigned int *sse1, const uint8_t *second_pred,
-#if CONFIG_EXT_INTER
-    const uint8_t *mask, int mask_stride, int invert_mask,
-#endif
-    int w, int h, int use_upsampled_ref) {
+    unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask,
+    int mask_stride, int invert_mask, int w, int h, int use_upsampled_ref) {
   COMMON_MV_TEST;
   (void)maxr;
   (void)maxc;
-#if CONFIG_EXT_INTER
   (void)mask;
   (void)mask_stride;
   (void)invert_mask;
-#endif
   bestmv->row = minr;
   bestmv->col = minc;
   besterr = 0;
-  // In the sub-pel motion search, if hp is not used, then the last bit of mv
-  // has to be 0.
+// In the sub-pel motion search, if hp is not used, then the last bit of mv
+// has to be 0.
+#if CONFIG_AMVR
+  lower_mv_precision(bestmv, allow_hp, 0);
+#else
   lower_mv_precision(bestmv, allow_hp);
+#endif
   return besterr;
 }
diff --git a/third_party/aom/av1/encoder/mcomp.h b/third_party/aom/av1/encoder/mcomp.h
index 733e415ce..2c53075cc 100644
--- a/third_party/aom/av1/encoder/mcomp.h
+++ b/third_party/aom/av1/encoder/mcomp.h
@@ -58,13 +58,11 @@ int av1_get_mvpred_var(const MACROBLOCK *x, const MV *best_mv,
 int av1_get_mvpred_av_var(const MACROBLOCK *x, const MV *best_mv,
                           const MV *center_mv, const uint8_t *second_pred,
                           const aom_variance_fn_ptr_t *vfp, int use_mvcost);
-#if CONFIG_EXT_INTER
 int av1_get_mvpred_mask_var(const MACROBLOCK *x, const MV *best_mv,
                             const MV *center_mv, const uint8_t *second_pred,
                             const uint8_t *mask, int mask_stride,
                             int invert_mask, const aom_variance_fn_ptr_t *vfp,
                             int use_mvcost);
-#endif
 
 struct AV1_COMP;
 struct SPEED_FEATURES;
@@ -99,10 +97,8 @@ typedef int(fractional_mv_step_fp)(
     int forced_stop,  // 0 - full, 1 - qtr only, 2 - half only
     int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
     int *distortion, unsigned int *sse1, const uint8_t *second_pred,
-#if CONFIG_EXT_INTER
-    const uint8_t *mask, int mask_stride, int invert_mask,
-#endif
-    int w, int h, int use_upsampled_ref);
+    const uint8_t *mask, int mask_stride, int invert_mask, int w, int h,
+    int use_upsampled_ref);
 
 extern fractional_mv_step_fp av1_find_best_sub_pixel_tree;
 extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned;
@@ -123,18 +119,23 @@ typedef int (*av1_diamond_search_fn_t)(
 
 int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
                              const aom_variance_fn_ptr_t *fn_ptr,
-#if CONFIG_EXT_INTER
                              const uint8_t *mask, int mask_stride,
-                             int invert_mask,
-#endif
-                             const MV *center_mv, const uint8_t *second_pred);
+                             int invert_mask, const MV *center_mv,
+                             const uint8_t *second_pred);
 
 struct AV1_COMP;
 
+#if CONFIG_HASH_ME
+int av1_full_pixel_search(const struct AV1_COMP *cpi, MACROBLOCK *x,
+                          BLOCK_SIZE bsize, MV *mvp_full, int step_param,
+                          int error_per_bit, int *cost_list, const MV *ref_mv,
+                          int var_max, int rd, int x_pos, int y_pos, int intra);
+#else
 int av1_full_pixel_search(const struct AV1_COMP *cpi, MACROBLOCK *x,
                           BLOCK_SIZE bsize, MV *mvp_full, int step_param,
                           int error_per_bit, int *cost_list, const MV *ref_mv,
                           int var_max, int rd);
+#endif
 
 #if CONFIG_MOTION_VAR
 int av1_obmc_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x,
diff --git a/third_party/aom/av1/encoder/palette.c b/third_party/aom/av1/encoder/palette.c
index bac06cd17..f34b82544 100644
--- a/third_party/aom/av1/encoder/palette.c
+++ b/third_party/aom/av1/encoder/palette.c
@@ -14,116 +14,14 @@
 
 #include "av1/encoder/cost.h"
 #include "av1/encoder/palette.h"
-
-static float calc_dist(const float *p1, const float *p2, int dim) {
-  float dist = 0;
-  int i;
-  for (i = 0; i < dim; ++i) {
-    const float diff = p1[i] - p2[i];
-    dist += diff * diff;
-  }
-  return dist;
-}
-
-void av1_calc_indices(const float *data, const float *centroids,
-                      uint8_t *indices, int n, int k, int dim) {
-  int i, j;
-  for (i = 0; i < n; ++i) {
-    float min_dist = calc_dist(data + i * dim, centroids, dim);
-    indices[i] = 0;
-    for (j = 1; j < k; ++j) {
-      const float this_dist =
-          calc_dist(data + i * dim, centroids + j * dim, dim);
-      if (this_dist < min_dist) {
-        min_dist = this_dist;
-        indices[i] = j;
-      }
-    }
-  }
-}
-
-// Generate a random number in the range [0, 32768).
-static unsigned int lcg_rand16(unsigned int *state) {
-  *state = (unsigned int)(*state * 1103515245ULL + 12345);
-  return *state / 65536 % 32768;
-}
-
-static void calc_centroids(const float *data, float *centroids,
-                           const uint8_t *indices, int n, int k, int dim) {
-  int i, j, index;
-  int count[PALETTE_MAX_SIZE];
-  unsigned int rand_state = (unsigned int)data[0];
-
-  assert(n <= 32768);
-
-  memset(count, 0, sizeof(count[0]) * k);
-  memset(centroids, 0, sizeof(centroids[0]) * k * dim);
-
-  for (i = 0; i < n; ++i) {
-    index = indices[i];
-    assert(index < k);
-    ++count[index];
-    for (j = 0; j < dim; ++j) {
-      centroids[index * dim + j] += data[i * dim + j];
-    }
-  }
-
-  for (i = 0; i < k; ++i) {
-    if (count[i] == 0) {
-      memcpy(centroids + i * dim, data + (lcg_rand16(&rand_state) % n) * dim,
-             sizeof(centroids[0]) * dim);
-    } else {
-      const float norm = 1.0f / count[i];
-      for (j = 0; j < dim; ++j) centroids[i * dim + j] *= norm;
-    }
-  }
-
-  // Round to nearest integers.
-  for (i = 0; i < k * dim; ++i) {
-    centroids[i] = roundf(centroids[i]);
-  }
-}
-
-static float calc_total_dist(const float *data, const float *centroids,
-                             const uint8_t *indices, int n, int k, int dim) {
-  float dist = 0;
-  int i;
-  (void)k;
-
-  for (i = 0; i < n; ++i)
-    dist += calc_dist(data + i * dim, centroids + indices[i] * dim, dim);
-
-  return dist;
-}
-
-void av1_k_means(const float *data, float *centroids, uint8_t *indices, int n,
-                 int k, int dim, int max_itr) {
-  int i;
-  float this_dist;
-  float pre_centroids[2 * PALETTE_MAX_SIZE];
-  uint8_t pre_indices[MAX_SB_SQUARE];
-
-  av1_calc_indices(data, centroids, indices, n, k, dim);
-  this_dist = calc_total_dist(data, centroids, indices, n, k, dim);
-
-  for (i = 0; i < max_itr; ++i) {
-    const float pre_dist = this_dist;
-    memcpy(pre_centroids, centroids, sizeof(pre_centroids[0]) * k * dim);
-    memcpy(pre_indices, indices, sizeof(pre_indices[0]) * n);
-
-    calc_centroids(data, centroids, indices, n, k, dim);
-    av1_calc_indices(data, centroids, indices, n, k, dim);
-    this_dist = calc_total_dist(data, centroids, indices, n, k, dim);
-
-    if (this_dist > pre_dist) {
-      memcpy(centroids, pre_centroids, sizeof(pre_centroids[0]) * k * dim);
-      memcpy(indices, pre_indices, sizeof(pre_indices[0]) * n);
-      break;
-    }
-    if (!memcmp(centroids, pre_centroids, sizeof(pre_centroids[0]) * k * dim))
-      break;
-  }
-}
+#include "av1/encoder/random.h"
+
+#define AV1_K_MEANS_DIM 1
+#include "av1/encoder/k_means_template.h"
+#undef AV1_K_MEANS_DIM
+#define AV1_K_MEANS_DIM 2
+#include "av1/encoder/k_means_template.h"
+#undef AV1_K_MEANS_DIM
 
 static int float_comparer(const void *a, const void *b) {
   const float fa = *(const float *)a;
diff --git a/third_party/aom/av1/encoder/palette.h b/third_party/aom/av1/encoder/palette.h
index 8afe5a782..efd89f66f 100644
--- a/third_party/aom/av1/encoder/palette.h
+++ b/third_party/aom/av1/encoder/palette.h
@@ -18,17 +18,49 @@
 extern "C" {
 #endif
 
+#define AV1_K_MEANS_RENAME(func, dim) func##_dim##dim
+
+void AV1_K_MEANS_RENAME(av1_calc_indices, 1)(const float *data,
+                                             const float *centroids,
+                                             uint8_t *indices, int n, int k);
+void AV1_K_MEANS_RENAME(av1_calc_indices, 2)(const float *data,
+                                             const float *centroids,
+                                             uint8_t *indices, int n, int k);
+void AV1_K_MEANS_RENAME(av1_k_means, 1)(const float *data, float *centroids,
+                                        uint8_t *indices, int n, int k,
+                                        int max_itr);
+void AV1_K_MEANS_RENAME(av1_k_means, 2)(const float *data, float *centroids,
+                                        uint8_t *indices, int n, int k,
+                                        int max_itr);
+
 // Given 'n' 'data' points and 'k' 'centroids' each of dimension 'dim',
 // calculate the centroid 'indices' for the data points.
-void av1_calc_indices(const float *data, const float *centroids,
-                      uint8_t *indices, int n, int k, int dim);
+static INLINE void av1_calc_indices(const float *data, const float *centroids,
+                                    uint8_t *indices, int n, int k, int dim) {
+  if (dim == 1) {
+    AV1_K_MEANS_RENAME(av1_calc_indices, 1)(data, centroids, indices, n, k);
+  } else if (dim == 2) {
+    AV1_K_MEANS_RENAME(av1_calc_indices, 2)(data, centroids, indices, n, k);
+  } else {
+    assert(0 && "Untemplated k means dimension");
+  }
+}
 
 // Given 'n' 'data' points and an initial guess of 'k' 'centroids' each of
 // dimension 'dim', runs up to 'max_itr' iterations of k-means algorithm to get
 // updated 'centroids' and the centroid 'indices' for elements in 'data'.
 // Note: the output centroids are rounded off to nearest integers.
-void av1_k_means(const float *data, float *centroids, uint8_t *indices, int n,
-                 int k, int dim, int max_itr);
+static INLINE void av1_k_means(const float *data, float *centroids,
+                               uint8_t *indices, int n, int k, int dim,
+                               int max_itr) {
+  if (dim == 1) {
+    AV1_K_MEANS_RENAME(av1_k_means, 1)(data, centroids, indices, n, k, max_itr);
+  } else if (dim == 2) {
+    AV1_K_MEANS_RENAME(av1_k_means, 2)(data, centroids, indices, n, k, max_itr);
+  } else {
+    assert(0 && "Untemplated k means dimension");
+  }
+}
 
 // Given a list of centroids, returns the unique number of centroids 'k', and
 // puts these unique centroids in first 'k' indices of 'centroids' array.
diff --git a/third_party/aom/av1/encoder/pickcdef.c b/third_party/aom/av1/encoder/pickcdef.c
index e4ec38826..accc97e57 100644
--- a/third_party/aom/av1/encoder/pickcdef.c
+++ b/third_party/aom/av1/encoder/pickcdef.c
@@ -19,11 +19,11 @@
 #include "av1/common/reconinter.h"
 #include "av1/encoder/encoder.h"
 
-#define REDUCED_STRENGTHS 8
-#define REDUCED_TOTAL_STRENGTHS (REDUCED_STRENGTHS * CLPF_STRENGTHS)
-#define TOTAL_STRENGTHS (DERING_STRENGTHS * CLPF_STRENGTHS)
+#define REDUCED_PRI_STRENGTHS 8
+#define REDUCED_TOTAL_STRENGTHS (REDUCED_PRI_STRENGTHS * CDEF_SEC_STRENGTHS)
+#define TOTAL_STRENGTHS (CDEF_PRI_STRENGTHS * CDEF_SEC_STRENGTHS)
 
-static int priconv[REDUCED_STRENGTHS] = { 0, 1, 2, 3, 4, 7, 12, 25 };
+static int priconv[REDUCED_PRI_STRENGTHS] = { 0, 1, 2, 3, 4, 7, 12, 25 };
 
 /* Search for the best strength to add as an option, knowing we
    already selected nb_strengths options. */
@@ -68,11 +68,16 @@ static uint64_t search_one_dual(int *lev0, int *lev1, int nb_strengths,
                                 uint64_t (**mse)[TOTAL_STRENGTHS], int sb_count,
                                 int fast) {
   uint64_t tot_mse[TOTAL_STRENGTHS][TOTAL_STRENGTHS];
+#if !CONFIG_CDEF_SINGLEPASS
   const int total_strengths = fast ? REDUCED_TOTAL_STRENGTHS : TOTAL_STRENGTHS;
+#endif
   int i, j;
   uint64_t best_tot_mse = (uint64_t)1 << 63;
   int best_id0 = 0;
   int best_id1 = 0;
+#if CONFIG_CDEF_SINGLEPASS
+  const int total_strengths = fast ? REDUCED_TOTAL_STRENGTHS : TOTAL_STRENGTHS;
+#endif
   memset(tot_mse, 0, sizeof(tot_mse));
   for (i = 0; i < sb_count; i++) {
     int gi;
@@ -232,13 +237,13 @@ static INLINE uint64_t mse_4x4_16bit(uint16_t *dst, int dstride, uint16_t *src,
 }
 
 /* Compute MSE only on the blocks we filtered. */
-uint64_t compute_dering_dist(uint16_t *dst, int dstride, uint16_t *src,
-                             dering_list *dlist, int dering_count,
-                             BLOCK_SIZE bsize, int coeff_shift, int pli) {
+uint64_t compute_cdef_dist(uint16_t *dst, int dstride, uint16_t *src,
+                           cdef_list *dlist, int cdef_count, BLOCK_SIZE bsize,
+                           int coeff_shift, int pli) {
   uint64_t sum = 0;
   int bi, bx, by;
   if (bsize == BLOCK_8X8) {
-    for (bi = 0; bi < dering_count; bi++) {
+    for (bi = 0; bi < cdef_count; bi++) {
       by = dlist[bi].by;
       bx = dlist[bi].bx;
       if (pli == 0) {
@@ -250,7 +255,7 @@ uint64_t compute_dering_dist(uint16_t *dst, int dstride, uint16_t *src,
       }
     }
   } else if (bsize == BLOCK_4X8) {
-    for (bi = 0; bi < dering_count; bi++) {
+    for (bi = 0; bi < cdef_count; bi++) {
       by = dlist[bi].by;
       bx = dlist[bi].bx;
       sum += mse_4x4_16bit(&dst[(by << 3) * dstride + (bx << 2)], dstride,
@@ -259,7 +264,7 @@ uint64_t compute_dering_dist(uint16_t *dst, int dstride, uint16_t *src,
                            &src[(bi << (3 + 2)) + 4 * 4], 4);
     }
   } else if (bsize == BLOCK_8X4) {
-    for (bi = 0; bi < dering_count; bi++) {
+    for (bi = 0; bi < cdef_count; bi++) {
       by = dlist[bi].by;
       bx = dlist[bi].bx;
       sum += mse_4x4_16bit(&dst[(by << 2) * dstride + (bx << 3)], dstride,
@@ -269,7 +274,7 @@ uint64_t compute_dering_dist(uint16_t *dst, int dstride, uint16_t *src,
     }
   } else {
     assert(bsize == BLOCK_4X4);
-    for (bi = 0; bi < dering_count; bi++) {
+    for (bi = 0; bi < cdef_count; bi++) {
       by = dlist[bi].by;
       bx = dlist[bi].bx;
       sum += mse_4x4_16bit(&dst[(by << 2) * dstride + (bx << 2)], dstride,
@@ -282,12 +287,12 @@ uint64_t compute_dering_dist(uint16_t *dst, int dstride, uint16_t *src,
 void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
                      AV1_COMMON *cm, MACROBLOCKD *xd, int fast) {
   int r, c;
-  int sbr, sbc;
+  int fbr, fbc;
   uint16_t *src[3];
   uint16_t *ref_coeff[3];
-  dering_list dlist[MI_SIZE_64X64 * MI_SIZE_64X64];
-  int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = { { 0 } };
-  int var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = { { 0 } };
+  cdef_list dlist[MI_SIZE_64X64 * MI_SIZE_64X64];
+  int dir[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
+  int var[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
   int stride[3];
   int bsize[3];
   int mi_wide_l2[3];
@@ -295,18 +300,22 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
   int xdec[3];
   int ydec[3];
   int pli;
-  int dering_count;
+  int cdef_count;
   int coeff_shift = AOMMAX(cm->bit_depth - 8, 0);
   uint64_t best_tot_mse = (uint64_t)1 << 63;
   uint64_t tot_mse;
   int sb_count;
-  int nvsb = (cm->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
-  int nhsb = (cm->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
-  int *sb_index = aom_malloc(nvsb * nhsb * sizeof(*sb_index));
-  int *selected_strength = aom_malloc(nvsb * nhsb * sizeof(*sb_index));
+  int nvfb = (cm->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  int nhfb = (cm->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  int *sb_index = aom_malloc(nvfb * nhfb * sizeof(*sb_index));
+  int *selected_strength = aom_malloc(nvfb * nhfb * sizeof(*sb_index));
   uint64_t(*mse[2])[TOTAL_STRENGTHS];
-  int clpf_damping = 3 + (cm->base_qindex >> 6);
-  int dering_damping = 6;
+#if CONFIG_CDEF_SINGLEPASS
+  int pri_damping = 3 + (cm->base_qindex >> 6);
+#else
+  int pri_damping = 6;
+#endif
+  int sec_damping = 3 + (cm->base_qindex >> 6);
   int i;
   int nb_strengths;
   int nb_strength_bits;
@@ -314,19 +323,18 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
   double lambda;
   int nplanes = 3;
   const int total_strengths = fast ? REDUCED_TOTAL_STRENGTHS : TOTAL_STRENGTHS;
-  DECLARE_ALIGNED(32, uint16_t, inbuf[OD_DERING_INBUF_SIZE]);
+  DECLARE_ALIGNED(32, uint16_t, inbuf[CDEF_INBUF_SIZE]);
   uint16_t *in;
-  DECLARE_ALIGNED(32, uint16_t, tmp_dst[MAX_SB_SQUARE]);
-  int chroma_dering =
-      xd->plane[1].subsampling_x == xd->plane[1].subsampling_y &&
-      xd->plane[2].subsampling_x == xd->plane[2].subsampling_y;
+  DECLARE_ALIGNED(32, uint16_t, tmp_dst[CDEF_BLOCKSIZE * CDEF_BLOCKSIZE]);
+  int chroma_cdef = xd->plane[1].subsampling_x == xd->plane[1].subsampling_y &&
+                    xd->plane[2].subsampling_x == xd->plane[2].subsampling_y;
   quantizer =
       av1_ac_quant(cm->base_qindex, 0, cm->bit_depth) >> (cm->bit_depth - 8);
   lambda = .12 * quantizer * quantizer / 256.;
 
   av1_setup_dst_planes(xd->plane, cm->sb_size, frame, 0, 0);
-  mse[0] = aom_malloc(sizeof(**mse) * nvsb * nhsb);
-  mse[1] = aom_malloc(sizeof(**mse) * nvsb * nhsb);
+  mse[0] = aom_malloc(sizeof(**mse) * nvfb * nhfb);
+  mse[1] = aom_malloc(sizeof(**mse) * nvfb * nhfb);
   for (pli = 0; pli < nplanes; pli++) {
     uint8_t *ref_buffer;
     int ref_stride;
@@ -380,65 +388,76 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
       }
     }
   }
-  in = inbuf + OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER;
+  in = inbuf + CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER;
   sb_count = 0;
-  for (sbr = 0; sbr < nvsb; ++sbr) {
-    for (sbc = 0; sbc < nhsb; ++sbc) {
+  for (fbr = 0; fbr < nvfb; ++fbr) {
+    for (fbc = 0; fbc < nhfb; ++fbc) {
       int nvb, nhb;
       int gi;
       int dirinit = 0;
-      nhb = AOMMIN(MI_SIZE_64X64, cm->mi_cols - MI_SIZE_64X64 * sbc);
-      nvb = AOMMIN(MI_SIZE_64X64, cm->mi_rows - MI_SIZE_64X64 * sbr);
-      cm->mi_grid_visible[MI_SIZE_64X64 * sbr * cm->mi_stride +
-                          MI_SIZE_64X64 * sbc]
+      nhb = AOMMIN(MI_SIZE_64X64, cm->mi_cols - MI_SIZE_64X64 * fbc);
+      nvb = AOMMIN(MI_SIZE_64X64, cm->mi_rows - MI_SIZE_64X64 * fbr);
+      cm->mi_grid_visible[MI_SIZE_64X64 * fbr * cm->mi_stride +
+                          MI_SIZE_64X64 * fbc]
           ->mbmi.cdef_strength = -1;
-      if (sb_all_skip(cm, sbr * MI_SIZE_64X64, sbc * MI_SIZE_64X64)) continue;
-      dering_count = sb_compute_dering_list(cm, sbr * MI_SIZE_64X64,
-                                            sbc * MI_SIZE_64X64, dlist, 1);
+      if (sb_all_skip(cm, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64)) continue;
+      cdef_count = sb_compute_cdef_list(cm, fbr * MI_SIZE_64X64,
+                                        fbc * MI_SIZE_64X64, dlist, 1);
       for (pli = 0; pli < nplanes; pli++) {
-        for (i = 0; i < OD_DERING_INBUF_SIZE; i++)
-          inbuf[i] = OD_DERING_VERY_LARGE;
+        for (i = 0; i < CDEF_INBUF_SIZE; i++) inbuf[i] = CDEF_VERY_LARGE;
         for (gi = 0; gi < total_strengths; gi++) {
           int threshold;
           uint64_t curr_mse;
-          int clpf_strength;
-          threshold = gi / CLPF_STRENGTHS;
+          int sec_strength;
+          threshold = gi / CDEF_SEC_STRENGTHS;
           if (fast) threshold = priconv[threshold];
-          if (pli > 0 && !chroma_dering) threshold = 0;
+          if (pli > 0 && !chroma_cdef) threshold = 0;
           /* We avoid filtering the pixels for which some of the pixels to
              average
              are outside the frame. We could change the filter instead, but it
              would add special cases for any future vectorization. */
-          int yoff = OD_FILT_VBORDER * (sbr != 0);
-          int xoff = OD_FILT_HBORDER * (sbc != 0);
+          int yoff = CDEF_VBORDER * (fbr != 0);
+          int xoff = CDEF_HBORDER * (fbc != 0);
           int ysize = (nvb << mi_high_l2[pli]) +
-                      OD_FILT_VBORDER * (sbr != nvsb - 1) + yoff;
+                      CDEF_VBORDER * (fbr != nvfb - 1) + yoff;
           int xsize = (nhb << mi_wide_l2[pli]) +
-                      OD_FILT_HBORDER * (sbc != nhsb - 1) + xoff;
-          clpf_strength = gi % CLPF_STRENGTHS;
-          if (clpf_strength == 0)
-            copy_sb16_16(&in[(-yoff * OD_FILT_BSTRIDE - xoff)], OD_FILT_BSTRIDE,
+                      CDEF_HBORDER * (fbc != nhfb - 1) + xoff;
+          sec_strength = gi % CDEF_SEC_STRENGTHS;
+#if CONFIG_CDEF_SINGLEPASS
+          copy_sb16_16(&in[(-yoff * CDEF_BSTRIDE - xoff)], CDEF_BSTRIDE,
+                       src[pli],
+                       (fbr * MI_SIZE_64X64 << mi_high_l2[pli]) - yoff,
+                       (fbc * MI_SIZE_64X64 << mi_wide_l2[pli]) - xoff,
+                       stride[pli], ysize, xsize);
+          cdef_filter_fb(NULL, tmp_dst, CDEF_BSTRIDE, in, xdec[pli], ydec[pli],
+                         dir, &dirinit, var, pli, dlist, cdef_count, threshold,
+                         sec_strength + (sec_strength == 3), pri_damping,
+                         sec_damping, coeff_shift);
+#else
+          if (sec_strength == 0)
+            copy_sb16_16(&in[(-yoff * CDEF_BSTRIDE - xoff)], CDEF_BSTRIDE,
                          src[pli],
-                         (sbr * MI_SIZE_64X64 << mi_high_l2[pli]) - yoff,
-                         (sbc * MI_SIZE_64X64 << mi_wide_l2[pli]) - xoff,
+                         (fbr * MI_SIZE_64X64 << mi_high_l2[pli]) - yoff,
+                         (fbc * MI_SIZE_64X64 << mi_wide_l2[pli]) - xoff,
                          stride[pli], ysize, xsize);
-          od_dering(clpf_strength ? NULL : (uint8_t *)in, OD_FILT_BSTRIDE,
-                    tmp_dst, in, xdec[pli], ydec[pli], dir, &dirinit, var, pli,
-                    dlist, dering_count, threshold,
-                    clpf_strength + (clpf_strength == 3), clpf_damping,
-                    dering_damping, coeff_shift, clpf_strength != 0, 1);
-          curr_mse = compute_dering_dist(
+          cdef_filter_fb(sec_strength ? NULL : (uint8_t *)in, CDEF_BSTRIDE,
+                         tmp_dst, in, xdec[pli], ydec[pli], dir, &dirinit, var,
+                         pli, dlist, cdef_count, threshold,
+                         sec_strength + (sec_strength == 3), sec_damping,
+                         pri_damping, coeff_shift, sec_strength != 0, 1);
+#endif
+          curr_mse = compute_cdef_dist(
               ref_coeff[pli] +
-                  (sbr * MI_SIZE_64X64 << mi_high_l2[pli]) * stride[pli] +
-                  (sbc * MI_SIZE_64X64 << mi_wide_l2[pli]),
-              stride[pli], tmp_dst, dlist, dering_count, bsize[pli],
-              coeff_shift, pli);
+                  (fbr * MI_SIZE_64X64 << mi_high_l2[pli]) * stride[pli] +
+                  (fbc * MI_SIZE_64X64 << mi_wide_l2[pli]),
+              stride[pli], tmp_dst, dlist, cdef_count, bsize[pli], coeff_shift,
+              pli);
           if (pli < 2)
             mse[pli][sb_count][gi] = curr_mse;
           else
             mse[1][sb_count][gi] += curr_mse;
           sb_index[sb_count] =
-              MI_SIZE_64X64 * sbr * cm->mi_stride + MI_SIZE_64X64 * sbc;
+              MI_SIZE_64X64 * fbr * cm->mi_stride + MI_SIZE_64X64 * fbc;
         }
       }
       sb_count++;
@@ -494,15 +513,17 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
   if (fast) {
     for (int j = 0; j < nb_strengths; j++) {
       cm->cdef_strengths[j] =
-          priconv[cm->cdef_strengths[j] / CLPF_STRENGTHS] * CLPF_STRENGTHS +
-          (cm->cdef_strengths[j] % CLPF_STRENGTHS);
+          priconv[cm->cdef_strengths[j] / CDEF_SEC_STRENGTHS] *
+              CDEF_SEC_STRENGTHS +
+          (cm->cdef_strengths[j] % CDEF_SEC_STRENGTHS);
       cm->cdef_uv_strengths[j] =
-          priconv[cm->cdef_uv_strengths[j] / CLPF_STRENGTHS] * CLPF_STRENGTHS +
-          (cm->cdef_uv_strengths[j] % CLPF_STRENGTHS);
+          priconv[cm->cdef_uv_strengths[j] / CDEF_SEC_STRENGTHS] *
+              CDEF_SEC_STRENGTHS +
+          (cm->cdef_uv_strengths[j] % CDEF_SEC_STRENGTHS);
     }
   }
-  cm->cdef_dering_damping = dering_damping;
-  cm->cdef_clpf_damping = clpf_damping;
+  cm->cdef_pri_damping = pri_damping;
+  cm->cdef_sec_damping = sec_damping;
   aom_free(mse[0]);
   aom_free(mse[1]);
   for (pli = 0; pli < nplanes; pli++) {
diff --git a/third_party/aom/av1/encoder/picklpf.c b/third_party/aom/av1/encoder/picklpf.c
index 26fd55ef0..d8b6f9074 100644
--- a/third_party/aom/av1/encoder/picklpf.c
+++ b/third_party/aom/av1/encoder/picklpf.c
@@ -14,8 +14,8 @@
 
 #include "./aom_scale_rtcd.h"
 
-#include "aom_dsp/psnr.h"
 #include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/psnr.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
 
@@ -27,6 +27,85 @@
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/picklpf.h"
 
+#if CONFIG_LPF_SB
+#if CONFIG_HIGHBITDEPTH
+static int compute_sb_y_sse_highbd(const YV12_BUFFER_CONFIG *src,
+                                   const YV12_BUFFER_CONFIG *frame,
+                                   AV1_COMMON *const cm, int mi_row,
+                                   int mi_col) {
+  int sse = 0;
+  const int mi_row_start = AOMMAX(0, mi_row - FILT_BOUNDARY_MI_OFFSET);
+  const int mi_col_start = AOMMAX(0, mi_col - FILT_BOUNDARY_MI_OFFSET);
+  const int mi_row_range = mi_row - FILT_BOUNDARY_MI_OFFSET + MAX_MIB_SIZE;
+  const int mi_col_range = mi_col - FILT_BOUNDARY_MI_OFFSET + MAX_MIB_SIZE;
+  const int mi_row_end = AOMMIN(mi_row_range, cm->mi_rows);
+  const int mi_col_end = AOMMIN(mi_col_range, cm->mi_cols);
+
+  const int row = mi_row_start * MI_SIZE;
+  const int col = mi_col_start * MI_SIZE;
+  const uint16_t *src_y =
+      CONVERT_TO_SHORTPTR(src->y_buffer) + row * src->y_stride + col;
+  const uint16_t *frame_y =
+      CONVERT_TO_SHORTPTR(frame->y_buffer) + row * frame->y_stride + col;
+  const int row_end = (mi_row_end - mi_row_start) * MI_SIZE;
+  const int col_end = (mi_col_end - mi_col_start) * MI_SIZE;
+
+  int x, y;
+  for (y = 0; y < row_end; ++y) {
+    for (x = 0; x < col_end; ++x) {
+      const int diff = src_y[x] - frame_y[x];
+      sse += diff * diff;
+    }
+    src_y += src->y_stride;
+    frame_y += frame->y_stride;
+  }
+  return sse;
+}
+#endif
+
+static int compute_sb_y_sse(const YV12_BUFFER_CONFIG *src,
+                            const YV12_BUFFER_CONFIG *frame,
+                            AV1_COMMON *const cm, int mi_row, int mi_col) {
+  int sse = 0;
+  const int mi_row_start = AOMMAX(0, mi_row - FILT_BOUNDARY_MI_OFFSET);
+  const int mi_col_start = AOMMAX(0, mi_col - FILT_BOUNDARY_MI_OFFSET);
+  const int mi_row_range = mi_row - FILT_BOUNDARY_MI_OFFSET + MAX_MIB_SIZE;
+  const int mi_col_range = mi_col - FILT_BOUNDARY_MI_OFFSET + MAX_MIB_SIZE;
+  const int mi_row_end = AOMMIN(mi_row_range, cm->mi_rows);
+  const int mi_col_end = AOMMIN(mi_col_range, cm->mi_cols);
+
+  const int row = mi_row_start * MI_SIZE;
+  const int col = mi_col_start * MI_SIZE;
+  const uint8_t *src_y = src->y_buffer + row * src->y_stride + col;
+  const uint8_t *frame_y = frame->y_buffer + row * frame->y_stride + col;
+  const int row_end = (mi_row_end - mi_row_start) * MI_SIZE;
+  const int col_end = (mi_col_end - mi_col_start) * MI_SIZE;
+
+  int x, y;
+  for (y = 0; y < row_end; ++y) {
+    for (x = 0; x < col_end; ++x) {
+      const int diff = src_y[x] - frame_y[x];
+      sse += diff * diff;
+    }
+    src_y += src->y_stride;
+    frame_y += frame->y_stride;
+  }
+  return sse;
+}
+#endif  // CONFIG_LPF_SB
+
+#if !CONFIG_LPF_SB
+static void yv12_copy_plane(const YV12_BUFFER_CONFIG *src_bc,
+                            YV12_BUFFER_CONFIG *dst_bc, int plane) {
+  switch (plane) {
+    case 0: aom_yv12_copy_y(src_bc, dst_bc); break;
+    case 1: aom_yv12_copy_u(src_bc, dst_bc); break;
+    case 2: aom_yv12_copy_v(src_bc, dst_bc); break;
+    default: assert(plane >= 0 && plane <= 2); break;
+  }
+}
+#endif  // CONFIG_LPF_SB
+
 int av1_get_max_filter_level(const AV1_COMP *cpi) {
   if (cpi->oxcf.pass == 2) {
     return cpi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4
@@ -36,25 +115,156 @@ int av1_get_max_filter_level(const AV1_COMP *cpi) {
   }
 }
 
+#if CONFIG_LPF_SB
+// TODO(chengchen): reduce memory usage by copy superblock instead of frame
+static int try_filter_superblock(const YV12_BUFFER_CONFIG *sd,
+                                 AV1_COMP *const cpi, int filt_level,
+                                 int partial_frame, int mi_row, int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  int filt_err;
+
+#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_CB4X4
+  av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level, 1,
+                        partial_frame, mi_row, mi_col);
+#else
+  if (cpi->num_workers > 1)
+    av1_loop_filter_frame_mt(cm->frame_to_show, cm, cpi->td.mb.e_mbd.plane,
+                             filt_level, 1, partial_frame, cpi->workers,
+                             cpi->num_workers, &cpi->lf_row_sync);
+  else
+    av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level,
+                          1, partial_frame);
+#endif
+
+#if CONFIG_HIGHBITDEPTH
+  if (cm->use_highbitdepth) {
+    filt_err =
+        compute_sb_y_sse_highbd(sd, cm->frame_to_show, cm, mi_row, mi_col);
+  } else {
+    filt_err = compute_sb_y_sse(sd, cm->frame_to_show, cm, mi_row, mi_col);
+  }
+#else
+  filt_err = compute_sb_y_sse(sd, cm->frame_to_show, cm, mi_row, mi_col);
+#endif  // CONFIG_HIGHBITDEPTH
+
+  // TODO(chengchen): Copy the superblock only
+  // Re-instate the unfiltered frame
+  aom_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
+
+  return filt_err;
+}
+
+static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
+                               int partial_frame, double *best_cost_ret,
+                               int mi_row, int mi_col, int last_lvl) {
+  assert(partial_frame == 1);
+  assert(last_lvl >= 0);
+
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *x = &cpi->td.mb;
+
+  int min_filter_level = AOMMAX(0, last_lvl - MAX_LPF_OFFSET);
+  int max_filter_level =
+      AOMMIN(av1_get_max_filter_level(cpi), last_lvl + MAX_LPF_OFFSET);
+
+  // search a larger range for the start superblock
+  if (mi_row == 0 && mi_col == 0) {
+    min_filter_level = 0;
+    max_filter_level = av1_get_max_filter_level(cpi);
+  }
+
+  // TODO(chengchen): Copy for superblock only
+  // Make a copy of the unfiltered / processed recon buffer
+  aom_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
+
+  int estimate_err =
+      try_filter_superblock(sd, cpi, last_lvl, partial_frame, mi_row, mi_col);
+
+  int best_err = estimate_err;
+  int filt_best = last_lvl;
+
+  int i;
+  for (i = min_filter_level; i <= max_filter_level; i += LPF_STEP) {
+    if (i == last_lvl) continue;
+
+    int filt_err =
+        try_filter_superblock(sd, cpi, i, partial_frame, mi_row, mi_col);
+
+    if (filt_err < best_err) {
+      best_err = filt_err;
+      filt_best = i;
+    }
+  }
+
+  // If previous sb filter level has similar filtering performance as current
+  // best filter level, use previous level such that we can only send one bit
+  // to indicate current filter level is the same as the previous.
+  int threshold = 400;
+
+  // ratio = the filtering area / a superblock size
+  int ratio = 1;
+  if (mi_row + MAX_MIB_SIZE > cm->mi_rows) {
+    ratio *= (cm->mi_rows - mi_row);
+  } else {
+    if (mi_row == 0) {
+      ratio *= (MAX_MIB_SIZE - FILT_BOUNDARY_MI_OFFSET);
+    } else {
+      ratio *= MAX_MIB_SIZE;
+    }
+  }
+  if (mi_col + MAX_MIB_SIZE > cm->mi_cols) {
+    ratio *= (cm->mi_cols - mi_col);
+  } else {
+    if (mi_col == 0) {
+      ratio *= (MAX_MIB_SIZE - FILT_BOUNDARY_MI_OFFSET);
+    } else {
+      ratio *= MAX_MIB_SIZE;
+    }
+  }
+  threshold = threshold * ratio / (MAX_MIB_SIZE * MAX_MIB_SIZE);
+
+  const int diff = abs(estimate_err - best_err);
+
+  const int percent_thresh = (int)((double)estimate_err * 0.01);
+  threshold = AOMMAX(threshold, percent_thresh);
+  if (diff < threshold) {
+    best_err = estimate_err;
+    filt_best = last_lvl;
+  }
+
+  // Compute rdcost to determine whether to reuse previous filter lvl
+  if (filt_best != last_lvl) {
+  }
+
+  if (best_cost_ret) *best_cost_ret = RDCOST_DBL(x->rdmult, 0, best_err);
+  return filt_best;
+}
+
+#else  // CONFIG_LPF_SB
 static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
                                 AV1_COMP *const cpi, int filt_level,
                                 int partial_frame
-#if CONFIG_UV_LVL
+#if CONFIG_LOOPFILTER_LEVEL
                                 ,
-                                int plane
+                                int plane, int dir
 #endif
                                 ) {
   AV1_COMMON *const cm = &cpi->common;
   int64_t filt_err;
 
 #if CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_CB4X4
-#if CONFIG_UV_LVL
-  av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level,
-                        plane, partial_frame);
+#if CONFIG_LOOPFILTER_LEVEL
+  assert(plane >= 0 && plane <= 2);
+  int filter_level[2] = { filt_level, filt_level };
+  if (plane == 0 && dir == 0) filter_level[1] = cm->lf.filter_level[1];
+  if (plane == 0 && dir == 1) filter_level[0] = cm->lf.filter_level[0];
+
+  av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd,
+                        filter_level[0], filter_level[1], plane, partial_frame);
 #else
   av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level, 1,
                         partial_frame);
-#endif  // CONFIG_UV_LVL
+#endif  // CONFIG_LOOPFILTER_LEVEL
 #else
   if (cpi->num_workers > 1)
     av1_loop_filter_frame_mt(cm->frame_to_show, cm, cpi->td.mb.e_mbd.plane,
@@ -65,64 +275,33 @@ static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
                           1, partial_frame);
 #endif
 
-#if CONFIG_UV_LVL
+  int highbd = 0;
 #if CONFIG_HIGHBITDEPTH
-  if (cm->use_highbitdepth) {
-    if (plane == 0)
-      filt_err = aom_highbd_get_y_sse(sd, cm->frame_to_show);
-    else if (plane == 1)
-      filt_err = aom_highbd_get_u_sse(sd, cm->frame_to_show);
-    else
-      filt_err = aom_highbd_get_v_sse(sd, cm->frame_to_show);
-  } else {
-    if (plane == 0)
-      filt_err = aom_get_y_sse(sd, cm->frame_to_show);
-    else if (plane == 1)
-      filt_err = aom_get_u_sse(sd, cm->frame_to_show);
-    else
-      filt_err = aom_get_v_sse(sd, cm->frame_to_show);
-  }
-#else
-  if (plane == 0)
-    filt_err = aom_get_y_sse(sd, cm->frame_to_show);
-  else if (plane == 1)
-    filt_err = aom_get_u_sse(sd, cm->frame_to_show);
-  else
-    filt_err = aom_get_v_sse(sd, cm->frame_to_show);
+  highbd = cm->use_highbitdepth;
 #endif  // CONFIG_HIGHBITDEPTH
 
+#if CONFIG_LOOPFILTER_LEVEL
+  filt_err = aom_get_sse_plane(sd, cm->frame_to_show, plane, highbd);
+
   // Re-instate the unfiltered frame
-  if (plane == 0)
-    aom_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
-  else if (plane == 1)
-    aom_yv12_copy_u(&cpi->last_frame_uf, cm->frame_to_show);
-  else
-    aom_yv12_copy_v(&cpi->last_frame_uf, cm->frame_to_show);
-#else
-#if CONFIG_HIGHBITDEPTH
-  if (cm->use_highbitdepth) {
-    filt_err = aom_highbd_get_y_sse(sd, cm->frame_to_show);
-  } else {
-    filt_err = aom_get_y_sse(sd, cm->frame_to_show);
-  }
+  yv12_copy_plane(&cpi->last_frame_uf, cm->frame_to_show, plane);
 #else
-  filt_err = aom_get_y_sse(sd, cm->frame_to_show);
-#endif  // CONFIG_HIGHBITDEPTH
+  filt_err = aom_get_sse_plane(sd, cm->frame_to_show, 0, highbd);
 
   // Re-instate the unfiltered frame
-  aom_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
-#endif  // CONFIG_UV_LVL
+  yv12_copy_plane(&cpi->last_frame_uf, cm->frame_to_show, 0);
+#endif  // CONFIG_LOOPFILTER_LEVEL
 
   return filt_err;
 }
 
-int av1_search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
-                            int partial_frame, double *best_cost_ret
-#if CONFIG_UV_LVL
-                            ,
-                            int plane
+static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
+                               int partial_frame, double *best_cost_ret
+#if CONFIG_LOOPFILTER_LEVEL
+                               ,
+                               int plane, int dir
 #endif
-                            ) {
+                               ) {
   const AV1_COMMON *const cm = &cpi->common;
   const struct loopfilter *const lf = &cm->lf;
   const int min_filter_level = 0;
@@ -134,18 +313,18 @@ int av1_search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
 
 // Start the search at the previous frame filter level unless it is now out of
 // range.
-#if CONFIG_UV_LVL
+#if CONFIG_LOOPFILTER_LEVEL
   int lvl;
   switch (plane) {
-    case 0: lvl = lf->filter_level; break;
+    case 0: lvl = (dir == 1) ? lf->filter_level[1] : lf->filter_level[0]; break;
     case 1: lvl = lf->filter_level_u; break;
     case 2: lvl = lf->filter_level_v; break;
-    default: lvl = lf->filter_level; break;
+    default: assert(plane >= 0 && plane <= 2); return 0;
   }
   int filt_mid = clamp(lvl, min_filter_level, max_filter_level);
 #else
   int filt_mid = clamp(lf->filter_level, min_filter_level, max_filter_level);
-#endif  // CONFIG_UV_LVL
+#endif  // CONFIG_LOOPFILTER_LEVEL
   int filter_step = filt_mid < 16 ? 4 : filt_mid / 4;
   // Sum squared error at each filter level
   int64_t ss_err[MAX_LOOP_FILTER + 1];
@@ -153,23 +332,18 @@ int av1_search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
   // Set each entry to -1
   memset(ss_err, 0xFF, sizeof(ss_err));
 
-#if CONFIG_UV_LVL
-  if (plane == 0)
-    aom_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
-  else if (plane == 1)
-    aom_yv12_copy_u(cm->frame_to_show, &cpi->last_frame_uf);
-  else if (plane == 2)
-    aom_yv12_copy_v(cm->frame_to_show, &cpi->last_frame_uf);
+#if CONFIG_LOOPFILTER_LEVEL
+  yv12_copy_plane(cm->frame_to_show, &cpi->last_frame_uf, plane);
 #else
   //  Make a copy of the unfiltered / processed recon buffer
   aom_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
-#endif  // CONFIG_UV_LVL
+#endif  // CONFIG_LOOPFILTER_LEVEL
 
-#if CONFIG_UV_LVL
-  best_err = try_filter_frame(sd, cpi, filt_mid, partial_frame, plane);
+#if CONFIG_LOOPFILTER_LEVEL
+  best_err = try_filter_frame(sd, cpi, filt_mid, partial_frame, plane, dir);
 #else
   best_err = try_filter_frame(sd, cpi, filt_mid, partial_frame);
-#endif  // CONFIG_UV_LVL
+#endif  // CONFIG_LOOPFILTER_LEVEL
   filt_best = filt_mid;
   ss_err[filt_mid] = best_err;
 
@@ -189,12 +363,12 @@ int av1_search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
     if (filt_direction <= 0 && filt_low != filt_mid) {
       // Get Low filter error score
       if (ss_err[filt_low] < 0) {
-#if CONFIG_UV_LVL
+#if CONFIG_LOOPFILTER_LEVEL
         ss_err[filt_low] =
-            try_filter_frame(sd, cpi, filt_low, partial_frame, plane);
+            try_filter_frame(sd, cpi, filt_low, partial_frame, plane, dir);
 #else
         ss_err[filt_low] = try_filter_frame(sd, cpi, filt_low, partial_frame);
-#endif  // CONFIG_UV_LVL
+#endif  // CONFIG_LOOPFILTER_LEVEL
       }
       // If value is close to the best so far then bias towards a lower loop
       // filter value.
@@ -210,12 +384,12 @@ int av1_search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
     // Now look at filt_high
     if (filt_direction >= 0 && filt_high != filt_mid) {
       if (ss_err[filt_high] < 0) {
-#if CONFIG_UV_LVL
+#if CONFIG_LOOPFILTER_LEVEL
         ss_err[filt_high] =
-            try_filter_frame(sd, cpi, filt_high, partial_frame, plane);
+            try_filter_frame(sd, cpi, filt_high, partial_frame, plane, dir);
 #else
         ss_err[filt_high] = try_filter_frame(sd, cpi, filt_high, partial_frame);
-#endif  // CONFIG_UV_LVL
+#endif  // CONFIG_LOOPFILTER_LEVEL
       }
       // If value is significantly better than previous best, bias added against
       // raising filter value
@@ -241,6 +415,7 @@ int av1_search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
   if (best_cost_ret) *best_cost_ret = RDCOST_DBL(x->rdmult, 0, best_err);
   return filt_best;
 }
+#endif  // CONFIG_LPF_SB
 
 void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
                            LPF_PICK_METHOD method) {
@@ -249,8 +424,13 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
 
   lf->sharpness_level = cm->frame_type == KEY_FRAME ? 0 : cpi->oxcf.sharpness;
 
-  if (method == LPF_PICK_MINIMAL_LPF && lf->filter_level) {
+  if (method == LPF_PICK_MINIMAL_LPF) {
+#if CONFIG_LOOPFILTER_LEVEL
+    lf->filter_level[0] = 0;
+    lf->filter_level[1] = 0;
+#else
     lf->filter_level = 0;
+#endif
   } else if (method >= LPF_PICK_FROM_Q) {
     const int min_filter_level = 0;
     const int max_filter_level = av1_get_max_filter_level(cpi);
@@ -279,18 +459,54 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
     int filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18);
 #endif  // CONFIG_HIGHBITDEPTH
     if (cm->frame_type == KEY_FRAME) filt_guess -= 4;
+#if CONFIG_LOOPFILTER_LEVEL
+    lf->filter_level[0] = clamp(filt_guess, min_filter_level, max_filter_level);
+    lf->filter_level[1] = clamp(filt_guess, min_filter_level, max_filter_level);
+#else
     lf->filter_level = clamp(filt_guess, min_filter_level, max_filter_level);
+#endif
   } else {
-#if CONFIG_UV_LVL
-    lf->filter_level = av1_search_filter_level(
-        sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, NULL, 0);
-    lf->filter_level_u = av1_search_filter_level(
-        sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, NULL, 1);
-    lf->filter_level_v = av1_search_filter_level(
-        sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, NULL, 2);
+#if CONFIG_LPF_SB
+    int mi_row, mi_col;
+    // TODO(chengchen): init last_lvl using previous frame's info?
+    int last_lvl = 0;
+    // TODO(chengchen): if the frame size makes the last superblock very small,
+    // consider merge it to the previous superblock to save bits.
+    // Example, if frame size 1080x720, then in the last row of superblock,
+    // there're (FILT_BOUNDAR_OFFSET + 16) pixels.
+    for (mi_row = 0; mi_row < cm->mi_rows; mi_row += MAX_MIB_SIZE) {
+      for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
+        int lvl =
+            search_filter_level(sd, cpi, 1, NULL, mi_row, mi_col, last_lvl);
+
+        av1_loop_filter_sb_level_init(cm, mi_row, mi_col, lvl);
+
+        // For the superblock at row start, its previous filter level should be
+        // the one above it, not the one at the end of last row
+        if (mi_col + MAX_MIB_SIZE >= cm->mi_cols) {
+          last_lvl = cm->mi_grid_visible[mi_row * cm->mi_stride]->mbmi.filt_lvl;
+        } else {
+          last_lvl = lvl;
+        }
+      }
+    }
+#else  // CONFIG_LPF_SB
+#if CONFIG_LOOPFILTER_LEVEL
+    lf->filter_level[0] = lf->filter_level[1] = search_filter_level(
+        sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, NULL, 0, 2);
+    lf->filter_level[0] = search_filter_level(
+        sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, NULL, 0, 0);
+    lf->filter_level[1] = search_filter_level(
+        sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, NULL, 0, 1);
+
+    lf->filter_level_u = search_filter_level(
+        sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, NULL, 1, 0);
+    lf->filter_level_v = search_filter_level(
+        sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, NULL, 2, 0);
 #else
-    lf->filter_level = av1_search_filter_level(
-        sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, NULL);
-#endif  // CONFIG_UV_LVL
+    lf->filter_level =
+        search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, NULL);
+#endif  // CONFIG_LOOPFILTER_LEVEL
+#endif  // CONFIG_LPF_SB
   }
 }
diff --git a/third_party/aom/av1/encoder/picklpf.h b/third_party/aom/av1/encoder/picklpf.h
index bd248d114..2a168358e 100644
--- a/third_party/aom/av1/encoder/picklpf.h
+++ b/third_party/aom/av1/encoder/picklpf.h
@@ -21,13 +21,6 @@ extern "C" {
 struct yv12_buffer_config;
 struct AV1_COMP;
 int av1_get_max_filter_level(const AV1_COMP *cpi);
-#if CONFIG_UV_LVL
-int av1_search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
-                            int partial_frame, double *err, int plane);
-#else
-int av1_search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
-                            int partial_frame, double *err);
-#endif
 void av1_pick_filter_level(const struct yv12_buffer_config *sd,
                            struct AV1_COMP *cpi, LPF_PICK_METHOD method);
 #ifdef __cplusplus
diff --git a/third_party/aom/av1/encoder/pickrst.c b/third_party/aom/av1/encoder/pickrst.c
index fec68377a..a2262b6fc 100644
--- a/third_party/aom/av1/encoder/pickrst.c
+++ b/third_party/aom/av1/encoder/pickrst.c
@@ -29,13 +29,13 @@
 
 #include "av1/encoder/av1_quantize.h"
 #include "av1/encoder/encoder.h"
+#include "av1/encoder/mathutils.h"
 #include "av1/encoder/picklpf.h"
 #include "av1/encoder/pickrst.h"
-#include "av1/encoder/mathutils.h"
 
 // When set to RESTORE_WIENER or RESTORE_SGRPROJ only those are allowed.
-// When set to RESTORE_NONE (0) we allow switchable.
-const RestorationType force_restore_type = RESTORE_NONE;
+// When set to RESTORE_TYPES we allow switchable.
+static const RestorationType force_restore_type = RESTORE_TYPES;
 
 // Number of Wiener iterations
 #define NUM_WIENER_ITERS 5
@@ -44,7 +44,7 @@ typedef double (*search_restore_type)(const YV12_BUFFER_CONFIG *src,
                                       AV1_COMP *cpi, int partial_frame,
                                       int plane, RestorationInfo *info,
                                       RestorationType *rest_level,
-                                      double *best_tile_cost,
+                                      int64_t *best_tile_cost,
                                       YV12_BUFFER_CONFIG *dst_frame);
 
 const int frame_level_restore_bits[RESTORE_TYPES] = { 2, 2, 2, 2 };
@@ -124,13 +124,11 @@ static int64_t sse_restoration_frame(AV1_COMMON *const cm,
 static int64_t try_restoration_tile(const YV12_BUFFER_CONFIG *src,
                                     AV1_COMP *const cpi, RestorationInfo *rsi,
                                     int components_pattern, int partial_frame,
-                                    int tile_idx, int subtile_idx,
-                                    int subtile_bits,
+                                    int tile_idx,
                                     YV12_BUFFER_CONFIG *dst_frame) {
   AV1_COMMON *const cm = &cpi->common;
   int64_t filt_err;
   int tile_width, tile_height, nhtiles, nvtiles;
-  int h_start, h_end, v_start, v_end;
   int ntiles, width, height;
 
   // Y and UV components cannot be mixed
@@ -151,11 +149,16 @@ static int64_t try_restoration_tile(const YV12_BUFFER_CONFIG *src,
 
   av1_loop_restoration_frame(cm->frame_to_show, cm, rsi, components_pattern,
                              partial_frame, dst_frame);
-  av1_get_rest_tile_limits(tile_idx, subtile_idx, subtile_bits, nhtiles,
-                           nvtiles, tile_width, tile_height, width, height, 0,
-                           0, &h_start, &h_end, &v_start, &v_end);
-  filt_err = sse_restoration_tile(src, dst_frame, cm, h_start, h_end - h_start,
-                                  v_start, v_end - v_start, components_pattern);
+  RestorationTileLimits limits = av1_get_rest_tile_limits(
+      tile_idx, nhtiles, nvtiles, tile_width, tile_height, width,
+#if CONFIG_STRIPED_LOOP_RESTORATION
+      height, components_pattern > 1 ? cm->subsampling_y : 0);
+#else
+      height);
+#endif
+  filt_err = sse_restoration_tile(
+      src, dst_frame, cm, limits.h_start, limits.h_end - limits.h_start,
+      limits.v_start, limits.v_end - limits.v_start, components_pattern);
 
   return filt_err;
 }
@@ -172,16 +175,16 @@ static int64_t try_restoration_frame(const YV12_BUFFER_CONFIG *src,
   return filt_err;
 }
 
-static int64_t get_pixel_proj_error(uint8_t *src8, int width, int height,
-                                    int src_stride, uint8_t *dat8,
-                                    int dat_stride, int bit_depth,
+static int64_t get_pixel_proj_error(const uint8_t *src8, int width, int height,
+                                    int src_stride, const uint8_t *dat8,
+                                    int dat_stride, int use_highbitdepth,
                                     int32_t *flt1, int flt1_stride,
                                     int32_t *flt2, int flt2_stride, int *xqd) {
   int i, j;
   int64_t err = 0;
   int xq[2];
   decode_xq(xqd, xq);
-  if (bit_depth == 8) {
+  if (!use_highbitdepth) {
     const uint8_t *src = src8;
     const uint8_t *dat = dat8;
     for (i = 0; i < height; ++i) {
@@ -219,12 +222,12 @@ static int64_t get_pixel_proj_error(uint8_t *src8, int width, int height,
 
 #define USE_SGRPROJ_REFINEMENT_SEARCH 1
 static int64_t finer_search_pixel_proj_error(
-    uint8_t *src8, int width, int height, int src_stride, uint8_t *dat8,
-    int dat_stride, int bit_depth, int32_t *flt1, int flt1_stride,
-    int32_t *flt2, int flt2_stride, int start_step, int *xqd) {
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int use_highbitdepth, int32_t *flt1,
+    int flt1_stride, int32_t *flt2, int flt2_stride, int start_step, int *xqd) {
   int64_t err = get_pixel_proj_error(src8, width, height, src_stride, dat8,
-                                     dat_stride, bit_depth, flt1, flt1_stride,
-                                     flt2, flt2_stride, xqd);
+                                     dat_stride, use_highbitdepth, flt1,
+                                     flt1_stride, flt2, flt2_stride, xqd);
   (void)start_step;
 #if USE_SGRPROJ_REFINEMENT_SEARCH
   int64_t err2;
@@ -237,8 +240,8 @@ static int64_t finer_search_pixel_proj_error(
         if (xqd[p] - s >= tap_min[p]) {
           xqd[p] -= s;
           err2 = get_pixel_proj_error(src8, width, height, src_stride, dat8,
-                                      dat_stride, bit_depth, flt1, flt1_stride,
-                                      flt2, flt2_stride, xqd);
+                                      dat_stride, use_highbitdepth, flt1,
+                                      flt1_stride, flt2, flt2_stride, xqd);
           if (err2 > err) {
             xqd[p] += s;
           } else {
@@ -255,8 +258,8 @@ static int64_t finer_search_pixel_proj_error(
         if (xqd[p] + s <= tap_max[p]) {
           xqd[p] += s;
           err2 = get_pixel_proj_error(src8, width, height, src_stride, dat8,
-                                      dat_stride, bit_depth, flt1, flt1_stride,
-                                      flt2, flt2_stride, xqd);
+                                      dat_stride, use_highbitdepth, flt1,
+                                      flt1_stride, flt2, flt2_stride, xqd);
           if (err2 > err) {
             xqd[p] -= s;
           } else {
@@ -273,10 +276,11 @@ static int64_t finer_search_pixel_proj_error(
   return err;
 }
 
-static void get_proj_subspace(uint8_t *src8, int width, int height,
+static void get_proj_subspace(const uint8_t *src8, int width, int height,
                               int src_stride, uint8_t *dat8, int dat_stride,
-                              int bit_depth, int32_t *flt1, int flt1_stride,
-                              int32_t *flt2, int flt2_stride, int *xq) {
+                              int use_highbitdepth, int32_t *flt1,
+                              int flt1_stride, int32_t *flt2, int flt2_stride,
+                              int *xq) {
   int i, j;
   double H[2][2] = { { 0, 0 }, { 0, 0 } };
   double C[2] = { 0, 0 };
@@ -289,7 +293,7 @@ static void get_proj_subspace(uint8_t *src8, int width, int height,
   // Default
   xq[0] = 0;
   xq[1] = 0;
-  if (bit_depth == 8) {
+  if (!use_highbitdepth) {
     const uint8_t *src = src8;
     const uint8_t *dat = dat8;
     for (i = 0; i < height; ++i) {
@@ -346,54 +350,83 @@ void encode_xq(int *xq, int *xqd) {
 }
 
 static void search_selfguided_restoration(uint8_t *dat8, int width, int height,
-                                          int dat_stride, uint8_t *src8,
-                                          int src_stride, int bit_depth,
-                                          int *eps, int *xqd, int32_t *rstbuf) {
+                                          int dat_stride, const uint8_t *src8,
+                                          int src_stride, int use_highbitdepth,
+                                          int bit_depth, int pu_width,
+                                          int pu_height, int *eps, int *xqd,
+                                          int32_t *rstbuf) {
   int32_t *flt1 = rstbuf;
   int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
-  int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX;
   int ep, bestep = 0;
   int64_t err, besterr = -1;
   int exqd[2], bestxqd[2] = { 0, 0 };
+  int flt1_stride = ((width + 7) & ~7) + 8;
+  int flt2_stride = ((width + 7) & ~7) + 8;
+  assert(pu_width == (RESTORATION_PROC_UNIT_SIZE >> 1) ||
+         pu_width == RESTORATION_PROC_UNIT_SIZE);
+  assert(pu_height == (RESTORATION_PROC_UNIT_SIZE >> 1) ||
+         pu_height == RESTORATION_PROC_UNIT_SIZE);
+#if !CONFIG_HIGHBITDEPTH
+  (void)bit_depth;
+#endif
 
   for (ep = 0; ep < SGRPROJ_PARAMS; ep++) {
     int exq[2];
 #if CONFIG_HIGHBITDEPTH
-    if (bit_depth > 8) {
+    if (use_highbitdepth) {
       uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+      for (int i = 0; i < height; i += pu_height)
+        for (int j = 0; j < width; j += pu_width) {
+          const int w = AOMMIN(pu_width, width - j);
+          const int h = AOMMIN(pu_height, height - i);
+          uint16_t *dat_p = dat + i * dat_stride + j;
+          int32_t *flt1_p = flt1 + i * flt1_stride + j;
+          int32_t *flt2_p = flt2 + i * flt2_stride + j;
 #if USE_HIGHPASS_IN_SGRPROJ
-      av1_highpass_filter_highbd(dat, width, height, dat_stride, flt1, width,
-                                 sgr_params[ep].corner, sgr_params[ep].edge);
+          av1_highpass_filter_highbd(dat_p, w, h, dat_stride, flt1_p,
+                                     flt1_stride, sgr_params[ep].corner,
+                                     sgr_params[ep].edge);
 #else
-      av1_selfguided_restoration_highbd(dat, width, height, dat_stride, flt1,
-                                        width, bit_depth, sgr_params[ep].r1,
-                                        sgr_params[ep].e1, tmpbuf2);
+          av1_selfguided_restoration_highbd(
+              dat_p, w, h, dat_stride, flt1_p, flt1_stride, bit_depth,
+              sgr_params[ep].r1, sgr_params[ep].e1);
 #endif  // USE_HIGHPASS_IN_SGRPROJ
-      av1_selfguided_restoration_highbd(dat, width, height, dat_stride, flt2,
-                                        width, bit_depth, sgr_params[ep].r2,
-                                        sgr_params[ep].e2, tmpbuf2);
+          av1_selfguided_restoration_highbd(
+              dat_p, w, h, dat_stride, flt2_p, flt2_stride, bit_depth,
+              sgr_params[ep].r2, sgr_params[ep].e2);
+        }
     } else {
 #endif
+      for (int i = 0; i < height; i += pu_height)
+        for (int j = 0; j < width; j += pu_width) {
+          const int w = AOMMIN(pu_width, width - j);
+          const int h = AOMMIN(pu_height, height - i);
+          uint8_t *dat_p = dat8 + i * dat_stride + j;
+          int32_t *flt1_p = flt1 + i * flt1_stride + j;
+          int32_t *flt2_p = flt2 + i * flt2_stride + j;
 #if USE_HIGHPASS_IN_SGRPROJ
-      av1_highpass_filter(dat8, width, height, dat_stride, flt1, width,
-                          sgr_params[ep].corner, sgr_params[ep].edge);
+          av1_highpass_filter(dat_p, w, h, dat_stride, flt1_p, flt1_stride,
+                              sgr_params[ep].corner, sgr_params[ep].edge);
 #else
-    av1_selfguided_restoration(dat8, width, height, dat_stride, flt1, width,
-                               sgr_params[ep].r1, sgr_params[ep].e1, tmpbuf2);
+        av1_selfguided_restoration(dat_p, w, h, dat_stride, flt1_p, flt1_stride,
+                                   sgr_params[ep].r1, sgr_params[ep].e1);
 #endif  // USE_HIGHPASS_IN_SGRPROJ
-      av1_selfguided_restoration(dat8, width, height, dat_stride, flt2, width,
-                                 sgr_params[ep].r2, sgr_params[ep].e2, tmpbuf2);
+          av1_selfguided_restoration(dat_p, w, h, dat_stride, flt2_p,
+                                     flt2_stride, sgr_params[ep].r2,
+                                     sgr_params[ep].e2);
+        }
 #if CONFIG_HIGHBITDEPTH
     }
 #endif
     aom_clear_system_state();
     get_proj_subspace(src8, width, height, src_stride, dat8, dat_stride,
-                      bit_depth, flt1, width, flt2, width, exq);
+                      use_highbitdepth, flt1, flt1_stride, flt2, flt2_stride,
+                      exq);
     aom_clear_system_state();
     encode_xq(exq, exqd);
-    err = finer_search_pixel_proj_error(src8, width, height, src_stride, dat8,
-                                        dat_stride, bit_depth, flt1, width,
-                                        flt2, width, 2, exqd);
+    err = finer_search_pixel_proj_error(
+        src8, width, height, src_stride, dat8, dat_stride, use_highbitdepth,
+        flt1, flt1_stride, flt2, flt2_stride, 2, exqd);
     if (besterr == -1 || err < besterr) {
       bestep = ep;
       besterr = err;
@@ -420,124 +453,258 @@ static int count_sgrproj_bits(SgrprojInfo *sgrproj_info,
   return bits;
 }
 
-static double search_sgrproj(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
-                             int partial_frame, int plane,
-                             RestorationInfo *info, RestorationType *type,
-                             double *best_tile_cost,
-                             YV12_BUFFER_CONFIG *dst_frame) {
-  SgrprojInfo *sgrproj_info = info->sgrproj_info;
-  double err, cost_norestore, cost_sgrproj;
-  int bits;
-  MACROBLOCK *x = &cpi->td.mb;
+struct rest_search_ctxt {
+  const YV12_BUFFER_CONFIG *src;
+  AV1_COMP *cpi;
+  uint8_t *dgd_buffer;
+  const uint8_t *src_buffer;
+  int dgd_stride;
+  int src_stride;
+  int partial_frame;
+  RestorationInfo *info;
+  RestorationType *type;
+  int64_t *best_tile_cost;
+  int plane;
+  int plane_width;
+  int plane_height;
+  int nrtiles_x;
+  int nrtiles_y;
+  YV12_BUFFER_CONFIG *dst_frame;
+};
+
+// Fill in ctxt. Returns the number of restoration tiles for this plane
+static INLINE int init_rest_search_ctxt(
+    const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi, int partial_frame, int plane,
+    RestorationInfo *info, RestorationType *type, int64_t *best_tile_cost,
+    YV12_BUFFER_CONFIG *dst_frame, struct rest_search_ctxt *ctxt) {
   AV1_COMMON *const cm = &cpi->common;
+  ctxt->src = src;
+  ctxt->cpi = cpi;
+  ctxt->partial_frame = partial_frame;
+  ctxt->info = info;
+  ctxt->type = type;
+  ctxt->best_tile_cost = best_tile_cost;
+  ctxt->plane = plane;
+  ctxt->dst_frame = dst_frame;
+
   const YV12_BUFFER_CONFIG *dgd = cm->frame_to_show;
-  RestorationInfo *rsi = &cpi->rst_search[0];
-  int tile_idx, tile_width, tile_height, nhtiles, nvtiles;
-  int h_start, h_end, v_start, v_end;
-  int width, height, src_stride, dgd_stride;
-  uint8_t *dgd_buffer, *src_buffer;
   if (plane == AOM_PLANE_Y) {
-    width = src->y_crop_width;
-    height = src->y_crop_height;
-    src_buffer = src->y_buffer;
-    src_stride = src->y_stride;
-    dgd_buffer = dgd->y_buffer;
-    dgd_stride = dgd->y_stride;
-    assert(width == dgd->y_crop_width);
-    assert(height == dgd->y_crop_height);
-    assert(width == src->y_crop_width);
-    assert(height == src->y_crop_height);
+    ctxt->plane_width = src->y_crop_width;
+    ctxt->plane_height = src->y_crop_height;
+    ctxt->src_buffer = src->y_buffer;
+    ctxt->src_stride = src->y_stride;
+    ctxt->dgd_buffer = dgd->y_buffer;
+    ctxt->dgd_stride = dgd->y_stride;
+    assert(ctxt->plane_width == dgd->y_crop_width);
+    assert(ctxt->plane_height == dgd->y_crop_height);
+    assert(ctxt->plane_width == src->y_crop_width);
+    assert(ctxt->plane_height == src->y_crop_height);
   } else {
-    width = src->uv_crop_width;
-    height = src->uv_crop_height;
-    src_stride = src->uv_stride;
-    dgd_stride = dgd->uv_stride;
-    src_buffer = plane == AOM_PLANE_U ? src->u_buffer : src->v_buffer;
-    dgd_buffer = plane == AOM_PLANE_U ? dgd->u_buffer : dgd->v_buffer;
-    assert(width == dgd->uv_crop_width);
-    assert(height == dgd->uv_crop_height);
+    ctxt->plane_width = src->uv_crop_width;
+    ctxt->plane_height = src->uv_crop_height;
+    ctxt->src_stride = src->uv_stride;
+    ctxt->dgd_stride = dgd->uv_stride;
+    ctxt->src_buffer = plane == AOM_PLANE_U ? src->u_buffer : src->v_buffer;
+    ctxt->dgd_buffer = plane == AOM_PLANE_U ? dgd->u_buffer : dgd->v_buffer;
+    assert(ctxt->plane_width == dgd->uv_crop_width);
+    assert(ctxt->plane_height == dgd->uv_crop_height);
   }
-  const int ntiles =
-      av1_get_rest_ntiles(width, height, cm->rst_info[0].restoration_tilesize,
-                          &tile_width, &tile_height, &nhtiles, &nvtiles);
-  SgrprojInfo ref_sgrproj_info;
-  set_default_sgrproj(&ref_sgrproj_info);
 
-  rsi[plane].frame_restoration_type = RESTORE_SGRPROJ;
+  return av1_get_rest_ntiles(ctxt->plane_width, ctxt->plane_height,
+                             cm->rst_info[plane].restoration_tilesize, NULL,
+                             NULL, &ctxt->nrtiles_x, &ctxt->nrtiles_y);
+}
 
-  for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
-    rsi[plane].restoration_type[tile_idx] = RESTORE_NONE;
+typedef void (*rtile_visitor_t)(const struct rest_search_ctxt *search_ctxt,
+                                int rtile_idx,
+                                const RestorationTileLimits *limits, void *arg);
+
+static void foreach_rtile_in_tile(const struct rest_search_ctxt *ctxt,
+                                  int tile_row, int tile_col,
+                                  rtile_visitor_t fun, void *arg) {
+  const AV1_COMMON *const cm = &ctxt->cpi->common;
+  const RestorationInfo *rsi = ctxt->cpi->rst_search;
+  TileInfo tile_info;
+
+  av1_tile_set_row(&tile_info, cm, tile_row);
+  av1_tile_set_col(&tile_info, cm, tile_col);
+
+  int tile_col_start = tile_info.mi_col_start * MI_SIZE;
+  int tile_col_end = tile_info.mi_col_end * MI_SIZE;
+  int tile_row_start = tile_info.mi_row_start * MI_SIZE;
+  int tile_row_end = tile_info.mi_row_end * MI_SIZE;
+  if (ctxt->plane > 0) {
+    tile_col_start = ROUND_POWER_OF_TWO(tile_col_start, cm->subsampling_x);
+    tile_col_end = ROUND_POWER_OF_TWO(tile_col_end, cm->subsampling_x);
+    tile_row_start = ROUND_POWER_OF_TWO(tile_row_start, cm->subsampling_y);
+    tile_row_end = ROUND_POWER_OF_TWO(tile_row_end, cm->subsampling_y);
   }
-  // Compute best Sgrproj filters for each tile
-  for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
-    av1_get_rest_tile_limits(tile_idx, 0, 0, nhtiles, nvtiles, tile_width,
-                             tile_height, width, height, 0, 0, &h_start, &h_end,
-                             &v_start, &v_end);
-    err = sse_restoration_tile(src, cm->frame_to_show, cm, h_start,
-                               h_end - h_start, v_start, v_end - v_start,
-                               (1 << plane));
-    // #bits when a tile is not restored
-    bits = av1_cost_bit(RESTORE_NONE_SGRPROJ_PROB, 0);
-    cost_norestore = RDCOST_DBL(x->rdmult, (bits >> 4), err);
-    best_tile_cost[tile_idx] = DBL_MAX;
-    search_selfguided_restoration(
-        dgd_buffer + v_start * dgd_stride + h_start, h_end - h_start,
-        v_end - v_start, dgd_stride,
-        src_buffer + v_start * src_stride + h_start, src_stride,
+
+#if CONFIG_FRAME_SUPERRES
+  // If upscaling is enabled, the tile limits need scaling to match the
+  // upscaled frame where the restoration tiles live. To do this, scale up the
+  // top-left and bottom-right of the tile.
+  if (!av1_superres_unscaled(cm)) {
+    av1_calculate_unscaled_superres_size(&tile_col_start, &tile_row_start,
+                                         cm->superres_scale_denominator);
+    av1_calculate_unscaled_superres_size(&tile_col_end, &tile_row_end,
+                                         cm->superres_scale_denominator);
+    // Make sure we don't fall off the bottom-right of the frame.
+    tile_col_end = AOMMIN(tile_col_end, ctxt->plane_width);
+    tile_row_end = AOMMIN(tile_row_end, ctxt->plane_height);
+  }
+#endif  // CONFIG_FRAME_SUPERRES
+
+  const int rtile_size = rsi->restoration_tilesize;
+  const int rtile_col0 = (tile_col_start + rtile_size - 1) / rtile_size;
+  const int rtile_col1 =
+      AOMMIN((tile_col_end + rtile_size - 1) / rtile_size, ctxt->nrtiles_x);
+  const int rtile_row0 = (tile_row_start + rtile_size - 1) / rtile_size;
+  const int rtile_row1 =
+      AOMMIN((tile_row_end + rtile_size - 1) / rtile_size, ctxt->nrtiles_y);
+
+  const int rtile_width = AOMMIN(tile_col_end - tile_col_start, rtile_size);
+  const int rtile_height = AOMMIN(tile_row_end - tile_row_start, rtile_size);
+
+  for (int rtile_row = rtile_row0; rtile_row < rtile_row1; ++rtile_row) {
+    for (int rtile_col = rtile_col0; rtile_col < rtile_col1; ++rtile_col) {
+      const int rtile_idx = rtile_row * ctxt->nrtiles_x + rtile_col;
+      RestorationTileLimits limits = av1_get_rest_tile_limits(
+          rtile_idx, ctxt->nrtiles_x, ctxt->nrtiles_y, rtile_width,
+          rtile_height, ctxt->plane_width,
+#if CONFIG_STRIPED_LOOP_RESTORATION
+          ctxt->plane_height, ctxt->plane > 0 ? cm->subsampling_y : 0);
+#else
+          ctxt->plane_height);
+#endif
+      fun(ctxt, rtile_idx, &limits, arg);
+    }
+  }
+}
+
+static void search_sgrproj_for_rtile(const struct rest_search_ctxt *ctxt,
+                                     int rtile_idx,
+                                     const RestorationTileLimits *limits,
+                                     void *arg) {
+  const MACROBLOCK *const x = &ctxt->cpi->td.mb;
+  const AV1_COMMON *const cm = &ctxt->cpi->common;
+  RestorationInfo *rsi = ctxt->cpi->rst_search;
+  SgrprojInfo *sgrproj_info = ctxt->info->sgrproj_info;
+
+  SgrprojInfo *ref_sgrproj_info = (SgrprojInfo *)arg;
+
+  int64_t err =
+      sse_restoration_tile(ctxt->src, cm->frame_to_show, cm, limits->h_start,
+                           limits->h_end - limits->h_start, limits->v_start,
+                           limits->v_end - limits->v_start, (1 << ctxt->plane));
+  // #bits when a tile is not restored
+  int bits = av1_cost_bit(RESTORE_NONE_SGRPROJ_PROB, 0);
+  double cost_norestore = RDCOST_DBL(x->rdmult, (bits >> 4), err);
+  ctxt->best_tile_cost[rtile_idx] = INT64_MAX;
+
+  RestorationInfo *plane_rsi = &rsi[ctxt->plane];
+  SgrprojInfo *rtile_sgrproj_info = &plane_rsi->sgrproj_info[rtile_idx];
+  uint8_t *dgd_start =
+      ctxt->dgd_buffer + limits->v_start * ctxt->dgd_stride + limits->h_start;
+  const uint8_t *src_start =
+      ctxt->src_buffer + limits->v_start * ctxt->src_stride + limits->h_start;
+
+  search_selfguided_restoration(
+      dgd_start, limits->h_end - limits->h_start,
+      limits->v_end - limits->v_start, ctxt->dgd_stride, src_start,
+      ctxt->src_stride,
 #if CONFIG_HIGHBITDEPTH
-        cm->bit_depth,
+      cm->use_highbitdepth, cm->bit_depth,
 #else
-        8,
+      0, 8,
 #endif  // CONFIG_HIGHBITDEPTH
-        &rsi[plane].sgrproj_info[tile_idx].ep,
-        rsi[plane].sgrproj_info[tile_idx].xqd, cm->rst_internal.tmpbuf);
-    rsi[plane].restoration_type[tile_idx] = RESTORE_SGRPROJ;
-    err = try_restoration_tile(src, cpi, rsi, (1 << plane), partial_frame,
-                               tile_idx, 0, 0, dst_frame);
-    bits = count_sgrproj_bits(&rsi[plane].sgrproj_info[tile_idx],
-                              &ref_sgrproj_info)
-           << AV1_PROB_COST_SHIFT;
-    bits += av1_cost_bit(RESTORE_NONE_SGRPROJ_PROB, 1);
-    cost_sgrproj = RDCOST_DBL(x->rdmult, (bits >> 4), err);
-    if (cost_sgrproj >= cost_norestore) {
-      type[tile_idx] = RESTORE_NONE;
-    } else {
-      type[tile_idx] = RESTORE_SGRPROJ;
-      memcpy(&sgrproj_info[tile_idx], &rsi[plane].sgrproj_info[tile_idx],
-             sizeof(sgrproj_info[tile_idx]));
-      memcpy(&ref_sgrproj_info, &sgrproj_info[tile_idx],
-             sizeof(ref_sgrproj_info));
-      best_tile_cost[tile_idx] = err;
+      rsi[ctxt->plane].procunit_width, rsi[ctxt->plane].procunit_height,
+      &rtile_sgrproj_info->ep, rtile_sgrproj_info->xqd,
+      cm->rst_internal.tmpbuf);
+  plane_rsi->restoration_type[rtile_idx] = RESTORE_SGRPROJ;
+  err = try_restoration_tile(ctxt->src, ctxt->cpi, rsi, (1 << ctxt->plane),
+                             ctxt->partial_frame, rtile_idx, ctxt->dst_frame);
+  bits =
+      count_sgrproj_bits(&plane_rsi->sgrproj_info[rtile_idx], ref_sgrproj_info)
+      << AV1_PROB_COST_SHIFT;
+  bits += av1_cost_bit(RESTORE_NONE_SGRPROJ_PROB, 1);
+  double cost_sgrproj = RDCOST_DBL(x->rdmult, (bits >> 4), err);
+  if (cost_sgrproj >= cost_norestore) {
+    ctxt->type[rtile_idx] = RESTORE_NONE;
+  } else {
+    ctxt->type[rtile_idx] = RESTORE_SGRPROJ;
+    *ref_sgrproj_info = sgrproj_info[rtile_idx] =
+        plane_rsi->sgrproj_info[rtile_idx];
+    ctxt->best_tile_cost[rtile_idx] = err;
+  }
+  plane_rsi->restoration_type[rtile_idx] = RESTORE_NONE;
+}
+
+static double search_sgrproj(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
+                             int partial_frame, int plane,
+                             RestorationInfo *info, RestorationType *type,
+                             int64_t *best_tile_cost,
+                             YV12_BUFFER_CONFIG *dst_frame) {
+  struct rest_search_ctxt ctxt;
+  const int nrtiles =
+      init_rest_search_ctxt(src, cpi, partial_frame, plane, info, type,
+                            best_tile_cost, dst_frame, &ctxt);
+
+  RestorationInfo *plane_rsi = &cpi->rst_search[plane];
+  plane_rsi->frame_restoration_type = RESTORE_SGRPROJ;
+  for (int rtile_idx = 0; rtile_idx < nrtiles; ++rtile_idx) {
+    plane_rsi->restoration_type[rtile_idx] = RESTORE_NONE;
+  }
+
+  // Compute best Sgrproj filters for each rtile, one (encoder/decoder)
+  // tile at a time.
+  const AV1_COMMON *const cm = &cpi->common;
+#if CONFIG_HIGHBITDEPTH
+  if (cm->use_highbitdepth)
+    extend_frame_highbd(CONVERT_TO_SHORTPTR(ctxt.dgd_buffer), ctxt.plane_width,
+                        ctxt.plane_height, ctxt.dgd_stride, SGRPROJ_BORDER_HORZ,
+                        SGRPROJ_BORDER_VERT);
+  else
+#endif
+    extend_frame(ctxt.dgd_buffer, ctxt.plane_width, ctxt.plane_height,
+                 ctxt.dgd_stride, SGRPROJ_BORDER_HORZ, SGRPROJ_BORDER_VERT);
+
+  for (int tile_row = 0; tile_row < cm->tile_rows; ++tile_row) {
+    for (int tile_col = 0; tile_col < cm->tile_cols; ++tile_col) {
+      SgrprojInfo ref_sgrproj_info;
+      set_default_sgrproj(&ref_sgrproj_info);
+      foreach_rtile_in_tile(&ctxt, tile_row, tile_col, search_sgrproj_for_rtile,
+                            &ref_sgrproj_info);
     }
-    rsi[plane].restoration_type[tile_idx] = RESTORE_NONE;
   }
+
   // Cost for Sgrproj filtering
+  SgrprojInfo ref_sgrproj_info;
   set_default_sgrproj(&ref_sgrproj_info);
-  bits = frame_level_restore_bits[rsi[plane].frame_restoration_type]
-         << AV1_PROB_COST_SHIFT;
-  for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
-    bits +=
-        av1_cost_bit(RESTORE_NONE_SGRPROJ_PROB, type[tile_idx] != RESTORE_NONE);
-    memcpy(&rsi[plane].sgrproj_info[tile_idx], &sgrproj_info[tile_idx],
-           sizeof(sgrproj_info[tile_idx]));
-    if (type[tile_idx] == RESTORE_SGRPROJ) {
-      bits += count_sgrproj_bits(&rsi[plane].sgrproj_info[tile_idx],
+  SgrprojInfo *sgrproj_info = info->sgrproj_info;
+
+  int bits = frame_level_restore_bits[plane_rsi->frame_restoration_type]
+             << AV1_PROB_COST_SHIFT;
+  for (int rtile_idx = 0; rtile_idx < nrtiles; ++rtile_idx) {
+    bits += av1_cost_bit(RESTORE_NONE_SGRPROJ_PROB,
+                         type[rtile_idx] != RESTORE_NONE);
+    plane_rsi->sgrproj_info[rtile_idx] = sgrproj_info[rtile_idx];
+    if (type[rtile_idx] == RESTORE_SGRPROJ) {
+      bits += count_sgrproj_bits(&plane_rsi->sgrproj_info[rtile_idx],
                                  &ref_sgrproj_info)
               << AV1_PROB_COST_SHIFT;
-      memcpy(&ref_sgrproj_info, &rsi[plane].sgrproj_info[tile_idx],
-             sizeof(ref_sgrproj_info));
+      ref_sgrproj_info = plane_rsi->sgrproj_info[rtile_idx];
     }
-    rsi[plane].restoration_type[tile_idx] = type[tile_idx];
+    plane_rsi->restoration_type[rtile_idx] = type[rtile_idx];
   }
-  err = try_restoration_frame(src, cpi, rsi, (1 << plane), partial_frame,
-                              dst_frame);
-  cost_sgrproj = RDCOST_DBL(x->rdmult, (bits >> 4), err);
-
+  int64_t err = try_restoration_frame(src, cpi, cpi->rst_search, (1 << plane),
+                                      partial_frame, dst_frame);
+  double cost_sgrproj = RDCOST_DBL(cpi->td.mb.rdmult, (bits >> 4), err);
   return cost_sgrproj;
 }
 
-static double find_average(uint8_t *src, int h_start, int h_end, int v_start,
-                           int v_end, int stride) {
+static double find_average(const uint8_t *src, int h_start, int h_end,
+                           int v_start, int v_end, int stride) {
   uint64_t sum = 0;
   double avg = 0;
   int i, j;
@@ -548,47 +715,51 @@ static double find_average(uint8_t *src, int h_start, int h_end, int v_start,
   return avg;
 }
 
-static void compute_stats(uint8_t *dgd, uint8_t *src, int h_start, int h_end,
+static void compute_stats(int wiener_win, const uint8_t *dgd,
+                          const uint8_t *src, int h_start, int h_end,
                           int v_start, int v_end, int dgd_stride,
                           int src_stride, double *M, double *H) {
   int i, j, k, l;
   double Y[WIENER_WIN2];
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin = (wiener_win >> 1);
   const double avg =
       find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
 
-  memset(M, 0, sizeof(*M) * WIENER_WIN2);
-  memset(H, 0, sizeof(*H) * WIENER_WIN2 * WIENER_WIN2);
+  memset(M, 0, sizeof(*M) * wiener_win2);
+  memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2);
   for (i = v_start; i < v_end; i++) {
     for (j = h_start; j < h_end; j++) {
       const double X = (double)src[i * src_stride + j] - avg;
       int idx = 0;
-      for (k = -WIENER_HALFWIN; k <= WIENER_HALFWIN; k++) {
-        for (l = -WIENER_HALFWIN; l <= WIENER_HALFWIN; l++) {
+      for (k = -wiener_halfwin; k <= wiener_halfwin; k++) {
+        for (l = -wiener_halfwin; l <= wiener_halfwin; l++) {
           Y[idx] = (double)dgd[(i + l) * dgd_stride + (j + k)] - avg;
           idx++;
         }
       }
-      for (k = 0; k < WIENER_WIN2; ++k) {
+      assert(idx == wiener_win2);
+      for (k = 0; k < wiener_win2; ++k) {
         M[k] += Y[k] * X;
-        H[k * WIENER_WIN2 + k] += Y[k] * Y[k];
-        for (l = k + 1; l < WIENER_WIN2; ++l) {
+        H[k * wiener_win2 + k] += Y[k] * Y[k];
+        for (l = k + 1; l < wiener_win2; ++l) {
           // H is a symmetric matrix, so we only need to fill out the upper
           // triangle here. We can copy it down to the lower triangle outside
           // the (i, j) loops.
-          H[k * WIENER_WIN2 + l] += Y[k] * Y[l];
+          H[k * wiener_win2 + l] += Y[k] * Y[l];
         }
       }
     }
   }
-  for (k = 0; k < WIENER_WIN2; ++k) {
-    for (l = k + 1; l < WIENER_WIN2; ++l) {
-      H[l * WIENER_WIN2 + k] = H[k * WIENER_WIN2 + l];
+  for (k = 0; k < wiener_win2; ++k) {
+    for (l = k + 1; l < wiener_win2; ++l) {
+      H[l * wiener_win2 + k] = H[k * wiener_win2 + l];
     }
   }
 }
 
 #if CONFIG_HIGHBITDEPTH
-static double find_average_highbd(uint16_t *src, int h_start, int h_end,
+static double find_average_highbd(const uint16_t *src, int h_start, int h_end,
                                   int v_start, int v_end, int stride) {
   uint64_t sum = 0;
   double avg = 0;
@@ -600,168 +771,184 @@ static double find_average_highbd(uint16_t *src, int h_start, int h_end,
   return avg;
 }
 
-static void compute_stats_highbd(uint8_t *dgd8, uint8_t *src8, int h_start,
-                                 int h_end, int v_start, int v_end,
-                                 int dgd_stride, int src_stride, double *M,
-                                 double *H) {
+static void compute_stats_highbd(int wiener_win, const uint8_t *dgd8,
+                                 const uint8_t *src8, int h_start, int h_end,
+                                 int v_start, int v_end, int dgd_stride,
+                                 int src_stride, double *M, double *H) {
   int i, j, k, l;
   double Y[WIENER_WIN2];
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin = (wiener_win >> 1);
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
   const double avg =
       find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride);
 
-  memset(M, 0, sizeof(*M) * WIENER_WIN2);
-  memset(H, 0, sizeof(*H) * WIENER_WIN2 * WIENER_WIN2);
+  memset(M, 0, sizeof(*M) * wiener_win2);
+  memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2);
   for (i = v_start; i < v_end; i++) {
     for (j = h_start; j < h_end; j++) {
       const double X = (double)src[i * src_stride + j] - avg;
       int idx = 0;
-      for (k = -WIENER_HALFWIN; k <= WIENER_HALFWIN; k++) {
-        for (l = -WIENER_HALFWIN; l <= WIENER_HALFWIN; l++) {
+      for (k = -wiener_halfwin; k <= wiener_halfwin; k++) {
+        for (l = -wiener_halfwin; l <= wiener_halfwin; l++) {
           Y[idx] = (double)dgd[(i + l) * dgd_stride + (j + k)] - avg;
           idx++;
         }
       }
-      for (k = 0; k < WIENER_WIN2; ++k) {
+      assert(idx == wiener_win2);
+      for (k = 0; k < wiener_win2; ++k) {
         M[k] += Y[k] * X;
-        H[k * WIENER_WIN2 + k] += Y[k] * Y[k];
-        for (l = k + 1; l < WIENER_WIN2; ++l) {
+        H[k * wiener_win2 + k] += Y[k] * Y[k];
+        for (l = k + 1; l < wiener_win2; ++l) {
           // H is a symmetric matrix, so we only need to fill out the upper
           // triangle here. We can copy it down to the lower triangle outside
           // the (i, j) loops.
-          H[k * WIENER_WIN2 + l] += Y[k] * Y[l];
+          H[k * wiener_win2 + l] += Y[k] * Y[l];
         }
       }
     }
   }
-  for (k = 0; k < WIENER_WIN2; ++k) {
-    for (l = k + 1; l < WIENER_WIN2; ++l) {
-      H[l * WIENER_WIN2 + k] = H[k * WIENER_WIN2 + l];
+  for (k = 0; k < wiener_win2; ++k) {
+    for (l = k + 1; l < wiener_win2; ++l) {
+      H[l * wiener_win2 + k] = H[k * wiener_win2 + l];
     }
   }
 }
 #endif  // CONFIG_HIGHBITDEPTH
 
-static INLINE int wrap_index(int i) {
-  return (i >= WIENER_HALFWIN1 ? WIENER_WIN - 1 - i : i);
+static INLINE int wrap_index(int i, int wiener_win) {
+  const int wiener_halfwin1 = (wiener_win >> 1) + 1;
+  return (i >= wiener_halfwin1 ? wiener_win - 1 - i : i);
 }
 
 // Fix vector b, update vector a
-static void update_a_sep_sym(double **Mc, double **Hc, double *a, double *b) {
+static void update_a_sep_sym(int wiener_win, double **Mc, double **Hc,
+                             double *a, double *b) {
   int i, j;
   double S[WIENER_WIN];
   double A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1];
-  int w, w2;
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin1 = (wiener_win >> 1) + 1;
   memset(A, 0, sizeof(A));
   memset(B, 0, sizeof(B));
-  for (i = 0; i < WIENER_WIN; i++) {
-    for (j = 0; j < WIENER_WIN; ++j) {
-      const int jj = wrap_index(j);
+  for (i = 0; i < wiener_win; i++) {
+    for (j = 0; j < wiener_win; ++j) {
+      const int jj = wrap_index(j, wiener_win);
       A[jj] += Mc[i][j] * b[i];
     }
   }
-  for (i = 0; i < WIENER_WIN; i++) {
-    for (j = 0; j < WIENER_WIN; j++) {
+  for (i = 0; i < wiener_win; i++) {
+    for (j = 0; j < wiener_win; j++) {
       int k, l;
-      for (k = 0; k < WIENER_WIN; ++k)
-        for (l = 0; l < WIENER_WIN; ++l) {
-          const int kk = wrap_index(k);
-          const int ll = wrap_index(l);
-          B[ll * WIENER_HALFWIN1 + kk] +=
-              Hc[j * WIENER_WIN + i][k * WIENER_WIN2 + l] * b[i] * b[j];
+      for (k = 0; k < wiener_win; ++k)
+        for (l = 0; l < wiener_win; ++l) {
+          const int kk = wrap_index(k, wiener_win);
+          const int ll = wrap_index(l, wiener_win);
+          B[ll * wiener_halfwin1 + kk] +=
+              Hc[j * wiener_win + i][k * wiener_win2 + l] * b[i] * b[j];
         }
     }
   }
   // Normalization enforcement in the system of equations itself
-  w = WIENER_WIN;
-  w2 = (w >> 1) + 1;
-  for (i = 0; i < w2 - 1; ++i)
+  for (i = 0; i < wiener_halfwin1 - 1; ++i)
     A[i] -=
-        A[w2 - 1] * 2 + B[i * w2 + w2 - 1] - 2 * B[(w2 - 1) * w2 + (w2 - 1)];
-  for (i = 0; i < w2 - 1; ++i)
-    for (j = 0; j < w2 - 1; ++j)
-      B[i * w2 + j] -= 2 * (B[i * w2 + (w2 - 1)] + B[(w2 - 1) * w2 + j] -
-                            2 * B[(w2 - 1) * w2 + (w2 - 1)]);
-  if (linsolve(w2 - 1, B, w2, A, S)) {
-    S[w2 - 1] = 1.0;
-    for (i = w2; i < w; ++i) {
-      S[i] = S[w - 1 - i];
-      S[w2 - 1] -= 2 * S[i];
+        A[wiener_halfwin1 - 1] * 2 +
+        B[i * wiener_halfwin1 + wiener_halfwin1 - 1] -
+        2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 + (wiener_halfwin1 - 1)];
+  for (i = 0; i < wiener_halfwin1 - 1; ++i)
+    for (j = 0; j < wiener_halfwin1 - 1; ++j)
+      B[i * wiener_halfwin1 + j] -=
+          2 * (B[i * wiener_halfwin1 + (wiener_halfwin1 - 1)] +
+               B[(wiener_halfwin1 - 1) * wiener_halfwin1 + j] -
+               2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 +
+                     (wiener_halfwin1 - 1)]);
+  if (linsolve(wiener_halfwin1 - 1, B, wiener_halfwin1, A, S)) {
+    S[wiener_halfwin1 - 1] = 1.0;
+    for (i = wiener_halfwin1; i < wiener_win; ++i) {
+      S[i] = S[wiener_win - 1 - i];
+      S[wiener_halfwin1 - 1] -= 2 * S[i];
     }
-    memcpy(a, S, w * sizeof(*a));
+    memcpy(a, S, wiener_win * sizeof(*a));
   }
 }
 
 // Fix vector a, update vector b
-static void update_b_sep_sym(double **Mc, double **Hc, double *a, double *b) {
+static void update_b_sep_sym(int wiener_win, double **Mc, double **Hc,
+                             double *a, double *b) {
   int i, j;
   double S[WIENER_WIN];
   double A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1];
-  int w, w2;
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin1 = (wiener_win >> 1) + 1;
   memset(A, 0, sizeof(A));
   memset(B, 0, sizeof(B));
-  for (i = 0; i < WIENER_WIN; i++) {
-    const int ii = wrap_index(i);
-    for (j = 0; j < WIENER_WIN; j++) A[ii] += Mc[i][j] * a[j];
+  for (i = 0; i < wiener_win; i++) {
+    const int ii = wrap_index(i, wiener_win);
+    for (j = 0; j < wiener_win; j++) A[ii] += Mc[i][j] * a[j];
   }
 
-  for (i = 0; i < WIENER_WIN; i++) {
-    for (j = 0; j < WIENER_WIN; j++) {
-      const int ii = wrap_index(i);
-      const int jj = wrap_index(j);
+  for (i = 0; i < wiener_win; i++) {
+    for (j = 0; j < wiener_win; j++) {
+      const int ii = wrap_index(i, wiener_win);
+      const int jj = wrap_index(j, wiener_win);
       int k, l;
-      for (k = 0; k < WIENER_WIN; ++k)
-        for (l = 0; l < WIENER_WIN; ++l)
-          B[jj * WIENER_HALFWIN1 + ii] +=
-              Hc[i * WIENER_WIN + j][k * WIENER_WIN2 + l] * a[k] * a[l];
+      for (k = 0; k < wiener_win; ++k)
+        for (l = 0; l < wiener_win; ++l)
+          B[jj * wiener_halfwin1 + ii] +=
+              Hc[i * wiener_win + j][k * wiener_win2 + l] * a[k] * a[l];
     }
   }
   // Normalization enforcement in the system of equations itself
-  w = WIENER_WIN;
-  w2 = WIENER_HALFWIN1;
-  for (i = 0; i < w2 - 1; ++i)
+  for (i = 0; i < wiener_halfwin1 - 1; ++i)
     A[i] -=
-        A[w2 - 1] * 2 + B[i * w2 + w2 - 1] - 2 * B[(w2 - 1) * w2 + (w2 - 1)];
-  for (i = 0; i < w2 - 1; ++i)
-    for (j = 0; j < w2 - 1; ++j)
-      B[i * w2 + j] -= 2 * (B[i * w2 + (w2 - 1)] + B[(w2 - 1) * w2 + j] -
-                            2 * B[(w2 - 1) * w2 + (w2 - 1)]);
-  if (linsolve(w2 - 1, B, w2, A, S)) {
-    S[w2 - 1] = 1.0;
-    for (i = w2; i < w; ++i) {
-      S[i] = S[w - 1 - i];
-      S[w2 - 1] -= 2 * S[i];
+        A[wiener_halfwin1 - 1] * 2 +
+        B[i * wiener_halfwin1 + wiener_halfwin1 - 1] -
+        2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 + (wiener_halfwin1 - 1)];
+  for (i = 0; i < wiener_halfwin1 - 1; ++i)
+    for (j = 0; j < wiener_halfwin1 - 1; ++j)
+      B[i * wiener_halfwin1 + j] -=
+          2 * (B[i * wiener_halfwin1 + (wiener_halfwin1 - 1)] +
+               B[(wiener_halfwin1 - 1) * wiener_halfwin1 + j] -
+               2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 +
+                     (wiener_halfwin1 - 1)]);
+  if (linsolve(wiener_halfwin1 - 1, B, wiener_halfwin1, A, S)) {
+    S[wiener_halfwin1 - 1] = 1.0;
+    for (i = wiener_halfwin1; i < wiener_win; ++i) {
+      S[i] = S[wiener_win - 1 - i];
+      S[wiener_halfwin1 - 1] -= 2 * S[i];
     }
-    memcpy(b, S, w * sizeof(*b));
+    memcpy(b, S, wiener_win * sizeof(*b));
   }
 }
 
-static int wiener_decompose_sep_sym(double *M, double *H, double *a,
-                                    double *b) {
+static int wiener_decompose_sep_sym(int wiener_win, double *M, double *H,
+                                    double *a, double *b) {
   static const int init_filt[WIENER_WIN] = {
     WIENER_FILT_TAP0_MIDV, WIENER_FILT_TAP1_MIDV, WIENER_FILT_TAP2_MIDV,
     WIENER_FILT_TAP3_MIDV, WIENER_FILT_TAP2_MIDV, WIENER_FILT_TAP1_MIDV,
     WIENER_FILT_TAP0_MIDV,
   };
-  int i, j, iter;
   double *Hc[WIENER_WIN2];
   double *Mc[WIENER_WIN];
-  for (i = 0; i < WIENER_WIN; i++) {
-    Mc[i] = M + i * WIENER_WIN;
-    for (j = 0; j < WIENER_WIN; j++) {
-      Hc[i * WIENER_WIN + j] =
-          H + i * WIENER_WIN * WIENER_WIN2 + j * WIENER_WIN;
-    }
+  int i, j, iter;
+  const int plane_off = (WIENER_WIN - wiener_win) >> 1;
+  const int wiener_win2 = wiener_win * wiener_win;
+  for (i = 0; i < wiener_win; i++) {
+    a[i] = b[i] = (double)init_filt[i + plane_off] / WIENER_FILT_STEP;
   }
-  for (i = 0; i < WIENER_WIN; i++) {
-    a[i] = b[i] = (double)init_filt[i] / WIENER_FILT_STEP;
+  for (i = 0; i < wiener_win; i++) {
+    Mc[i] = M + i * wiener_win;
+    for (j = 0; j < wiener_win; j++) {
+      Hc[i * wiener_win + j] =
+          H + i * wiener_win * wiener_win2 + j * wiener_win;
+    }
   }
 
   iter = 1;
   while (iter < NUM_WIENER_ITERS) {
-    update_a_sep_sym(Mc, Hc, a, b);
-    update_b_sep_sym(Mc, Hc, a, b);
+    update_a_sep_sym(wiener_win, Mc, Hc, a, b);
+    update_b_sep_sym(wiener_win, Mc, Hc, a, b);
     iter++;
   }
   return 1;
@@ -770,14 +957,16 @@ static int wiener_decompose_sep_sym(double *M, double *H, double *a,
 // Computes the function x'*H*x - x'*M for the learned 2D filter x, and compares
 // against identity filters; Final score is defined as the difference between
 // the function values
-static double compute_score(double *M, double *H, InterpKernel vfilt,
-                            InterpKernel hfilt) {
+static double compute_score(int wiener_win, double *M, double *H,
+                            InterpKernel vfilt, InterpKernel hfilt) {
   double ab[WIENER_WIN * WIENER_WIN];
   int i, k, l;
   double P = 0, Q = 0;
   double iP = 0, iQ = 0;
   double Score, iScore;
   double a[WIENER_WIN], b[WIENER_WIN];
+  const int plane_off = (WIENER_WIN - wiener_win) >> 1;
+  const int wiener_win2 = wiener_win * wiener_win;
 
   aom_clear_system_state();
 
@@ -788,32 +977,41 @@ static double compute_score(double *M, double *H, InterpKernel vfilt,
     a[WIENER_HALFWIN] -= 2 * a[i];
     b[WIENER_HALFWIN] -= 2 * b[i];
   }
-  for (k = 0; k < WIENER_WIN; ++k) {
-    for (l = 0; l < WIENER_WIN; ++l) ab[k * WIENER_WIN + l] = a[l] * b[k];
+  memset(ab, 0, sizeof(ab));
+  for (k = 0; k < wiener_win; ++k) {
+    for (l = 0; l < wiener_win; ++l)
+      ab[k * wiener_win + l] = a[l + plane_off] * b[k + plane_off];
   }
-  for (k = 0; k < WIENER_WIN2; ++k) {
+  for (k = 0; k < wiener_win2; ++k) {
     P += ab[k] * M[k];
-    for (l = 0; l < WIENER_WIN2; ++l)
-      Q += ab[k] * H[k * WIENER_WIN2 + l] * ab[l];
+    for (l = 0; l < wiener_win2; ++l)
+      Q += ab[k] * H[k * wiener_win2 + l] * ab[l];
   }
   Score = Q - 2 * P;
 
-  iP = M[WIENER_WIN2 >> 1];
-  iQ = H[(WIENER_WIN2 >> 1) * WIENER_WIN2 + (WIENER_WIN2 >> 1)];
+  iP = M[wiener_win2 >> 1];
+  iQ = H[(wiener_win2 >> 1) * wiener_win2 + (wiener_win2 >> 1)];
   iScore = iQ - 2 * iP;
 
   return Score - iScore;
 }
 
-static void quantize_sym_filter(double *f, InterpKernel fi) {
+static void quantize_sym_filter(int wiener_win, double *f, InterpKernel fi) {
   int i;
-  for (i = 0; i < WIENER_HALFWIN; ++i) {
+  const int wiener_halfwin = (wiener_win >> 1);
+  for (i = 0; i < wiener_halfwin; ++i) {
     fi[i] = RINT(f[i] * WIENER_FILT_STEP);
   }
   // Specialize for 7-tap filter
-  fi[0] = CLIP(fi[0], WIENER_FILT_TAP0_MINV, WIENER_FILT_TAP0_MAXV);
-  fi[1] = CLIP(fi[1], WIENER_FILT_TAP1_MINV, WIENER_FILT_TAP1_MAXV);
-  fi[2] = CLIP(fi[2], WIENER_FILT_TAP2_MINV, WIENER_FILT_TAP2_MAXV);
+  if (wiener_win == WIENER_WIN) {
+    fi[0] = CLIP(fi[0], WIENER_FILT_TAP0_MINV, WIENER_FILT_TAP0_MAXV);
+    fi[1] = CLIP(fi[1], WIENER_FILT_TAP1_MINV, WIENER_FILT_TAP1_MAXV);
+    fi[2] = CLIP(fi[2], WIENER_FILT_TAP2_MINV, WIENER_FILT_TAP2_MAXV);
+  } else {
+    fi[2] = CLIP(fi[1], WIENER_FILT_TAP2_MINV, WIENER_FILT_TAP2_MAXV);
+    fi[1] = CLIP(fi[0], WIENER_FILT_TAP1_MINV, WIENER_FILT_TAP1_MAXV);
+    fi[0] = 0;
+  }
   // Satisfy filter constraints
   fi[WIENER_WIN - 1] = fi[0];
   fi[WIENER_WIN - 2] = fi[1];
@@ -822,14 +1020,15 @@ static void quantize_sym_filter(double *f, InterpKernel fi) {
   fi[3] = -2 * (fi[0] + fi[1] + fi[2]);
 }
 
-static int count_wiener_bits(WienerInfo *wiener_info,
+static int count_wiener_bits(int wiener_win, WienerInfo *wiener_info,
                              WienerInfo *ref_wiener_info) {
   int bits = 0;
-  bits += aom_count_primitive_refsubexpfin(
-      WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
-      WIENER_FILT_TAP0_SUBEXP_K,
-      ref_wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV,
-      wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV);
+  if (wiener_win == WIENER_WIN)
+    bits += aom_count_primitive_refsubexpfin(
+        WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+        WIENER_FILT_TAP0_SUBEXP_K,
+        ref_wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV,
+        wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV);
   bits += aom_count_primitive_refsubexpfin(
       WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
       WIENER_FILT_TAP1_SUBEXP_K,
@@ -840,11 +1039,12 @@ static int count_wiener_bits(WienerInfo *wiener_info,
       WIENER_FILT_TAP2_SUBEXP_K,
       ref_wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV,
       wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV);
-  bits += aom_count_primitive_refsubexpfin(
-      WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
-      WIENER_FILT_TAP0_SUBEXP_K,
-      ref_wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV,
-      wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV);
+  if (wiener_win == WIENER_WIN)
+    bits += aom_count_primitive_refsubexpfin(
+        WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+        WIENER_FILT_TAP0_SUBEXP_K,
+        ref_wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV,
+        wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV);
   bits += aom_count_primitive_refsubexpfin(
       WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
       WIENER_FILT_TAP1_SUBEXP_K,
@@ -861,11 +1061,13 @@ static int count_wiener_bits(WienerInfo *wiener_info,
 #define USE_WIENER_REFINEMENT_SEARCH 1
 static int64_t finer_tile_search_wiener(const YV12_BUFFER_CONFIG *src,
                                         AV1_COMP *cpi, RestorationInfo *rsi,
-                                        int start_step, int plane, int tile_idx,
+                                        int start_step, int plane,
+                                        int wiener_win, int tile_idx,
                                         int partial_frame,
                                         YV12_BUFFER_CONFIG *dst_frame) {
+  const int plane_off = (WIENER_WIN - wiener_win) >> 1;
   int64_t err = try_restoration_tile(src, cpi, rsi, 1 << plane, partial_frame,
-                                     tile_idx, 0, 0, dst_frame);
+                                     tile_idx, dst_frame);
   (void)start_step;
 #if USE_WIENER_REFINEMENT_SEARCH
   int64_t err2;
@@ -875,7 +1077,7 @@ static int64_t finer_tile_search_wiener(const YV12_BUFFER_CONFIG *src,
                     WIENER_FILT_TAP2_MAXV };
   // printf("err  pre = %"PRId64"\n", err);
   for (int s = start_step; s >= 1; s >>= 1) {
-    for (int p = 0; p < WIENER_HALFWIN; ++p) {
+    for (int p = plane_off; p < WIENER_HALFWIN; ++p) {
       int skip = 0;
       do {
         if (rsi[plane].wiener_info[tile_idx].hfilter[p] - s >= tap_min[p]) {
@@ -883,7 +1085,7 @@ static int64_t finer_tile_search_wiener(const YV12_BUFFER_CONFIG *src,
           rsi[plane].wiener_info[tile_idx].hfilter[WIENER_WIN - p - 1] -= s;
           rsi[plane].wiener_info[tile_idx].hfilter[WIENER_HALFWIN] += 2 * s;
           err2 = try_restoration_tile(src, cpi, rsi, 1 << plane, partial_frame,
-                                      tile_idx, 0, 0, dst_frame);
+                                      tile_idx, dst_frame);
           if (err2 > err) {
             rsi[plane].wiener_info[tile_idx].hfilter[p] += s;
             rsi[plane].wiener_info[tile_idx].hfilter[WIENER_WIN - p - 1] += s;
@@ -904,7 +1106,7 @@ static int64_t finer_tile_search_wiener(const YV12_BUFFER_CONFIG *src,
           rsi[plane].wiener_info[tile_idx].hfilter[WIENER_WIN - p - 1] += s;
           rsi[plane].wiener_info[tile_idx].hfilter[WIENER_HALFWIN] -= 2 * s;
           err2 = try_restoration_tile(src, cpi, rsi, 1 << plane, partial_frame,
-                                      tile_idx, 0, 0, dst_frame);
+                                      tile_idx, dst_frame);
           if (err2 > err) {
             rsi[plane].wiener_info[tile_idx].hfilter[p] -= s;
             rsi[plane].wiener_info[tile_idx].hfilter[WIENER_WIN - p - 1] -= s;
@@ -918,7 +1120,7 @@ static int64_t finer_tile_search_wiener(const YV12_BUFFER_CONFIG *src,
         break;
       } while (1);
     }
-    for (int p = 0; p < WIENER_HALFWIN; ++p) {
+    for (int p = plane_off; p < WIENER_HALFWIN; ++p) {
       int skip = 0;
       do {
         if (rsi[plane].wiener_info[tile_idx].vfilter[p] - s >= tap_min[p]) {
@@ -926,7 +1128,7 @@ static int64_t finer_tile_search_wiener(const YV12_BUFFER_CONFIG *src,
           rsi[plane].wiener_info[tile_idx].vfilter[WIENER_WIN - p - 1] -= s;
           rsi[plane].wiener_info[tile_idx].vfilter[WIENER_HALFWIN] += 2 * s;
           err2 = try_restoration_tile(src, cpi, rsi, 1 << plane, partial_frame,
-                                      tile_idx, 0, 0, dst_frame);
+                                      tile_idx, dst_frame);
           if (err2 > err) {
             rsi[plane].wiener_info[tile_idx].vfilter[p] += s;
             rsi[plane].wiener_info[tile_idx].vfilter[WIENER_WIN - p - 1] += s;
@@ -947,7 +1149,7 @@ static int64_t finer_tile_search_wiener(const YV12_BUFFER_CONFIG *src,
           rsi[plane].wiener_info[tile_idx].vfilter[WIENER_WIN - p - 1] += s;
           rsi[plane].wiener_info[tile_idx].vfilter[WIENER_HALFWIN] -= 2 * s;
           err2 = try_restoration_tile(src, cpi, rsi, 1 << plane, partial_frame,
-                                      tile_idx, 0, 0, dst_frame);
+                                      tile_idx, dst_frame);
           if (err2 > err) {
             rsi[plane].wiener_info[tile_idx].vfilter[p] -= s;
             rsi[plane].wiener_info[tile_idx].vfilter[WIENER_WIN - p - 1] -= s;
@@ -967,154 +1169,157 @@ static int64_t finer_tile_search_wiener(const YV12_BUFFER_CONFIG *src,
   return err;
 }
 
-static double search_wiener(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
-                            int partial_frame, int plane, RestorationInfo *info,
-                            RestorationType *type, double *best_tile_cost,
-                            YV12_BUFFER_CONFIG *dst_frame) {
-  WienerInfo *wiener_info = info->wiener_info;
-  AV1_COMMON *const cm = &cpi->common;
-  RestorationInfo *rsi = cpi->rst_search;
-  int64_t err;
-  int bits;
-  double cost_wiener, cost_norestore;
-  MACROBLOCK *x = &cpi->td.mb;
+static void search_wiener_for_rtile(const struct rest_search_ctxt *ctxt,
+                                    int rtile_idx,
+                                    const RestorationTileLimits *limits,
+                                    void *arg) {
+  const MACROBLOCK *const x = &ctxt->cpi->td.mb;
+  const AV1_COMMON *const cm = &ctxt->cpi->common;
+  RestorationInfo *rsi = ctxt->cpi->rst_search;
+
+  const int wiener_win =
+      (ctxt->plane == AOM_PLANE_Y) ? WIENER_WIN : WIENER_WIN_CHROMA;
+
   double M[WIENER_WIN2];
   double H[WIENER_WIN2 * WIENER_WIN2];
   double vfilterd[WIENER_WIN], hfilterd[WIENER_WIN];
-  const YV12_BUFFER_CONFIG *dgd = cm->frame_to_show;
-  int width, height, src_stride, dgd_stride;
-  uint8_t *dgd_buffer, *src_buffer;
-  if (plane == AOM_PLANE_Y) {
-    width = src->y_crop_width;
-    height = src->y_crop_height;
-    src_buffer = src->y_buffer;
-    src_stride = src->y_stride;
-    dgd_buffer = dgd->y_buffer;
-    dgd_stride = dgd->y_stride;
-    assert(width == dgd->y_crop_width);
-    assert(height == dgd->y_crop_height);
-    assert(width == src->y_crop_width);
-    assert(height == src->y_crop_height);
-  } else {
-    width = src->uv_crop_width;
-    height = src->uv_crop_height;
-    src_stride = src->uv_stride;
-    dgd_stride = dgd->uv_stride;
-    src_buffer = plane == AOM_PLANE_U ? src->u_buffer : src->v_buffer;
-    dgd_buffer = plane == AOM_PLANE_U ? dgd->u_buffer : dgd->v_buffer;
-    assert(width == dgd->uv_crop_width);
-    assert(height == dgd->uv_crop_height);
-  }
-  double score;
-  int tile_idx, tile_width, tile_height, nhtiles, nvtiles;
-  int h_start, h_end, v_start, v_end;
-  const int ntiles = av1_get_rest_ntiles(
-      width, height, cm->rst_info[plane].restoration_tilesize, &tile_width,
-      &tile_height, &nhtiles, &nvtiles);
-  WienerInfo ref_wiener_info;
-  set_default_wiener(&ref_wiener_info);
 
-  rsi[plane].frame_restoration_type = RESTORE_WIENER;
+  WienerInfo *ref_wiener_info = (WienerInfo *)arg;
 
-  for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
-    rsi[plane].restoration_type[tile_idx] = RESTORE_NONE;
-  }
+  int64_t err =
+      sse_restoration_tile(ctxt->src, cm->frame_to_show, cm, limits->h_start,
+                           limits->h_end - limits->h_start, limits->v_start,
+                           limits->v_end - limits->v_start, (1 << ctxt->plane));
+  // #bits when a tile is not restored
+  int bits = av1_cost_bit(RESTORE_NONE_WIENER_PROB, 0);
+  double cost_norestore = RDCOST_DBL(x->rdmult, (bits >> 4), err);
+  ctxt->best_tile_cost[rtile_idx] = INT64_MAX;
 
-// Construct a (WIENER_HALFWIN)-pixel border around the frame
 #if CONFIG_HIGHBITDEPTH
   if (cm->use_highbitdepth)
-    extend_frame_highbd(CONVERT_TO_SHORTPTR(dgd_buffer), width, height,
-                        dgd_stride);
+    compute_stats_highbd(wiener_win, ctxt->dgd_buffer, ctxt->src_buffer,
+                         limits->h_start, limits->h_end, limits->v_start,
+                         limits->v_end, ctxt->dgd_stride, ctxt->src_stride, M,
+                         H);
   else
-#endif
-    extend_frame(dgd_buffer, width, height, dgd_stride);
-
-  // Compute best Wiener filters for each tile
-  for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
-    av1_get_rest_tile_limits(tile_idx, 0, 0, nhtiles, nvtiles, tile_width,
-                             tile_height, width, height, 0, 0, &h_start, &h_end,
-                             &v_start, &v_end);
-    err = sse_restoration_tile(src, cm->frame_to_show, cm, h_start,
-                               h_end - h_start, v_start, v_end - v_start,
-                               (1 << plane));
-    // #bits when a tile is not restored
-    bits = av1_cost_bit(RESTORE_NONE_WIENER_PROB, 0);
-    cost_norestore = RDCOST_DBL(x->rdmult, (bits >> 4), err);
-    best_tile_cost[tile_idx] = DBL_MAX;
-
-    av1_get_rest_tile_limits(tile_idx, 0, 0, nhtiles, nvtiles, tile_width,
-                             tile_height, width, height, 0, 0, &h_start, &h_end,
-                             &v_start, &v_end);
-#if CONFIG_HIGHBITDEPTH
-    if (cm->use_highbitdepth)
-      compute_stats_highbd(dgd_buffer, src_buffer, h_start, h_end, v_start,
-                           v_end, dgd_stride, src_stride, M, H);
-    else
 #endif  // CONFIG_HIGHBITDEPTH
-      compute_stats(dgd_buffer, src_buffer, h_start, h_end, v_start, v_end,
-                    dgd_stride, src_stride, M, H);
+    compute_stats(wiener_win, ctxt->dgd_buffer, ctxt->src_buffer,
+                  limits->h_start, limits->h_end, limits->v_start,
+                  limits->v_end, ctxt->dgd_stride, ctxt->src_stride, M, H);
 
-    type[tile_idx] = RESTORE_WIENER;
+  ctxt->type[rtile_idx] = RESTORE_WIENER;
 
-    if (!wiener_decompose_sep_sym(M, H, vfilterd, hfilterd)) {
-      type[tile_idx] = RESTORE_NONE;
-      continue;
-    }
-    quantize_sym_filter(vfilterd, rsi[plane].wiener_info[tile_idx].vfilter);
-    quantize_sym_filter(hfilterd, rsi[plane].wiener_info[tile_idx].hfilter);
-
-    // Filter score computes the value of the function x'*A*x - x'*b for the
-    // learned filter and compares it against identity filer. If there is no
-    // reduction in the function, the filter is reverted back to identity
-    score = compute_score(M, H, rsi[plane].wiener_info[tile_idx].vfilter,
-                          rsi[plane].wiener_info[tile_idx].hfilter);
-    if (score > 0.0) {
-      type[tile_idx] = RESTORE_NONE;
-      continue;
-    }
-    aom_clear_system_state();
+  if (!wiener_decompose_sep_sym(wiener_win, M, H, vfilterd, hfilterd)) {
+    ctxt->type[rtile_idx] = RESTORE_NONE;
+    return;
+  }
 
-    rsi[plane].restoration_type[tile_idx] = RESTORE_WIENER;
-    err = finer_tile_search_wiener(src, cpi, rsi, 4, plane, tile_idx,
-                                   partial_frame, dst_frame);
-    bits =
-        count_wiener_bits(&rsi[plane].wiener_info[tile_idx], &ref_wiener_info)
-        << AV1_PROB_COST_SHIFT;
-    bits += av1_cost_bit(RESTORE_NONE_WIENER_PROB, 1);
-    cost_wiener = RDCOST_DBL(x->rdmult, (bits >> 4), err);
-    if (cost_wiener >= cost_norestore) {
-      type[tile_idx] = RESTORE_NONE;
-    } else {
-      type[tile_idx] = RESTORE_WIENER;
-      memcpy(&wiener_info[tile_idx], &rsi[plane].wiener_info[tile_idx],
-             sizeof(wiener_info[tile_idx]));
-      memcpy(&ref_wiener_info, &rsi[plane].wiener_info[tile_idx],
-             sizeof(ref_wiener_info));
-      best_tile_cost[tile_idx] = err;
+  RestorationInfo *plane_rsi = &rsi[ctxt->plane];
+  WienerInfo *rtile_wiener_info = &plane_rsi->wiener_info[rtile_idx];
+  quantize_sym_filter(wiener_win, vfilterd, rtile_wiener_info->vfilter);
+  quantize_sym_filter(wiener_win, hfilterd, rtile_wiener_info->hfilter);
+
+  // Filter score computes the value of the function x'*A*x - x'*b for the
+  // learned filter and compares it against identity filer. If there is no
+  // reduction in the function, the filter is reverted back to identity
+  double score = compute_score(wiener_win, M, H, rtile_wiener_info->vfilter,
+                               rtile_wiener_info->hfilter);
+  if (score > 0.0) {
+    ctxt->type[rtile_idx] = RESTORE_NONE;
+    return;
+  }
+  aom_clear_system_state();
+
+  plane_rsi->restoration_type[rtile_idx] = RESTORE_WIENER;
+  err = finer_tile_search_wiener(ctxt->src, ctxt->cpi, rsi, 4, ctxt->plane,
+                                 wiener_win, rtile_idx, ctxt->partial_frame,
+                                 ctxt->dst_frame);
+  if (wiener_win != WIENER_WIN) {
+    assert(rtile_wiener_info->vfilter[0] == 0 &&
+           rtile_wiener_info->vfilter[WIENER_WIN - 1] == 0);
+    assert(rtile_wiener_info->hfilter[0] == 0 &&
+           rtile_wiener_info->hfilter[WIENER_WIN - 1] == 0);
+  }
+  bits = count_wiener_bits(wiener_win, rtile_wiener_info, ref_wiener_info)
+         << AV1_PROB_COST_SHIFT;
+  bits += av1_cost_bit(RESTORE_NONE_WIENER_PROB, 1);
+  double cost_wiener = RDCOST_DBL(x->rdmult, (bits >> 4), err);
+  if (cost_wiener >= cost_norestore) {
+    ctxt->type[rtile_idx] = RESTORE_NONE;
+  } else {
+    ctxt->type[rtile_idx] = RESTORE_WIENER;
+    *ref_wiener_info = ctxt->info->wiener_info[rtile_idx] = *rtile_wiener_info;
+    ctxt->best_tile_cost[rtile_idx] = err;
+  }
+  plane_rsi->restoration_type[rtile_idx] = RESTORE_NONE;
+}
+
+static double search_wiener(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
+                            int partial_frame, int plane, RestorationInfo *info,
+                            RestorationType *type, int64_t *best_tile_cost,
+                            YV12_BUFFER_CONFIG *dst_frame) {
+  struct rest_search_ctxt ctxt;
+  const int nrtiles =
+      init_rest_search_ctxt(src, cpi, partial_frame, plane, info, type,
+                            best_tile_cost, dst_frame, &ctxt);
+
+  RestorationInfo *plane_rsi = &cpi->rst_search[plane];
+  plane_rsi->frame_restoration_type = RESTORE_WIENER;
+  for (int tile_idx = 0; tile_idx < nrtiles; ++tile_idx) {
+    plane_rsi->restoration_type[tile_idx] = RESTORE_NONE;
+  }
+
+  AV1_COMMON *const cm = &cpi->common;
+// Construct a (WIENER_HALFWIN)-pixel border around the frame
+// Note use this border to gather stats even though the actual filter
+// may use less border on the top/bottom of a processing unit.
+#if CONFIG_HIGHBITDEPTH
+  if (cm->use_highbitdepth)
+    extend_frame_highbd(CONVERT_TO_SHORTPTR(ctxt.dgd_buffer), ctxt.plane_width,
+                        ctxt.plane_height, ctxt.dgd_stride, WIENER_HALFWIN,
+                        WIENER_HALFWIN);
+  else
+#endif
+    extend_frame(ctxt.dgd_buffer, ctxt.plane_width, ctxt.plane_height,
+                 ctxt.dgd_stride, WIENER_HALFWIN, WIENER_HALFWIN);
+
+  // Compute best Wiener filters for each rtile, one (encoder/decoder)
+  // tile at a time.
+  for (int tile_row = 0; tile_row < cm->tile_rows; ++tile_row) {
+    for (int tile_col = 0; tile_col < cm->tile_cols; ++tile_col) {
+      WienerInfo ref_wiener_info;
+      set_default_wiener(&ref_wiener_info);
+
+      foreach_rtile_in_tile(&ctxt, tile_row, tile_col, search_wiener_for_rtile,
+                            &ref_wiener_info);
     }
-    rsi[plane].restoration_type[tile_idx] = RESTORE_NONE;
   }
-  // Cost for Wiener filtering
+
+  // cost for Wiener filtering
+  WienerInfo ref_wiener_info;
   set_default_wiener(&ref_wiener_info);
-  bits = frame_level_restore_bits[rsi[plane].frame_restoration_type]
-         << AV1_PROB_COST_SHIFT;
-  for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
+  int bits = frame_level_restore_bits[plane_rsi->frame_restoration_type]
+             << AV1_PROB_COST_SHIFT;
+  WienerInfo *wiener_info = info->wiener_info;
+  const int wiener_win =
+      (plane == AOM_PLANE_Y) ? WIENER_WIN : WIENER_WIN_CHROMA;
+
+  for (int tile_idx = 0; tile_idx < nrtiles; ++tile_idx) {
     bits +=
         av1_cost_bit(RESTORE_NONE_WIENER_PROB, type[tile_idx] != RESTORE_NONE);
-    memcpy(&rsi[plane].wiener_info[tile_idx], &wiener_info[tile_idx],
-           sizeof(wiener_info[tile_idx]));
+    plane_rsi->wiener_info[tile_idx] = wiener_info[tile_idx];
+
     if (type[tile_idx] == RESTORE_WIENER) {
-      bits +=
-          count_wiener_bits(&rsi[plane].wiener_info[tile_idx], &ref_wiener_info)
-          << AV1_PROB_COST_SHIFT;
-      memcpy(&ref_wiener_info, &rsi[plane].wiener_info[tile_idx],
-             sizeof(ref_wiener_info));
+      bits += count_wiener_bits(wiener_win, &plane_rsi->wiener_info[tile_idx],
+                                &ref_wiener_info)
+              << AV1_PROB_COST_SHIFT;
+      ref_wiener_info = plane_rsi->wiener_info[tile_idx];
     }
-    rsi[plane].restoration_type[tile_idx] = type[tile_idx];
+    plane_rsi->restoration_type[tile_idx] = type[tile_idx];
   }
-  err = try_restoration_frame(src, cpi, rsi, 1 << plane, partial_frame,
-                              dst_frame);
-  cost_wiener = RDCOST_DBL(x->rdmult, (bits >> 4), err);
+  int64_t err = try_restoration_frame(src, cpi, cpi->rst_search, 1 << plane,
+                                      partial_frame, dst_frame);
+  double cost_wiener = RDCOST_DBL(cpi->td.mb.rdmult, (bits >> 4), err);
 
   return cost_wiener;
 }
@@ -1122,7 +1327,7 @@ static double search_wiener(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
 static double search_norestore(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
                                int partial_frame, int plane,
                                RestorationInfo *info, RestorationType *type,
-                               double *best_tile_cost,
+                               int64_t *best_tile_cost,
                                YV12_BUFFER_CONFIG *dst_frame) {
   int64_t err;
   double cost_norestore;
@@ -1130,7 +1335,6 @@ static double search_norestore(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
   MACROBLOCK *x = &cpi->td.mb;
   AV1_COMMON *const cm = &cpi->common;
   int tile_idx, tile_width, tile_height, nhtiles, nvtiles;
-  int h_start, h_end, v_start, v_end;
   int width, height;
   if (plane == AOM_PLANE_Y) {
     width = src->y_crop_width;
@@ -1148,12 +1352,16 @@ static double search_norestore(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
 
   info->frame_restoration_type = RESTORE_NONE;
   for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
-    av1_get_rest_tile_limits(tile_idx, 0, 0, nhtiles, nvtiles, tile_width,
-                             tile_height, width, height, 0, 0, &h_start, &h_end,
-                             &v_start, &v_end);
-    err = sse_restoration_tile(src, cm->frame_to_show, cm, h_start,
-                               h_end - h_start, v_start, v_end - v_start,
-                               1 << plane);
+    RestorationTileLimits limits = av1_get_rest_tile_limits(
+        tile_idx, nhtiles, nvtiles, tile_width, tile_height, width,
+#if CONFIG_STRIPED_LOOP_RESTORATION
+        height, plane != AOM_PLANE_Y ? cm->subsampling_y : 0);
+#else
+        height);
+#endif
+    err = sse_restoration_tile(src, cm->frame_to_show, cm, limits.h_start,
+                               limits.h_end - limits.h_start, limits.v_start,
+                               limits.v_end - limits.v_start, 1 << plane);
     type[tile_idx] = RESTORE_NONE;
     best_tile_cost[tile_idx] = err;
   }
@@ -1164,74 +1372,88 @@ static double search_norestore(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
   return cost_norestore;
 }
 
+struct switchable_rest_search_ctxt {
+  SgrprojInfo sgrproj_info;
+  WienerInfo wiener_info;
+  RestorationType *const *restore_types;
+  int64_t *const *tile_cost;
+  double cost_switchable;
+};
+
+static void search_switchable_for_rtile(const struct rest_search_ctxt *ctxt,
+                                        int rtile_idx,
+                                        const RestorationTileLimits *limits,
+                                        void *arg) {
+  const MACROBLOCK *x = &ctxt->cpi->td.mb;
+  RestorationInfo *rsi = &ctxt->cpi->common.rst_info[ctxt->plane];
+  struct switchable_rest_search_ctxt *swctxt =
+      (struct switchable_rest_search_ctxt *)arg;
+
+  (void)limits;
+
+  double best_cost =
+      RDCOST_DBL(x->rdmult, (x->switchable_restore_cost[RESTORE_NONE] >> 4),
+                 swctxt->tile_cost[RESTORE_NONE][rtile_idx]);
+  rsi->restoration_type[rtile_idx] = RESTORE_NONE;
+  for (RestorationType r = 1; r < RESTORE_SWITCHABLE_TYPES; r++) {
+    if (force_restore_type != RESTORE_TYPES)
+      if (r != force_restore_type) continue;
+    int tilebits = 0;
+    if (swctxt->restore_types[r][rtile_idx] != r) continue;
+    if (r == RESTORE_WIENER)
+      tilebits += count_wiener_bits(
+          (ctxt->plane == AOM_PLANE_Y ? WIENER_WIN : WIENER_WIN - 2),
+          &rsi->wiener_info[rtile_idx], &swctxt->wiener_info);
+    else if (r == RESTORE_SGRPROJ)
+      tilebits += count_sgrproj_bits(&rsi->sgrproj_info[rtile_idx],
+                                     &swctxt->sgrproj_info);
+    tilebits <<= AV1_PROB_COST_SHIFT;
+    tilebits += x->switchable_restore_cost[r];
+    double cost =
+        RDCOST_DBL(x->rdmult, tilebits >> 4, swctxt->tile_cost[r][rtile_idx]);
+
+    if (cost < best_cost) {
+      rsi->restoration_type[rtile_idx] = r;
+      best_cost = cost;
+    }
+  }
+  if (rsi->restoration_type[rtile_idx] == RESTORE_WIENER)
+    swctxt->wiener_info = rsi->wiener_info[rtile_idx];
+  else if (rsi->restoration_type[rtile_idx] == RESTORE_SGRPROJ)
+    swctxt->sgrproj_info = rsi->sgrproj_info[rtile_idx];
+  if (force_restore_type != RESTORE_TYPES)
+    assert(rsi->restoration_type[rtile_idx] == force_restore_type ||
+           rsi->restoration_type[rtile_idx] == RESTORE_NONE);
+  swctxt->cost_switchable += best_cost;
+}
+
 static double search_switchable_restoration(
     const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi, int partial_frame, int plane,
     RestorationType *const restore_types[RESTORE_SWITCHABLE_TYPES],
-    double *const tile_cost[RESTORE_SWITCHABLE_TYPES], RestorationInfo *rsi) {
-  AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCK *x = &cpi->td.mb;
-  double cost_switchable = 0;
-  int bits, tile_idx;
-  RestorationType r;
-  int width, height;
-  if (plane == AOM_PLANE_Y) {
-    width = src->y_crop_width;
-    height = src->y_crop_height;
-  } else {
-    width = src->uv_crop_width;
-    height = src->uv_crop_height;
-  }
-  const int ntiles = av1_get_rest_ntiles(
-      width, height, cm->rst_info[plane].restoration_tilesize, NULL, NULL, NULL,
-      NULL);
-  SgrprojInfo ref_sgrproj_info;
-  set_default_sgrproj(&ref_sgrproj_info);
-  WienerInfo ref_wiener_info;
-  set_default_wiener(&ref_wiener_info);
-  (void)partial_frame;
+    int64_t *const tile_cost[RESTORE_SWITCHABLE_TYPES], RestorationInfo *rsi) {
+  const AV1_COMMON *const cm = &cpi->common;
+  struct rest_search_ctxt ctxt;
+  init_rest_search_ctxt(src, cpi, partial_frame, plane, NULL, NULL, NULL, NULL,
+                        &ctxt);
+  struct switchable_rest_search_ctxt swctxt;
+  swctxt.restore_types = restore_types;
+  swctxt.tile_cost = tile_cost;
 
   rsi->frame_restoration_type = RESTORE_SWITCHABLE;
-  bits = frame_level_restore_bits[rsi->frame_restoration_type]
-         << AV1_PROB_COST_SHIFT;
-  cost_switchable = RDCOST_DBL(x->rdmult, bits >> 4, 0);
-  for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
-    double best_cost =
-        RDCOST_DBL(x->rdmult, (cpi->switchable_restore_cost[RESTORE_NONE] >> 4),
-                   tile_cost[RESTORE_NONE][tile_idx]);
-    rsi->restoration_type[tile_idx] = RESTORE_NONE;
-    for (r = 1; r < RESTORE_SWITCHABLE_TYPES; r++) {
-      if (force_restore_type != 0)
-        if (r != force_restore_type) continue;
-      int tilebits = 0;
-      if (restore_types[r][tile_idx] != r) continue;
-      if (r == RESTORE_WIENER)
-        tilebits +=
-            count_wiener_bits(&rsi->wiener_info[tile_idx], &ref_wiener_info);
-      else if (r == RESTORE_SGRPROJ)
-        tilebits +=
-            count_sgrproj_bits(&rsi->sgrproj_info[tile_idx], &ref_sgrproj_info);
-      tilebits <<= AV1_PROB_COST_SHIFT;
-      tilebits += cpi->switchable_restore_cost[r];
-      double cost =
-          RDCOST_DBL(x->rdmult, tilebits >> 4, tile_cost[r][tile_idx]);
-
-      if (cost < best_cost) {
-        rsi->restoration_type[tile_idx] = r;
-        best_cost = cost;
-      }
+  int bits = frame_level_restore_bits[rsi->frame_restoration_type]
+             << AV1_PROB_COST_SHIFT;
+  swctxt.cost_switchable = RDCOST_DBL(cpi->td.mb.rdmult, bits >> 4, 0);
+
+  for (int tile_row = 0; tile_row < cm->tile_rows; ++tile_row) {
+    for (int tile_col = 0; tile_col < cm->tile_cols; ++tile_col) {
+      set_default_sgrproj(&swctxt.sgrproj_info);
+      set_default_wiener(&swctxt.wiener_info);
+      foreach_rtile_in_tile(&ctxt, tile_row, tile_col,
+                            search_switchable_for_rtile, &swctxt);
     }
-    if (rsi->restoration_type[tile_idx] == RESTORE_WIENER)
-      memcpy(&ref_wiener_info, &rsi->wiener_info[tile_idx],
-             sizeof(ref_wiener_info));
-    else if (rsi->restoration_type[tile_idx] == RESTORE_SGRPROJ)
-      memcpy(&ref_sgrproj_info, &rsi->sgrproj_info[tile_idx],
-             sizeof(ref_sgrproj_info));
-    if (force_restore_type != 0)
-      assert(rsi->restoration_type[tile_idx] == force_restore_type ||
-             rsi->restoration_type[tile_idx] == RESTORE_NONE);
-    cost_switchable += best_cost;
   }
-  return cost_switchable;
+
+  return swctxt.cost_switchable;
 }
 
 void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
@@ -1241,7 +1463,7 @@ void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
   };
   AV1_COMMON *const cm = &cpi->common;
   double cost_restore[RESTORE_TYPES];
-  double *tile_cost[RESTORE_SWITCHABLE_TYPES];
+  int64_t *tile_cost[RESTORE_SWITCHABLE_TYPES];
   RestorationType *restore_types[RESTORE_SWITCHABLE_TYPES];
   double best_cost_restore;
   RestorationType r, best_restore;
@@ -1259,7 +1481,7 @@ void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
 
   // Assume ntiles_uv is never larger that ntiles_y and so the same arrays work.
   for (r = 0; r < RESTORE_SWITCHABLE_TYPES; r++) {
-    tile_cost[r] = (double *)aom_malloc(sizeof(*tile_cost[0]) * ntiles_y);
+    tile_cost[r] = (int64_t *)aom_malloc(sizeof(*tile_cost[0]) * ntiles_y);
     restore_types[r] =
         (RestorationType *)aom_malloc(sizeof(*restore_types[0]) * ntiles_y);
   }
@@ -1267,7 +1489,7 @@ void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
   for (int plane = AOM_PLANE_Y; plane <= AOM_PLANE_V; ++plane) {
     for (r = 0; r < RESTORE_SWITCHABLE_TYPES; ++r) {
       cost_restore[r] = DBL_MAX;
-      if (force_restore_type != 0)
+      if (force_restore_type != RESTORE_TYPES)
         if (r != RESTORE_NONE && r != force_restore_type) continue;
       cost_restore[r] =
           search_restore_fun[r](src, cpi, method == LPF_PICK_FROM_SUBIMAGE,
@@ -1283,7 +1505,7 @@ void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
     best_cost_restore = DBL_MAX;
     best_restore = 0;
     for (r = 0; r < RESTORE_TYPES; ++r) {
-      if (force_restore_type != 0)
+      if (force_restore_type != RESTORE_TYPES)
         if (r != RESTORE_NONE && r != force_restore_type) continue;
       if (cost_restore[r] < best_cost_restore) {
         best_restore = r;
@@ -1291,7 +1513,7 @@ void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
       }
     }
     cm->rst_info[plane].frame_restoration_type = best_restore;
-    if (force_restore_type != 0)
+    if (force_restore_type != RESTORE_TYPES)
       assert(best_restore == force_restore_type ||
              best_restore == RESTORE_NONE);
     if (best_restore != RESTORE_SWITCHABLE) {
diff --git a/third_party/aom/av1/encoder/random.h b/third_party/aom/av1/encoder/random.h
new file mode 100644
index 000000000..9b2dac965
--- /dev/null
+++ b/third_party/aom/av1/encoder/random.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_RANDOM_H_
+#define AV1_ENCODER_RANDOM_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Generate a random number in the range [0, 32768).
+static INLINE unsigned int lcg_rand16(unsigned int *state) {
+  *state = (unsigned int)(*state * 1103515245ULL + 12345);
+  return *state / 65536 % 32768;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_RANDOM_H_
diff --git a/third_party/aom/av1/encoder/ransac.c b/third_party/aom/av1/encoder/ransac.c
index c6e3675be..6d2eb4183 100644
--- a/third_party/aom/av1/encoder/ransac.c
+++ b/third_party/aom/av1/encoder/ransac.c
@@ -17,6 +17,7 @@
 
 #include "av1/encoder/ransac.h"
 #include "av1/encoder/mathutils.h"
+#include "av1/encoder/random.h"
 
 #define MAX_MINPTS 4
 #define MAX_DEGENERATE_ITER 10
@@ -587,12 +588,6 @@ static int find_homography(int np, double *pts1, double *pts2, double *mat) {
   return 0;
 }
 
-// Generate a random number in the range [0, 32768).
-static unsigned int lcg_rand16(unsigned int *state) {
-  *state = (unsigned int)(*state * 1103515245ULL + 12345);
-  return *state / 65536 % 32768;
-}
-
 static int get_rand_indices(int npoints, int minpts, int *indices,
                             unsigned int *seed) {
   int i, j;
diff --git a/third_party/aom/av1/encoder/ratectrl.c b/third_party/aom/av1/encoder/ratectrl.c
index b546fdffa..a90cb880e 100644
--- a/third_party/aom/av1/encoder/ratectrl.c
+++ b/third_party/aom/av1/encoder/ratectrl.c
@@ -29,6 +29,7 @@
 #include "av1/common/seg_common.h"
 
 #include "av1/encoder/encodemv.h"
+#include "av1/encoder/random.h"
 #include "av1/encoder/ratectrl.h"
 
 // Max rate target for 1080P and below encodes under normal circumstances
@@ -93,9 +94,11 @@ static int gf_low = 400;
 static int kf_high = 5000;
 static int kf_low = 400;
 
-double av1_resize_rate_factor(const AV1_COMP *cpi) {
-  return (double)(cpi->oxcf.width * cpi->oxcf.height) /
-         (cpi->common.width * cpi->common.height);
+// How many times less pixels there are to encode given the current scaling.
+// Temporary replacement for rcf_mult and rate_thresh_mult.
+static double resize_rate_factor(const AV1_COMP *cpi, int width, int height) {
+  (void)cpi;
+  return (double)(cpi->oxcf.width * cpi->oxcf.height) / (width * height);
 }
 
 // Functions to compute the active minq lookup table entries based on a
@@ -371,7 +374,8 @@ int av1_rc_drop_frame(AV1_COMP *cpi) {
   }
 }
 
-static double get_rate_correction_factor(const AV1_COMP *cpi) {
+static double get_rate_correction_factor(const AV1_COMP *cpi, int width,
+                                         int height) {
   const RATE_CONTROL *const rc = &cpi->rc;
   double rcf;
 
@@ -389,15 +393,16 @@ static double get_rate_correction_factor(const AV1_COMP *cpi) {
     else
       rcf = rc->rate_correction_factors[INTER_NORMAL];
   }
-  rcf *= av1_resize_rate_factor(cpi);
+  rcf *= resize_rate_factor(cpi, width, height);
   return fclamp(rcf, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
 }
 
-static void set_rate_correction_factor(AV1_COMP *cpi, double factor) {
+static void set_rate_correction_factor(AV1_COMP *cpi, double factor, int width,
+                                       int height) {
   RATE_CONTROL *const rc = &cpi->rc;
 
   // Normalize RCF to account for the size-dependent scaling factor.
-  factor /= av1_resize_rate_factor(cpi);
+  factor /= resize_rate_factor(cpi, width, height);
 
   factor = fclamp(factor, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
 
@@ -417,11 +422,14 @@ static void set_rate_correction_factor(AV1_COMP *cpi, double factor) {
   }
 }
 
-void av1_rc_update_rate_correction_factors(AV1_COMP *cpi) {
+void av1_rc_update_rate_correction_factors(AV1_COMP *cpi, int width,
+                                           int height) {
   const AV1_COMMON *const cm = &cpi->common;
   int correction_factor = 100;
-  double rate_correction_factor = get_rate_correction_factor(cpi);
+  double rate_correction_factor =
+      get_rate_correction_factor(cpi, width, height);
   double adjustment_limit;
+  const int MBs = av1_get_MBs(width, height);
 
   int projected_size_based_on_q = 0;
 
@@ -439,7 +447,7 @@ void av1_rc_update_rate_correction_factors(AV1_COMP *cpi) {
         av1_cyclic_refresh_estimate_bits_at_q(cpi, rate_correction_factor);
   } else {
     projected_size_based_on_q =
-        av1_estimate_bits_at_q(cpi->common.frame_type, cm->base_qindex, cm->MBs,
+        av1_estimate_bits_at_q(cpi->common.frame_type, cm->base_qindex, MBs,
                                rate_correction_factor, cm->bit_depth);
   }
   // Work out a size correction factor.
@@ -485,21 +493,24 @@ void av1_rc_update_rate_correction_factors(AV1_COMP *cpi) {
       rate_correction_factor = MIN_BPB_FACTOR;
   }
 
-  set_rate_correction_factor(cpi, rate_correction_factor);
+  set_rate_correction_factor(cpi, rate_correction_factor, width, height);
 }
 
 int av1_rc_regulate_q(const AV1_COMP *cpi, int target_bits_per_frame,
-                      int active_best_quality, int active_worst_quality) {
+                      int active_best_quality, int active_worst_quality,
+                      int width, int height) {
   const AV1_COMMON *const cm = &cpi->common;
   int q = active_worst_quality;
   int last_error = INT_MAX;
   int i, target_bits_per_mb, bits_per_mb_at_this_q;
-  const double correction_factor = get_rate_correction_factor(cpi);
+  const int MBs = av1_get_MBs(width, height);
+  const double correction_factor =
+      get_rate_correction_factor(cpi, width, height);
 
   // Calculate required scaling factor based on target frame size and size of
   // frame produced using previous Q.
   target_bits_per_mb =
-      (int)((uint64_t)target_bits_per_frame << BPER_MB_NORMBITS) / cm->MBs;
+      (int)((uint64_t)(target_bits_per_frame) << BPER_MB_NORMBITS) / MBs;
 
   i = active_best_quality;
 
@@ -579,8 +590,11 @@ static int calc_active_worst_quality_one_pass_vbr(const AV1_COMP *cpi) {
     active_worst_quality =
         curr_frame == 0 ? rc->worst_quality : rc->last_q[KEY_FRAME] * 2;
   } else {
-    if (!rc->is_src_frame_alt_ref &&
-        (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+    if (!rc->is_src_frame_alt_ref && (cpi->refresh_golden_frame ||
+#if CONFIG_EXT_REFS
+                                      cpi->refresh_alt2_ref_frame ||
+#endif  // CONFIG_EXT_REFS
+                                      cpi->refresh_alt_ref_frame)) {
       active_worst_quality = curr_frame == 1 ? rc->last_q[KEY_FRAME] * 5 / 4
                                              : rc->last_q[INTER_FRAME];
     } else {
@@ -647,8 +661,8 @@ static int calc_active_worst_quality_one_pass_cbr(const AV1_COMP *cpi) {
   return active_worst_quality;
 }
 
-static int rc_pick_q_and_bounds_one_pass_cbr(const AV1_COMP *cpi,
-                                             int *bottom_index,
+static int rc_pick_q_and_bounds_one_pass_cbr(const AV1_COMP *cpi, int width,
+                                             int height, int *bottom_index,
                                              int *top_index) {
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
@@ -678,7 +692,7 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const AV1_COMP *cpi,
           rc, rc->avg_frame_qindex[KEY_FRAME], cm->bit_depth);
 
       // Allow somewhat lower kf minq with small image formats.
-      if ((cm->width * cm->height) <= (352 * 288)) {
+      if ((width * height) <= (352 * 288)) {
         q_adj_factor -= 0.25;
       }
 
@@ -740,7 +754,7 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const AV1_COMP *cpi,
     q = rc->last_boosted_qindex;
   } else {
     q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
-                          active_worst_quality);
+                          active_worst_quality, width, height);
     if (q > *top_index) {
       // Special case when we are targeting the max allowed rate
       if (rc->this_frame_target >= rc->max_frame_bandwidth)
@@ -770,8 +784,8 @@ static int get_active_cq_level(const RATE_CONTROL *rc,
   return active_cq_level;
 }
 
-static int rc_pick_q_and_bounds_one_pass_vbr(const AV1_COMP *cpi,
-                                             int *bottom_index,
+static int rc_pick_q_and_bounds_one_pass_vbr(const AV1_COMP *cpi, int width,
+                                             int height, int *bottom_index,
                                              int *top_index) {
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
@@ -804,7 +818,7 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const AV1_COMP *cpi,
           rc, rc->avg_frame_qindex[KEY_FRAME], cm->bit_depth);
 
       // Allow somewhat lower kf minq with small image formats.
-      if ((cm->width * cm->height) <= (352 * 288)) {
+      if ((width * height) <= (352 * 288)) {
         q_adj_factor -= 0.25;
       }
 
@@ -899,7 +913,7 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const AV1_COMP *cpi,
     q = rc->last_boosted_qindex;
   } else {
     q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
-                          active_worst_quality);
+                          active_worst_quality, width, height);
     if (q > *top_index) {
       // Special case when we are targeting the max allowed rate
       if (rc->this_frame_target >= rc->max_frame_bandwidth)
@@ -945,7 +959,8 @@ int av1_frame_type_qdelta(const AV1_COMP *cpi, int rf_level, int q) {
 }
 
 #define STATIC_MOTION_THRESH 95
-static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int *bottom_index,
+static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
+                                         int height, int *bottom_index,
                                          int *top_index) {
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
@@ -992,7 +1007,7 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int *bottom_index,
           get_kf_active_quality(rc, active_worst_quality, cm->bit_depth);
 
       // Allow somewhat lower kf minq with small image formats.
-      if ((cm->width * cm->height) <= (352 * 288)) {
+      if ((width * height) <= (352 * 288)) {
         q_adj_factor -= 0.25;
       }
 
@@ -1005,8 +1020,11 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int *bottom_index,
       active_best_quality +=
           av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth);
     }
-  } else if (!rc->is_src_frame_alt_ref &&
-             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+  } else if (!rc->is_src_frame_alt_ref && (cpi->refresh_golden_frame ||
+#if CONFIG_EXT_REFS
+                                           cpi->refresh_alt2_ref_frame ||
+#endif  // CONFIG_EXT_REFS
+                                           cpi->refresh_alt_ref_frame)) {
     // Use the lower of active_worst_quality and recent
     // average Q as basis for GF/ARF best Q limit unless last frame was
     // a key frame.
@@ -1026,7 +1044,11 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int *bottom_index,
       active_best_quality = active_best_quality * 15 / 16;
 
     } else if (oxcf->rc_mode == AOM_Q) {
+#if CONFIG_EXT_REFS
+      if (!cpi->refresh_alt_ref_frame && !cpi->refresh_alt2_ref_frame) {
+#else
       if (!cpi->refresh_alt_ref_frame) {
+#endif  // CONFIG_EXT_REFS
         active_best_quality = cq_level;
       } else {
         active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
@@ -1058,8 +1080,11 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int *bottom_index,
   if ((cpi->oxcf.rc_mode != AOM_Q) &&
       (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD)) {
     if (frame_is_intra_only(cm) ||
-        (!rc->is_src_frame_alt_ref &&
-         (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) {
+        (!rc->is_src_frame_alt_ref && (cpi->refresh_golden_frame ||
+#if CONFIG_EXT_REFS
+                                       cpi->refresh_alt2_ref_frame ||
+#endif  // CONFIG_EXT_REFS
+                                       cpi->refresh_alt_ref_frame))) {
       active_best_quality -=
           (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast);
       active_worst_quality += (cpi->twopass.extend_maxq / 2);
@@ -1105,7 +1130,7 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int *bottom_index,
     }
   } else {
     q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
-                          active_worst_quality);
+                          active_worst_quality, width, height);
     if (q > active_worst_quality) {
       // Special case when we are targeting the max allowed rate.
       if (rc->this_frame_target >= rc->max_frame_bandwidth)
@@ -1126,16 +1151,19 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int *bottom_index,
   return q;
 }
 
-int av1_rc_pick_q_and_bounds(const AV1_COMP *cpi, int *bottom_index,
-                             int *top_index) {
+int av1_rc_pick_q_and_bounds(const AV1_COMP *cpi, int width, int height,
+                             int *bottom_index, int *top_index) {
   int q;
   if (cpi->oxcf.pass == 0) {
     if (cpi->oxcf.rc_mode == AOM_CBR)
-      q = rc_pick_q_and_bounds_one_pass_cbr(cpi, bottom_index, top_index);
+      q = rc_pick_q_and_bounds_one_pass_cbr(cpi, width, height, bottom_index,
+                                            top_index);
     else
-      q = rc_pick_q_and_bounds_one_pass_vbr(cpi, bottom_index, top_index);
+      q = rc_pick_q_and_bounds_one_pass_vbr(cpi, width, height, bottom_index,
+                                            top_index);
   } else {
-    q = rc_pick_q_and_bounds_two_pass(cpi, bottom_index, top_index);
+    q = rc_pick_q_and_bounds_two_pass(cpi, width, height, bottom_index,
+                                      top_index);
   }
 
   return q;
@@ -1157,7 +1185,8 @@ void av1_rc_compute_frame_size_bounds(const AV1_COMP *cpi, int frame_target,
   }
 }
 
-void av1_rc_set_frame_target(AV1_COMP *cpi, int target) {
+static void rc_set_frame_target(AV1_COMP *cpi, int target, int width,
+                                int height) {
   const AV1_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
 
@@ -1166,11 +1195,11 @@ void av1_rc_set_frame_target(AV1_COMP *cpi, int target) {
   // Modify frame size target when down-scaled.
   if (!av1_frame_unscaled(cm))
     rc->this_frame_target =
-        (int)(rc->this_frame_target * av1_resize_rate_factor(cpi));
+        (int)(rc->this_frame_target * resize_rate_factor(cpi, width, height));
 
   // Target rate per SB64 (including partial SB64s.
-  rc->sb64_target_rate = (int)((int64_t)rc->this_frame_target * 64 * 64) /
-                         (cm->width * cm->height);
+  rc->sb64_target_rate =
+      (int)((int64_t)rc->this_frame_target * 64 * 64) / (width * height);
 }
 
 static void update_alt_ref_frame_stats(AV1_COMP *cpi) {
@@ -1194,7 +1223,7 @@ static void update_golden_frame_stats(AV1_COMP *cpi) {
   //                   only the virtual indices for the reference frame will be
   //                   updated and cpi->refresh_golden_frame will still be zero.
   if (cpi->refresh_golden_frame || rc->is_src_frame_alt_ref) {
-#else
+#else   // !CONFIG_EXT_REFS
   // Update the Golden frame usage counts.
   if (cpi->refresh_golden_frame) {
 #endif  // CONFIG_EXT_REFS
@@ -1219,7 +1248,11 @@ static void update_golden_frame_stats(AV1_COMP *cpi) {
     // Decrement count down till next gf
     if (rc->frames_till_gf_update_due > 0) rc->frames_till_gf_update_due--;
 
+#if CONFIG_EXT_REFS
+  } else if (!cpi->refresh_alt_ref_frame && !cpi->refresh_alt2_ref_frame) {
+#else
   } else if (!cpi->refresh_alt_ref_frame) {
+#endif  // CONFIG_EXT_REFS
     // Decrement count down till next gf
     if (rc->frames_till_gf_update_due > 0) rc->frames_till_gf_update_due--;
 
@@ -1240,7 +1273,7 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
   rc->projected_frame_size = (int)(bytes_used << 3);
 
   // Post encode loop adjustment of Q prediction.
-  av1_rc_update_rate_correction_factors(cpi);
+  av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
 
   // Keep a record of last Q and ambient average Q.
   if (cm->frame_type == KEY_FRAME) {
@@ -1249,7 +1282,11 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
         ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2);
   } else {
     if (!rc->is_src_frame_alt_ref &&
-        !(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+        !(cpi->refresh_golden_frame ||
+#if CONFIG_EXT_REFS
+          cpi->refresh_alt2_ref_frame ||
+#endif  // CONFIG_EXT_REFS
+          cpi->refresh_alt_ref_frame)) {
       rc->last_q[INTER_FRAME] = qindex;
       rc->avg_frame_qindex[INTER_FRAME] =
           ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[INTER_FRAME] + qindex, 2);
@@ -1271,6 +1308,9 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
   if ((qindex < rc->last_boosted_qindex) || (cm->frame_type == KEY_FRAME) ||
       (!rc->constrained_gf_group &&
        (cpi->refresh_alt_ref_frame ||
+#if CONFIG_EXT_REFS
+        cpi->refresh_alt2_ref_frame ||
+#endif  // CONFIG_EXT_REFS
         (cpi->refresh_golden_frame && !rc->is_src_frame_alt_ref)))) {
     rc->last_boosted_qindex = qindex;
   }
@@ -1280,6 +1320,10 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
 
   // Rolling monitors of whether we are over or underspending used to help
   // regulate min and Max Q in two pass.
+  if (!av1_frame_unscaled(cm))
+    rc->this_frame_target =
+        (int)(rc->this_frame_target /
+              resize_rate_factor(cpi, cm->width, cm->height));
   if (cm->frame_type != KEY_FRAME) {
     rc->rolling_target_bits = ROUND_POWER_OF_TWO(
         rc->rolling_target_bits * 3 + rc->this_frame_target, 2);
@@ -1294,6 +1338,8 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
   // Actual bits spent
   rc->total_actual_bits += rc->projected_frame_size;
 #if CONFIG_EXT_REFS
+  // TODO(zoeliu): To investigate whether we should treat BWDREF_FRAME
+  //               differently here for rc->avg_frame_bandwidth.
   rc->total_target_bits +=
       (cm->show_frame || rc->is_bwd_ref_frame) ? rc->avg_frame_bandwidth : 0;
 #else
@@ -1313,6 +1359,8 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
   if (cm->frame_type == KEY_FRAME) rc->frames_since_key = 0;
 
 #if CONFIG_EXT_REFS
+  // TODO(zoeliu): To investigate whether we should treat BWDREF_FRAME
+  //               differently here for rc->avg_frame_bandwidth.
   if (cm->show_frame || rc->is_bwd_ref_frame) {
 #else
   if (cm->show_frame) {
@@ -1320,6 +1368,12 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
     rc->frames_since_key++;
     rc->frames_to_key--;
   }
+  // if (cm->current_video_frame == 1 && cm->show_frame)
+  /*
+  rc->this_frame_target =
+      (int)(rc->this_frame_target / resize_rate_factor(cpi, cm->width,
+  cm->height));
+      */
 }
 
 void av1_rc_postencode_update_drop_frame(AV1_COMP *cpi) {
@@ -1394,7 +1448,7 @@ void av1_rc_get_one_pass_vbr_params(AV1_COMP *cpi) {
     target = calc_iframe_target_size_one_pass_vbr(cpi);
   else
     target = calc_pframe_target_size_one_pass_vbr(cpi);
-  av1_rc_set_frame_target(cpi, target);
+  rc_set_frame_target(cpi, target, cm->width, cm->height);
 }
 
 static int calc_pframe_target_size_one_pass_cbr(const AV1_COMP *cpi) {
@@ -1496,7 +1550,7 @@ void av1_rc_get_one_pass_cbr_params(AV1_COMP *cpi) {
   else
     target = calc_pframe_target_size_one_pass_cbr(cpi);
 
-  av1_rc_set_frame_target(cpi, target);
+  rc_set_frame_target(cpi, target, cm->width, cm->height);
   // TODO(afergs): Decide whether to scale up, down, or not at all
 }
 
@@ -1581,11 +1635,11 @@ void av1_rc_set_gf_interval_range(const AV1_COMP *const cpi,
   }
 }
 
-void av1_rc_update_framerate(AV1_COMP *cpi) {
-  const AV1_COMMON *const cm = &cpi->common;
+void av1_rc_update_framerate(AV1_COMP *cpi, int width, int height) {
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   RATE_CONTROL *const rc = &cpi->rc;
   int vbr_max_bits;
+  const int MBs = av1_get_MBs(width, height);
 
   rc->avg_frame_bandwidth = (int)(oxcf->target_bandwidth / cpi->framerate);
   rc->min_frame_bandwidth =
@@ -1605,7 +1659,7 @@ void av1_rc_update_framerate(AV1_COMP *cpi) {
       (int)(((int64_t)rc->avg_frame_bandwidth * oxcf->two_pass_vbrmax_section) /
             100);
   rc->max_frame_bandwidth =
-      AOMMAX(AOMMAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits);
+      AOMMAX(AOMMAX((MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits);
 
   av1_rc_set_gf_interval_range(cpi, rc);
 }
@@ -1654,73 +1708,12 @@ static void vbr_rate_correction(AV1_COMP *cpi, int *this_frame_target) {
   }
 }
 
-void av1_set_target_rate(AV1_COMP *cpi) {
+void av1_set_target_rate(AV1_COMP *cpi, int width, int height) {
   RATE_CONTROL *const rc = &cpi->rc;
   int target_rate = rc->base_frame_target;
 
   // Correction to rate target based on prior over or under shoot.
   if (cpi->oxcf.rc_mode == AOM_VBR || cpi->oxcf.rc_mode == AOM_CQ)
     vbr_rate_correction(cpi, &target_rate);
-  av1_rc_set_frame_target(cpi, target_rate);
-}
-
-static unsigned int lcg_rand16(unsigned int *state) {
-  *state = (unsigned int)(*state * 1103515245ULL + 12345);
-  return *state / 65536 % 32768;
-}
-
-uint8_t av1_calculate_next_resize_scale(const AV1_COMP *cpi) {
-  static unsigned int seed = 56789;
-  const AV1EncoderConfig *oxcf = &cpi->oxcf;
-  if (oxcf->pass == 1) return SCALE_DENOMINATOR;
-  uint8_t new_num = SCALE_DENOMINATOR;
-
-  switch (oxcf->resize_mode) {
-    case RESIZE_NONE: new_num = SCALE_DENOMINATOR; break;
-    case RESIZE_FIXED:
-      if (cpi->common.frame_type == KEY_FRAME)
-        new_num = oxcf->resize_kf_scale_numerator;
-      else
-        new_num = oxcf->resize_scale_numerator;
-      break;
-    case RESIZE_DYNAMIC:
-      // RESIZE_DYNAMIC: Just random for now.
-      new_num = lcg_rand16(&seed) % 4 + 13;
-      break;
-    default: assert(0);
-  }
-  return new_num;
-}
-
-#if CONFIG_FRAME_SUPERRES
-// TODO(afergs): Rename av1_rc_update_superres_scale(...)?
-uint8_t av1_calculate_next_superres_scale(const AV1_COMP *cpi, int width,
-                                          int height) {
-  static unsigned int seed = 34567;
-  const AV1EncoderConfig *oxcf = &cpi->oxcf;
-  if (oxcf->pass == 1) return SCALE_DENOMINATOR;
-  uint8_t new_num = SCALE_DENOMINATOR;
-
-  switch (oxcf->superres_mode) {
-    case SUPERRES_NONE: new_num = SCALE_DENOMINATOR; break;
-    case SUPERRES_FIXED:
-      if (cpi->common.frame_type == KEY_FRAME)
-        new_num = oxcf->superres_kf_scale_numerator;
-      else
-        new_num = oxcf->superres_scale_numerator;
-      break;
-    case SUPERRES_DYNAMIC:
-      // SUPERRES_DYNAMIC: Just random for now.
-      new_num = lcg_rand16(&seed) % 9 + 8;
-      break;
-    default: assert(0);
-  }
-
-  // Make sure overall reduction is no more than 1/2 of the source size.
-  av1_calculate_scaled_size(&width, &height, new_num);
-  if (width * 2 < oxcf->width || height * 2 < oxcf->height)
-    new_num = SCALE_DENOMINATOR;
-
-  return new_num;
+  rc_set_frame_target(cpi, target_rate, width, height);
 }
-#endif  // CONFIG_FRAME_SUPERRES
diff --git a/third_party/aom/av1/encoder/ratectrl.h b/third_party/aom/av1/encoder/ratectrl.h
index 4ebdfadd6..8b410e778 100644
--- a/third_party/aom/av1/encoder/ratectrl.h
+++ b/third_party/aom/av1/encoder/ratectrl.h
@@ -50,6 +50,14 @@ typedef enum {
 #endif  // CONFIG_EXT_REFS
 
 typedef struct {
+  int resize_width;
+  int resize_height;
+#if CONFIG_FRAME_SUPERRES
+  uint8_t superres_denom;
+#endif  // CONFIG_FRAME_SUPERRES
+} size_params_type;
+
+typedef struct {
   // Rate targetting variables
   int base_frame_target;  // A baseline frame target before adjustment
                           // for previous under or over shoot.
@@ -189,10 +197,6 @@ int av1_rc_get_default_max_gf_interval(double framerate, int min_frame_rate);
 void av1_rc_get_one_pass_vbr_params(struct AV1_COMP *cpi);
 void av1_rc_get_one_pass_cbr_params(struct AV1_COMP *cpi);
 
-// How many times less pixels there are to encode given the current scaling.
-// Temporary replacement for rcf_mult and rate_thresh_mult.
-double av1_resize_rate_factor(const struct AV1_COMP *cpi);
-
 // Post encode update of the rate control parameters based
 // on bytes used
 void av1_rc_postencode_update(struct AV1_COMP *cpi, uint64_t bytes_used);
@@ -201,7 +205,8 @@ void av1_rc_postencode_update_drop_frame(struct AV1_COMP *cpi);
 
 // Updates rate correction factors
 // Changes only the rate correction factors in the rate control structure.
-void av1_rc_update_rate_correction_factors(struct AV1_COMP *cpi);
+void av1_rc_update_rate_correction_factors(struct AV1_COMP *cpi, int width,
+                                           int height);
 
 // Decide if we should drop this frame: For 1-pass CBR.
 // Changes only the decimation count in the rate control structure
@@ -214,12 +219,13 @@ void av1_rc_compute_frame_size_bounds(const struct AV1_COMP *cpi,
                                       int *frame_over_shoot_limit);
 
 // Picks q and q bounds given the target for bits
-int av1_rc_pick_q_and_bounds(const struct AV1_COMP *cpi, int *bottom_index,
-                             int *top_index);
+int av1_rc_pick_q_and_bounds(const struct AV1_COMP *cpi, int width, int height,
+                             int *bottom_index, int *top_index);
 
 // Estimates q to achieve a target bits per frame
 int av1_rc_regulate_q(const struct AV1_COMP *cpi, int target_bits_per_frame,
-                      int active_best_quality, int active_worst_quality);
+                      int active_best_quality, int active_worst_quality,
+                      int width, int height);
 
 // Estimates bits per mb for a given qindex and correction factor.
 int av1_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
@@ -247,20 +253,15 @@ int av1_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
 
 int av1_frame_type_qdelta(const struct AV1_COMP *cpi, int rf_level, int q);
 
-void av1_rc_update_framerate(struct AV1_COMP *cpi);
+void av1_rc_update_framerate(struct AV1_COMP *cpi, int width, int height);
 
 void av1_rc_set_gf_interval_range(const struct AV1_COMP *const cpi,
                                   RATE_CONTROL *const rc);
 
-void av1_set_target_rate(struct AV1_COMP *cpi);
+void av1_set_target_rate(struct AV1_COMP *cpi, int width, int height);
 
 int av1_resize_one_pass_cbr(struct AV1_COMP *cpi);
 
-uint8_t av1_calculate_next_resize_scale(const struct AV1_COMP *cpi);
-#if CONFIG_FRAME_SUPERRES
-uint8_t av1_calculate_next_superres_scale(const struct AV1_COMP *cpi, int width,
-                                          int height);
-#endif  // CONFIG_FRAME_SUPERRES
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/encoder/rd.c b/third_party/aom/av1/encoder/rd.c
index da3b6e209..5dd485334 100644
--- a/third_party/aom/av1/encoder/rd.c
+++ b/third_party/aom/av1/encoder/rd.c
@@ -36,6 +36,9 @@
 #include "av1/encoder/encodemb.h"
 #include "av1/encoder/encodemv.h"
 #include "av1/encoder/encoder.h"
+#if CONFIG_LV_MAP
+#include "av1/encoder/encodetxb.h"
+#endif
 #include "av1/encoder/mcomp.h"
 #include "av1/encoder/ratectrl.h"
 #include "av1/encoder/rd.h"
@@ -54,121 +57,301 @@ static const uint8_t rd_thresh_block_size_factor[BLOCK_SIZES_ALL] = {
 #if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
   2,  2,  2,
 #endif
-  2,  3,  3,  4, 6, 6, 8, 12, 12, 16, 24, 24, 32,
+  2,  3,  3,  4, 6,  6,  8, 12, 12, 16, 24, 24, 32,
 #if CONFIG_EXT_PARTITION
   48, 48, 64,
 #endif  // CONFIG_EXT_PARTITION
-  4,  4,  8,  8
+  4,  4,  8,  8, 16, 16,
+#if CONFIG_EXT_PARTITION
+  32, 32
+#endif  // CONFIG_EXT_PARTITION
 };
 
-static void fill_mode_costs(AV1_COMP *cpi) {
-  const FRAME_CONTEXT *const fc = cpi->common.fc;
+#if CONFIG_EXT_TX
+static const int use_intra_ext_tx_for_txsize[EXT_TX_SETS_INTRA][EXT_TX_SIZES] =
+    {
+#if CONFIG_CHROMA_2X2
+      { 1, 1, 1, 1, 1 },  // unused
+      { 0, 1, 1, 0, 0 },
+      { 0, 0, 0, 1, 0 },
+#if CONFIG_MRC_TX
+      { 0, 0, 0, 0, 1 },
+#endif  // CONFIG_MRC_TX
+#else   // CONFIG_CHROMA_2X2
+      { 1, 1, 1, 1 },  // unused
+      { 1, 1, 0, 0 },
+      { 0, 0, 1, 0 },
+#if CONFIG_MRC_TX
+      { 0, 0, 0, 1 },
+#endif  // CONFIG_MRC_TX
+#endif  // CONFIG_CHROMA_2X2
+    };
+
+static const int use_inter_ext_tx_for_txsize[EXT_TX_SETS_INTER][EXT_TX_SIZES] =
+    {
+#if CONFIG_CHROMA_2X2
+      { 1, 1, 1, 1, 1 },  // unused
+      { 0, 1, 1, 0, 0 }, { 0, 0, 0, 1, 0 }, { 0, 0, 0, 0, 1 },
+#if CONFIG_MRC_TX
+      { 0, 0, 0, 0, 1 },
+#endif  // CONFIG_MRC_TX
+#else   // CONFIG_CHROMA_2X2
+      { 1, 1, 1, 1 },  // unused
+      { 1, 1, 0, 0 }, { 0, 0, 1, 0 }, { 0, 0, 0, 1 },
+#if CONFIG_MRC_TX
+      { 0, 0, 0, 1 },
+#endif  // CONFIG_MRC_TX
+#endif  // CONFIG_CHROMA_2X2
+    };
+#endif  // CONFIG_EXT_TX
+
+void av1_fill_mode_rates(AV1_COMMON *const cm, MACROBLOCK *x,
+                         FRAME_CONTEXT *fc) {
   int i, j;
 
+  if (cm->frame_type == KEY_FRAME) {
+    for (i = 0; i < PARTITION_CONTEXTS_PRIMARY; ++i)
+      av1_cost_tokens_from_cdf(x->partition_cost[i], fc->partition_cdf[i],
+                               NULL);
+#if CONFIG_UNPOISON_PARTITION_CTX
+    for (; i < PARTITION_CONTEXTS_PRIMARY + PARTITION_BLOCK_SIZES; ++i) {
+      aom_prob p = fc->partition_prob[i][PARTITION_VERT];
+      assert(p > 0);
+      x->partition_cost[i][PARTITION_NONE] = INT_MAX;
+      x->partition_cost[i][PARTITION_HORZ] = INT_MAX;
+      x->partition_cost[i][PARTITION_VERT] = av1_cost_bit(p, 0);
+      x->partition_cost[i][PARTITION_SPLIT] = av1_cost_bit(p, 1);
+    }
+    for (; i < PARTITION_CONTEXTS_PRIMARY + 2 * PARTITION_BLOCK_SIZES; ++i) {
+      aom_prob p = fc->partition_prob[i][PARTITION_HORZ];
+      assert(p > 0);
+      x->partition_cost[i][PARTITION_NONE] = INT_MAX;
+      x->partition_cost[i][PARTITION_HORZ] = av1_cost_bit(p, 0);
+      x->partition_cost[i][PARTITION_VERT] = INT_MAX;
+      x->partition_cost[i][PARTITION_SPLIT] = av1_cost_bit(p, 1);
+    }
+    x->partition_cost[PARTITION_CONTEXTS][PARTITION_NONE] = INT_MAX;
+    x->partition_cost[PARTITION_CONTEXTS][PARTITION_HORZ] = INT_MAX;
+    x->partition_cost[PARTITION_CONTEXTS][PARTITION_VERT] = INT_MAX;
+    x->partition_cost[PARTITION_CONTEXTS][PARTITION_SPLIT] = 0;
+#endif  // CONFIG_UNPOISON_PARTITION_CTX
+  }
+
+#if CONFIG_KF_CTX
+  for (i = 0; i < KF_MODE_CONTEXTS; ++i)
+    for (j = 0; j < KF_MODE_CONTEXTS; ++j)
+      av1_cost_tokens_from_cdf(x->y_mode_costs[i][j], fc->kf_y_cdf[i][j], NULL);
+#else
   for (i = 0; i < INTRA_MODES; ++i)
     for (j = 0; j < INTRA_MODES; ++j)
-      av1_cost_tokens_from_cdf(cpi->y_mode_costs[i][j], av1_kf_y_mode_cdf[i][j],
-                               av1_intra_mode_inv);
+      av1_cost_tokens_from_cdf(x->y_mode_costs[i][j], fc->kf_y_cdf[i][j], NULL);
+#endif
 
   for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
-    av1_cost_tokens_from_cdf(cpi->mbmode_cost[i], fc->y_mode_cdf[i],
-                             av1_intra_mode_inv);
-
+    av1_cost_tokens_from_cdf(x->mbmode_cost[i], fc->y_mode_cdf[i], NULL);
   for (i = 0; i < INTRA_MODES; ++i)
-    av1_cost_tokens_from_cdf(cpi->intra_uv_mode_cost[i], fc->uv_mode_cdf[i],
-                             av1_intra_mode_inv);
+    av1_cost_tokens_from_cdf(x->intra_uv_mode_cost[i], fc->uv_mode_cdf[i],
+                             NULL);
 
   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
-    av1_cost_tokens(cpi->switchable_interp_costs[i],
-                    fc->switchable_interp_prob[i], av1_switchable_interp_tree);
+    av1_cost_tokens_from_cdf(x->switchable_interp_costs[i],
+                             fc->switchable_interp_cdf[i], NULL);
 
-#if CONFIG_PALETTE
   for (i = 0; i < PALETTE_BLOCK_SIZES; ++i) {
-    av1_cost_tokens_from_cdf(cpi->palette_y_size_cost[i],
+    av1_cost_tokens_from_cdf(x->palette_y_size_cost[i],
                              fc->palette_y_size_cdf[i], NULL);
-    av1_cost_tokens_from_cdf(cpi->palette_uv_size_cost[i],
+    av1_cost_tokens_from_cdf(x->palette_uv_size_cost[i],
                              fc->palette_uv_size_cdf[i], NULL);
   }
 
   for (i = 0; i < PALETTE_SIZES; ++i) {
     for (j = 0; j < PALETTE_COLOR_INDEX_CONTEXTS; ++j) {
-      av1_cost_tokens_from_cdf(cpi->palette_y_color_cost[i][j],
+      av1_cost_tokens_from_cdf(x->palette_y_color_cost[i][j],
                                fc->palette_y_color_index_cdf[i][j], NULL);
-      av1_cost_tokens_from_cdf(cpi->palette_uv_color_cost[i][j],
+      av1_cost_tokens_from_cdf(x->palette_uv_color_cost[i][j],
                                fc->palette_uv_color_index_cdf[i][j], NULL);
     }
   }
-#endif  // CONFIG_PALETTE
+#if CONFIG_MRC_TX
+  for (i = 0; i < PALETTE_SIZES; ++i) {
+    for (j = 0; j < PALETTE_COLOR_INDEX_CONTEXTS; ++j) {
+      av1_cost_tokens_from_cdf(x->mrc_mask_inter_cost[i][j],
+                               fc->mrc_mask_inter_cdf[i][j], NULL);
+      av1_cost_tokens_from_cdf(x->mrc_mask_intra_cost[i][j],
+                               fc->mrc_mask_intra_cdf[i][j], NULL);
+    }
+  }
+#endif  // CONFIG_MRC_TX
+
+#if CONFIG_CFL
+  int sign_cost[CFL_JOINT_SIGNS];
+  av1_cost_tokens_from_cdf(sign_cost, fc->cfl_sign_cdf, NULL);
+  for (int joint_sign = 0; joint_sign < CFL_JOINT_SIGNS; joint_sign++) {
+    const aom_cdf_prob *cdf_u = fc->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)];
+    const aom_cdf_prob *cdf_v = fc->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)];
+    int *cost_u = x->cfl_cost[joint_sign][CFL_PRED_U];
+    int *cost_v = x->cfl_cost[joint_sign][CFL_PRED_V];
+    if (CFL_SIGN_U(joint_sign) == CFL_SIGN_ZERO)
+      memset(cost_u, 0, CFL_ALPHABET_SIZE * sizeof(*cost_u));
+    else
+      av1_cost_tokens_from_cdf(cost_u, cdf_u, NULL);
+    if (CFL_SIGN_V(joint_sign) == CFL_SIGN_ZERO)
+      memset(cost_v, 0, CFL_ALPHABET_SIZE * sizeof(*cost_v));
+    else
+      av1_cost_tokens_from_cdf(cost_v, cdf_v, NULL);
+    for (int u = 0; u < CFL_ALPHABET_SIZE; u++)
+      cost_u[u] += sign_cost[joint_sign];
+  }
+#endif  // CONFIG_CFL
 
   for (i = 0; i < MAX_TX_DEPTH; ++i)
     for (j = 0; j < TX_SIZE_CONTEXTS; ++j)
-      av1_cost_tokens(cpi->tx_size_cost[i][j], fc->tx_size_probs[i][j],
-                      av1_tx_size_tree[i]);
+      av1_cost_tokens_from_cdf(x->tx_size_cost[i][j], fc->tx_size_cdf[i][j],
+                               NULL);
 
 #if CONFIG_EXT_TX
+#if CONFIG_LGT_FROM_PRED
+  if (LGT_FROM_PRED_INTRA) {
+    for (i = 0; i < LGT_SIZES; ++i) {
+      for (j = 0; j < INTRA_MODES; ++j) {
+        x->intra_lgt_cost[i][j][0] = av1_cost_bit(fc->intra_lgt_prob[i][j], 0);
+        x->intra_lgt_cost[i][j][1] = av1_cost_bit(fc->intra_lgt_prob[i][j], 1);
+      }
+    }
+  }
+  if (LGT_FROM_PRED_INTER) {
+    for (i = 0; i < LGT_SIZES; ++i) {
+      x->inter_lgt_cost[i][0] = av1_cost_bit(fc->inter_lgt_prob[i], 0);
+      x->inter_lgt_cost[i][1] = av1_cost_bit(fc->inter_lgt_prob[i], 1);
+    }
+  }
+#endif  // CONFIG_LGT_FROM_PRED
   for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
     int s;
     for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
       if (use_inter_ext_tx_for_txsize[s][i]) {
-        av1_cost_tokens(cpi->inter_tx_type_costs[s][i],
-                        fc->inter_ext_tx_prob[s][i], av1_ext_tx_inter_tree[s]);
+        av1_cost_tokens_from_cdf(
+            x->inter_tx_type_costs[s][i], fc->inter_ext_tx_cdf[s][i],
+            av1_ext_tx_inv[av1_ext_tx_set_idx_to_type[1][s]]);
       }
     }
     for (s = 1; s < EXT_TX_SETS_INTRA; ++s) {
       if (use_intra_ext_tx_for_txsize[s][i]) {
-        for (j = 0; j < INTRA_MODES; ++j)
-          av1_cost_tokens(cpi->intra_tx_type_costs[s][i][j],
-                          fc->intra_ext_tx_prob[s][i][j],
-                          av1_ext_tx_intra_tree[s]);
+        for (j = 0; j < INTRA_MODES; ++j) {
+          av1_cost_tokens_from_cdf(
+              x->intra_tx_type_costs[s][i][j], fc->intra_ext_tx_cdf[s][i][j],
+              av1_ext_tx_inv[av1_ext_tx_set_idx_to_type[0][s]]);
+        }
       }
     }
   }
 #else
   for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
     for (j = 0; j < TX_TYPES; ++j)
-      av1_cost_tokens(cpi->intra_tx_type_costs[i][j],
-                      fc->intra_ext_tx_prob[i][j], av1_ext_tx_tree);
+      av1_cost_tokens_from_cdf(x->intra_tx_type_costs[i][j],
+                               fc->intra_ext_tx_cdf[i][j], av1_ext_tx_inv);
   }
   for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
-    av1_cost_tokens(cpi->inter_tx_type_costs[i], fc->inter_ext_tx_prob[i],
-                    av1_ext_tx_tree);
+    av1_cost_tokens_from_cdf(x->inter_tx_type_costs[i], fc->inter_ext_tx_cdf[i],
+                             av1_ext_tx_inv);
   }
 #endif  // CONFIG_EXT_TX
 #if CONFIG_EXT_INTRA
 #if CONFIG_INTRA_INTERP
   for (i = 0; i < INTRA_FILTERS + 1; ++i)
-    av1_cost_tokens(cpi->intra_filter_cost[i], fc->intra_filter_probs[i],
-                    av1_intra_filter_tree);
+    av1_cost_tokens_from_cdf(x->intra_filter_cost[i], fc->intra_filter_cdf[i],
+                             NULL);
 #endif  // CONFIG_INTRA_INTERP
 #endif  // CONFIG_EXT_INTRA
 #if CONFIG_LOOP_RESTORATION
-  av1_cost_tokens(cpi->switchable_restore_cost, fc->switchable_restore_prob,
+  av1_cost_tokens(x->switchable_restore_cost, fc->switchable_restore_prob,
                   av1_switchable_restore_tree);
 #endif  // CONFIG_LOOP_RESTORATION
-#if CONFIG_GLOBAL_MOTION
-  for (i = 0; i < TRANS_TYPES; ++i)
-    cpi->gmtype_cost[i] = (1 + (i > 0 ? GLOBAL_TYPE_BITS : 0))
-                          << AV1_PROB_COST_SHIFT;
-#endif  // CONFIG_GLOBAL_MOTION
-}
+#if CONFIG_INTRABC
+  av1_cost_tokens_from_cdf(x->intrabc_cost, fc->intrabc_cdf, NULL);
+#endif  // CONFIG_INTRABC
 
-void av1_fill_token_costs(av1_coeff_cost *c,
-                          av1_coeff_probs_model (*p)[PLANE_TYPES]) {
-  int i, j, k, l;
-  TX_SIZE t;
-  for (t = 0; t < TX_SIZES; ++t)
-    for (i = 0; i < PLANE_TYPES; ++i)
-      for (j = 0; j < REF_TYPES; ++j)
-        for (k = 0; k < COEF_BANDS; ++k)
-          for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
-            aom_prob probs[ENTROPY_NODES];
-            av1_model_to_full_probs(p[t][i][j][k][l], probs);
-            av1_cost_tokens((int *)c[t][i][j][k][0][l], probs, av1_coef_tree);
-            av1_cost_tokens_skip((int *)c[t][i][j][k][1][l], probs,
-                                 av1_coef_tree);
-            assert(c[t][i][j][k][0][l][EOB_TOKEN] ==
-                   c[t][i][j][k][1][l][EOB_TOKEN]);
-          }
+  if (!frame_is_intra_only(cm)) {
+    for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i) {
+#if CONFIG_NEW_MULTISYMBOL
+      av1_cost_tokens_from_cdf(x->newmv_mode_cost[i], fc->newmv_cdf[i], NULL);
+#else
+      x->newmv_mode_cost[i][0] = av1_cost_bit(fc->newmv_prob[i], 0);
+      x->newmv_mode_cost[i][1] = av1_cost_bit(fc->newmv_prob[i], 1);
+#endif
+    }
+
+    for (i = 0; i < ZEROMV_MODE_CONTEXTS; ++i) {
+#if CONFIG_NEW_MULTISYMBOL
+      av1_cost_tokens_from_cdf(x->zeromv_mode_cost[i], fc->zeromv_cdf[i], NULL);
+#else
+      x->zeromv_mode_cost[i][0] = av1_cost_bit(fc->zeromv_prob[i], 0);
+      x->zeromv_mode_cost[i][1] = av1_cost_bit(fc->zeromv_prob[i], 1);
+#endif
+    }
+
+    for (i = 0; i < REFMV_MODE_CONTEXTS; ++i) {
+#if CONFIG_NEW_MULTISYMBOL
+      av1_cost_tokens_from_cdf(x->refmv_mode_cost[i], fc->refmv_cdf[i], NULL);
+#else
+      x->refmv_mode_cost[i][0] = av1_cost_bit(fc->refmv_prob[i], 0);
+      x->refmv_mode_cost[i][1] = av1_cost_bit(fc->refmv_prob[i], 1);
+#endif
+    }
+
+    for (i = 0; i < DRL_MODE_CONTEXTS; ++i) {
+#if CONFIG_NEW_MULTISYMBOL
+      av1_cost_tokens_from_cdf(x->drl_mode_cost0[i], fc->drl_cdf[i], NULL);
+#else
+      x->drl_mode_cost0[i][0] = av1_cost_bit(fc->drl_prob[i], 0);
+      x->drl_mode_cost0[i][1] = av1_cost_bit(fc->drl_prob[i], 1);
+#endif
+    }
+    for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+      av1_cost_tokens_from_cdf(x->inter_compound_mode_cost[i],
+                               fc->inter_compound_mode_cdf[i], NULL);
+#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
+    for (i = 0; i < BLOCK_SIZES_ALL; ++i)
+      av1_cost_tokens_from_cdf(x->compound_type_cost[i],
+                               fc->compound_type_cdf[i], NULL);
+#endif  // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
+#if CONFIG_COMPOUND_SINGLEREF
+    for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+      av1_cost_tokens_from_cdf(x->inter_singleref_comp_mode_cost[i],
+                               fc->inter_singleref_comp_mode_cdf[i], NULL);
+#endif  // CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_INTERINTRA
+    for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
+      av1_cost_tokens_from_cdf(x->interintra_mode_cost[i],
+                               fc->interintra_mode_cdf[i], NULL);
+#endif  // CONFIG_INTERINTRA
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+    for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; i++) {
+      av1_cost_tokens_from_cdf(x->motion_mode_cost[i], fc->motion_mode_cdf[i],
+                               NULL);
+    }
+#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+    for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; i++) {
+#if CONFIG_NCOBMC_ADAPT_WEIGHT
+      av1_cost_tokens_from_cdf(x->motion_mode_cost2[i], fc->ncobmc_cdf[i],
+                               NULL);
+#endif
+#if CONFIG_NEW_MULTISYMBOL || CONFIG_NCOBMC_ADAPT_WEIGHT
+      av1_cost_tokens_from_cdf(x->motion_mode_cost1[i], fc->obmc_cdf[i], NULL);
+#else
+      x->motion_mode_cost1[i][0] = av1_cost_bit(fc->obmc_prob[i], 0);
+      x->motion_mode_cost1[i][1] = av1_cost_bit(fc->obmc_prob[i], 1);
+#endif
+    }
+#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+#if CONFIG_MOTION_VAR && CONFIG_NCOBMC_ADAPT_WEIGHT
+    for (i = ADAPT_OVERLAP_BLOCK_8X8; i < ADAPT_OVERLAP_BLOCKS; ++i) {
+      av1_cost_tokens_from_cdf(x->ncobmc_mode_cost[i], fc->ncobmc_mode_cdf[i],
+                               NULL);
+    }
+#endif  // CONFIG_MOTION_VAR && CONFIG_NCOBMC_ADAPT_WEIGHT
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+  }
 }
 
 // Values are now correlated to quantizer.
@@ -212,11 +395,11 @@ static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = {
   128, 144, 128, 128, 144,
 #if CONFIG_EXT_REFS
   // TODO(zoeliu): To adjust further following factor values.
-  128, 128, 128
+  128, 128, 128,
   // TODO(weitinglin): We should investigate if the values should be the same
   //                   as the value used by OVERLAY frame
-  ,
-  144
+  144,  // INTNL_OVERLAY_UPDATE
+  128   // INTNL_ARF_UPDATE
 #endif  // CONFIG_EXT_REFS
 };
 
@@ -341,11 +524,170 @@ void av1_set_mvcost(MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame, int ref,
   x->nmvjointcost = x->nmv_vec_cost[nmv_ctx];
 }
 
+#if CONFIG_LV_MAP
+#if !LV_MAP_PROB
+static void get_rate_cost(aom_prob p, int cost[2]) {
+  cost[0] = av1_cost_bit(p, 0);
+  cost[1] = av1_cost_bit(p, 1);
+}
+#endif  // !LV_MAP_PROB
+
+void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc) {
+  for (int tx_size = 0; tx_size < TX_SIZES; ++tx_size) {
+    for (int plane = 0; plane < PLANE_TYPES; ++plane) {
+      LV_MAP_COEFF_COST *pcost = &x->coeff_costs[tx_size][plane];
+
+#if LV_MAP_PROB
+      for (int ctx = 0; ctx < TXB_SKIP_CONTEXTS; ++ctx)
+        av1_cost_tokens_from_cdf(pcost->txb_skip_cost[ctx],
+                                 fc->txb_skip_cdf[tx_size][ctx], NULL);
+
+      for (int ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx)
+        av1_cost_tokens_from_cdf(pcost->nz_map_cost[ctx],
+                                 fc->nz_map_cdf[tx_size][plane][ctx], NULL);
+
+      for (int ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx)
+        av1_cost_tokens_from_cdf(pcost->eob_cost[ctx],
+                                 fc->eob_flag_cdf[tx_size][plane][ctx], NULL);
+
+      for (int ctx = 0; ctx < DC_SIGN_CONTEXTS; ++ctx)
+        av1_cost_tokens_from_cdf(pcost->dc_sign_cost[ctx],
+                                 fc->dc_sign_cdf[plane][ctx], NULL);
+
+      for (int layer = 0; layer < NUM_BASE_LEVELS; ++layer)
+        for (int ctx = 0; ctx < COEFF_BASE_CONTEXTS; ++ctx)
+          av1_cost_tokens_from_cdf(
+              pcost->base_cost[layer][ctx],
+              fc->coeff_base_cdf[tx_size][plane][layer][ctx], NULL);
+
+#if BR_NODE
+      for (int br = 0; br < BASE_RANGE_SETS; ++br)
+        for (int ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx)
+          av1_cost_tokens_from_cdf(pcost->br_cost[br][ctx],
+                                   fc->coeff_br_cdf[tx_size][plane][br][ctx],
+                                   NULL);
+
+      for (int ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) {
+        int lps_rate[2];
+        av1_cost_tokens_from_cdf(lps_rate,
+                                 fc->coeff_lps_cdf[tx_size][plane][ctx], NULL);
+
+        for (int base_range = 0; base_range < COEFF_BASE_RANGE + 1;
+             ++base_range) {
+          int br_set_idx = base_range < COEFF_BASE_RANGE
+                               ? coeff_to_br_index[base_range]
+                               : BASE_RANGE_SETS;
+
+          pcost->lps_cost[ctx][base_range] = 0;
+
+          for (int idx = 0; idx < BASE_RANGE_SETS; ++idx) {
+            if (idx == br_set_idx) {
+              pcost->lps_cost[ctx][base_range] += pcost->br_cost[idx][ctx][1];
+
+              int br_base = br_index_to_coeff[br_set_idx];
+              int br_offset = base_range - br_base;
+              int extra_bits = (1 << br_extra_bits[idx]) - 1;
+              for (int tok = 0; tok < extra_bits; ++tok) {
+                if (tok == br_offset) {
+                  pcost->lps_cost[ctx][base_range] += lps_rate[1];
+                  break;
+                } else {
+                  pcost->lps_cost[ctx][base_range] += lps_rate[0];
+                }
+              }
+              break;
+            } else {
+              pcost->lps_cost[ctx][base_range] += pcost->br_cost[idx][ctx][0];
+            }
+          }
+          // load the base range cost
+        }
+      }
+#else   // BR_NODE
+      for (int ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx)
+        av1_cost_tokens_from_cdf(pcost->lps_cost[ctx],
+                                 fc->coeff_lps_cdf[tx_size][plane][ctx], NULL);
+#endif  // BR_NODE
+#if CONFIG_CTX1D
+      for (int tx_class = 0; tx_class < TX_CLASSES; ++tx_class)
+        av1_cost_tokens_from_cdf(pcost->eob_mode_cost[tx_class],
+                                 fc->eob_mode_cdf[tx_size][plane][tx_class],
+                                 NULL);
+
+      for (int tx_class = 0; tx_class < TX_CLASSES; ++tx_class)
+        for (int ctx = 0; ctx < EMPTY_LINE_CONTEXTS; ++ctx)
+          av1_cost_tokens_from_cdf(
+              pcost->empty_line_cost[tx_class][ctx],
+              fc->empty_line_cdf[tx_size][plane][tx_class][ctx], NULL);
+
+      for (int tx_class = 0; tx_class < TX_CLASSES; ++tx_class)
+        for (int ctx = 0; ctx < HV_EOB_CONTEXTS; ++ctx)
+          av1_cost_tokens_from_cdf(
+              pcost->hv_eob_cost[tx_class][ctx],
+              fc->hv_eob_cdf[tx_size][plane][tx_class][ctx], NULL);
+#endif  // CONFIG_CTX1D
+#else   // LV_MAP_PROB
+      for (int ctx = 0; ctx < TXB_SKIP_CONTEXTS; ++ctx)
+        get_rate_cost(fc->txb_skip[tx_size][ctx], pcost->txb_skip_cost[ctx]);
+
+      for (int ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx)
+        get_rate_cost(fc->nz_map[tx_size][plane][ctx], pcost->nz_map_cost[ctx]);
+
+      for (int ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx)
+        get_rate_cost(fc->eob_flag[tx_size][plane][ctx], pcost->eob_cost[ctx]);
+
+      for (int ctx = 0; ctx < DC_SIGN_CONTEXTS; ++ctx)
+        get_rate_cost(fc->dc_sign[plane][ctx], pcost->dc_sign_cost[ctx]);
+
+      for (int layer = 0; layer < NUM_BASE_LEVELS; ++layer)
+        for (int ctx = 0; ctx < COEFF_BASE_CONTEXTS; ++ctx)
+          get_rate_cost(fc->coeff_base[tx_size][plane][layer][ctx],
+                        pcost->base_cost[layer][ctx]);
+
+      for (int ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx)
+        get_rate_cost(fc->coeff_lps[tx_size][plane][ctx], pcost->lps_cost[ctx]);
+
+#if CONFIG_CTX1D
+      for (int tx_class = 0; tx_class < TX_CLASSES; ++tx_class)
+        get_rate_cost(fc->eob_mode[tx_size][plane][tx_class],
+                      pcost->eob_mode_cost[tx_class]);
+
+      for (int tx_class = 0; tx_class < TX_CLASSES; ++tx_class)
+        for (int ctx = 0; ctx < EMPTY_LINE_CONTEXTS; ++ctx)
+          get_rate_cost(fc->empty_line[tx_size][plane][tx_class][ctx],
+                        pcost->empty_line_cost[tx_class][ctx]);
+
+      for (int tx_class = 0; tx_class < TX_CLASSES; ++tx_class)
+        for (int ctx = 0; ctx < HV_EOB_CONTEXTS; ++ctx)
+          get_rate_cost(fc->hv_eob[tx_size][plane][tx_class][ctx],
+                        pcost->hv_eob_cost[tx_class][ctx]);
+#endif  // CONFIG_CTX1D
+#endif  // LV_MAP_PROB
+    }
+  }
+}
+#endif  // CONFIG_LV_MAP
+
+void av1_fill_token_costs_from_cdf(av1_coeff_cost *cost,
+                                   coeff_cdf_model (*cdf)[PLANE_TYPES]) {
+  for (int tx = 0; tx < TX_SIZES; ++tx) {
+    for (int pt = 0; pt < PLANE_TYPES; ++pt) {
+      for (int rt = 0; rt < REF_TYPES; ++rt) {
+        for (int band = 0; band < COEF_BANDS; ++band) {
+          for (int ctx = 0; ctx < BAND_COEFF_CONTEXTS(band); ++ctx) {
+            av1_cost_tokens_from_cdf(cost[tx][pt][rt][band][ctx],
+                                     cdf[tx][pt][rt][band][ctx], NULL);
+          }
+        }
+      }
+    }
+  }
+}
+
 void av1_initialize_rd_consts(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->td.mb;
   RD_OPT *const rd = &cpi->rd;
-  int i;
   int nmv_ctx;
 
   aom_clear_system_state();
@@ -357,11 +699,25 @@ void av1_initialize_rd_consts(AV1_COMP *cpi) {
   set_block_thresholds(cm, rd);
 
   for (nmv_ctx = 0; nmv_ctx < NMV_CONTEXTS; ++nmv_ctx) {
+#if CONFIG_AMVR
+    if (cm->cur_frame_mv_precision_level) {
+      av1_build_nmv_cost_table(x->nmv_vec_cost[nmv_ctx], x->nmvcost[nmv_ctx],
+                               &cm->fc->nmvc[nmv_ctx], MV_SUBPEL_NONE);
+    } else {
+      av1_build_nmv_cost_table(
+          x->nmv_vec_cost[nmv_ctx],
+          cm->allow_high_precision_mv ? x->nmvcost_hp[nmv_ctx]
+                                      : x->nmvcost[nmv_ctx],
+          &cm->fc->nmvc[nmv_ctx], cm->allow_high_precision_mv);
+    }
+
+#else
     av1_build_nmv_cost_table(
         x->nmv_vec_cost[nmv_ctx],
         cm->allow_high_precision_mv ? x->nmvcost_hp[nmv_ctx]
                                     : x->nmvcost[nmv_ctx],
         &cm->fc->nmvc[nmv_ctx], cm->allow_high_precision_mv);
+#endif
   }
   x->mvcost = x->mv_cost_stack[0];
   x->nmvjointcost = x->nmv_vec_cost[0];
@@ -376,106 +732,22 @@ void av1_initialize_rd_consts(AV1_COMP *cpi) {
   }
 #endif
 
+#if CONFIG_GLOBAL_MOTION
   if (cpi->oxcf.pass != 1) {
-    av1_fill_token_costs(x->token_costs, cm->fc->coef_probs);
-
-    if (cm->frame_type == KEY_FRAME) {
-#if CONFIG_EXT_PARTITION_TYPES
-      for (i = 0; i < PARTITION_PLOFFSET; ++i)
-        av1_cost_tokens(cpi->partition_cost[i], cm->fc->partition_prob[i],
-                        av1_partition_tree);
-      for (; i < PARTITION_CONTEXTS_PRIMARY; ++i)
-        av1_cost_tokens(cpi->partition_cost[i], cm->fc->partition_prob[i],
-                        av1_ext_partition_tree);
+    for (int i = 0; i < TRANS_TYPES; ++i)
+#if GLOBAL_TRANS_TYPES > 4
+      cpi->gmtype_cost[i] = (1 + (i > 0 ? GLOBAL_TYPE_BITS : 0))
+                            << AV1_PROB_COST_SHIFT;
 #else
-      for (i = 0; i < PARTITION_CONTEXTS_PRIMARY; ++i)
-        av1_cost_tokens(cpi->partition_cost[i], cm->fc->partition_prob[i],
-                        av1_partition_tree);
-#endif  // CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_UNPOISON_PARTITION_CTX
-      for (; i < PARTITION_CONTEXTS_PRIMARY + PARTITION_BLOCK_SIZES; ++i) {
-        aom_prob p = cm->fc->partition_prob[i][PARTITION_VERT];
-        assert(p > 0);
-        cpi->partition_cost[i][PARTITION_NONE] = INT_MAX;
-        cpi->partition_cost[i][PARTITION_HORZ] = INT_MAX;
-        cpi->partition_cost[i][PARTITION_VERT] = av1_cost_bit(p, 0);
-        cpi->partition_cost[i][PARTITION_SPLIT] = av1_cost_bit(p, 1);
-      }
-      for (; i < PARTITION_CONTEXTS_PRIMARY + 2 * PARTITION_BLOCK_SIZES; ++i) {
-        aom_prob p = cm->fc->partition_prob[i][PARTITION_HORZ];
-        assert(p > 0);
-        cpi->partition_cost[i][PARTITION_NONE] = INT_MAX;
-        cpi->partition_cost[i][PARTITION_HORZ] = av1_cost_bit(p, 0);
-        cpi->partition_cost[i][PARTITION_VERT] = INT_MAX;
-        cpi->partition_cost[i][PARTITION_SPLIT] = av1_cost_bit(p, 1);
-      }
-      cpi->partition_cost[PARTITION_CONTEXTS][PARTITION_NONE] = INT_MAX;
-      cpi->partition_cost[PARTITION_CONTEXTS][PARTITION_HORZ] = INT_MAX;
-      cpi->partition_cost[PARTITION_CONTEXTS][PARTITION_VERT] = INT_MAX;
-      cpi->partition_cost[PARTITION_CONTEXTS][PARTITION_SPLIT] = 0;
-#endif  // CONFIG_UNPOISON_PARTITION_CTX
-    }
-
-    fill_mode_costs(cpi);
-
-    if (!frame_is_intra_only(cm)) {
-      for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i) {
-        cpi->newmv_mode_cost[i][0] = av1_cost_bit(cm->fc->newmv_prob[i], 0);
-        cpi->newmv_mode_cost[i][1] = av1_cost_bit(cm->fc->newmv_prob[i], 1);
-      }
-
-      for (i = 0; i < ZEROMV_MODE_CONTEXTS; ++i) {
-        cpi->zeromv_mode_cost[i][0] = av1_cost_bit(cm->fc->zeromv_prob[i], 0);
-        cpi->zeromv_mode_cost[i][1] = av1_cost_bit(cm->fc->zeromv_prob[i], 1);
-      }
-
-      for (i = 0; i < REFMV_MODE_CONTEXTS; ++i) {
-        cpi->refmv_mode_cost[i][0] = av1_cost_bit(cm->fc->refmv_prob[i], 0);
-        cpi->refmv_mode_cost[i][1] = av1_cost_bit(cm->fc->refmv_prob[i], 1);
-      }
-
-      for (i = 0; i < DRL_MODE_CONTEXTS; ++i) {
-        cpi->drl_mode_cost0[i][0] = av1_cost_bit(cm->fc->drl_prob[i], 0);
-        cpi->drl_mode_cost0[i][1] = av1_cost_bit(cm->fc->drl_prob[i], 1);
-      }
-#if CONFIG_EXT_INTER
-      for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
-        av1_cost_tokens((int *)cpi->inter_compound_mode_cost[i],
-                        cm->fc->inter_compound_mode_probs[i],
-                        av1_inter_compound_mode_tree);
-#if CONFIG_COMPOUND_SINGLEREF
-      for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
-        av1_cost_tokens((int *)cpi->inter_singleref_comp_mode_cost[i],
-                        cm->fc->inter_singleref_comp_mode_probs[i],
-                        av1_inter_singleref_comp_mode_tree);
-#endif  // CONFIG_COMPOUND_SINGLEREF
-#if CONFIG_INTERINTRA
-      for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
-        av1_cost_tokens((int *)cpi->interintra_mode_cost[i],
-                        cm->fc->interintra_mode_prob[i],
-                        av1_interintra_mode_tree);
-#endif  // CONFIG_INTERINTRA
-#endif  // CONFIG_EXT_INTER
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-      for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; i++) {
-        av1_cost_tokens((int *)cpi->motion_mode_cost[i],
-                        cm->fc->motion_mode_prob[i], av1_motion_mode_tree);
-      }
-#if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-      for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; i++) {
-        cpi->motion_mode_cost1[i][0] = av1_cost_bit(cm->fc->obmc_prob[i], 0);
-        cpi->motion_mode_cost1[i][1] = av1_cost_bit(cm->fc->obmc_prob[i], 1);
-      }
-#endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-#if CONFIG_MOTION_VAR && CONFIG_NCOBMC_ADAPT_WEIGHT
-      for (i = ADAPT_OVERLAP_BLOCK_8X8; i < ADAPT_OVERLAP_BLOCKS; ++i) {
-        av1_cost_tokens((int *)cpi->ncobmc_mode_cost[i],
-                        cm->fc->ncobmc_mode_prob[i], av1_ncobmc_mode_tree);
-      }
-#endif
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-    }
+      // IDENTITY: 1 bit
+      // TRANSLATION: 3 bits
+      // ROTZOOM: 2 bits
+      // AFFINE: 3 bits
+      cpi->gmtype_cost[i] = (1 + (i > 0 ? (i == ROTZOOM ? 1 : 2) : 0))
+                            << AV1_PROB_COST_SHIFT;
+#endif  // GLOBAL_TRANS_TYPES > 4
   }
+#endif  // CONFIG_GLOBAL_MOTION
 }
 
 static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
@@ -618,6 +890,26 @@ static void get_entropy_contexts_plane(
             !!(*(const uint64_t *)&left[i] | *(const uint64_t *)&left[i + 8]);
       break;
 #if CONFIG_TX64X64
+    case TX_32X64:
+      for (i = 0; i < num_4x4_w; i += 16)
+        t_above[i] =
+            !!(*(const uint64_t *)&above[i] | *(const uint64_t *)&above[i + 8]);
+      for (i = 0; i < num_4x4_h; i += 32)
+        t_left[i] =
+            !!(*(const uint64_t *)&left[i] | *(const uint64_t *)&left[i + 8] |
+               *(const uint64_t *)&left[i + 16] |
+               *(const uint64_t *)&left[i + 24]);
+      break;
+    case TX_64X32:
+      for (i = 0; i < num_4x4_w; i += 32)
+        t_above[i] =
+            !!(*(const uint64_t *)&above[i] | *(const uint64_t *)&above[i + 8] |
+               *(const uint64_t *)&above[i + 16] |
+               *(const uint64_t *)&above[i + 24]);
+      for (i = 0; i < num_4x4_h; i += 16)
+        t_left[i] =
+            !!(*(const uint64_t *)&left[i] | *(const uint64_t *)&left[i + 8]);
+      break;
     case TX_64X64:
       for (i = 0; i < num_4x4_w; i += 32)
         t_above[i] =
@@ -727,6 +1019,20 @@ static void get_entropy_contexts_plane(
         t_left[i] = !!*(const uint64_t *)&left[i];
       break;
 #if CONFIG_TX64X64
+    case TX_32X64:
+      for (i = 0; i < num_4x4_w; i += 8)
+        t_above[i] = !!*(const uint64_t *)&above[i];
+      for (i = 0; i < num_4x4_h; i += 16)
+        t_left[i] =
+            !!(*(const uint64_t *)&left[i] | *(const uint64_t *)&left[i + 8]);
+      break;
+    case TX_64X32:
+      for (i = 0; i < num_4x4_w; i += 16)
+        t_above[i] =
+            !!(*(const uint64_t *)&above[i] | *(const uint64_t *)&above[i + 8]);
+      for (i = 0; i < num_4x4_h; i += 8)
+        t_left[i] = !!*(const uint64_t *)&left[i];
+      break;
     case TX_64X64:
       for (i = 0; i < num_4x4_w; i += 16)
         t_above[i] =
@@ -909,8 +1215,8 @@ YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const AV1_COMP *cpi,
 }
 
 #if CONFIG_DUAL_FILTER
-int av1_get_switchable_rate(const AV1_COMP *cpi, const MACROBLOCKD *xd) {
-  const AV1_COMMON *const cm = &cpi->common;
+int av1_get_switchable_rate(const AV1_COMMON *const cm, MACROBLOCK *x,
+                            const MACROBLOCKD *xd) {
   if (cm->interp_filter == SWITCHABLE) {
     const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
     int inter_filter_cost = 0;
@@ -921,8 +1227,9 @@ int av1_get_switchable_rate(const AV1_COMP *cpi, const MACROBLOCKD *xd) {
           (mbmi->ref_frame[1] > INTRA_FRAME &&
            has_subpel_mv_component(xd->mi[0], xd, dir + 2))) {
         const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
-        inter_filter_cost +=
-            cpi->switchable_interp_costs[ctx][mbmi->interp_filter[dir]];
+        const InterpFilter filter =
+            av1_extract_interp_filter(mbmi->interp_filters, dir);
+        inter_filter_cost += x->switchable_interp_costs[ctx][filter];
       }
     }
     return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost;
@@ -931,13 +1238,15 @@ int av1_get_switchable_rate(const AV1_COMP *cpi, const MACROBLOCKD *xd) {
   }
 }
 #else
-int av1_get_switchable_rate(const AV1_COMP *cpi, const MACROBLOCKD *xd) {
-  const AV1_COMMON *const cm = &cpi->common;
+int av1_get_switchable_rate(const AV1_COMMON *const cm, MACROBLOCK *x,
+                            const MACROBLOCKD *xd) {
   if (cm->interp_filter == SWITCHABLE) {
     const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
     const int ctx = av1_get_pred_context_switchable_interp(xd);
+    const InterpFilter filter =
+        av1_extract_interp_filter(mbmi->interp_filters, 0);
     return SWITCHABLE_INTERP_RATE_FACTOR *
-           cpi->switchable_interp_costs[ctx][mbmi->interp_filter];
+           x->switchable_interp_costs[ctx][filter];
   }
   return 0;
 }
@@ -957,6 +1266,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
     rd->thresh_mult[THR_NEARESTL2] = 300;
     rd->thresh_mult[THR_NEARESTL3] = 300;
     rd->thresh_mult[THR_NEARESTB] = 300;
+    rd->thresh_mult[THR_NEARESTA2] = 300;
 #endif  // CONFIG_EXT_REFS
     rd->thresh_mult[THR_NEARESTA] = 300;
     rd->thresh_mult[THR_NEARESTG] = 300;
@@ -966,6 +1276,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
     rd->thresh_mult[THR_NEARESTL2] = 0;
     rd->thresh_mult[THR_NEARESTL3] = 0;
     rd->thresh_mult[THR_NEARESTB] = 0;
+    rd->thresh_mult[THR_NEARESTA2] = 0;
 #endif  // CONFIG_EXT_REFS
     rd->thresh_mult[THR_NEARESTA] = 0;
     rd->thresh_mult[THR_NEARESTG] = 0;
@@ -978,6 +1289,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_NEWL2] += 1000;
   rd->thresh_mult[THR_NEWL3] += 1000;
   rd->thresh_mult[THR_NEWB] += 1000;
+  rd->thresh_mult[THR_NEWA2] = 1000;
 #endif  // CONFIG_EXT_REFS
   rd->thresh_mult[THR_NEWA] += 1000;
   rd->thresh_mult[THR_NEWG] += 1000;
@@ -987,6 +1299,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_NEARL2] += 1000;
   rd->thresh_mult[THR_NEARL3] += 1000;
   rd->thresh_mult[THR_NEARB] += 1000;
+  rd->thresh_mult[THR_NEARA2] = 1000;
 #endif  // CONFIG_EXT_REFS
   rd->thresh_mult[THR_NEARA] += 1000;
   rd->thresh_mult[THR_NEARG] += 1000;
@@ -996,14 +1309,13 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_ZEROL2] += 2000;
   rd->thresh_mult[THR_ZEROL3] += 2000;
   rd->thresh_mult[THR_ZEROB] += 2000;
+  rd->thresh_mult[THR_ZEROA2] = 2000;
 #endif  // CONFIG_EXT_REFS
   rd->thresh_mult[THR_ZEROG] += 2000;
   rd->thresh_mult[THR_ZEROA] += 2000;
 
   rd->thresh_mult[THR_TM] += 1000;
 
-#if CONFIG_EXT_INTER
-
 #if CONFIG_COMPOUND_SINGLEREF
   rd->thresh_mult[THR_SR_NEAREST_NEARMV] += 1200;
 #if CONFIG_EXT_REFS
@@ -1063,6 +1375,10 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2B] += 1000;
   rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3B] += 1000;
   rd->thresh_mult[THR_COMP_NEAREST_NEARESTGB] += 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA2] += 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A2] += 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A2] += 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA2] += 1000;
 
 #if CONFIG_EXT_COMP_REFS
   rd->thresh_mult[THR_COMP_NEAREST_NEARESTLL2] += 1000;
@@ -1072,31 +1388,6 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
 #endif  // CONFIG_EXT_COMP_REFS
 #endif  // CONFIG_EXT_REFS
 
-#else  // CONFIG_EXT_INTER
-
-  rd->thresh_mult[THR_COMP_NEARESTLA] += 1000;
-#if CONFIG_EXT_REFS
-  rd->thresh_mult[THR_COMP_NEARESTL2A] += 1000;
-  rd->thresh_mult[THR_COMP_NEARESTL3A] += 1000;
-#endif  // CONFIG_EXT_REFS
-  rd->thresh_mult[THR_COMP_NEARESTGA] += 1000;
-#if CONFIG_EXT_REFS
-  rd->thresh_mult[THR_COMP_NEARESTLB] += 1000;
-  rd->thresh_mult[THR_COMP_NEARESTL2B] += 1000;
-  rd->thresh_mult[THR_COMP_NEARESTL3B] += 1000;
-  rd->thresh_mult[THR_COMP_NEARESTGB] += 1000;
-#if CONFIG_EXT_COMP_REFS
-  rd->thresh_mult[THR_COMP_NEARESTLL2] += 1000;
-  rd->thresh_mult[THR_COMP_NEARESTLL3] += 1000;
-  rd->thresh_mult[THR_COMP_NEARESTLG] += 1000;
-  rd->thresh_mult[THR_COMP_NEARESTBA] += 1000;
-#endif  // CONFIG_EXT_COMP_REFS
-#endif  // CONFIG_EXT_REFS
-
-#endif  // CONFIG_EXT_INTER
-
-#if CONFIG_EXT_INTER
-
   rd->thresh_mult[THR_COMP_NEAR_NEARLA] += 1200;
   rd->thresh_mult[THR_COMP_NEAREST_NEWLA] += 1500;
   rd->thresh_mult[THR_COMP_NEW_NEARESTLA] += 1500;
@@ -1164,6 +1455,38 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_NEW_NEWGB] += 2000;
   rd->thresh_mult[THR_COMP_ZERO_ZEROGB] += 2500;
 
+  rd->thresh_mult[THR_COMP_NEAR_NEARLA2] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLA2] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLA2] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLA2] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARLA2] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWLA2] += 2000;
+  rd->thresh_mult[THR_COMP_ZERO_ZEROLA2] += 2500;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARL2A2] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWL2A2] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTL2A2] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL2A2] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARL2A2] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWL2A2] += 2000;
+  rd->thresh_mult[THR_COMP_ZERO_ZEROL2A2] += 2500;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARL3A2] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWL3A2] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTL3A2] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL3A2] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARL3A2] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWL3A2] += 2000;
+  rd->thresh_mult[THR_COMP_ZERO_ZEROL3A2] += 2500;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARGA2] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWGA2] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTGA2] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWGA2] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARGA2] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWGA2] += 2000;
+  rd->thresh_mult[THR_COMP_ZERO_ZEROGA2] += 2500;
+
 #if CONFIG_EXT_COMP_REFS
   rd->thresh_mult[THR_COMP_NEAR_NEARLL2] += 1200;
   rd->thresh_mult[THR_COMP_NEAREST_NEWLL2] += 1500;
@@ -1199,64 +1522,6 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
 #endif  // CONFIG_EXT_COMP_REFS
 #endif  // CONFIG_EXT_REFS
 
-#else  // CONFIG_EXT_INTER
-
-  rd->thresh_mult[THR_COMP_NEARLA] += 1500;
-  rd->thresh_mult[THR_COMP_NEWLA] += 2000;
-#if CONFIG_EXT_REFS
-  rd->thresh_mult[THR_COMP_NEARL2A] += 1500;
-  rd->thresh_mult[THR_COMP_NEWL2A] += 2000;
-  rd->thresh_mult[THR_COMP_NEARL3A] += 1500;
-  rd->thresh_mult[THR_COMP_NEWL3A] += 2000;
-#endif  // CONFIG_EXT_REFS
-  rd->thresh_mult[THR_COMP_NEARGA] += 1500;
-  rd->thresh_mult[THR_COMP_NEWGA] += 2000;
-
-#if CONFIG_EXT_REFS
-  rd->thresh_mult[THR_COMP_NEARLB] += 1500;
-  rd->thresh_mult[THR_COMP_NEWLB] += 2000;
-  rd->thresh_mult[THR_COMP_NEARL2B] += 1500;
-  rd->thresh_mult[THR_COMP_NEWL2B] += 2000;
-  rd->thresh_mult[THR_COMP_NEARL3B] += 1500;
-  rd->thresh_mult[THR_COMP_NEWL3B] += 2000;
-  rd->thresh_mult[THR_COMP_NEARGB] += 1500;
-  rd->thresh_mult[THR_COMP_NEWGB] += 2000;
-
-#if CONFIG_EXT_COMP_REFS
-  rd->thresh_mult[THR_COMP_NEARLL2] += 1500;
-  rd->thresh_mult[THR_COMP_NEWLL2] += 2000;
-  rd->thresh_mult[THR_COMP_NEARLL3] += 1500;
-  rd->thresh_mult[THR_COMP_NEWLL3] += 2000;
-  rd->thresh_mult[THR_COMP_NEARLG] += 1500;
-  rd->thresh_mult[THR_COMP_NEWLG] += 2000;
-  rd->thresh_mult[THR_COMP_NEARBA] += 1500;
-  rd->thresh_mult[THR_COMP_NEWBA] += 2000;
-#endif  // CONFIG_EXT_COMP_REFS
-#endif  // CONFIG_EXT_REFS
-
-  rd->thresh_mult[THR_COMP_ZEROLA] += 2500;
-#if CONFIG_EXT_REFS
-  rd->thresh_mult[THR_COMP_ZEROL2A] += 2500;
-  rd->thresh_mult[THR_COMP_ZEROL3A] += 2500;
-#endif  // CONFIG_EXT_REFS
-  rd->thresh_mult[THR_COMP_ZEROGA] += 2500;
-
-#if CONFIG_EXT_REFS
-  rd->thresh_mult[THR_COMP_ZEROLB] += 2500;
-  rd->thresh_mult[THR_COMP_ZEROL2B] += 2500;
-  rd->thresh_mult[THR_COMP_ZEROL3B] += 2500;
-  rd->thresh_mult[THR_COMP_ZEROGB] += 2500;
-
-#if CONFIG_EXT_COMP_REFS
-  rd->thresh_mult[THR_COMP_ZEROLL2] += 2500;
-  rd->thresh_mult[THR_COMP_ZEROLL3] += 2500;
-  rd->thresh_mult[THR_COMP_ZEROLG] += 2500;
-  rd->thresh_mult[THR_COMP_ZEROBA] += 2500;
-#endif  // CONFIG_EXT_COMP_REFS
-#endif  // CONFIG_EXT_REFS
-
-#endif  // CONFIG_EXT_INTER
-
   rd->thresh_mult[THR_H_PRED] += 2000;
   rd->thresh_mult[THR_V_PRED] += 2000;
   rd->thresh_mult[THR_D135_PRED] += 2500;
@@ -1266,7 +1531,6 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_D117_PRED] += 2500;
   rd->thresh_mult[THR_D45_PRED] += 2500;
 
-#if CONFIG_EXT_INTER
   rd->thresh_mult[THR_COMP_INTERINTRA_ZEROL] += 1500;
   rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] += 1500;
   rd->thresh_mult[THR_COMP_INTERINTRA_NEARL] += 1500;
@@ -1294,13 +1558,17 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTB] += 1500;
   rd->thresh_mult[THR_COMP_INTERINTRA_NEARB] += 1500;
   rd->thresh_mult[THR_COMP_INTERINTRA_NEWB] += 2000;
+
+  rd->thresh_mult[THR_COMP_INTERINTRA_ZEROA2] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTA2] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEARA2] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEWA2] += 2000;
 #endif  // CONFIG_EXT_REFS
 
   rd->thresh_mult[THR_COMP_INTERINTRA_ZEROA] += 1500;
   rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] += 1500;
   rd->thresh_mult[THR_COMP_INTERINTRA_NEARA] += 1500;
   rd->thresh_mult[THR_COMP_INTERINTRA_NEWA] += 2000;
-#endif  // CONFIG_EXT_INTER
 }
 
 void av1_set_rd_speed_thresholds_sub8x8(AV1_COMP *cpi) {
@@ -1312,6 +1580,11 @@ void av1_set_rd_speed_thresholds_sub8x8(AV1_COMP *cpi) {
     2500,
     2500,
     2500,
+    2500,
+    4500,
+    4500,
+    4500,
+    4500,
     4500,
     4500,
     4500,
@@ -1321,7 +1594,7 @@ void av1_set_rd_speed_thresholds_sub8x8(AV1_COMP *cpi) {
     4500,
     4500,
     2500
-#else
+#else  // !CONFIG_EXT_REFS
     2500,
     2500,
     2500,
diff --git a/third_party/aom/av1/encoder/rd.h b/third_party/aom/av1/encoder/rd.h
index ea5115b41..35ada8e6c 100644
--- a/third_party/aom/av1/encoder/rd.h
+++ b/third_party/aom/av1/encoder/rd.h
@@ -43,14 +43,6 @@ extern "C" {
 #define MV_COST_WEIGHT 108
 #define MV_COST_WEIGHT_SUB 120
 
-#define INVALID_MV 0x80008000
-
-#if CONFIG_EXT_REFS
-#define MAX_REFS 15
-#else
-#define MAX_REFS 6
-#endif  // CONFIG_EXT_REFS
-
 #define RD_THRESH_MAX_FACT 64
 #define RD_THRESH_INC 1
 
@@ -62,6 +54,7 @@ typedef enum {
   THR_NEARESTL2,
   THR_NEARESTL3,
   THR_NEARESTB,
+  THR_NEARESTA2,
 #endif  // CONFIG_EXT_REFS
   THR_NEARESTA,
   THR_NEARESTG,
@@ -73,6 +66,7 @@ typedef enum {
   THR_NEWL2,
   THR_NEWL3,
   THR_NEWB,
+  THR_NEWA2,
 #endif  // CONFIG_EXT_REFS
   THR_NEWA,
   THR_NEWG,
@@ -82,6 +76,7 @@ typedef enum {
   THR_NEARL2,
   THR_NEARL3,
   THR_NEARB,
+  THR_NEARA2,
 #endif  // CONFIG_EXT_REFS
   THR_NEARA,
   THR_NEARG,
@@ -91,11 +86,10 @@ typedef enum {
   THR_ZEROL2,
   THR_ZEROL3,
   THR_ZEROB,
+  THR_ZEROA2,
 #endif  // CONFIG_EXT_REFS
-  THR_ZEROG,
   THR_ZEROA,
-
-#if CONFIG_EXT_INTER
+  THR_ZEROG,
 
 #if CONFIG_COMPOUND_SINGLEREF
   THR_SR_NEAREST_NEARMV,
@@ -156,6 +150,10 @@ typedef enum {
   THR_COMP_NEAREST_NEARESTL2B,
   THR_COMP_NEAREST_NEARESTL3B,
   THR_COMP_NEAREST_NEARESTGB,
+  THR_COMP_NEAREST_NEARESTLA2,
+  THR_COMP_NEAREST_NEARESTL2A2,
+  THR_COMP_NEAREST_NEARESTL3A2,
+  THR_COMP_NEAREST_NEARESTGA2,
 #if CONFIG_EXT_COMP_REFS
   THR_COMP_NEAREST_NEARESTLL2,
   THR_COMP_NEAREST_NEARESTLL3,
@@ -164,40 +162,13 @@ typedef enum {
 #endif  // CONFIG_EXT_COMP_REFS
 #endif  // CONFIG_EXT_REFS
 
-#else  // CONFIG_EXT_INTER
-
-  THR_COMP_NEARESTLA,
-#if CONFIG_EXT_REFS
-  THR_COMP_NEARESTL2A,
-  THR_COMP_NEARESTL3A,
-#endif  // CONFIG_EXT_REFS
-  THR_COMP_NEARESTGA,
-#if CONFIG_EXT_REFS
-  THR_COMP_NEARESTLB,
-  THR_COMP_NEARESTL2B,
-  THR_COMP_NEARESTL3B,
-  THR_COMP_NEARESTGB,
-#if CONFIG_EXT_COMP_REFS
-  THR_COMP_NEARESTLL2,
-  THR_COMP_NEARESTLL3,
-  THR_COMP_NEARESTLG,
-  THR_COMP_NEARESTBA,
-#endif  // CONFIG_EXT_COMP_REFS
-#endif  // CONFIG_EXT_REFS
-
-#endif  // CONFIG_EXT_INTER
-
   THR_TM,
 
-#if CONFIG_ALT_INTRA
   THR_SMOOTH,
 #if CONFIG_SMOOTH_HV
   THR_SMOOTH_V,
   THR_SMOOTH_H,
 #endif  // CONFIG_SMOOTH_HV
-#endif  // CONFIG_ALT_INTRA
-
-#if CONFIG_EXT_INTER
 
   THR_COMP_NEAR_NEARLA,
   THR_COMP_NEW_NEARESTLA,
@@ -266,6 +237,38 @@ typedef enum {
   THR_COMP_NEW_NEWGB,
   THR_COMP_ZERO_ZEROGB,
 
+  THR_COMP_NEAR_NEARLA2,
+  THR_COMP_NEW_NEARESTLA2,
+  THR_COMP_NEAREST_NEWLA2,
+  THR_COMP_NEW_NEARLA2,
+  THR_COMP_NEAR_NEWLA2,
+  THR_COMP_NEW_NEWLA2,
+  THR_COMP_ZERO_ZEROLA2,
+
+  THR_COMP_NEAR_NEARL2A2,
+  THR_COMP_NEW_NEARESTL2A2,
+  THR_COMP_NEAREST_NEWL2A2,
+  THR_COMP_NEW_NEARL2A2,
+  THR_COMP_NEAR_NEWL2A2,
+  THR_COMP_NEW_NEWL2A2,
+  THR_COMP_ZERO_ZEROL2A2,
+
+  THR_COMP_NEAR_NEARL3A2,
+  THR_COMP_NEW_NEARESTL3A2,
+  THR_COMP_NEAREST_NEWL3A2,
+  THR_COMP_NEW_NEARL3A2,
+  THR_COMP_NEAR_NEWL3A2,
+  THR_COMP_NEW_NEWL3A2,
+  THR_COMP_ZERO_ZEROL3A2,
+
+  THR_COMP_NEAR_NEARGA2,
+  THR_COMP_NEW_NEARESTGA2,
+  THR_COMP_NEAREST_NEWGA2,
+  THR_COMP_NEW_NEARGA2,
+  THR_COMP_NEAR_NEWGA2,
+  THR_COMP_NEW_NEWGA2,
+  THR_COMP_ZERO_ZEROGA2,
+
 #if CONFIG_EXT_COMP_REFS
   THR_COMP_NEAR_NEARLL2,
   THR_COMP_NEW_NEARESTLL2,
@@ -301,64 +304,6 @@ typedef enum {
 #endif  // CONFIG_EXT_COMP_REFS
 #endif  // CONFIG_EXT_REFS
 
-#else  // CONFIG_EXT_INTER
-
-  THR_COMP_NEARLA,
-  THR_COMP_NEWLA,
-#if CONFIG_EXT_REFS
-  THR_COMP_NEARL2A,
-  THR_COMP_NEWL2A,
-  THR_COMP_NEARL3A,
-  THR_COMP_NEWL3A,
-#endif  // CONFIG_EXT_REFS
-  THR_COMP_NEARGA,
-  THR_COMP_NEWGA,
-
-#if CONFIG_EXT_REFS
-  THR_COMP_NEARLB,
-  THR_COMP_NEWLB,
-  THR_COMP_NEARL2B,
-  THR_COMP_NEWL2B,
-  THR_COMP_NEARL3B,
-  THR_COMP_NEWL3B,
-  THR_COMP_NEARGB,
-  THR_COMP_NEWGB,
-
-#if CONFIG_EXT_COMP_REFS
-  THR_COMP_NEARLL2,
-  THR_COMP_NEWLL2,
-  THR_COMP_NEARLL3,
-  THR_COMP_NEWLL3,
-  THR_COMP_NEARLG,
-  THR_COMP_NEWLG,
-  THR_COMP_NEARBA,
-  THR_COMP_NEWBA,
-#endif  // CONFIG_EXT_COMP_REFS
-#endif  // CONFIG_EXT_REFS
-
-  THR_COMP_ZEROLA,
-#if CONFIG_EXT_REFS
-  THR_COMP_ZEROL2A,
-  THR_COMP_ZEROL3A,
-#endif  // CONFIG_EXT_REFS
-  THR_COMP_ZEROGA,
-
-#if CONFIG_EXT_REFS
-  THR_COMP_ZEROLB,
-  THR_COMP_ZEROL2B,
-  THR_COMP_ZEROL3B,
-  THR_COMP_ZEROGB,
-
-#if CONFIG_EXT_COMP_REFS
-  THR_COMP_ZEROLL2,
-  THR_COMP_ZEROLL3,
-  THR_COMP_ZEROLG,
-  THR_COMP_ZEROBA,
-#endif  // CONFIG_EXT_COMP_REFS
-#endif  // CONFIG_EXT_REFS
-
-#endif  // CONFIG_EXT_INTER
-
   THR_H_PRED,
   THR_V_PRED,
   THR_D135_PRED,
@@ -368,7 +313,6 @@ typedef enum {
   THR_D117_PRED,
   THR_D45_PRED,
 
-#if CONFIG_EXT_INTER
   THR_COMP_INTERINTRA_ZEROL,
   THR_COMP_INTERINTRA_NEARESTL,
   THR_COMP_INTERINTRA_NEARL,
@@ -396,13 +340,17 @@ typedef enum {
   THR_COMP_INTERINTRA_NEARESTB,
   THR_COMP_INTERINTRA_NEARB,
   THR_COMP_INTERINTRA_NEWB,
+
+  THR_COMP_INTERINTRA_ZEROA2,
+  THR_COMP_INTERINTRA_NEARESTA2,
+  THR_COMP_INTERINTRA_NEARA2,
+  THR_COMP_INTERINTRA_NEWA2,
 #endif  // CONFIG_EXT_REFS
 
   THR_COMP_INTERINTRA_ZEROA,
   THR_COMP_INTERINTRA_NEARESTA,
   THR_COMP_INTERINTRA_NEARA,
   THR_COMP_INTERINTRA_NEWA,
-#endif  // CONFIG_EXT_INTER
   MAX_MODES
 } THR_MODES;
 
@@ -412,6 +360,7 @@ typedef enum {
   THR_LAST2,
   THR_LAST3,
   THR_BWDR,
+  THR_ALTR2,
 #endif  // CONFIG_EXT_REFS
   THR_GOLD,
   THR_ALTR,
@@ -428,9 +377,16 @@ typedef enum {
   THR_COMP_L2B,
   THR_COMP_L3B,
   THR_COMP_GB,
+
+  THR_COMP_LA2,
+  THR_COMP_L2A2,
+  THR_COMP_L3A2,
+  THR_COMP_GA2,
 #endif  // CONFIG_EXT_REFS
 
   THR_INTRA,
+
+  MAX_REFS
 } THR_MODES_SUB8X8;
 
 typedef struct RD_OPT {
@@ -458,10 +414,8 @@ static INLINE void av1_init_rd_stats(RD_STATS *rd_stats) {
   rd_stats->sse = 0;
   rd_stats->skip = 1;
   rd_stats->zero_rate = 0;
+  rd_stats->invalid_rate = 0;
   rd_stats->ref_rdcost = INT64_MAX;
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-  rd_stats->dist_y = 0;
-#endif
 #if CONFIG_RD_DEBUG
   for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
     rd_stats->txb_coeff_cost[plane] = 0;
@@ -487,10 +441,8 @@ static INLINE void av1_invalid_rd_stats(RD_STATS *rd_stats) {
   rd_stats->sse = INT64_MAX;
   rd_stats->skip = 0;
   rd_stats->zero_rate = 0;
+  rd_stats->invalid_rate = 1;
   rd_stats->ref_rdcost = INT64_MAX;
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-  rd_stats->dist_y = INT64_MAX;
-#endif
 #if CONFIG_RD_DEBUG
   for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
     rd_stats->txb_coeff_cost[plane] = INT_MAX;
@@ -515,9 +467,7 @@ static INLINE void av1_merge_rd_stats(RD_STATS *rd_stats_dst,
   rd_stats_dst->dist += rd_stats_src->dist;
   rd_stats_dst->sse += rd_stats_src->sse;
   rd_stats_dst->skip &= rd_stats_src->skip;
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-  rd_stats_dst->dist_y += rd_stats_src->dist_y;
-#endif
+  rd_stats_dst->invalid_rate &= rd_stats_src->invalid_rate;
 #if CONFIG_RD_DEBUG
   for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
     rd_stats_dst->txb_coeff_cost[plane] += rd_stats_src->txb_coeff_cost[plane];
@@ -539,6 +489,16 @@ static INLINE void av1_merge_rd_stats(RD_STATS *rd_stats_dst,
 #endif
 }
 
+static INLINE int av1_get_coeff_token_cost(int token, int eob_val, int is_first,
+                                           const int *head_cost_table,
+                                           const int *tail_cost_table) {
+  if (eob_val == LAST_EOB) return av1_cost_zero(128);
+  const int comb_symb = 2 * AOMMIN(token, TWO_TOKEN) - eob_val + is_first;
+  int cost = head_cost_table[comb_symb];
+  if (token > ONE_TOKEN) cost += tail_cost_table[token - TWO_TOKEN];
+  return cost;
+}
+
 struct TileInfo;
 struct TileDataEnc;
 struct AV1_COMP;
@@ -554,7 +514,8 @@ void av1_initialize_me_consts(const struct AV1_COMP *cpi, MACROBLOCK *x,
 void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n,
                                   unsigned int qstep, int *rate, int64_t *dist);
 
-int av1_get_switchable_rate(const struct AV1_COMP *cpi, const MACROBLOCKD *xd);
+int av1_get_switchable_rate(const AV1_COMMON *const cm, MACROBLOCK *x,
+                            const MACROBLOCKD *xd);
 
 int av1_raster_block_offset(BLOCK_SIZE plane_bsize, int raster_block,
                             int stride);
@@ -583,9 +544,6 @@ void av1_update_rd_thresh_fact(const AV1_COMMON *const cm,
                                int (*fact)[MAX_MODES], int rd_thresh, int bsize,
                                int best_mode_index);
 
-void av1_fill_token_costs(av1_coeff_cost *c,
-                          av1_coeff_probs_model (*p)[PLANE_TYPES]);
-
 static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh,
                                       int thresh_fact) {
   return best_rd < ((int64_t)thresh * thresh_fact >> 5) || thresh == INT_MAX;
@@ -609,6 +567,16 @@ void av1_setup_pred_block(const MACROBLOCKD *xd,
 int av1_get_intra_cost_penalty(int qindex, int qdelta,
                                aom_bit_depth_t bit_depth);
 
+void av1_fill_mode_rates(AV1_COMMON *const cm, MACROBLOCK *x,
+                         FRAME_CONTEXT *fc);
+
+#if CONFIG_LV_MAP
+void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc);
+#endif
+
+void av1_fill_token_costs_from_cdf(av1_coeff_cost *cost,
+                                   coeff_cdf_model (*cdf)[PLANE_TYPES]);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/encoder/rdopt.c b/third_party/aom/av1/encoder/rdopt.c
index 43b00b83b..607db9b86 100644
--- a/third_party/aom/av1/encoder/rdopt.c
+++ b/third_party/aom/av1/encoder/rdopt.c
@@ -21,12 +21,16 @@
 #include "aom_ports/mem.h"
 #include "aom_ports/system_state.h"
 
+#if CONFIG_CFL
+#include "av1/common/cfl.h"
+#endif
 #include "av1/common/common.h"
 #include "av1/common/common_data.h"
 #include "av1/common/entropy.h"
 #include "av1/common/entropymode.h"
 #include "av1/common/idct.h"
 #include "av1/common/mvref_common.h"
+#include "av1/common/obmc.h"
 #include "av1/common/pred_common.h"
 #include "av1/common/quant_common.h"
 #include "av1/common/reconinter.h"
@@ -51,19 +55,15 @@
 #endif
 #include "av1/encoder/hybrid_fwd_txfm.h"
 #include "av1/encoder/mcomp.h"
-#if CONFIG_PALETTE
 #include "av1/encoder/palette.h"
-#endif  // CONFIG_PALETTE
 #include "av1/encoder/ratectrl.h"
 #include "av1/encoder/rd.h"
 #include "av1/encoder/rdopt.h"
 #include "av1/encoder/tokenize.h"
 #if CONFIG_PVQ
 #include "av1/encoder/pvq_encoder.h"
-#endif  // CONFIG_PVQ
-#if CONFIG_PVQ || CONFIG_DAALA_DIST
 #include "av1/common/pvq.h"
-#endif  // CONFIG_PVQ || CONFIG_DIST_8X8
+#endif  // CONFIG_PVQ
 #if CONFIG_DUAL_FILTER
 #define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS)
 #if USE_EXTRA_FILTER
@@ -82,26 +82,36 @@ static const int filter_sets[DUAL_FILTER_SET_SIZE][2] = {
 
 #if CONFIG_EXT_REFS
 
-#define LAST_FRAME_MODE_MASK                                      \
-  ((1 << INTRA_FRAME) | (1 << LAST2_FRAME) | (1 << LAST3_FRAME) | \
-   (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME))
-#define LAST2_FRAME_MODE_MASK                                    \
-  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST3_FRAME) | \
-   (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME))
-#define LAST3_FRAME_MODE_MASK                                    \
-  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) | \
-   (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME))
-#define GOLDEN_FRAME_MODE_MASK                                   \
-  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) | \
-   (1 << LAST3_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME))
-#define BWDREF_FRAME_MODE_MASK                                   \
-  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) | \
-   (1 << LAST3_FRAME) | (1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME))
-#define ALTREF_FRAME_MODE_MASK                                   \
-  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) | \
-   (1 << LAST3_FRAME) | (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME))
-
-#else
+#define LAST_FRAME_MODE_MASK                                          \
+  ((1 << INTRA_FRAME) | (1 << LAST2_FRAME) | (1 << LAST3_FRAME) |     \
+   (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF2_FRAME) | \
+   (1 << ALTREF_FRAME))
+#define LAST2_FRAME_MODE_MASK                                         \
+  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST3_FRAME) |      \
+   (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF2_FRAME) | \
+   (1 << ALTREF_FRAME))
+#define LAST3_FRAME_MODE_MASK                                         \
+  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) |      \
+   (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF2_FRAME) | \
+   (1 << ALTREF_FRAME))
+#define GOLDEN_FRAME_MODE_MASK                                       \
+  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) |     \
+   (1 << LAST3_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF2_FRAME) | \
+   (1 << ALTREF_FRAME))
+#define BWDREF_FRAME_MODE_MASK                                       \
+  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) |     \
+   (1 << LAST3_FRAME) | (1 << GOLDEN_FRAME) | (1 << ALTREF2_FRAME) | \
+   (1 << ALTREF_FRAME))
+#define ALTREF2_FRAME_MODE_MASK                                     \
+  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) |    \
+   (1 << LAST3_FRAME) | (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | \
+   (1 << ALTREF_FRAME))
+#define ALTREF_FRAME_MODE_MASK                                      \
+  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) |    \
+   (1 << LAST3_FRAME) | (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | \
+   (1 << ALTREF2_FRAME))
+
+#else  // !CONFIG_EXT_REFS
 
 #define LAST_FRAME_MODE_MASK \
   ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | (1 << INTRA_FRAME))
@@ -114,11 +124,12 @@ static const int filter_sets[DUAL_FILTER_SET_SIZE][2] = {
 
 #if CONFIG_EXT_REFS
 #if CONFIG_EXT_COMP_REFS
-#define SECOND_REF_FRAME_MASK                                        \
-  ((1 << ALTREF_FRAME) | (1 << BWDREF_FRAME) | (1 << GOLDEN_FRAME) | \
-   (1 << LAST2_FRAME) | 0x01)  // NOLINT
-#else                          // !CONFIG_EXT_COMP_REFS
-#define SECOND_REF_FRAME_MASK ((1 << ALTREF_FRAME) | (1 << BWDREF_FRAME) | 0x01)
+#define SECOND_REF_FRAME_MASK                                         \
+  ((1 << ALTREF_FRAME) | (1 << ALTREF2_FRAME) | (1 << BWDREF_FRAME) | \
+   (1 << GOLDEN_FRAME) | (1 << LAST2_FRAME) | 0x01)
+#else  // !CONFIG_EXT_COMP_REFS
+#define SECOND_REF_FRAME_MASK \
+  ((1 << ALTREF_FRAME) | (1 << ALTREF2_FRAME) | (1 << BWDREF_FRAME) | 0x01)
 #endif  // CONFIG_EXT_COMP_REFS
 #else   // !CONFIG_EXT_REFS
 #define SECOND_REF_FRAME_MASK ((1 << ALTREF_FRAME) | 0x01)
@@ -135,10 +146,16 @@ static const int filter_sets[DUAL_FILTER_SET_SIZE][2] = {
 // Setting this to 1 will disable trellis optimization within the
 // transform search. Trellis optimization will still be applied
 // in the final encode.
+#ifndef DISABLE_TRELLISQ_SEARCH
 #define DISABLE_TRELLISQ_SEARCH 0
+#endif
 
-const double ADST_FLIP_SVM[8] = { -6.6623, -2.8062, -3.2531, 3.1671,    // vert
-                                  -7.7051, -3.2234, -3.6193, 3.4533 };  // horz
+static const double ADST_FLIP_SVM[8] = {
+  /* vertical */
+  -6.6623, -2.8062, -3.2531, 3.1671,
+  /* horizontal */
+  -7.7051, -3.2234, -3.6193, 3.4533
+};
 
 typedef struct {
   PREDICTION_MODE mode;
@@ -166,6 +183,7 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEARESTMV, { LAST2_FRAME, NONE_FRAME } },
   { NEARESTMV, { LAST3_FRAME, NONE_FRAME } },
   { NEARESTMV, { BWDREF_FRAME, NONE_FRAME } },
+  { NEARESTMV, { ALTREF2_FRAME, NONE_FRAME } },
 #endif  // CONFIG_EXT_REFS
   { NEARESTMV, { ALTREF_FRAME, NONE_FRAME } },
   { NEARESTMV, { GOLDEN_FRAME, NONE_FRAME } },
@@ -177,6 +195,7 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEWMV, { LAST2_FRAME, NONE_FRAME } },
   { NEWMV, { LAST3_FRAME, NONE_FRAME } },
   { NEWMV, { BWDREF_FRAME, NONE_FRAME } },
+  { NEWMV, { ALTREF2_FRAME, NONE_FRAME } },
 #endif  // CONFIG_EXT_REFS
   { NEWMV, { ALTREF_FRAME, NONE_FRAME } },
   { NEWMV, { GOLDEN_FRAME, NONE_FRAME } },
@@ -186,6 +205,7 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEARMV, { LAST2_FRAME, NONE_FRAME } },
   { NEARMV, { LAST3_FRAME, NONE_FRAME } },
   { NEARMV, { BWDREF_FRAME, NONE_FRAME } },
+  { NEARMV, { ALTREF2_FRAME, NONE_FRAME } },
 #endif  // CONFIG_EXT_REFS
   { NEARMV, { ALTREF_FRAME, NONE_FRAME } },
   { NEARMV, { GOLDEN_FRAME, NONE_FRAME } },
@@ -195,14 +215,13 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { ZEROMV, { LAST2_FRAME, NONE_FRAME } },
   { ZEROMV, { LAST3_FRAME, NONE_FRAME } },
   { ZEROMV, { BWDREF_FRAME, NONE_FRAME } },
+  { ZEROMV, { ALTREF2_FRAME, NONE_FRAME } },
 #endif  // CONFIG_EXT_REFS
   { ZEROMV, { GOLDEN_FRAME, NONE_FRAME } },
   { ZEROMV, { ALTREF_FRAME, NONE_FRAME } },
 
 // TODO(zoeliu): May need to reconsider the order on the modes to check
 
-#if CONFIG_EXT_INTER
-
 #if CONFIG_COMPOUND_SINGLEREF
   // Single ref comp mode
   { SR_NEAREST_NEARMV, { LAST_FRAME, NONE_FRAME } },
@@ -263,6 +282,10 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEAREST_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
   { NEAREST_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
   { NEAREST_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
 
 #if CONFIG_EXT_COMP_REFS
   { NEAREST_NEARESTMV, { LAST_FRAME, LAST2_FRAME } },
@@ -272,40 +295,14 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
 #endif  // CONFIG_EXT_COMP_REFS
 #endif  // CONFIG_EXT_REFS
 
-#else  // CONFIG_EXT_INTER
-
-  { NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
-#if CONFIG_EXT_REFS
-  { NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
-  { NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
-#endif  // CONFIG_EXT_REFS
-  { NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
-#if CONFIG_EXT_REFS
-  { NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
-  { NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
-  { NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
-  { NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
-
-#if CONFIG_EXT_COMP_REFS
-  { NEARESTMV, { LAST_FRAME, LAST2_FRAME } },
-  { NEARESTMV, { LAST_FRAME, LAST3_FRAME } },
-  { NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } },
-  { NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } },
-#endif  // CONFIG_EXT_COMP_REFS
-#endif  // CONFIG_EXT_REFS
-#endif  // CONFIG_EXT_INTER
-
   { TM_PRED, { INTRA_FRAME, NONE_FRAME } },
 
-#if CONFIG_ALT_INTRA
   { SMOOTH_PRED, { INTRA_FRAME, NONE_FRAME } },
 #if CONFIG_SMOOTH_HV
   { SMOOTH_V_PRED, { INTRA_FRAME, NONE_FRAME } },
   { SMOOTH_H_PRED, { INTRA_FRAME, NONE_FRAME } },
 #endif  // CONFIG_SMOOTH_HV
-#endif  // CONFIG_ALT_INTRA
 
-#if CONFIG_EXT_INTER
   { NEAR_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
   { NEW_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
   { NEAREST_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
@@ -373,6 +370,38 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEW_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
   { ZERO_ZEROMV, { GOLDEN_FRAME, BWDREF_FRAME } },
 
+  { NEAR_NEARMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARESTMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { NEAR_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { ZERO_ZEROMV, { LAST_FRAME, ALTREF2_FRAME } },
+
+  { NEAR_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARESTMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { NEAR_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { NEW_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { ZERO_ZEROMV, { LAST2_FRAME, ALTREF2_FRAME } },
+
+  { NEAR_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { NEAR_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { NEW_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { ZERO_ZEROMV, { LAST3_FRAME, ALTREF2_FRAME } },
+
+  { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+  { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+  { NEW_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+  { ZERO_ZEROMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+
 #if CONFIG_EXT_COMP_REFS
   { NEAR_NEARMV, { LAST_FRAME, LAST2_FRAME } },
   { NEW_NEARESTMV, { LAST_FRAME, LAST2_FRAME } },
@@ -408,64 +437,6 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
 #endif  // CONFIG_EXT_COMP_REFS
 #endif  // CONFIG_EXT_REFS
 
-#else  // !CONFIG_EXT_INTER
-
-  { NEARMV, { LAST_FRAME, ALTREF_FRAME } },
-  { NEWMV, { LAST_FRAME, ALTREF_FRAME } },
-#if CONFIG_EXT_REFS
-  { NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
-  { NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
-  { NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
-  { NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
-#endif  // CONFIG_EXT_REFS
-  { NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
-  { NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
-
-#if CONFIG_EXT_REFS
-  { NEARMV, { LAST_FRAME, BWDREF_FRAME } },
-  { NEWMV, { LAST_FRAME, BWDREF_FRAME } },
-  { NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
-  { NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
-  { NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
-  { NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
-  { NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
-  { NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
-
-#if CONFIG_EXT_COMP_REFS
-  { NEARMV, { LAST_FRAME, LAST2_FRAME } },
-  { NEWMV, { LAST_FRAME, LAST2_FRAME } },
-  { NEARMV, { LAST_FRAME, LAST3_FRAME } },
-  { NEWMV, { LAST_FRAME, LAST3_FRAME } },
-  { NEARMV, { LAST_FRAME, GOLDEN_FRAME } },
-  { NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
-  { NEARMV, { BWDREF_FRAME, ALTREF_FRAME } },
-  { NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
-#endif  // CONFIG_EXT_COMP_REFS
-#endif  // CONFIG_EXT_REFS
-
-  { ZEROMV, { LAST_FRAME, ALTREF_FRAME } },
-#if CONFIG_EXT_REFS
-  { ZEROMV, { LAST2_FRAME, ALTREF_FRAME } },
-  { ZEROMV, { LAST3_FRAME, ALTREF_FRAME } },
-#endif  // CONFIG_EXT_REFS
-  { ZEROMV, { GOLDEN_FRAME, ALTREF_FRAME } },
-
-#if CONFIG_EXT_REFS
-  { ZEROMV, { LAST_FRAME, BWDREF_FRAME } },
-  { ZEROMV, { LAST2_FRAME, BWDREF_FRAME } },
-  { ZEROMV, { LAST3_FRAME, BWDREF_FRAME } },
-  { ZEROMV, { GOLDEN_FRAME, BWDREF_FRAME } },
-
-#if CONFIG_EXT_COMP_REFS
-  { ZEROMV, { LAST_FRAME, LAST2_FRAME } },
-  { ZEROMV, { LAST_FRAME, LAST3_FRAME } },
-  { ZEROMV, { LAST_FRAME, GOLDEN_FRAME } },
-  { ZEROMV, { BWDREF_FRAME, ALTREF_FRAME } },
-#endif  // CONFIG_EXT_COMP_REFS
-#endif  // CONFIG_EXT_REFS
-
-#endif  // CONFIG_EXT_INTER
-
   { H_PRED, { INTRA_FRAME, NONE_FRAME } },
   { V_PRED, { INTRA_FRAME, NONE_FRAME } },
   { D135_PRED, { INTRA_FRAME, NONE_FRAME } },
@@ -475,7 +446,6 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { D117_PRED, { INTRA_FRAME, NONE_FRAME } },
   { D45_PRED, { INTRA_FRAME, NONE_FRAME } },
 
-#if CONFIG_EXT_INTER
   { ZEROMV, { LAST_FRAME, INTRA_FRAME } },
   { NEARESTMV, { LAST_FRAME, INTRA_FRAME } },
   { NEARMV, { LAST_FRAME, INTRA_FRAME } },
@@ -503,37 +473,34 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEARESTMV, { BWDREF_FRAME, INTRA_FRAME } },
   { NEARMV, { BWDREF_FRAME, INTRA_FRAME } },
   { NEWMV, { BWDREF_FRAME, INTRA_FRAME } },
+
+  { ZEROMV, { ALTREF2_FRAME, INTRA_FRAME } },
+  { NEARESTMV, { ALTREF2_FRAME, INTRA_FRAME } },
+  { NEARMV, { ALTREF2_FRAME, INTRA_FRAME } },
+  { NEWMV, { ALTREF2_FRAME, INTRA_FRAME } },
 #endif  // CONFIG_EXT_REFS
 
   { ZEROMV, { ALTREF_FRAME, INTRA_FRAME } },
   { NEARESTMV, { ALTREF_FRAME, INTRA_FRAME } },
   { NEARMV, { ALTREF_FRAME, INTRA_FRAME } },
   { NEWMV, { ALTREF_FRAME, INTRA_FRAME } },
-#endif  // CONFIG_EXT_INTER
 };
 
 static const PREDICTION_MODE intra_rd_search_mode_order[INTRA_MODES] = {
-  DC_PRED,       H_PRED,        V_PRED,
-#if CONFIG_ALT_INTRA
-  SMOOTH_PRED,
-#endif  // CONFIG_ALT_INTRA
-  TM_PRED,
-#if CONFIG_ALT_INTRA && CONFIG_SMOOTH_HV
+  DC_PRED,       H_PRED,        V_PRED,    SMOOTH_PRED, TM_PRED,
+#if CONFIG_SMOOTH_HV
   SMOOTH_V_PRED, SMOOTH_H_PRED,
-#endif  // CONFIG_ALT_INTRA && CONFIG_SMOOTH_HV
-  D135_PRED,     D207_PRED,     D153_PRED, D63_PRED, D117_PRED, D45_PRED,
+#endif  // CONFIG_SMOOTH_HV
+  D135_PRED,     D207_PRED,     D153_PRED, D63_PRED,    D117_PRED, D45_PRED,
 };
 
 #if CONFIG_CFL
 static const UV_PREDICTION_MODE uv_rd_search_mode_order[UV_INTRA_MODES] = {
-  UV_DC_PRED,       UV_H_PRED,        UV_V_PRED,
-#if CONFIG_ALT_INTRA
-  UV_SMOOTH_PRED,
-#endif  // CONFIG_ALT_INTRA
-  UV_TM_PRED,
-#if CONFIG_ALT_INTRA && CONFIG_SMOOTH_HV
+  UV_DC_PRED,       UV_CFL_PRED,      UV_H_PRED,
+  UV_V_PRED,        UV_SMOOTH_PRED,   UV_TM_PRED,
+#if CONFIG_SMOOTH_HV
   UV_SMOOTH_V_PRED, UV_SMOOTH_H_PRED,
-#endif  // CONFIG_ALT_INTRA && CONFIG_SMOOTH_HV
+#endif  // CONFIG_SMOOTH_HV
   UV_D135_PRED,     UV_D207_PRED,     UV_D153_PRED,
   UV_D63_PRED,      UV_D117_PRED,     UV_D45_PRED,
 };
@@ -541,7 +508,6 @@ static const UV_PREDICTION_MODE uv_rd_search_mode_order[UV_INTRA_MODES] = {
 #define uv_rd_search_mode_order intra_rd_search_mode_order
 #endif  // CONFIG_CFL
 
-#if CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA || CONFIG_PALETTE
 static INLINE int write_uniform_cost(int n, int v) {
   const int l = get_unsigned_bits(n);
   const int m = (1 << l) - n;
@@ -551,7 +517,6 @@ static INLINE int write_uniform_cost(int n, int v) {
   else
     return l * av1_cost_bit(128, 0);
 }
-#endif  // CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA || CONFIG_PALETTE
 
 // constants for prune 1 and prune 2 decision boundaries
 #define FAST_EXT_TX_CORR_MID 0.0
@@ -559,7 +524,82 @@ static INLINE int write_uniform_cost(int n, int v) {
 #define FAST_EXT_TX_CORR_MARGIN 0.5
 #define FAST_EXT_TX_EDST_MARGIN 0.3
 
-#if CONFIG_DAALA_DIST
+static unsigned pixel_dist_visible_only(
+    const AV1_COMP *const cpi, const MACROBLOCK *x, const uint8_t *src,
+    const int src_stride, const uint8_t *dst, const int dst_stride,
+    const BLOCK_SIZE tx_bsize, int txb_rows, int txb_cols, int visible_rows,
+    int visible_cols) {
+  unsigned sse;
+
+  if (txb_rows == visible_rows && txb_cols == visible_cols
+#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
+      && tx_bsize < BLOCK_SIZES
+#endif
+      ) {
+    cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
+    return sse;
+  }
+#if CONFIG_HIGHBITDEPTH
+  const MACROBLOCKD *xd = &x->e_mbd;
+
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    uint64_t sse64 = aom_highbd_sse_odd_size(src, src_stride, dst, dst_stride,
+                                             visible_cols, visible_rows);
+    return (unsigned int)ROUND_POWER_OF_TWO(sse64, (xd->bd - 8) * 2);
+  }
+#else
+  (void)x;
+#endif  // CONFIG_HIGHBITDEPTH
+  sse = aom_sse_odd_size(src, src_stride, dst, dst_stride, visible_cols,
+                         visible_rows);
+  return sse;
+}
+
+#if CONFIG_DIST_8X8
+static uint64_t cdef_dist_8x8_16bit(uint16_t *dst, int dstride, uint16_t *src,
+                                    int sstride, int coeff_shift) {
+  uint64_t svar = 0;
+  uint64_t dvar = 0;
+  uint64_t sum_s = 0;
+  uint64_t sum_d = 0;
+  uint64_t sum_s2 = 0;
+  uint64_t sum_d2 = 0;
+  uint64_t sum_sd = 0;
+  uint64_t dist = 0;
+
+  int i, j;
+  for (i = 0; i < 8; i++) {
+    for (j = 0; j < 8; j++) {
+      sum_s += src[i * sstride + j];
+      sum_d += dst[i * dstride + j];
+      sum_s2 += src[i * sstride + j] * src[i * sstride + j];
+      sum_d2 += dst[i * dstride + j] * dst[i * dstride + j];
+      sum_sd += src[i * sstride + j] * dst[i * dstride + j];
+    }
+  }
+  /* Compute the variance -- the calculation cannot go negative. */
+  svar = sum_s2 - ((sum_s * sum_s + 32) >> 6);
+  dvar = sum_d2 - ((sum_d * sum_d + 32) >> 6);
+
+  // Tuning of jm's original dering distortion metric used in CDEF tool,
+  // suggested by jm
+  const uint64_t a = 4;
+  const uint64_t b = 2;
+  const uint64_t c1 = (400 * a << 2 * coeff_shift);
+  const uint64_t c2 = (b * 20000 * a * a << 4 * coeff_shift);
+
+  dist =
+      (uint64_t)floor(.5 +
+                      (sum_d2 + sum_s2 - 2 * sum_sd) * .5 * (svar + dvar + c1) /
+                          (sqrt(svar * (double)dvar + c2)));
+
+  // Calibrate dist to have similar rate for the same QP with MSE only
+  // distortion (as in master branch)
+  dist = (uint64_t)((float)dist * 0.75);
+
+  return dist;
+}
+
 static int od_compute_var_4x4(uint16_t *x, int stride) {
   int sum;
   int s2;
@@ -617,7 +657,7 @@ static double od_compute_dist_8x8(int use_activity_masking, uint16_t *x,
     }
   }
   /* We use a different variance statistic depending on whether activity
-     masking is used, since the harmonic mean appeared slghtly worse with
+     masking is used, since the harmonic mean appeared slightly worse with
      masking off. The calibration constant just ensures that we preserve the
      rate compared to activity=1. */
   if (use_activity_masking) {
@@ -688,268 +728,241 @@ static double od_compute_dist_common(int activity_masking, uint16_t *x,
 
 static double od_compute_dist(uint16_t *x, uint16_t *y, int bsize_w,
                               int bsize_h, int qindex) {
-  int i;
-  double sum;
-  sum = 0;
-
   assert(bsize_w >= 8 && bsize_h >= 8);
-
 #if CONFIG_PVQ
   int activity_masking = 1;
 #else
   int activity_masking = 0;
 #endif
-  {
-    int j;
-    DECLARE_ALIGNED(16, od_coeff, e[MAX_TX_SQUARE]);
-    DECLARE_ALIGNED(16, od_coeff, tmp[MAX_TX_SQUARE]);
-    DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_TX_SQUARE]);
-    int mid = OD_DIST_LP_MID;
-    for (i = 0; i < bsize_h; i++) {
-      for (j = 0; j < bsize_w; j++) {
-        e[i * bsize_w + j] = x[i * bsize_w + j] - y[i * bsize_w + j];
-      }
+  int i, j;
+  DECLARE_ALIGNED(16, od_coeff, e[MAX_TX_SQUARE]);
+  DECLARE_ALIGNED(16, od_coeff, tmp[MAX_TX_SQUARE]);
+  DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_TX_SQUARE]);
+  for (i = 0; i < bsize_h; i++) {
+    for (j = 0; j < bsize_w; j++) {
+      e[i * bsize_w + j] = x[i * bsize_w + j] - y[i * bsize_w + j];
     }
-    for (i = 0; i < bsize_h; i++) {
-      tmp[i * bsize_w] = mid * e[i * bsize_w] + 2 * e[i * bsize_w + 1];
-      tmp[i * bsize_w + bsize_w - 1] =
-          mid * e[i * bsize_w + bsize_w - 1] + 2 * e[i * bsize_w + bsize_w - 2];
-      for (j = 1; j < bsize_w - 1; j++) {
-        tmp[i * bsize_w + j] = mid * e[i * bsize_w + j] +
-                               e[i * bsize_w + j - 1] + e[i * bsize_w + j + 1];
-      }
+  }
+  int mid = OD_DIST_LP_MID;
+  for (i = 0; i < bsize_h; i++) {
+    tmp[i * bsize_w] = mid * e[i * bsize_w] + 2 * e[i * bsize_w + 1];
+    tmp[i * bsize_w + bsize_w - 1] =
+        mid * e[i * bsize_w + bsize_w - 1] + 2 * e[i * bsize_w + bsize_w - 2];
+    for (j = 1; j < bsize_w - 1; j++) {
+      tmp[i * bsize_w + j] = mid * e[i * bsize_w + j] + e[i * bsize_w + j - 1] +
+                             e[i * bsize_w + j + 1];
     }
-    sum = od_compute_dist_common(activity_masking, x, y, bsize_w, bsize_h,
-                                 qindex, tmp, e_lp);
   }
-  return sum;
+  return od_compute_dist_common(activity_masking, x, y, bsize_w, bsize_h,
+                                qindex, tmp, e_lp);
 }
 
 static double od_compute_dist_diff(uint16_t *x, int16_t *e, int bsize_w,
                                    int bsize_h, int qindex) {
-  int i;
-  double sum;
-  sum = 0;
-
   assert(bsize_w >= 8 && bsize_h >= 8);
-
 #if CONFIG_PVQ
   int activity_masking = 1;
 #else
   int activity_masking = 0;
 #endif
-  {
-    int j;
-    DECLARE_ALIGNED(16, uint16_t, y[MAX_TX_SQUARE]);
-    DECLARE_ALIGNED(16, od_coeff, tmp[MAX_TX_SQUARE]);
-    DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_TX_SQUARE]);
-    int mid = OD_DIST_LP_MID;
-    for (i = 0; i < bsize_h; i++) {
-      for (j = 0; j < bsize_w; j++) {
-        y[i * bsize_w + j] = x[i * bsize_w + j] - e[i * bsize_w + j];
-      }
+  DECLARE_ALIGNED(16, uint16_t, y[MAX_TX_SQUARE]);
+  DECLARE_ALIGNED(16, od_coeff, tmp[MAX_TX_SQUARE]);
+  DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_TX_SQUARE]);
+  int i, j;
+  for (i = 0; i < bsize_h; i++) {
+    for (j = 0; j < bsize_w; j++) {
+      y[i * bsize_w + j] = x[i * bsize_w + j] - e[i * bsize_w + j];
     }
-    for (i = 0; i < bsize_h; i++) {
-      tmp[i * bsize_w] = mid * e[i * bsize_w] + 2 * e[i * bsize_w + 1];
-      tmp[i * bsize_w + bsize_w - 1] =
-          mid * e[i * bsize_w + bsize_w - 1] + 2 * e[i * bsize_w + bsize_w - 2];
-      for (j = 1; j < bsize_w - 1; j++) {
-        tmp[i * bsize_w + j] = mid * e[i * bsize_w + j] +
-                               e[i * bsize_w + j - 1] + e[i * bsize_w + j + 1];
-      }
+  }
+  int mid = OD_DIST_LP_MID;
+  for (i = 0; i < bsize_h; i++) {
+    tmp[i * bsize_w] = mid * e[i * bsize_w] + 2 * e[i * bsize_w + 1];
+    tmp[i * bsize_w + bsize_w - 1] =
+        mid * e[i * bsize_w + bsize_w - 1] + 2 * e[i * bsize_w + bsize_w - 2];
+    for (j = 1; j < bsize_w - 1; j++) {
+      tmp[i * bsize_w + j] = mid * e[i * bsize_w + j] + e[i * bsize_w + j - 1] +
+                             e[i * bsize_w + j + 1];
     }
-    sum = od_compute_dist_common(activity_masking, x, y, bsize_w, bsize_h,
-                                 qindex, tmp, e_lp);
   }
-  return sum;
+  return od_compute_dist_common(activity_masking, x, y, bsize_w, bsize_h,
+                                qindex, tmp, e_lp);
 }
-#endif  // CONFIG_DAALA_DIST
 
-#if CONFIG_DIST_8X8
-#define NEW_FUTURE_DIST 0
-int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCKD *xd,
+int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCK *x,
                      const uint8_t *src, int src_stride, const uint8_t *dst,
                      int dst_stride, const BLOCK_SIZE tx_bsize, int bsw,
                      int bsh, int visible_w, int visible_h, int qindex) {
   int64_t d = 0;
-
-#if CONFIG_DAALA_DIST || NEW_FUTURE_DIST
   int i, j;
+  const MACROBLOCKD *xd = &x->e_mbd;
 
   DECLARE_ALIGNED(16, uint16_t, orig[MAX_TX_SQUARE]);
   DECLARE_ALIGNED(16, uint16_t, rec[MAX_TX_SQUARE]);
-  (void)cpi;
-  (void)tx_bsize;
-#endif  // CONFIG_DAALA_DIST || NEW_FUTURE_DIST
 
-#if !CONFIG_HIGHBITDEPTH
-  (void)xd;
-#endif
+  assert(bsw >= 8);
+  assert(bsh >= 8);
+  assert((bsw & 0x07) == 0);
+  assert((bsh & 0x07) == 0);
 
-#if !CONFIG_DAALA_DIST
-  (void)qindex;
-#endif
-
-#if !CONFIG_DAALA_DIST || !NEW_FUTURE_DIST
-  (void)xd;
-  (void)bsw, (void)bsh;
-  (void)visible_w, (void)visible_h;
-#endif
-
-#if CONFIG_DAALA_DIST || NEW_FUTURE_DIST
+  if (x->tune_metric == AOM_TUNE_CDEF_DIST ||
+      x->tune_metric == AOM_TUNE_DAALA_DIST) {
 #if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    for (j = 0; j < bsh; j++)
-      for (i = 0; i < bsw; i++)
-        orig[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
-
-    if ((bsw == visible_w) && (bsh == visible_h)) {
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       for (j = 0; j < bsh; j++)
         for (i = 0; i < bsw; i++)
-          rec[j * bsw + i] = CONVERT_TO_SHORTPTR(dst)[j * dst_stride + i];
-    } else {
-      for (j = 0; j < visible_h; j++)
-        for (i = 0; i < visible_w; i++)
-          rec[j * bsw + i] = CONVERT_TO_SHORTPTR(dst)[j * dst_stride + i];
+          orig[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
 
-      if (visible_w < bsw) {
+      if ((bsw == visible_w) && (bsh == visible_h)) {
         for (j = 0; j < bsh; j++)
-          for (i = visible_w; i < bsw; i++)
-            rec[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
-      }
-
-      if (visible_h < bsh) {
-        for (j = visible_h; j < bsh; j++)
           for (i = 0; i < bsw; i++)
-            rec[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
+            rec[j * bsw + i] = CONVERT_TO_SHORTPTR(dst)[j * dst_stride + i];
+      } else {
+        for (j = 0; j < visible_h; j++)
+          for (i = 0; i < visible_w; i++)
+            rec[j * bsw + i] = CONVERT_TO_SHORTPTR(dst)[j * dst_stride + i];
+
+        if (visible_w < bsw) {
+          for (j = 0; j < bsh; j++)
+            for (i = visible_w; i < bsw; i++)
+              rec[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
+        }
+
+        if (visible_h < bsh) {
+          for (j = visible_h; j < bsh; j++)
+            for (i = 0; i < bsw; i++)
+              rec[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
+        }
       }
-    }
-  } else {
+    } else {
 #endif
-    for (j = 0; j < bsh; j++)
-      for (i = 0; i < bsw; i++) orig[j * bsw + i] = src[j * src_stride + i];
-
-    if ((bsw == visible_w) && (bsh == visible_h)) {
       for (j = 0; j < bsh; j++)
-        for (i = 0; i < bsw; i++) rec[j * bsw + i] = dst[j * dst_stride + i];
-    } else {
-      for (j = 0; j < visible_h; j++)
-        for (i = 0; i < visible_w; i++)
-          rec[j * bsw + i] = dst[j * dst_stride + i];
+        for (i = 0; i < bsw; i++) orig[j * bsw + i] = src[j * src_stride + i];
 
-      if (visible_w < bsw) {
+      if ((bsw == visible_w) && (bsh == visible_h)) {
         for (j = 0; j < bsh; j++)
-          for (i = visible_w; i < bsw; i++)
-            rec[j * bsw + i] = src[j * src_stride + i];
-      }
+          for (i = 0; i < bsw; i++) rec[j * bsw + i] = dst[j * dst_stride + i];
+      } else {
+        for (j = 0; j < visible_h; j++)
+          for (i = 0; i < visible_w; i++)
+            rec[j * bsw + i] = dst[j * dst_stride + i];
+
+        if (visible_w < bsw) {
+          for (j = 0; j < bsh; j++)
+            for (i = visible_w; i < bsw; i++)
+              rec[j * bsw + i] = src[j * src_stride + i];
+        }
 
-      if (visible_h < bsh) {
-        for (j = visible_h; j < bsh; j++)
-          for (i = 0; i < bsw; i++) rec[j * bsw + i] = src[j * src_stride + i];
+        if (visible_h < bsh) {
+          for (j = visible_h; j < bsh; j++)
+            for (i = 0; i < bsw; i++)
+              rec[j * bsw + i] = src[j * src_stride + i];
+        }
       }
-    }
 #if CONFIG_HIGHBITDEPTH
-  }
+    }
 #endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_DAALA_DIST || NEW_FUTURE_DIST
+  }
 
-#if CONFIG_DAALA_DIST
-  d = (int64_t)od_compute_dist(orig, rec, bsw, bsh, qindex);
-#elif NEW_FUTURE_DIST
-  // Call new 8x8-wise distortion function here, for example
-  for (i = 0; i < bsh; i += 8) {
-    for (j = 0; j < bsw; j += 8) {
-      d +=
-          av1_compute_dist_8x8(&orig[i * bsw + j], &rec[i * bsw + j], bsw, bsh);
+  if (x->tune_metric == AOM_TUNE_DAALA_DIST) {
+    d = (int64_t)od_compute_dist(orig, rec, bsw, bsh, qindex);
+  } else if (x->tune_metric == AOM_TUNE_CDEF_DIST) {
+    int coeff_shift = AOMMAX(xd->bd - 8, 0);
+
+    for (i = 0; i < bsh; i += 8) {
+      for (j = 0; j < bsw; j += 8) {
+        d += cdef_dist_8x8_16bit(&rec[i * bsw + j], bsw, &orig[i * bsw + j],
+                                 bsw, coeff_shift);
+      }
     }
+#if CONFIG_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+      d = ((uint64_t)d) >> 2 * coeff_shift;
+#endif
+  } else {
+    // Otherwise, MSE by default
+    d = pixel_dist_visible_only(cpi, x, src, src_stride, dst, dst_stride,
+                                tx_bsize, bsh, bsw, visible_h, visible_w);
   }
-#else
-  // Otherwise, MSE by default
-  unsigned sse;
-  // TODO(Any): Use even faster function which does not calculate variance
-  cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
-  d = sse;
-#endif  // CONFIG_DAALA_DIST
 
   return d;
 }
 
-static int64_t av1_dist_8x8_diff(const MACROBLOCKD *xd, const uint8_t *src,
+static int64_t av1_dist_8x8_diff(const MACROBLOCK *x, const uint8_t *src,
                                  int src_stride, const int16_t *diff,
                                  int diff_stride, int bsw, int bsh,
                                  int visible_w, int visible_h, int qindex) {
   int64_t d = 0;
-
-#if CONFIG_DAALA_DIST || NEW_FUTURE_DIST
   int i, j;
+  const MACROBLOCKD *xd = &x->e_mbd;
 
   DECLARE_ALIGNED(16, uint16_t, orig[MAX_TX_SQUARE]);
   DECLARE_ALIGNED(16, int16_t, diff16[MAX_TX_SQUARE]);
-#endif  // CONFIG_DAALA_DIST || NEW_FUTURE_DIST
 
-#if !CONFIG_HIGHBITDEPTH
-  (void)xd;
-#endif
+  assert(bsw >= 8);
+  assert(bsh >= 8);
+  assert((bsw & 0x07) == 0);
+  assert((bsh & 0x07) == 0);
 
-#if !CONFIG_DAALA_DIST
-  (void)qindex;
-#endif
-
-#if !CONFIG_DAALA_DIST || !NEW_FUTURE_DIST
-  (void)xd;
-  (void)src, (void)src_stride;
-  (void)bsw, (void)bsh;
-  (void)visible_w, (void)visible_h;
-#endif
-
-#if CONFIG_DAALA_DIST || NEW_FUTURE_DIST
+  if (x->tune_metric == AOM_TUNE_CDEF_DIST ||
+      x->tune_metric == AOM_TUNE_DAALA_DIST) {
 #if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    for (j = 0; j < bsh; j++)
-      for (i = 0; i < bsw; i++)
-        orig[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
-  } else {
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      for (j = 0; j < bsh; j++)
+        for (i = 0; i < bsw; i++)
+          orig[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
+    } else {
 #endif
-    for (j = 0; j < bsh; j++)
-      for (i = 0; i < bsw; i++) orig[j * bsw + i] = src[j * src_stride + i];
+      for (j = 0; j < bsh; j++)
+        for (i = 0; i < bsw; i++) orig[j * bsw + i] = src[j * src_stride + i];
 #if CONFIG_HIGHBITDEPTH
-  }
+    }
 #endif  // CONFIG_HIGHBITDEPTH
 
-  if ((bsw == visible_w) && (bsh == visible_h)) {
-    for (j = 0; j < bsh; j++)
-      for (i = 0; i < bsw; i++) diff16[j * bsw + i] = diff[j * diff_stride + i];
-  } else {
-    for (j = 0; j < visible_h; j++)
-      for (i = 0; i < visible_w; i++)
-        diff16[j * bsw + i] = diff[j * diff_stride + i];
-
-    if (visible_w < bsw) {
+    if ((bsw == visible_w) && (bsh == visible_h)) {
       for (j = 0; j < bsh; j++)
-        for (i = visible_w; i < bsw; i++) diff16[j * bsw + i] = 0;
-    }
+        for (i = 0; i < bsw; i++)
+          diff16[j * bsw + i] = diff[j * diff_stride + i];
+    } else {
+      for (j = 0; j < visible_h; j++)
+        for (i = 0; i < visible_w; i++)
+          diff16[j * bsw + i] = diff[j * diff_stride + i];
 
-    if (visible_h < bsh) {
-      for (j = visible_h; j < bsh; j++)
-        for (i = 0; i < bsw; i++) diff16[j * bsw + i] = 0;
+      if (visible_w < bsw) {
+        for (j = 0; j < bsh; j++)
+          for (i = visible_w; i < bsw; i++) diff16[j * bsw + i] = 0;
+      }
+
+      if (visible_h < bsh) {
+        for (j = visible_h; j < bsh; j++)
+          for (i = 0; i < bsw; i++) diff16[j * bsw + i] = 0;
+      }
     }
   }
-#endif  // CONFIG_DAALA_DIST || NEW_FUTURE_DIST
 
-#if CONFIG_DAALA_DIST
-  d = (int64_t)od_compute_dist_diff(orig, diff16, bsw, bsh, qindex);
-#elif NEW_FUTURE_DIST
-  // Call new 8x8-wise distortion function (with diff inpu) here, for example
-  for (i = 0; i < bsh; i += 8) {
-    for (j = 0; j < bsw; j += 8) {
-      d += av1_compute_dist_8x8_diff(&orig[i * bsw + j], &diff16[i * bsw + j],
-                                     bsw, bsh);
+  if (x->tune_metric == AOM_TUNE_DAALA_DIST) {
+    d = (int64_t)od_compute_dist_diff(orig, diff16, bsw, bsh, qindex);
+  } else if (x->tune_metric == AOM_TUNE_CDEF_DIST) {
+    int coeff_shift = AOMMAX(xd->bd - 8, 0);
+    DECLARE_ALIGNED(16, uint16_t, dst16[MAX_TX_SQUARE]);
+
+    for (i = 0; i < bsh; i++) {
+      for (j = 0; j < bsw; j++) {
+        dst16[i * bsw + j] = orig[i * bsw + j] - diff16[i * bsw + j];
+      }
     }
+
+    for (i = 0; i < bsh; i += 8) {
+      for (j = 0; j < bsw; j += 8) {
+        d += cdef_dist_8x8_16bit(&dst16[i * bsw + j], bsw, &orig[i * bsw + j],
+                                 bsw, coeff_shift);
+      }
+    }
+    // Don't scale 'd' for HBD since it will be done by caller side for diff
+    // input
+  } else {
+    // Otherwise, MSE by default
+    d = aom_sum_squares_2d_i16(diff, diff_stride, visible_w, visible_h);
   }
-#else
-  // Otherwise, MSE by default
-  d = aom_sum_squares_2d_i16(diff, diff_stride, bsw, bsh);
-#endif  // CONFIG_DAALA_DIST
 
   return d;
 }
@@ -1169,6 +1182,17 @@ static int prune_one_for_sby(const AV1_COMP *cpi, BLOCK_SIZE bsize,
                           pd->dst.stride);
 }
 
+#if CONFIG_EXT_TX
+// 1D Transforms used in inter set, this needs to be changed if
+// ext_tx_used_inter is changed
+static const int ext_tx_used_inter_1D[EXT_TX_SETS_INTER][TX_TYPES_1D] = {
+  { 1, 0, 0, 0 }, { 1, 1, 1, 1 }, { 1, 1, 1, 1 }, { 1, 0, 0, 1 },
+#if CONFIG_MRC_TX
+  { 1, 0, 0, 1 },
+#endif  // CONFIG_MRC_TX
+};
+#endif  // CONFIG_EXT_TX
+
 static int prune_tx_types(const AV1_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
                           const MACROBLOCKD *const xd, int tx_set) {
 #if CONFIG_EXT_TX
@@ -1392,22 +1416,18 @@ static int64_t av1_block_error2_c(const tran_low_t *coeff,
                                   const tran_low_t *ref, intptr_t block_size,
                                   int64_t *ssz) {
   int64_t error;
+  int64_t ssz_trash;
   // Use the existing sse codes for calculating distortion of decoded signal:
   // i.e. (orig - decoded)^2
-  error = av1_block_error_fp(coeff, dqcoeff, block_size);
+  error = av1_block_error(coeff, dqcoeff, block_size, &ssz_trash);
   // prediction residue^2 = (orig - ref)^2
-  *ssz = av1_block_error_fp(coeff, ref, block_size);
+  *ssz = av1_block_error(coeff, ref, block_size, &ssz_trash);
   return error;
 }
 #endif  // CONFIG_HIGHBITDEPTH
 #endif  // CONFIG_PVQ
 
 #if !CONFIG_PVQ || CONFIG_VAR_TX
-/* The trailing '0' is a terminator which is used inside av1_cost_coeffs() to
- * decide whether to include cost of a trailing EOB node or not (i.e. we
- * can skip this if the last coefficient in this transform block, e.g. the
- * 16th coefficient in a 4x4 block or the 64th coefficient in a 8x8 block,
- * were non-zero). */
 #if !CONFIG_LV_MAP
 static int cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
                        int block, TX_SIZE tx_size, const SCAN_ORDER *scan_order,
@@ -1421,17 +1441,19 @@ static int cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
   const uint16_t *band_count = &band_count_table[tx_size][1];
   const int eob = p->eobs[block];
   const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  const int tx_size_ctx = txsize_sqr_map[tx_size];
-  unsigned int(*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
-      x->token_costs[tx_size_ctx][type][is_inter_block(mbmi)];
+  const TX_SIZE tx_size_ctx = txsize_sqr_map[tx_size];
   uint8_t token_cache[MAX_TX_SQUARE];
   int pt = combine_entropy_contexts(*a, *l);
   int c, cost;
   const int16_t *scan = scan_order->scan;
   const int16_t *nb = scan_order->neighbors;
   const int ref = is_inter_block(mbmi);
-  aom_prob *blockz_probs =
-      cm->fc->blockzero_probs[txsize_sqr_map[tx_size]][type][ref];
+  int(*head_token_costs)[COEFF_CONTEXTS][TAIL_TOKENS] =
+      x->token_head_costs[tx_size_ctx][type][ref];
+  int(*tail_token_costs)[COEFF_CONTEXTS][TAIL_TOKENS] =
+      x->token_tail_costs[tx_size_ctx][type][ref];
+  const int seg_eob = av1_get_tx_eob(&cm->seg, mbmi->segment_id, tx_size);
+  int eob_val;
 
 #if CONFIG_HIGHBITDEPTH
   const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, xd->bd);
@@ -1446,8 +1468,8 @@ static int cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
   (void)cm;
 
   if (eob == 0) {
-    // single eob token
-    cost = av1_cost_bit(blockz_probs[pt], 0);
+    // block zero
+    cost = (*head_token_costs)[pt][0];
   } else {
     if (use_fast_coef_costing) {
       int band_left = *band_count++;
@@ -1456,10 +1478,13 @@ static int cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
       int v = qcoeff[0];
       int16_t prev_t;
       cost = av1_get_token_cost(v, &prev_t, cat6_bits);
-      cost += (*token_costs)[!prev_t][pt][prev_t];
+      eob_val = (eob == 1) ? EARLY_EOB : NO_EOB;
+      cost += av1_get_coeff_token_cost(
+          prev_t, eob_val, 1, (*head_token_costs)[pt], (*tail_token_costs)[pt]);
 
       token_cache[0] = av1_pt_energy_class[prev_t];
-      ++token_costs;
+      ++head_token_costs;
+      ++tail_token_costs;
 
       // ac tokens
       for (c = 1; c < eob; c++) {
@@ -1468,17 +1493,18 @@ static int cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
 
         v = qcoeff[rc];
         cost += av1_get_token_cost(v, &t, cat6_bits);
-        cost += (*token_costs)[!t][!prev_t][t];
+        eob_val =
+            (c + 1 == eob) ? (c + 1 == seg_eob ? LAST_EOB : EARLY_EOB) : NO_EOB;
+        cost += av1_get_coeff_token_cost(t, eob_val, 0,
+                                         (*head_token_costs)[!prev_t],
+                                         (*tail_token_costs)[!prev_t]);
         prev_t = t;
         if (!--band_left) {
           band_left = *band_count++;
-          ++token_costs;
+          ++head_token_costs;
+          ++tail_token_costs;
         }
       }
-
-      // eob token
-      cost += (*token_costs)[0][!prev_t][EOB_TOKEN];
-
     } else {  // !use_fast_coef_costing
       int band_left = *band_count++;
 
@@ -1486,10 +1512,13 @@ static int cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
       int v = qcoeff[0];
       int16_t tok;
       cost = av1_get_token_cost(v, &tok, cat6_bits);
-      cost += (*token_costs)[!tok][pt][tok];
+      eob_val = (eob == 1) ? EARLY_EOB : NO_EOB;
+      cost += av1_get_coeff_token_cost(tok, eob_val, 1, (*head_token_costs)[pt],
+                                       (*tail_token_costs)[pt]);
 
       token_cache[0] = av1_pt_energy_class[tok];
-      ++token_costs;
+      ++head_token_costs;
+      ++tail_token_costs;
 
       // ac tokens
       for (c = 1; c < eob; c++) {
@@ -1498,17 +1527,17 @@ static int cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
         v = qcoeff[rc];
         cost += av1_get_token_cost(v, &tok, cat6_bits);
         pt = get_coef_context(nb, token_cache, c);
-        cost += (*token_costs)[!tok][pt][tok];
+        eob_val =
+            (c + 1 == eob) ? (c + 1 == seg_eob ? LAST_EOB : EARLY_EOB) : NO_EOB;
+        cost += av1_get_coeff_token_cost(
+            tok, eob_val, 0, (*head_token_costs)[pt], (*tail_token_costs)[pt]);
         token_cache[rc] = av1_pt_energy_class[tok];
         if (!--band_left) {
           band_left = *band_count++;
-          ++token_costs;
+          ++head_token_costs;
+          ++tail_token_costs;
         }
       }
-
-      // eob token
-      pt = get_coef_context(nb, token_cache, c);
-      cost += (*token_costs)[0][pt][EOB_TOKEN];
     }
   }
 
@@ -1520,10 +1549,25 @@ int av1_cost_coeffs(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
                     int blk_row, int blk_col, int block, TX_SIZE tx_size,
                     const SCAN_ORDER *scan_order, const ENTROPY_CONTEXT *a,
                     const ENTROPY_CONTEXT *l, int use_fast_coef_costing) {
+  const AV1_COMMON *const cm = &cpi->common;
 #if !CONFIG_LV_MAP
   (void)blk_row;
   (void)blk_col;
-  const AV1_COMMON *const cm = &cpi->common;
+#if CONFIG_MRC_TX
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  const TX_TYPE tx_type = av1_get_tx_type(xd->plane[plane].plane_type, xd,
+                                          blk_row, blk_col, block, tx_size);
+  const int is_inter = is_inter_block(mbmi);
+  if (tx_type == MRC_DCT && ((is_inter && SIGNAL_MRC_MASK_INTER) ||
+                             (!is_inter && SIGNAL_MRC_MASK_INTRA))) {
+    const int mrc_mask_cost =
+        av1_cost_color_map(x, plane, block, mbmi->sb_type, tx_size, MRC_MAP);
+    return cost_coeffs(cm, x, plane, block, tx_size, scan_order, a, l,
+                       use_fast_coef_costing) +
+           mrc_mask_cost;
+  }
+#endif
   return cost_coeffs(cm, x, plane, block, tx_size, scan_order, a, l,
                      use_fast_coef_costing);
 #else  // !CONFIG_LV_MAP
@@ -1545,7 +1589,7 @@ int av1_cost_coeffs(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
 
   TXB_CTX txb_ctx;
   get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
-  return av1_cost_coeffs_txb(cpi, x, plane, blk_row, blk_col, block, tx_size,
+  return av1_cost_coeffs_txb(cm, x, plane, blk_row, blk_col, block, tx_size,
                              &txb_ctx);
 #endif  // !CONFIG_LV_MAP
 }
@@ -1600,31 +1644,16 @@ static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x,
   assert(visible_cols > 0);
 
 #if CONFIG_DIST_8X8
-  if (plane == 0 && txb_cols >= 8 && txb_rows >= 8)
-    return av1_dist_8x8(cpi, xd, src, src_stride, dst, dst_stride, tx_bsize,
-                        txb_cols, txb_rows, visible_cols, visible_rows,
-                        x->qindex);
+  if (x->using_dist_8x8 && plane == 0 && txb_cols >= 8 && txb_rows >= 8)
+    return (unsigned)av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride,
+                                  tx_bsize, txb_cols, txb_rows, visible_cols,
+                                  visible_rows, x->qindex);
 #endif  // CONFIG_DIST_8X8
 
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-  if ((txb_rows == visible_rows && txb_cols == visible_cols) &&
-      tx_bsize < BLOCK_SIZES) {
-#else
-  if (txb_rows == visible_rows && txb_cols == visible_cols) {
-#endif
-    unsigned sse;
-    cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
-    return sse;
-  }
-#if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    uint64_t sse = aom_highbd_sse_odd_size(src, src_stride, dst, dst_stride,
-                                           visible_cols, visible_rows);
-    return (unsigned int)ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
-  }
-#endif  // CONFIG_HIGHBITDEPTH
-  unsigned sse = aom_sse_odd_size(src, src_stride, dst, dst_stride,
-                                  visible_cols, visible_rows);
+  unsigned sse = pixel_dist_visible_only(cpi, x, src, src_stride, dst,
+                                         dst_stride, tx_bsize, txb_rows,
+                                         txb_cols, visible_rows, visible_cols);
+
   return sse;
 }
 
@@ -1649,8 +1678,8 @@ static int64_t pixel_diff_dist(const MACROBLOCK *x, int plane,
                      NULL, &visible_cols, &visible_rows);
 
 #if CONFIG_DIST_8X8
-  if (plane == 0 && txb_width >= 8 && txb_height >= 8)
-    return av1_dist_8x8_diff(xd, src, src_stride, diff, diff_stride, txb_width,
+  if (x->using_dist_8x8 && plane == 0 && txb_width >= 8 && txb_height >= 8)
+    return av1_dist_8x8_diff(x, src, src_stride, diff, diff_stride, txb_width,
                              txb_height, visible_cols, visible_rows, x->qindex);
   else
 #endif
@@ -1658,7 +1687,6 @@ static int64_t pixel_diff_dist(const MACROBLOCK *x, int plane,
                                   visible_rows);
 }
 
-#if CONFIG_PALETTE || CONFIG_INTRABC
 int av1_count_colors(const uint8_t *src, int stride, int rows, int cols) {
   int val_count[256];
   memset(val_count, 0, sizeof(val_count));
@@ -1693,7 +1721,6 @@ int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols,
   return n;
 }
 #endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_PALETTE || CONFIG_INTRABC
 
 void av1_dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
                     BLOCK_SIZE plane_bsize, int block, int blk_row, int blk_col,
@@ -1707,7 +1734,11 @@ void av1_dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
   const struct macroblockd_plane *const pd = &xd->plane[plane];
 #endif  // CONFIG_DIST_8X8
 
-  if (cpi->sf.use_transform_domain_distortion && !CONFIG_DIST_8X8) {
+  if (cpi->sf.use_transform_domain_distortion
+#if CONFIG_DIST_8X8
+      && !x->using_dist_8x8
+#endif
+      ) {
     // Transform domain distortion computation is more efficient as it does
     // not involve an inverse transform, but it is less accurate.
     const int buffer_length = tx_size_2d[tx_size];
@@ -1721,25 +1752,22 @@ void av1_dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
 #if CONFIG_HIGHBITDEPTH
     const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;
     *out_dist = av1_highbd_block_error2_c(coeff, dqcoeff, ref_coeff,
-                                          buffer_length, &this_sse, bd) >>
-                shift;
+                                          buffer_length, &this_sse, bd);
 #else
-    *out_dist = av1_block_error2_c(coeff, dqcoeff, ref_coeff, buffer_length,
-                                   &this_sse) >>
-                shift;
+    *out_dist =
+        av1_block_error2_c(coeff, dqcoeff, ref_coeff, buffer_length, &this_sse);
 #endif  // CONFIG_HIGHBITDEPTH
 #else   // !CONFIG_PVQ
 #if CONFIG_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
       *out_dist = av1_highbd_block_error(coeff, dqcoeff, buffer_length,
-                                         &this_sse, xd->bd) >>
-                  shift;
+                                         &this_sse, xd->bd);
     else
 #endif
-      *out_dist =
-          av1_block_error(coeff, dqcoeff, buffer_length, &this_sse) >> shift;
+      *out_dist = av1_block_error(coeff, dqcoeff, buffer_length, &this_sse);
 #endif  // CONFIG_PVQ
-    *out_sse = this_sse >> shift;
+    *out_dist = RIGHT_SIGNED_SHIFT(*out_dist, shift);
+    *out_sse = RIGHT_SIGNED_SHIFT(this_sse, shift);
   } else {
     const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
 #if !CONFIG_PVQ || CONFIG_DIST_8X8
@@ -1808,17 +1836,23 @@ void av1_dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
         (void)dst;
 #endif  // !CONFIG_PVQ
 
+#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+        uint8_t *mrc_mask = BLOCK_OFFSET(xd->mrc_mask, block);
+#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
         const PLANE_TYPE plane_type = get_plane_type(plane);
         TX_TYPE tx_type =
             av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
         av1_inverse_transform_block(xd, dqcoeff,
-#if CONFIG_LGT
+#if CONFIG_LGT_FROM_PRED
                                     xd->mi[0]->mbmi.mode,
 #endif
+#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+                                    mrc_mask,
+#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
                                     tx_type, tx_size, recon, MAX_TX_SIZE, eob);
 
 #if CONFIG_DIST_8X8
-        if (plane == 0 && (bsw < 8 || bsh < 8)) {
+        if (x->using_dist_8x8 && plane == 0 && (bsw < 8 || bsh < 8)) {
           // Save decoded pixels for inter block in pd->pred to avoid
           // block_8x8_rd_txfm_daala_dist() need to produce them
           // by calling av1_inverse_transform_block() again.
@@ -1864,12 +1898,23 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
   const AV1_COMP *cpi = args->cpi;
   ENTROPY_CONTEXT *a = args->t_above + blk_col;
   ENTROPY_CONTEXT *l = args->t_left + blk_row;
-#if !CONFIG_TXK_SEL
   const AV1_COMMON *cm = &cpi->common;
-#endif
   int64_t rd1, rd2, rd;
   RD_STATS this_rd_stats;
 
+#if CONFIG_DIST_8X8
+  // If sub8x8 tx, 8x8 or larger partition, and luma channel,
+  // dist-8x8 disables early skip, because the distortion metrics for
+  // sub8x8 tx (MSE) and reference distortion from 8x8 or larger partition
+  // (new distortion metric) are different.
+  // Exception is: dist-8x8 is enabled but still MSE is used,
+  // i.e. "--tune=" encoder option is not used.
+  int disable_early_skip =
+      x->using_dist_8x8 && plane == 0 && plane_bsize >= BLOCK_8X8 &&
+      (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4) &&
+      x->tune_metric != AOM_TUNE_PSNR;
+#endif  // CONFIG_DIST_8X8
+
 #if !CONFIG_SUPERTX && !CONFIG_VAR_TX
   assert(tx_size == av1_get_tx_size(plane, xd));
 #endif  // !CONFIG_SUPERTX
@@ -1879,26 +1924,8 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
   if (args->exit_early) return;
 
   if (!is_inter_block(mbmi)) {
-    av1_predict_intra_block_facade(xd, plane, block, blk_col, blk_row, tx_size);
-#if CONFIG_DPCM_INTRA
-    const int block_raster_idx =
-        av1_block_index_to_raster_order(tx_size, block);
-    const PREDICTION_MODE mode = (plane == AOM_PLANE_Y)
-                                     ? get_y_mode(xd->mi[0], block_raster_idx)
-                                     : get_uv_mode(mbmi->uv_mode);
-    TX_TYPE tx_type =
-        av1_get_tx_type((plane == AOM_PLANE_Y) ? PLANE_TYPE_Y : PLANE_TYPE_UV,
-                        xd, blk_row, blk_col, block, tx_size);
-    if (av1_use_dpcm_intra(plane, mode, tx_type, mbmi)) {
-      int8_t skip;
-      av1_encode_block_intra_dpcm(cm, x, mode, plane, block, blk_row, blk_col,
-                                  plane_bsize, tx_size, tx_type, a, l, &skip);
-      av1_dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col,
-                     tx_size, &this_rd_stats.dist, &this_rd_stats.sse,
-                     OUTPUT_HAS_DECODED_PIXELS);
-      goto CALCULATE_RD;
-    }
-#endif  // CONFIG_DPCM_INTRA
+    av1_predict_intra_block_facade(cm, xd, plane, block, blk_col, blk_row,
+                                   tx_size);
     av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
   }
 
@@ -1921,21 +1948,32 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
 #if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
     tmp_dist =
-        av1_highbd_block_error(coeff, dqcoeff, buffer_length, &tmp, xd->bd) >>
-        shift;
+        av1_highbd_block_error(coeff, dqcoeff, buffer_length, &tmp, xd->bd);
   else
 #endif
-    tmp_dist = av1_block_error(coeff, dqcoeff, buffer_length, &tmp) >> shift;
+    tmp_dist = av1_block_error(coeff, dqcoeff, buffer_length, &tmp);
+  tmp_dist = RIGHT_SIGNED_SHIFT(tmp_dist, shift);
 
-  if (RDCOST(x->rdmult, 0, tmp_dist) + args->this_rd < args->best_rd) {
+  if (
+#if CONFIG_DIST_8X8
+      disable_early_skip ||
+#endif
+      RDCOST(x->rdmult, 0, tmp_dist) + args->this_rd < args->best_rd) {
     av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size,
-                   a, l);
+                   a, l, 1);
   } else {
     args->exit_early = 1;
     return;
   }
 #endif  // DISABLE_TRELLISQ_SEARCH
 
+#if CONFIG_MRC_TX
+  if (mbmi->tx_type == MRC_DCT && !mbmi->valid_mrc_mask) {
+    args->exit_early = 1;
+    return;
+  }
+#endif  // CONFIG_MRC_TX
+
   if (!is_inter_block(mbmi)) {
     struct macroblock_plane *const p = &x->plane[plane];
     av1_inverse_transform_block_facade(xd, plane, block, blk_row, blk_col,
@@ -1949,19 +1987,15 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
                    OUTPUT_HAS_PREDICTED_PIXELS);
   }
 #if CONFIG_CFL
-  if (plane == AOM_PLANE_Y && x->cfl_store_y) {
-    struct macroblockd_plane *const pd = &xd->plane[plane];
-    const int dst_stride = pd->dst.stride;
-    uint8_t *dst =
-        &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
-    // TODO (ltrudeau) Store sub-8x8 inter blocks when bottom right block is
-    // intra predicted.
-    cfl_store(xd->cfl, dst, dst_stride, blk_row, blk_col, tx_size, plane_bsize);
+  if (plane == AOM_PLANE_Y && xd->cfl->store_y) {
+#if CONFIG_CHROMA_SUB8X8
+    assert(!is_inter_block(mbmi) || plane_bsize < BLOCK_8X8);
+#else
+    assert(!is_inter_block(mbmi));
+#endif  // CONFIG_CHROMA_SUB8X8
+    cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize);
   }
-#endif
-#if CONFIG_DPCM_INTRA
-CALCULATE_RD : {}
-#endif  // CONFIG_DPCM_INTRA
+#endif  // CONFIG_CFL
   rd = RDCOST(x->rdmult, 0, this_rd_stats.dist);
   if (args->this_rd + rd > args->best_rd) {
     args->exit_early = 1;
@@ -2008,16 +2042,12 @@ CALCULATE_RD : {}
   args->this_rd += rd;
 
 #if CONFIG_DIST_8X8
-  if (!(plane == 0 && plane_bsize >= BLOCK_8X8 &&
-        (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4))) {
+  if (!disable_early_skip)
 #endif
     if (args->this_rd > args->best_rd) {
       args->exit_early = 1;
       return;
     }
-#if CONFIG_DIST_8X8
-  }
-#endif
 }
 
 #if CONFIG_DIST_8X8
@@ -2033,8 +2063,10 @@ static void dist_8x8_sub8x8_txfm_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
   const uint8_t *src = &p->src.buf[0];
   const uint8_t *dst = &pd->dst.buf[0];
   const int16_t *pred = &pd->pred[0];
-  const int bw = block_size_wide[bsize];
-  const int bh = block_size_high[bsize];
+  int bw = block_size_wide[bsize];
+  int bh = block_size_high[bsize];
+  int visible_w = bw;
+  int visible_h = bh;
 
   int i, j;
   int64_t rd, rd1, rd2;
@@ -2044,6 +2076,9 @@ static void dist_8x8_sub8x8_txfm_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
   assert((bw & 0x07) == 0);
   assert((bh & 0x07) == 0);
 
+  get_txb_dimensions(xd, 0, bsize, 0, 0, bsize, &bw, &bh, &visible_w,
+                     &visible_h);
+
 #if CONFIG_HIGHBITDEPTH
   uint8_t *pred8;
   DECLARE_ALIGNED(16, uint16_t, pred16[MAX_TX_SQUARE]);
@@ -2064,22 +2099,30 @@ static void dist_8x8_sub8x8_txfm_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
   } else {
 #endif
     for (j = 0; j < bh; j++)
-      for (i = 0; i < bw; i++) pred8[j * bw + i] = pred[j * bw + i];
+      for (i = 0; i < bw; i++) pred8[j * bw + i] = (uint8_t)pred[j * bw + i];
 #if CONFIG_HIGHBITDEPTH
   }
 #endif  // CONFIG_HIGHBITDEPTH
 
-  tmp1 = av1_dist_8x8(cpi, xd, src, src_stride, pred8, bw, bsize, bw, bh, bw,
-                      bh, qindex);
-  tmp2 = av1_dist_8x8(cpi, xd, src, src_stride, dst, dst_stride, bsize, bw, bh,
-                      bw, bh, qindex);
+  tmp1 = (unsigned)av1_dist_8x8(cpi, x, src, src_stride, pred8, bw, bsize, bw,
+                                bh, visible_w, visible_h, qindex);
+  tmp2 = (unsigned)av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride, bsize,
+                                bw, bh, visible_w, visible_h, qindex);
 
   if (!is_inter_block(mbmi)) {
+    if (x->tune_metric == AOM_TUNE_PSNR) {
+      assert(args->rd_stats.sse == tmp1 * 16);
+      assert(args->rd_stats.dist == tmp2 * 16);
+    }
     args->rd_stats.sse = (int64_t)tmp1 * 16;
     args->rd_stats.dist = (int64_t)tmp2 * 16;
   } else {
     // For inter mode, the decoded pixels are provided in pd->pred,
     // while the predicted pixels are in dst.
+    if (x->tune_metric == AOM_TUNE_PSNR) {
+      assert(args->rd_stats.sse == tmp2 * 16);
+      assert(args->rd_stats.dist == tmp1 * 16);
+    }
     args->rd_stats.sse = (int64_t)tmp2 * 16;
     args->rd_stats.dist = (int64_t)tmp1 * 16;
   }
@@ -2116,7 +2159,8 @@ static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
   av1_foreach_transformed_block_in_plane(xd, bsize, plane, block_rd_txfm,
                                          &args);
 #if CONFIG_DIST_8X8
-  if (!args.exit_early && plane == 0 && bsize >= BLOCK_8X8 &&
+  if (x->using_dist_8x8 && !args.exit_early && plane == 0 &&
+      bsize >= BLOCK_8X8 &&
       (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4))
     dist_8x8_sub8x8_txfm_rd(cpi, x, bsize, &args);
 #endif
@@ -2174,23 +2218,14 @@ static int tx_size_cost(const AV1_COMP *const cpi, const MACROBLOCK *const x,
   const MACROBLOCKD *const xd = &x->e_mbd;
   const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
 
-  const int tx_select = cm->tx_mode == TX_MODE_SELECT &&
-#if CONFIG_EXT_PARTITION_TYPES
-                        // Currently these block shapes can only use 4x4
-                        // transforms
-                        mbmi->sb_type != BLOCK_4X16 &&
-                        mbmi->sb_type != BLOCK_16X4 &&
-#endif
-                        mbmi->sb_type >= BLOCK_8X8;
-
-  if (tx_select) {
+  if (cm->tx_mode == TX_MODE_SELECT && block_signals_txsize(mbmi->sb_type)) {
     const int is_inter = is_inter_block(mbmi);
-    const int tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize]
-                                     : intra_tx_size_cat_lookup[bsize];
+    const int32_t tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize]
+                                         : intra_tx_size_cat_lookup[bsize];
     const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size];
     const int depth = tx_size_to_depth(coded_tx_size);
     const int tx_size_ctx = get_tx_size_context(xd);
-    int r_tx_size = cpi->tx_size_cost[tx_size_cat][tx_size_ctx][depth];
+    int r_tx_size = x->tx_size_cost[tx_size_cat][tx_size_ctx][depth];
 #if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
     if (is_quarter_tx_allowed(xd, mbmi, is_inter) && tx_size != coded_tx_size)
       r_tx_size += av1_cost_bit(cm->fc->quarter_tx_size_prob,
@@ -2202,12 +2237,38 @@ static int tx_size_cost(const AV1_COMP *const cpi, const MACROBLOCK *const x,
   }
 }
 
-// #TODO(angiebird): use this function whenever it's possible
-int av1_tx_type_cost(const AV1_COMP *cpi, const MACROBLOCKD *xd,
-                     BLOCK_SIZE bsize, int plane, TX_SIZE tx_size,
-                     TX_TYPE tx_type) {
+#if CONFIG_LGT_FROM_PRED
+int av1_lgt_cost(const AV1_COMMON *cm, const MACROBLOCK *x,
+                 const MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
+                 TX_SIZE tx_size, int use_lgt) {
+  if (plane > 0) return 0;
+  const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  const int is_inter = is_inter_block(mbmi);
+
+  assert(is_lgt_allowed(mbmi->mode, tx_size));
+  if (get_ext_tx_types(tx_size, bsize, is_inter, cm->reduced_tx_set_used) > 1 &&
+      !xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+    const int ext_tx_set =
+        get_ext_tx_set(tx_size, bsize, is_inter, cm->reduced_tx_set_used);
+    if (LGT_FROM_PRED_INTRA && !is_inter && ext_tx_set > 0 &&
+        ALLOW_INTRA_EXT_TX)
+      return x->intra_lgt_cost[txsize_sqr_map[tx_size]][mbmi->mode][use_lgt];
+    if (LGT_FROM_PRED_INTRA && is_inter && ext_tx_set > 0)
+      return x->inter_lgt_cost[txsize_sqr_map[tx_size]][use_lgt];
+  }
+  return 0;
+}
+#endif  // CONFIG_LGT_FROM_PRED
+
+// TODO(angiebird): use this function whenever it's possible
+int av1_tx_type_cost(const AV1_COMMON *cm, const MACROBLOCK *x,
+                     const MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
+                     TX_SIZE tx_size, TX_TYPE tx_type) {
   if (plane > 0) return 0;
 
+#if CONFIG_LGT_FROM_PRED
+  assert(!xd->mi[0]->mbmi.use_lgt);
+#endif
 #if CONFIG_VAR_TX
   tx_size = get_min_tx_size(tx_size);
 #endif
@@ -2215,31 +2276,31 @@ int av1_tx_type_cost(const AV1_COMP *cpi, const MACROBLOCKD *xd,
   const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   const int is_inter = is_inter_block(mbmi);
 #if CONFIG_EXT_TX
-  const AV1_COMMON *cm = &cpi->common;
   if (get_ext_tx_types(tx_size, bsize, is_inter, cm->reduced_tx_set_used) > 1 &&
       !xd->lossless[xd->mi[0]->mbmi.segment_id]) {
     const int ext_tx_set =
         get_ext_tx_set(tx_size, bsize, is_inter, cm->reduced_tx_set_used);
     if (is_inter) {
       if (ext_tx_set > 0)
-        return cpi
+        return x
             ->inter_tx_type_costs[ext_tx_set][txsize_sqr_map[tx_size]][tx_type];
     } else {
       if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX)
-        return cpi->intra_tx_type_costs[ext_tx_set][txsize_sqr_map[tx_size]]
-                                       [mbmi->mode][tx_type];
+        return x->intra_tx_type_costs[ext_tx_set][txsize_sqr_map[tx_size]]
+                                     [mbmi->mode][tx_type];
     }
   }
 #else
   (void)bsize;
+  (void)cm;
   if (tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
       !FIXED_TX_TYPE) {
     if (is_inter) {
-      return cpi->inter_tx_type_costs[tx_size][tx_type];
+      return x->inter_tx_type_costs[tx_size][tx_type];
     } else {
-      return cpi->intra_tx_type_costs[tx_size]
-                                     [intra_mode_to_tx_type_context[mbmi->mode]]
-                                     [tx_type];
+      return x->intra_tx_type_costs[tx_size]
+                                   [intra_mode_to_tx_type_context[mbmi->mode]]
+                                   [tx_type];
     }
   }
 #endif  // CONFIG_EXT_TX
@@ -2247,7 +2308,7 @@ int av1_tx_type_cost(const AV1_COMP *cpi, const MACROBLOCKD *xd,
 }
 static int64_t txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
                         RD_STATS *rd_stats, int64_t ref_best_rd, BLOCK_SIZE bs,
-                        TX_TYPE tx_type, int tx_size) {
+                        TX_TYPE tx_type, TX_SIZE tx_size) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
@@ -2278,7 +2339,15 @@ static int64_t txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
   if (rd_stats->rate == INT_MAX) return INT64_MAX;
 #if !CONFIG_TXK_SEL
   int plane = 0;
-  rd_stats->rate += av1_tx_type_cost(cpi, xd, bs, plane, tx_size, tx_type);
+#if CONFIG_LGT_FROM_PRED
+  if (is_lgt_allowed(mbmi->mode, tx_size))
+    rd_stats->rate +=
+        av1_lgt_cost(cm, x, xd, bs, plane, tx_size, mbmi->use_lgt);
+  if (!mbmi->use_lgt)
+    rd_stats->rate += av1_tx_type_cost(cm, x, xd, bs, plane, tx_size, tx_type);
+#else
+  rd_stats->rate += av1_tx_type_cost(cm, x, xd, bs, plane, tx_size, tx_type);
+#endif  // CONFIG_LGT_FROM_PRED
 #endif
 
   if (rd_stats->skip) {
@@ -2316,8 +2385,14 @@ static int skip_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs,
 #if CONFIG_MRC_TX
   // MRC_DCT only implemented for TX_32X32 so only include this tx in
   // the search for TX_32X32
-  if (tx_type == MRC_DCT && tx_size != TX_32X32) return 1;
+  if (tx_type == MRC_DCT &&
+      ((is_inter && !USE_MRC_INTER) || (!is_inter && !USE_MRC_INTRA) ||
+       tx_size != TX_32X32))
+    return 1;
 #endif  // CONFIG_MRC_TX
+#if CONFIG_LGT_FROM_PRED
+  if (mbmi->use_lgt && mbmi->ref_mv_idx > 0) return 1;
+#endif  // CONFIG_LGT_FROM_PRED
   if (mbmi->ref_mv_idx > 0 && tx_type != DCT_DCT) return 1;
   if (FIXED_TX_TYPE && tx_type != get_default_tx_type(0, xd, 0, tx_size))
     return 1;
@@ -2330,10 +2405,10 @@ static int skip_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs,
   if (max_tx_size >= TX_32X32 && tx_size == TX_4X4) return 1;
 #if CONFIG_EXT_TX
   const AV1_COMMON *const cm = &cpi->common;
-  int ext_tx_set =
-      get_ext_tx_set(tx_size, bs, is_inter, cm->reduced_tx_set_used);
+  const TxSetType tx_set_type =
+      get_ext_tx_set_type(tx_size, bs, is_inter, cm->reduced_tx_set_used);
+  if (!av1_ext_tx_used[tx_set_type][tx_type]) return 1;
   if (is_inter) {
-    if (!ext_tx_used_inter[ext_tx_set][tx_type]) return 1;
     if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE) {
       if (!do_tx_type_search(tx_type, prune)) return 1;
     }
@@ -2341,7 +2416,6 @@ static int skip_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs,
     if (!ALLOW_INTRA_EXT_TX && bs >= BLOCK_8X8) {
       if (tx_type != intra_mode_to_tx_type_context[mbmi->mode]) return 1;
     }
-    if (!ext_tx_used_intra[ext_tx_set][tx_type]) return 1;
   }
 #else   // CONFIG_EXT_TX
   if (tx_size >= TX_32X32 && tx_type != DCT_DCT) return 1;
@@ -2352,8 +2426,7 @@ static int skip_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs,
   return 0;
 }
 
-#if CONFIG_EXT_INTER && \
-    (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT || CONFIG_INTERINTRA)
+#if (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT || CONFIG_INTERINTRA)
 static int64_t estimate_yrd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bs,
                                    MACROBLOCK *x, int *r, int64_t *d, int *s,
                                    int64_t *sse, int64_t ref_best_rd) {
@@ -2366,7 +2439,7 @@ static int64_t estimate_yrd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bs,
   *sse = rd_stats.sse;
   return rd;
 }
-#endif  // CONFIG_EXT_INTER && (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT)
+#endif  // (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT)
 
 static void choose_largest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
                                    RD_STATS *rd_stats, int64_t ref_best_rd,
@@ -2382,9 +2455,14 @@ static void choose_largest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
   const int is_inter = is_inter_block(mbmi);
   int prune = 0;
   const int plane = 0;
-#if CONFIG_EXT_TX
-  int ext_tx_set;
-#endif  // CONFIG_EXT_TX
+#if CONFIG_LGT_FROM_PRED
+  int is_lgt_best = 0;
+  int search_lgt = is_inter
+                       ? LGT_FROM_PRED_INTER && !x->use_default_inter_tx_type &&
+                             !cpi->sf.tx_type_search.prune_mode > NO_PRUNE
+                       : LGT_FROM_PRED_INTRA && !x->use_default_intra_tx_type &&
+                             ALLOW_INTRA_EXT_TX;
+#endif  // CONFIG_LGT_FROM_PRED
   av1_invalid_rd_stats(rd_stats);
 
   mbmi->tx_size = tx_size_from_tx_mode(bs, cm->tx_mode, is_inter);
@@ -2392,8 +2470,10 @@ static void choose_largest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
   mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
 #endif  // CONFIG_VAR_TX
 #if CONFIG_EXT_TX
-  ext_tx_set =
+  int ext_tx_set =
       get_ext_tx_set(mbmi->tx_size, bs, is_inter, cm->reduced_tx_set_used);
+  const TxSetType tx_set_type =
+      get_ext_tx_set_type(mbmi->tx_size, bs, is_inter, cm->reduced_tx_set_used);
 #endif  // CONFIG_EXT_TX
 
   if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE)
@@ -2414,12 +2494,12 @@ static void choose_largest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
 #endif  // CONFIG_PVQ
 
     for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
+      if (!av1_ext_tx_used[tx_set_type][tx_type]) continue;
       RD_STATS this_rd_stats;
       if (is_inter) {
         if (x->use_default_inter_tx_type &&
             tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size))
           continue;
-        if (!ext_tx_used_inter[ext_tx_set][tx_type]) continue;
         if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE) {
           if (!do_tx_type_search(tx_type, prune)) continue;
         }
@@ -2430,7 +2510,6 @@ static void choose_largest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
         if (!ALLOW_INTRA_EXT_TX && bs >= BLOCK_8X8) {
           if (tx_type != intra_mode_to_tx_type_context[mbmi->mode]) continue;
         }
-        if (!ext_tx_used_intra[ext_tx_set][tx_type]) continue;
       }
 
       mbmi->tx_type = tx_type;
@@ -2441,7 +2520,7 @@ static void choose_largest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
       od_encode_rollback(&x->daala_enc, &pre_buf);
 #endif  // CONFIG_PVQ
       if (this_rd_stats.rate == INT_MAX) continue;
-      av1_tx_type_cost(cpi, xd, bs, plane, mbmi->tx_size, tx_type);
+      av1_tx_type_cost(cm, x, xd, bs, plane, mbmi->tx_size, tx_type);
 
       if (this_rd_stats.skip)
         this_rd = RDCOST(x->rdmult, s1, this_rd_stats.sse);
@@ -2464,6 +2543,33 @@ static void choose_largest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
 #if CONFIG_PVQ
     od_encode_rollback(&x->daala_enc, &post_buf);
 #endif  // CONFIG_PVQ
+#if CONFIG_LGT_FROM_PRED
+    // search LGT
+    if (search_lgt && is_lgt_allowed(mbmi->mode, mbmi->tx_size) &&
+        !cm->reduced_tx_set_used) {
+      RD_STATS this_rd_stats;
+      mbmi->use_lgt = 1;
+      txfm_rd_in_plane(x, cpi, &this_rd_stats, ref_best_rd, 0, bs,
+                       mbmi->tx_size, cpi->sf.use_fast_coef_costing);
+      if (this_rd_stats.rate != INT_MAX) {
+        av1_lgt_cost(cm, x, xd, bs, plane, mbmi->tx_size, 1);
+        if (this_rd_stats.skip)
+          this_rd = RDCOST(x->rdmult, s1, this_rd_stats.sse);
+        else
+          this_rd =
+              RDCOST(x->rdmult, this_rd_stats.rate + s0, this_rd_stats.dist);
+        if (is_inter_block(mbmi) && !xd->lossless[mbmi->segment_id] &&
+            !this_rd_stats.skip)
+          this_rd = AOMMIN(this_rd, RDCOST(x->rdmult, s1, this_rd_stats.sse));
+        if (this_rd < best_rd) {
+          best_rd = this_rd;
+          is_lgt_best = 1;
+          *rd_stats = this_rd_stats;
+        }
+      }
+      mbmi->use_lgt = 0;
+    }
+#endif  // CONFIG_LGT_FROM_PRED
   } else {
     mbmi->tx_type = DCT_DCT;
     txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, bs, mbmi->tx_size,
@@ -2484,7 +2590,7 @@ static void choose_largest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
                        mbmi->tx_size, cpi->sf.use_fast_coef_costing);
       if (this_rd_stats.rate == INT_MAX) continue;
 
-      av1_tx_type_cost(cpi, xd, bs, plane, mbmi->tx_size, tx_type);
+      av1_tx_type_cost(cm, x, xd, bs, plane, mbmi->tx_size, tx_type);
       if (is_inter) {
         if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE &&
             !do_tx_type_search(tx_type, prune))
@@ -2511,6 +2617,9 @@ static void choose_largest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
   }
 #endif  // CONFIG_EXT_TX
   mbmi->tx_type = best_tx_type;
+#if CONFIG_LGT_FROM_PRED
+  mbmi->use_lgt = is_lgt_best;
+#endif  // CONFIG_LGT_FROM_PRED
 }
 
 static void choose_smallest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
@@ -2549,6 +2658,11 @@ static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
   TX_SIZE best_tx_size = max_tx_size;
   TX_TYPE best_tx_type = DCT_DCT;
+#if CONFIG_LGT_FROM_PRED
+  int breakout = 0;
+  int is_lgt_best = 0;
+  mbmi->use_lgt = 0;
+#endif  // CONFIG_LGT_FROM_PRED
 #if CONFIG_TXK_SEL
   TX_TYPE best_txk_type[MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
 #endif  // CONFIG_TXK_SEL
@@ -2584,12 +2698,12 @@ static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
       if (mbmi->ref_mv_idx > 0 && tx_type != DCT_DCT) continue;
       const TX_SIZE rect_tx_size = max_txsize_rect_lookup[bs];
       RD_STATS this_rd_stats;
-      int ext_tx_set =
-          get_ext_tx_set(rect_tx_size, bs, is_inter, cm->reduced_tx_set_used);
-      if ((is_inter && ext_tx_used_inter[ext_tx_set][tx_type]) ||
-          (!is_inter && ext_tx_used_intra[ext_tx_set][tx_type])) {
+      const TxSetType tx_set_type = get_ext_tx_set_type(
+          rect_tx_size, bs, is_inter, cm->reduced_tx_set_used);
+      if (av1_ext_tx_used[tx_set_type][tx_type]) {
         rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, tx_type,
                       rect_tx_size);
+        ref_best_rd = AOMMIN(rd, ref_best_rd);
         if (rd < best_rd) {
 #if CONFIG_TXK_SEL
           memcpy(best_txk_type, mbmi->txk_type, sizeof(best_txk_type[0]) * 256);
@@ -2605,6 +2719,21 @@ static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
       if (mbmi->sb_type < BLOCK_8X8 && is_inter) break;
 #endif  // CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
     }
+#if CONFIG_LGT_FROM_PRED
+    const TX_SIZE rect_tx_size = max_txsize_rect_lookup[bs];
+    if (is_lgt_allowed(mbmi->mode, rect_tx_size) && !cm->reduced_tx_set_used) {
+      RD_STATS this_rd_stats;
+      mbmi->use_lgt = 1;
+      rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, 0, rect_tx_size);
+      if (rd < best_rd) {
+        is_lgt_best = 1;
+        best_tx_size = rect_tx_size;
+        best_rd = rd;
+        *rd_stats = this_rd_stats;
+      }
+      mbmi->use_lgt = 0;
+    }
+#endif  // CONFIG_LGT_FROM_PRED
   }
 
 #if CONFIG_RECT_TX_EXT
@@ -2632,10 +2761,9 @@ static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
       if (mbmi->ref_mv_idx > 0 && tx_type != DCT_DCT) continue;
       const TX_SIZE tx_size = quarter_txsize_lookup[bs];
       RD_STATS this_rd_stats;
-      int ext_tx_set =
-          get_ext_tx_set(tx_size, bs, is_inter, cm->reduced_tx_set_used);
-      if ((is_inter && ext_tx_used_inter[ext_tx_set][tx_type]) ||
-          (!is_inter && ext_tx_used_intra[ext_tx_set][tx_type])) {
+      const TxSetType tx_set_type =
+          get_ext_tx_set_type(tx_size, bs, is_inter, cm->reduced_tx_set_used);
+      if (av1_ext_tx_used[tx_set_type][tx_type]) {
         rd =
             txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, tx_type, tx_size);
         if (rd < best_rd) {
@@ -2644,6 +2772,9 @@ static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
                  sizeof(best_txk_type[0]) * num_blk);
 #endif
           best_tx_type = tx_type;
+#if CONFIG_LGT_FROM_PRED
+          is_lgt_best = 0;
+#endif
           best_tx_size = tx_size;
           best_rd = rd;
           *rd_stats = this_rd_stats;
@@ -2654,6 +2785,21 @@ static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
       if (mbmi->sb_type < BLOCK_8X8 && is_inter) break;
 #endif  // CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
     }
+#if CONFIG_LGT_FROM_PRED
+    if (is_lgt_allowed(mbmi->mode, tx_size) && !cm->reduced_tx_set_used) {
+      const TX_SIZE tx_size = quarter_txsize_lookup[bs];
+      RD_STATS this_rd_stats;
+      mbmi->use_lgt = 1;
+      rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, 0, tx_size);
+      if (rd < best_rd) {
+        is_lgt_best = 1;
+        best_tx_size = tx_size;
+        best_rd = rd;
+        *rd_stats = this_rd_stats;
+      }
+      mbmi->use_lgt = 0;
+    }
+#endif  // CONFIG_LGT_FROM_PRED
   }
 #endif  // CONFIG_RECT_TX_EXT
 #endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
@@ -2692,15 +2838,23 @@ static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
       if (cpi->sf.tx_size_search_breakout &&
           (rd == INT64_MAX ||
            (this_rd_stats.skip == 1 && tx_type != DCT_DCT && n < start_tx) ||
-           (n < (int)max_tx_size && rd > last_rd)))
+           (n < (int)max_tx_size && rd > last_rd))) {
+#if CONFIG_LGT_FROM_PRED
+        breakout = 1;
+#endif
         break;
+      }
 
       last_rd = rd;
+      ref_best_rd = AOMMIN(rd, ref_best_rd);
       if (rd < best_rd) {
 #if CONFIG_TXK_SEL
         memcpy(best_txk_type, mbmi->txk_type, sizeof(best_txk_type[0]) * 256);
 #endif
         best_tx_type = tx_type;
+#if CONFIG_LGT_FROM_PRED
+        is_lgt_best = 0;
+#endif
         best_tx_size = n;
         best_rd = rd;
         *rd_stats = this_rd_stats;
@@ -2710,9 +2864,28 @@ static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
       if (mbmi->sb_type < BLOCK_8X8 && is_inter) break;
 #endif  // CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
     }
+#if CONFIG_LGT_FROM_PRED
+    mbmi->use_lgt = 1;
+    if (is_lgt_allowed(mbmi->mode, n) && !skip_txfm_search(cpi, x, bs, 0, n) &&
+        !breakout) {
+      RD_STATS this_rd_stats;
+      rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, 0, n);
+      if (rd < best_rd) {
+        is_lgt_best = 1;
+        best_tx_size = n;
+        best_rd = rd;
+        *rd_stats = this_rd_stats;
+      }
+    }
+    mbmi->use_lgt = 0;
+#endif  // CONFIG_LGT_FROM_PRED
   }
   mbmi->tx_size = best_tx_size;
   mbmi->tx_type = best_tx_type;
+#if CONFIG_LGT_FROM_PRED
+  mbmi->use_lgt = is_lgt_best;
+  assert(!is_lgt_best || is_lgt_allowed(mbmi->mode, mbmi->tx_size));
+#endif  // CONFIG_LGT_FROM_PRED
 #if CONFIG_TXK_SEL
   memcpy(mbmi->txk_type, best_txk_type, sizeof(best_txk_type[0]) * 256);
 #endif
@@ -2768,6 +2941,7 @@ static int conditional_skipintra(PREDICTION_MODE mode,
 // Model based RD estimation for luma intra blocks.
 static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x,
                                BLOCK_SIZE bsize, int mode_cost) {
+  const AV1_COMMON *cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   assert(!is_inter_block(mbmi));
@@ -2785,7 +2959,7 @@ static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x,
   int block = 0;
   for (row = 0; row < max_blocks_high; row += stepr) {
     for (col = 0; col < max_blocks_wide; col += stepc) {
-      av1_predict_intra_block_facade(xd, 0, block, col, row, tx_size);
+      av1_predict_intra_block_facade(cm, xd, 0, block, col, row, tx_size);
       block += step;
     }
   }
@@ -2816,7 +2990,6 @@ static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x,
   return this_rd;
 }
 
-#if CONFIG_PALETTE
 // Extends 'color_map' array from 'orig_width x orig_height' to 'new_width x
 // new_height'. Extra rows and columns are filled in by copying last valid
 // row/column.
@@ -2875,6 +3048,7 @@ static int rd_pick_palette_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
   MODE_INFO *const mic = xd->mi[0];
   MB_MODE_INFO *const mbmi = &mic->mbmi;
   assert(!is_inter_block(mbmi));
+  assert(bsize >= BLOCK_8X8);
   int this_rate, colors, n;
   const int src_stride = x->plane[0].src.stride;
   const uint8_t *const src = x->plane[0].src.buf;
@@ -2897,9 +3071,8 @@ static int rd_pick_palette_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
 #endif  // CONFIG_FILTER_INTRA
 
   if (colors > 1 && colors <= 64) {
-    int r, c, i, j, k, palette_mode_cost;
+    int r, c, i, k, palette_mode_cost;
     const int max_itr = 50;
-    uint8_t color_order[PALETTE_MAX_SIZE];
     float *const data = x->palette_buffer->kmeans_data_buf;
     float centroids[PALETTE_MAX_SIZE];
     float lb, ub, val;
@@ -2950,11 +3123,8 @@ static int rd_pick_palette_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
     if (rows * cols > PALETTE_MAX_BLOCK_SIZE) return 0;
 
 #if CONFIG_PALETTE_DELTA_ENCODING
-    const MODE_INFO *above_mi = xd->above_mi;
-    const MODE_INFO *left_mi = xd->left_mi;
     uint16_t color_cache[2 * PALETTE_MAX_SIZE];
-    const int n_cache =
-        av1_get_palette_cache(above_mi, left_mi, 0, color_cache);
+    const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
 #endif  // CONFIG_PALETTE_DELTA_ENCODING
 
     for (n = colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors; n >= 2;
@@ -2998,7 +3168,7 @@ static int rd_pick_palette_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
                                block_height);
       palette_mode_cost =
           dc_mode_cost +
-          cpi->palette_y_size_cost[bsize - BLOCK_8X8][k - PALETTE_MIN_SIZE] +
+          x->palette_y_size_cost[bsize - BLOCK_8X8][k - PALETTE_MIN_SIZE] +
           write_uniform_cost(k, color_map[0]) +
           av1_cost_bit(
               av1_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_ctx],
@@ -3008,16 +3178,8 @@ static int rd_pick_palette_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
                                                     color_cache, n_cache,
 #endif  // CONFIG_PALETTE_DELTA_ENCODING
                                                     cpi->common.bit_depth);
-      for (i = 0; i < rows; ++i) {
-        for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
-          int color_idx;
-          const int color_ctx = av1_get_palette_color_index_context(
-              color_map, block_width, i, j, k, color_order, &color_idx);
-          assert(color_idx >= 0 && color_idx < k);
-          palette_mode_cost += cpi->palette_y_color_cost[k - PALETTE_MIN_SIZE]
-                                                        [color_ctx][color_idx];
-        }
-      }
+      palette_mode_cost +=
+          av1_cost_color_map(x, 0, 0, bsize, mbmi->tx_size, PALETTE_MAP);
       this_model_rd = intra_model_yrd(cpi, x, bsize, palette_mode_cost);
       if (*best_model_rd != INT64_MAX &&
           this_model_rd > *best_model_rd + (*best_model_rd >> 1))
@@ -3027,7 +3189,8 @@ static int rd_pick_palette_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
       if (tokenonly_rd_stats.rate == INT_MAX) continue;
       this_rate = tokenonly_rd_stats.rate + palette_mode_cost;
       this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
-      if (!xd->lossless[mbmi->segment_id] && mbmi->sb_type >= BLOCK_8X8) {
+      if (!xd->lossless[mbmi->segment_id] &&
+          block_signals_txsize(mbmi->sb_type)) {
         tokenonly_rd_stats.rate -= tx_size_cost(cpi, x, bsize, mbmi->tx_size);
       }
       if (this_rd < *best_rd) {
@@ -3046,12 +3209,11 @@ static int rd_pick_palette_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
 
   if (best_mbmi->palette_mode_info.palette_size[0] > 0) {
     memcpy(color_map, best_palette_color_map,
-           rows * cols * sizeof(best_palette_color_map[0]));
+           block_width * block_height * sizeof(best_palette_color_map[0]));
   }
   *mbmi = *best_mbmi;
   return rate_overhead;
 }
-#endif  // CONFIG_PALETTE
 
 static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
     const AV1_COMP *const cpi, MACROBLOCK *x, int row, int col,
@@ -3124,9 +3286,7 @@ static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
 
   xd->mi[0]->mbmi.tx_size = tx_size;
 
-#if CONFIG_PALETTE
   xd->mi[0]->mbmi.palette_mode_info.palette_size[0] = 0;
-#endif  // CONFIG_PALETTE
 
 #if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -3172,8 +3332,8 @@ static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
                          block == 0 || block == 2));
           xd->mi[0]->bmi[block_raster_idx].as_mode = mode;
           av1_predict_intra_block(
-              xd, pd->width, pd->height, txsize_to_bsize[tx_size], mode, dst,
-              dst_stride, dst, dst_stride, col + idx, row + idy, 0);
+              cm, xd, pd->width, pd->height, txsize_to_bsize[tx_size], mode,
+              dst, dst_stride, dst, dst_stride, col + idx, row + idy, 0);
 #if !CONFIG_PVQ
           aom_highbd_subtract_block(tx_height, tx_width, src_diff, 8, src,
                                     src_stride, dst, dst_stride, xd->bd);
@@ -3220,9 +3380,12 @@ static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
             if (!skip)
 #endif
               av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block),
-#if CONFIG_LGT
+#if CONFIG_LGT_FROM_PRED
                                           mode,
 #endif
+#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+                                          BLOCK_OFFSET(xd->mrc_mask, block),
+#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
                                           DCT_DCT, tx_size, dst, dst_stride,
                                           p->eobs[block]);
           } else {
@@ -3242,7 +3405,7 @@ static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
             av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8,
                             tx_size, coeff_ctx, AV1_XFORM_QUANT_FP);
             av1_optimize_b(cm, x, 0, 0, 0, block, BLOCK_8X8, tx_size,
-                           tempa + idx, templ + idy);
+                           tempa + idx, templ + idy, 1);
 #endif  // DISABLE_TRELLISQ_SEARCH
             ratey += av1_cost_coeffs(cpi, x, 0, 0, 0, block, tx_size,
                                      scan_order, tempa + idx, templ + idy,
@@ -3273,9 +3436,12 @@ static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
             if (!skip)
 #endif
               av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block),
-#if CONFIG_LGT
+#if CONFIG_LGT_FROM_PRED
                                           mode,
 #endif
+#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+                                          BLOCK_OFFSET(xd->mrc_mask, block),
+#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
                                           tx_type, tx_size, dst, dst_stride,
                                           p->eobs[block]);
             cpi->fn_ptr[sub_bsize].vf(src, src_stride, dst, dst_stride, &tmp);
@@ -3374,7 +3540,7 @@ static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
         assert(IMPLIES(tx_size == TX_4X8 || tx_size == TX_8X4,
                        block == 0 || block == 2));
         xd->mi[0]->bmi[block_raster_idx].as_mode = mode;
-        av1_predict_intra_block(xd, pd->width, pd->height,
+        av1_predict_intra_block(cm, xd, pd->width, pd->height,
                                 txsize_to_bsize[tx_size], mode, dst, dst_stride,
                                 dst, dst_stride,
 #if CONFIG_CB4X4
@@ -3416,7 +3582,7 @@ static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
                         BLOCK_8X8, tx_size, coeff_ctx, xform_quant);
 
         av1_optimize_b(cm, x, 0, 0, 0, block, BLOCK_8X8, tx_size, tempa + idx,
-                       templ + idy);
+                       templ + idy, 1);
 #endif  // DISABLE_TRELLISQ_SEARCH
         ratey += av1_cost_coeffs(cpi, x, 0, 0, 0, block, tx_size, scan_order,
                                  tempa + idx, templ + idy,
@@ -3459,9 +3625,12 @@ static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
           if (!skip)
 #endif  // CONFIG_PVQ
             av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block),
-#if CONFIG_LGT
+#if CONFIG_LGT_FROM_PRED
                                         mode,
 #endif
+#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+                                        BLOCK_OFFSET(xd->mrc_mask, block),
+#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
                                         tx_type, tx_size, dst, dst_stride,
                                         p->eobs[block]);
           unsigned int tmp;
@@ -3477,9 +3646,12 @@ static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
           if (!skip)
 #endif  // CONFIG_PVQ
             av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block),
-#if CONFIG_LGT
+#if CONFIG_LGT_FROM_PRED
                                         mode,
 #endif
+#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+                                        BLOCK_OFFSET(xd->mrc_mask, block),
+#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
                                         DCT_DCT, tx_size, dst, dst_stride,
                                         p->eobs[block]);
         }
@@ -3544,7 +3716,7 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(const AV1_COMP *const cpi,
   int64_t total_distortion = 0;
   int tot_rate_y = 0;
   int64_t total_rd = 0;
-  const int *bmode_costs = cpi->mbmode_cost[0];
+  const int *bmode_costs = mb->mbmode_cost[0];
   const int is_lossless = xd->lossless[mbmi->segment_id];
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
   const TX_SIZE tx_size = is_lossless ? TX_4X4 : max_txsize_rect_lookup[bsize];
@@ -3565,6 +3737,9 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(const AV1_COMP *const cpi,
   // expense of speed.
   mbmi->tx_type = DCT_DCT;
   mbmi->tx_size = tx_size;
+#if CONFIG_LGT_FROM_PRED
+  mbmi->use_lgt = 0;
+#endif
 
   if (y_skip) *y_skip = 1;
 
@@ -3583,15 +3758,23 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(const AV1_COMP *const cpi,
         const PREDICTION_MODE L =
             av1_left_block_mode(mic, left_mi, pred_block_idx);
 
-        bmode_costs = cpi->y_mode_costs[A][L];
+#if CONFIG_KF_CTX
+        const int above_ctx = intra_mode_context[A];
+        const int left_ctx = intra_mode_context[L];
+        bmode_costs = mb->y_mode_costs[above_ctx][left_ctx];
+#else
+        bmode_costs = mb->y_mode_costs[A][L];
+#endif
       }
       this_rd = rd_pick_intra_sub_8x8_y_subblock_mode(
           cpi, mb, idy, idx, &best_mode, bmode_costs,
           xd->plane[0].above_context + idx, xd->plane[0].left_context + idy, &r,
           &ry, &d, bsize, tx_size, y_skip, best_rd - total_rd);
-#if !CONFIG_DIST_8X8
-      if (this_rd >= best_rd - total_rd) return INT64_MAX;
-#endif  // !CONFIG_DIST_8X8
+#if CONFIG_DIST_8X8
+      if (!cpi->oxcf.using_dist_8x8)
+#endif
+        if (this_rd >= best_rd - total_rd) return INT64_MAX;
+
       total_rd += this_rd;
       cost += r;
       total_distortion += d;
@@ -3609,7 +3792,7 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(const AV1_COMP *const cpi,
   mbmi->mode = mic->bmi[3].as_mode;
 
 #if CONFIG_DIST_8X8
-  {
+  if (cpi->oxcf.using_dist_8x8) {
     const struct macroblock_plane *p = &mb->plane[0];
     const struct macroblockd_plane *pd = &xd->plane[0];
     const int src_stride = p->src.stride;
@@ -3617,11 +3800,8 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(const AV1_COMP *const cpi,
     uint8_t *src = p->src.buf;
     uint8_t *dst = pd->dst.buf;
 
-#if CONFIG_PVQ
-    use_activity_masking = mb->daala_enc.use_activity_masking;
-#endif  // CONFIG_PVQ
     // Daala-defined distortion computed for the block of 8x8 pixels
-    total_distortion = av1_dist_8x8(cpi, xd, src, src_stride, dst, dst_stride,
+    total_distortion = av1_dist_8x8(cpi, mb, src, src_stride, dst, dst_stride,
                                     BLOCK_8X8, 8, 8, 8, 8, mb->qindex)
                        << 4;
   }
@@ -3634,14 +3814,20 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(const AV1_COMP *const cpi,
         1) {
       const int eset =
           get_ext_tx_set(tx_size, bsize, 0, cpi->common.reduced_tx_set_used);
-      rate_tx_type = cpi->intra_tx_type_costs[eset][txsize_sqr_map[tx_size]]
-                                             [mbmi->mode][mbmi->tx_type];
+#if CONFIG_LGT_FROM_PRED
+      if (LGT_FROM_PRED_INTRA && is_lgt_allowed(mbmi->mode, tx_size))
+        rate_tx_type += mb->intra_lgt_cost[txsize_sqr_map[tx_size]][mbmi->mode]
+                                          [mbmi->use_lgt];
+      if (!LGT_FROM_PRED_INTRA || !mbmi->use_lgt)
+#endif  // CONFIG_LGT_FROM_PRED
+        rate_tx_type += mb->intra_tx_type_costs[eset][txsize_sqr_map[tx_size]]
+                                               [mbmi->mode][mbmi->tx_type];
     }
 #else
     rate_tx_type =
-        cpi->intra_tx_type_costs[txsize_sqr_map[tx_size]]
-                                [intra_mode_to_tx_type_context[mbmi->mode]]
-                                [mbmi->tx_type];
+        mb->intra_tx_type_costs[txsize_sqr_map[tx_size]]
+                               [intra_mode_to_tx_type_context[mbmi->mode]]
+                               [mbmi->tx_type];
 #endif  // CONFIG_EXT_TX
     assert(mbmi->tx_size == tx_size);
     cost += rate_tx_type;
@@ -3671,13 +3857,14 @@ static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
   TX_SIZE best_tx_size = TX_4X4;
   FILTER_INTRA_MODE_INFO filter_intra_mode_info;
   TX_TYPE best_tx_type;
+#if CONFIG_LGT_FROM_PRED
+  int use_lgt_when_selected;
+#endif
 
   av1_zero(filter_intra_mode_info);
   mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 1;
   mbmi->mode = DC_PRED;
-#if CONFIG_PALETTE
   mbmi->palette_mode_info.palette_size[0] = 0;
-#endif  // CONFIG_PALETTE
 
   for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) {
     int this_rate;
@@ -3702,6 +3889,9 @@ static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
       best_tx_size = mic->mbmi.tx_size;
       filter_intra_mode_info = mbmi->filter_intra_mode_info;
       best_tx_type = mic->mbmi.tx_type;
+#if CONFIG_LGT_FROM_PRED
+      use_lgt_when_selected = mic->mbmi.use_lgt;
+#endif
       *rate = this_rate;
       *rate_tokenonly = tokenonly_rd_stats.rate;
       *distortion = tokenonly_rd_stats.dist;
@@ -3713,6 +3903,9 @@ static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
   if (filter_intra_selected_flag) {
     mbmi->mode = DC_PRED;
     mbmi->tx_size = best_tx_size;
+#if CONFIG_LGT_FROM_PRED
+    mbmi->use_lgt = use_lgt_when_selected;
+#endif
     mbmi->filter_intra_mode_info.use_filter_intra_mode[0] =
         filter_intra_mode_info.use_filter_intra_mode[0];
     mbmi->filter_intra_mode_info.filter_intra_mode[0] =
@@ -3733,6 +3926,9 @@ static int64_t calc_rd_given_intra_angle(
     int64_t best_rd_in, int8_t angle_delta, int max_angle_delta, int *rate,
     RD_STATS *rd_stats, int *best_angle_delta, TX_SIZE *best_tx_size,
     TX_TYPE *best_tx_type,
+#if CONFIG_LGT_FROM_PRED
+    int *use_lgt_when_selected,
+#endif
 #if CONFIG_INTRA_INTERP
     INTRA_FILTER *best_filter,
 #endif  // CONFIG_INTRA_INTERP
@@ -3765,6 +3961,9 @@ static int64_t calc_rd_given_intra_angle(
     *best_filter = mbmi->intra_filter;
 #endif  // CONFIG_INTRA_INTERP
     *best_tx_type = mbmi->tx_type;
+#if CONFIG_LGT_FROM_PRED
+    *use_lgt_when_selected = mbmi->use_lgt;
+#endif
     *rate = this_rate;
     rd_stats->rate = tokenonly_rd_stats.rate;
     rd_stats->dist = tokenonly_rd_stats.dist;
@@ -3794,6 +3993,9 @@ static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
   int64_t this_rd, best_rd_in, rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
   TX_SIZE best_tx_size = mic->mbmi.tx_size;
   TX_TYPE best_tx_type = mbmi->tx_type;
+#if CONFIG_LGT_FROM_PRED
+  int use_lgt_when_selected = mbmi->use_lgt;
+#endif
 
   for (i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX;
 
@@ -3810,12 +4012,15 @@ static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
         this_rd = calc_rd_given_intra_angle(
             cpi, x, bsize,
 #if CONFIG_INTRA_INTERP
-            mode_cost + cpi->intra_filter_cost[intra_filter_ctx][filter],
+            mode_cost + x->intra_filter_cost[intra_filter_ctx][filter],
 #else
           mode_cost,
 #endif  // CONFIG_INTRA_INTERP
             best_rd_in, (1 - 2 * i) * angle_delta, MAX_ANGLE_DELTA, rate,
             rd_stats, &best_angle_delta, &best_tx_size, &best_tx_type,
+#if CONFIG_LGT_FROM_PRED
+            &use_lgt_when_selected,
+#endif
 #if CONFIG_INTRA_INTERP
             &best_filter,
 #endif  // CONFIG_INTRA_INTERP
@@ -3851,12 +4056,15 @@ static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
           calc_rd_given_intra_angle(
               cpi, x, bsize,
 #if CONFIG_INTRA_INTERP
-              mode_cost + cpi->intra_filter_cost[intra_filter_ctx][filter],
+              mode_cost + x->intra_filter_cost[intra_filter_ctx][filter],
 #else
             mode_cost,
 #endif  // CONFIG_INTRA_INTERP
               best_rd, (1 - 2 * i) * angle_delta, MAX_ANGLE_DELTA, rate,
               rd_stats, &best_angle_delta, &best_tx_size, &best_tx_type,
+#if CONFIG_LGT_FROM_PRED
+              &use_lgt_when_selected,
+#endif
 #if CONFIG_INTRA_INTERP
               &best_filter,
 #endif  // CONFIG_INTRA_INTERP
@@ -3876,10 +4084,13 @@ static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
         mic->mbmi.intra_filter = filter;
         this_rd = calc_rd_given_intra_angle(
             cpi, x, bsize,
-            mode_cost + cpi->intra_filter_cost[intra_filter_ctx][filter],
-            best_rd, best_angle_delta, MAX_ANGLE_DELTA, rate, rd_stats,
-            &best_angle_delta, &best_tx_size, &best_tx_type, &best_filter,
-            &best_rd, best_model_rd);
+            mode_cost + x->intra_filter_cost[intra_filter_ctx][filter], best_rd,
+            best_angle_delta, MAX_ANGLE_DELTA, rate, rd_stats,
+            &best_angle_delta, &best_tx_size, &best_tx_type,
+#if CONFIG_LGT_FROM_PRED
+            &use_lgt_when_selected,
+#endif
+            &best_filter, &best_rd, best_model_rd);
       }
     }
   }
@@ -3891,6 +4102,9 @@ static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
   mic->mbmi.intra_filter = best_filter;
 #endif  // CONFIG_INTRA_INTERP
   mbmi->tx_type = best_tx_type;
+#if CONFIG_LGT_FROM_PRED
+  mbmi->use_lgt = use_lgt_when_selected;
+#endif
   return best_rd;
 }
 
@@ -3919,9 +4133,7 @@ static const uint8_t gradient_to_angle_bin[2][7][16] = {
 /* clang-format off */
 static const uint8_t mode_to_angle_bin[INTRA_MODES] = {
   0, 2, 6, 0, 4, 3, 5, 7, 1, 0,
-#if CONFIG_ALT_INTRA
   0,
-#endif  // CONFIG_ALT_INTRA
 };
 /* clang-format on */
 
@@ -4064,16 +4276,12 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   uint16_t filter_intra_mode_skip_mask = (1 << FILTER_INTRA_MODES) - 1;
 #endif  // CONFIG_FILTER_INTRA
   const int *bmode_costs;
-#if CONFIG_PALETTE
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-  uint8_t *best_palette_color_map =
-      cpi->common.allow_screen_content_tools
-          ? x->palette_buffer->best_palette_color_map
-          : NULL;
   int palette_y_mode_ctx = 0;
   const int try_palette =
-      cpi->common.allow_screen_content_tools && bsize >= BLOCK_8X8;
-#endif  // CONFIG_PALETTE
+      av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type);
+  uint8_t *best_palette_color_map =
+      try_palette ? x->palette_buffer->best_palette_color_map : NULL;
   const MODE_INFO *above_mi = xd->above_mi;
   const MODE_INFO *left_mi = xd->left_mi;
   const PREDICTION_MODE A = av1_above_block_mode(mic, above_mi, 0);
@@ -4085,7 +4293,14 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   od_encode_checkpoint(&x->daala_enc, &pre_buf);
   od_encode_checkpoint(&x->daala_enc, &post_buf);
 #endif  // CONFIG_PVQ
-  bmode_costs = cpi->y_mode_costs[A][L];
+
+#if CONFIG_KF_CTX
+  const int above_ctx = intra_mode_context[A];
+  const int left_ctx = intra_mode_context[L];
+  bmode_costs = x->y_mode_costs[above_ctx][left_ctx];
+#else
+  bmode_costs = x->y_mode_costs[A][L];
+#endif
 
 #if CONFIG_EXT_INTRA
   mbmi->angle_delta[0] = 0;
@@ -4101,14 +4316,17 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
 #if CONFIG_FILTER_INTRA
   mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
 #endif  // CONFIG_FILTER_INTRA
-#if CONFIG_PALETTE
   pmi->palette_size[0] = 0;
-  if (above_mi)
-    palette_y_mode_ctx +=
-        (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
-  if (left_mi)
-    palette_y_mode_ctx += (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
-#endif  // CONFIG_PALETTE
+  if (try_palette) {
+    if (above_mi) {
+      palette_y_mode_ctx +=
+          (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
+    }
+    if (left_mi) {
+      palette_y_mode_ctx +=
+          (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
+    }
+  }
 
   if (cpi->sf.tx_type_search.fast_intra_tx_type_search)
     x->use_default_intra_tx_type = 1;
@@ -4160,21 +4378,20 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
 
     this_rate = this_rate_tokenonly + bmode_costs[mbmi->mode];
 
-    if (!xd->lossless[mbmi->segment_id] && mbmi->sb_type >= BLOCK_8X8) {
+    if (!xd->lossless[mbmi->segment_id] &&
+        block_signals_txsize(mbmi->sb_type)) {
       // super_block_yrd above includes the cost of the tx_size in the
       // tokenonly rate, but for intra blocks, tx_size is always coded
       // (prediction granularity), so we account for it in the full rate,
       // not the tokenonly rate.
       this_rate_tokenonly -= tx_size_cost(cpi, x, bsize, mbmi->tx_size);
     }
-#if CONFIG_PALETTE
     if (try_palette && mbmi->mode == DC_PRED) {
       this_rate +=
           av1_cost_bit(av1_default_palette_y_mode_prob[bsize - BLOCK_8X8]
                                                       [palette_y_mode_ctx],
                        0);
     }
-#endif  // CONFIG_PALETTE
 #if CONFIG_FILTER_INTRA
     if (mbmi->mode == DC_PRED)
       this_rate += av1_cost_bit(cpi->common.fc->filter_intra_probs[0], 0);
@@ -4185,8 +4402,7 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
       const int p_angle =
           mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP;
       if (av1_is_intra_filter_switchable(p_angle))
-        this_rate +=
-            cpi->intra_filter_cost[intra_filter_ctx][mbmi->intra_filter];
+        this_rate += x->intra_filter_cost[intra_filter_ctx][mbmi->intra_filter];
 #endif  // CONFIG_INTRA_INTERP
       if (av1_use_angle_delta(bsize)) {
         this_rate += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
@@ -4194,6 +4410,10 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
       }
     }
 #endif  // CONFIG_EXT_INTRA
+#if CONFIG_INTRABC
+    if (bsize >= BLOCK_8X8 && cpi->common.allow_screen_content_tools)
+      this_rate += x->intrabc_cost[0];
+#endif  // CONFIG_INTRABC
     this_rd = RDCOST(x->rdmult, this_rate, this_distortion);
 #if CONFIG_FILTER_INTRA
     if (best_rd == INT64_MAX || this_rd - best_rd < (best_rd >> 4)) {
@@ -4221,14 +4441,12 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   od_encode_rollback(&x->daala_enc, &post_buf);
 #endif  // CONFIG_PVQ
 
-#if CONFIG_PALETTE
   if (try_palette) {
     rd_pick_palette_intra_sby(cpi, x, bsize, palette_y_mode_ctx,
                               bmode_costs[DC_PRED], &best_mbmi,
                               best_palette_color_map, &best_rd, &best_model_rd,
                               rate, rate_tokenonly, distortion, skippable);
   }
-#endif  // CONFIG_PALETTE
 
 #if CONFIG_FILTER_INTRA
   if (beat_best_rd) {
@@ -4317,6 +4535,9 @@ void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
 
   int64_t tmp;
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+  uint8_t *mrc_mask = BLOCK_OFFSET(xd->mrc_mask, block);
+#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
   PLANE_TYPE plane_type = get_plane_type(plane);
   TX_TYPE tx_type =
       av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
@@ -4346,6 +4567,22 @@ void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
 
   int coeff_ctx = get_entropy_context(tx_size, a, l);
 
+  tmp = pixel_diff_dist(x, plane, diff, diff_stride, blk_row, blk_col,
+                        plane_bsize, txm_bsize);
+
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    tmp = ROUND_POWER_OF_TWO(tmp, (xd->bd - 8) * 2);
+#endif  // CONFIG_HIGHBITDEPTH
+  rd_stats->sse += tmp << 4;
+
+  if (rd_stats->invalid_rate) {
+    rd_stats->dist += tmp << 4;
+    rd_stats->rate += rd_stats->zero_rate;
+    rd_stats->skip = 1;
+    return;
+  }
+
 // TODO(any): Use av1_dist_block to compute distortion
 #if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -4373,43 +4610,59 @@ void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
   const int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
   tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
   const int buffer_length = tx_size_2d[tx_size];
-  int64_t tmp_dist;
+  int64_t tmp_dist, tmp_sse;
+#if CONFIG_DIST_8X8
+  int disable_early_skip =
+      x->using_dist_8x8 && plane == 0 && plane_bsize >= BLOCK_8X8 &&
+      (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4) &&
+      x->tune_metric != AOM_TUNE_PSNR;
+#endif  // CONFIG_DIST_8X8
+
 #if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
     tmp_dist =
-        av1_highbd_block_error(coeff, dqcoeff, buffer_length, &tmp, xd->bd) >>
-        shift;
+        av1_highbd_block_error(coeff, dqcoeff, buffer_length, &tmp_sse, xd->bd);
   else
 #endif
-    tmp_dist = av1_block_error(coeff, dqcoeff, buffer_length, &tmp) >> shift;
+    tmp_dist = av1_block_error(coeff, dqcoeff, buffer_length, &tmp_sse);
+
+  tmp_dist = RIGHT_SIGNED_SHIFT(tmp_dist, shift);
 
-  if (RDCOST(x->rdmult, 0, tmp_dist) < rd_stats->ref_rdcost) {
+#if CONFIG_MRC_TX
+  if (tx_type == MRC_DCT && !xd->mi[0]->mbmi.valid_mrc_mask) {
+    av1_invalid_rd_stats(rd_stats);
+    return;
+  }
+#endif  // CONFIG_MRC_TX
+  if (
+#if CONFIG_DIST_8X8
+      disable_early_skip ||
+#endif
+      RDCOST(x->rdmult, 0, tmp_dist) < rd_stats->ref_rdcost) {
     av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size,
-                   a, l);
+                   a, l, 1);
+  } else {
+    rd_stats->rate += rd_stats->zero_rate;
+    rd_stats->dist += tmp << 4;
+    rd_stats->skip = 1;
+    rd_stats->invalid_rate = 1;
+    return;
   }
 #endif  // DISABLE_TRELLISQ_SEARCH
 
-  tmp = pixel_diff_dist(x, plane, diff, diff_stride, blk_row, blk_col,
-                        plane_bsize, txm_bsize);
-
-#if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-    tmp = ROUND_POWER_OF_TWO(tmp, (xd->bd - 8) * 2);
-#endif  // CONFIG_HIGHBITDEPTH
-  rd_stats->sse += tmp * 16;
   const int eob = p->eobs[block];
 
-#if CONFIG_LGT
-  PREDICTION_MODE mode = get_prediction_mode(xd->mi[0], plane, tx_size, block);
-  av1_inverse_transform_block(xd, dqcoeff, mode, tx_type, tx_size, rec_buffer,
-                              MAX_TX_SIZE, eob);
-#else
-  av1_inverse_transform_block(xd, dqcoeff, tx_type, tx_size, rec_buffer,
-                              MAX_TX_SIZE, eob);
+  av1_inverse_transform_block(xd, dqcoeff,
+#if CONFIG_LGT_FROM_PRED
+                              xd->mi[0]->mbmi.mode,
 #endif
+#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+                              mrc_mask,
+#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+                              tx_type, tx_size, rec_buffer, MAX_TX_SIZE, eob);
   if (eob > 0) {
 #if CONFIG_DIST_8X8
-    if (plane == 0 && (bw < 8 && bh < 8)) {
+    if (x->using_dist_8x8 && plane == 0 && (bw < 8 && bh < 8)) {
       // Save sub8x8 luma decoded pixels
       // since 8x8 luma decoded pixels are not available for daala-dist
       // after recursive split of BLOCK_8x8 is done.
@@ -4451,12 +4704,12 @@ void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
 }
 
 static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
-                            int blk_col, int plane, int block, int block32,
-                            TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize,
+                            int blk_col, int plane, int block, TX_SIZE tx_size,
+                            int depth, BLOCK_SIZE plane_bsize,
                             ENTROPY_CONTEXT *ta, ENTROPY_CONTEXT *tl,
                             TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
                             RD_STATS *rd_stats, int64_t ref_best_rd,
-                            int *is_cost_valid, RD_STATS *rd_stats_stack) {
+                            int *is_cost_valid) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   struct macroblock_plane *const p = &x->plane[plane];
@@ -4519,32 +4772,28 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
   TX_SIZE txs_ctx = get_txsize_context(tx_size);
   TXB_CTX txb_ctx;
   get_txb_ctx(plane_bsize, tx_size, plane, pta, ptl, &txb_ctx);
+
+#if LV_MAP_PROB
+  zero_blk_rate = x->coeff_costs[txs_ctx][get_plane_type(plane)]
+                      .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
+#else
   zero_blk_rate =
       av1_cost_bit(xd->fc->txb_skip[txs_ctx][txb_ctx.txb_skip_ctx], 1);
+#endif  // LV_MAP_PROB
 #else
-  int tx_size_ctx = txsize_sqr_map[tx_size];
+  TX_SIZE tx_size_ctx = txsize_sqr_map[tx_size];
   int coeff_ctx = get_entropy_context(tx_size, pta, ptl);
-  zero_blk_rate = x->token_costs[tx_size_ctx][pd->plane_type][1][0][0]
-                                [coeff_ctx][EOB_TOKEN];
+  zero_blk_rate =
+      x->token_head_costs[tx_size_ctx][pd->plane_type][1][0][coeff_ctx][0];
 #endif
 
   rd_stats->ref_rdcost = ref_best_rd;
   rd_stats->zero_rate = zero_blk_rate;
   if (cpi->common.tx_mode == TX_MODE_SELECT || tx_size == TX_4X4) {
     inter_tx_size[0][0] = tx_size;
-
-    if (tx_size == TX_32X32 && mbmi->tx_type != DCT_DCT &&
-        rd_stats_stack[block32].rate != INT_MAX) {
-      *rd_stats = rd_stats_stack[block32];
-      p->eobs[block] = !rd_stats->skip;
-      x->blk_skip[plane][blk_row * bw + blk_col] = rd_stats->skip;
-    } else {
-      av1_tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
-                        plane_bsize, pta, ptl, rd_stats);
-      if (tx_size == TX_32X32) {
-        rd_stats_stack[block32] = *rd_stats;
-      }
-    }
+    av1_tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
+                      plane_bsize, pta, ptl, rd_stats);
+    if (rd_stats->rate == INT_MAX) return;
 
     if ((RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
              RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) ||
@@ -4599,11 +4848,12 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
 
       av1_tx_block_rd_b(cpi, x, quarter_txsize, 0, 0, plane, 0, plane_bsize,
                         pta, ptl, &rd_stats_qttx);
+      if (rd_stats->rate == INT_MAX) return;
 
       tx_size_ctx = txsize_sqr_map[quarter_txsize];
       coeff_ctx = get_entropy_context(quarter_txsize, pta, ptl);
-      zero_blk_rate = x->token_costs[tx_size_ctx][pd->plane_type][1][0][0]
-                                    [coeff_ctx][EOB_TOKEN];
+      zero_blk_rate =
+          x->token_head_costs[tx_size_ctx][pd->plane_type][1][0][coeff_ctx][0];
       if ((RDCOST(x->rdmult, rd_stats_qttx.rate, rd_stats_qttx.dist) >=
                RDCOST(x->rdmult, zero_blk_rate, rd_stats_qttx.sse) ||
            rd_stats_qttx.skip == 1) &&
@@ -4629,11 +4879,15 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
                         plane, block_offset_qttx, plane_bsize, pta, ptl,
                         &rd_stats_tmp);
 
+      if (rd_stats->rate == INT_MAX) return;
+
+#if !CONFIG_PVQ
       av1_set_txb_context(x, plane, 0, quarter_txsize, pta, ptl);
+#endif  // !CONFIG_PVQ
       coeff_ctx = get_entropy_context(quarter_txsize, pta + blk_col_offset,
                                       ptl + blk_row_offset);
-      zero_blk_rate = x->token_costs[tx_size_ctx][pd->plane_type][1][0][0]
-                                    [coeff_ctx][EOB_TOKEN];
+      zero_blk_rate =
+          x->token_head_costs[tx_size_ctx][pd->plane_type][1][0][coeff_ctx][0];
       if ((RDCOST(x->rdmult, rd_stats_tmp.rate, rd_stats_tmp.dist) >=
                RDCOST(x->rdmult, zero_blk_rate, rd_stats_tmp.sse) ||
            rd_stats_tmp.skip == 1) &&
@@ -4684,13 +4938,13 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
 #endif
   }
 
+  if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH
 #if CONFIG_MRC_TX
-  // If the tx type we are trying is MRC_DCT, we cannot partition the transform
-  // into anything smaller than TX_32X32
-  if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH && mbmi->tx_type != MRC_DCT) {
-#else
-  if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH) {
-#endif
+      // If the tx type we are trying is MRC_DCT, we cannot partition the
+      // transform into anything smaller than TX_32X32
+      && mbmi->tx_type != MRC_DCT
+#endif  // CONFIG_MRC_TX
+      ) {
     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
     const int bsl = tx_size_wide_unit[sub_txs];
     int sub_step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
@@ -4713,25 +4967,26 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
 
       if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
 
-      select_tx_block(cpi, x, offsetr, offsetc, plane, block, block32, sub_txs,
+      select_tx_block(cpi, x, offsetr, offsetc, plane, block, sub_txs,
                       depth + 1, plane_bsize, ta, tl, tx_above, tx_left,
-                      &this_rd_stats, ref_best_rd - tmp_rd, &this_cost_valid,
-                      rd_stats_stack);
+                      &this_rd_stats, ref_best_rd - tmp_rd, &this_cost_valid);
 #if CONFIG_DIST_8X8
-      if (plane == 0 && tx_size == TX_8X8) {
+      if (x->using_dist_8x8 && plane == 0 && tx_size == TX_8X8) {
         sub8x8_eob[i] = p->eobs[block];
       }
 #endif  // CONFIG_DIST_8X8
       av1_merge_rd_stats(&sum_rd_stats, &this_rd_stats);
 
       tmp_rd = RDCOST(x->rdmult, sum_rd_stats.rate, sum_rd_stats.dist);
-#if !CONFIG_DIST_8X8
-      if (this_rd < tmp_rd) break;
+#if CONFIG_DIST_8X8
+      if (!x->using_dist_8x8)
 #endif
+        if (this_rd < tmp_rd) break;
       block += sub_step;
     }
 #if CONFIG_DIST_8X8
-    if (this_cost_valid && plane == 0 && tx_size == TX_8X8) {
+    if (x->using_dist_8x8 && this_cost_valid && plane == 0 &&
+        tx_size == TX_8X8) {
       const int src_stride = p->src.stride;
       const int dst_stride = pd->dst.stride;
 
@@ -4757,7 +5012,7 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
       DECLARE_ALIGNED(16, uint8_t, pred8[8 * 8]);
 #endif  // CONFIG_HIGHBITDEPTH
 
-      dist_8x8 = av1_dist_8x8(cpi, xd, src, src_stride, dst, dst_stride,
+      dist_8x8 = av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride,
                               BLOCK_8X8, 8, 8, 8, 8, qindex) *
                  16;
       sum_rd_stats.sse = dist_8x8;
@@ -4802,7 +5057,7 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
               for (j = 0; j < 4; j++)
                 for (i = 0; i < 4; i++)
                   pred8[(row * 4 + j) * 8 + 4 * col + i] =
-                      pred[(row * 4 + j) * pred_stride + 4 * col + i];
+                      (uint8_t)pred[(row * 4 + j) * pred_stride + 4 * col + i];
             } else {
               for (j = 0; j < 4; j++)
                 for (i = 0; i < 4; i++)
@@ -4814,7 +5069,7 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
 #if CONFIG_HIGHBITDEPTH
       }
 #endif  // CONFIG_HIGHBITDEPTH
-      dist_8x8 = av1_dist_8x8(cpi, xd, src, src_stride, pred8, 8, BLOCK_8X8, 8,
+      dist_8x8 = av1_dist_8x8(cpi, x, src, src_stride, pred8, 8, BLOCK_8X8, 8,
                               8, 8, 8, qindex) *
                  16;
       sum_rd_stats.dist = dist_8x8;
@@ -4853,12 +5108,14 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
     }
 #endif
 
+#if !CONFIG_PVQ
     av1_set_txb_context(x, plane, block, tx_size_selected, pta, ptl);
 #if CONFIG_RECT_TX_EXT
     if (is_qttx_picked)
       av1_set_txb_context(x, plane, block_offset_qttx, tx_size_selected,
                           pta + blk_col_offset, ptl + blk_row_offset);
-#endif
+#endif  // CONFIG_RECT_TX_EXT
+#endif  // !CONFIG_PVQ
 
     txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size,
                           tx_size);
@@ -4889,7 +5146,7 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
 
 static void inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
                             RD_STATS *rd_stats, BLOCK_SIZE bsize,
-                            int64_t ref_best_rd, RD_STATS *rd_stats_stack) {
+                            int64_t ref_best_rd) {
   MACROBLOCKD *const xd = &x->e_mbd;
   int is_cost_valid = 1;
   int64_t this_rd = 0;
@@ -4908,7 +5165,8 @@ static void inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
     const int bw = tx_size_wide_unit[max_tx_size];
     int idx, idy;
     int block = 0;
-    int block32 = 0;
+    int init_depth =
+        (mi_height != mi_width) ? RECT_VARTX_DEPTH_INIT : SQR_VARTX_DEPTH_INIT;
     int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
     ENTROPY_CONTEXT ctxa[2 * MAX_MIB_SIZE];
     ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE];
@@ -4924,15 +5182,17 @@ static void inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
 
     for (idy = 0; idy < mi_height; idy += bh) {
       for (idx = 0; idx < mi_width; idx += bw) {
-        select_tx_block(cpi, x, idy, idx, 0, block, block32, max_tx_size,
-                        mi_height != mi_width, plane_bsize, ctxa, ctxl,
-                        tx_above, tx_left, &pn_rd_stats, ref_best_rd - this_rd,
-                        &is_cost_valid, rd_stats_stack);
+        select_tx_block(cpi, x, idy, idx, 0, block, max_tx_size, init_depth,
+                        plane_bsize, ctxa, ctxl, tx_above, tx_left,
+                        &pn_rd_stats, ref_best_rd - this_rd, &is_cost_valid);
+        if (pn_rd_stats.rate == INT_MAX) {
+          av1_invalid_rd_stats(rd_stats);
+          return;
+        }
         av1_merge_rd_stats(rd_stats, &pn_rd_stats);
         this_rd += AOMMIN(RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist),
                           RDCOST(x->rdmult, 0, pn_rd_stats.sse));
         block += step;
-        ++block32;
       }
     }
   }
@@ -4949,8 +5209,7 @@ static void inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
 
 static int64_t select_tx_size_fix_type(const AV1_COMP *cpi, MACROBLOCK *x,
                                        RD_STATS *rd_stats, BLOCK_SIZE bsize,
-                                       int64_t ref_best_rd, TX_TYPE tx_type,
-                                       RD_STATS *rd_stats_stack) {
+                                       int64_t ref_best_rd, TX_TYPE tx_type) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
@@ -4964,7 +5223,7 @@ static int64_t select_tx_size_fix_type(const AV1_COMP *cpi, MACROBLOCK *x,
   const int max_blocks_wide = max_block_wide(xd, bsize, 0);
 
   mbmi->tx_type = tx_type;
-  inter_block_yrd(cpi, x, rd_stats, bsize, ref_best_rd, rd_stats_stack);
+  inter_block_yrd(cpi, x, rd_stats, bsize, ref_best_rd);
   mbmi->min_tx_size = get_min_tx_size(mbmi->inter_tx_size[0][0]);
 
   if (rd_stats->rate == INT_MAX) return INT64_MAX;
@@ -4981,23 +5240,37 @@ static int64_t select_tx_size_fix_type(const AV1_COMP *cpi, MACROBLOCK *x,
       !xd->lossless[xd->mi[0]->mbmi.segment_id]) {
     const int ext_tx_set = get_ext_tx_set(mbmi->min_tx_size, bsize, is_inter,
                                           cm->reduced_tx_set_used);
-    if (is_inter) {
-      if (ext_tx_set > 0)
+#if CONFIG_LGT_FROM_PRED
+    if (is_lgt_allowed(mbmi->mode, mbmi->min_tx_size)) {
+      if (LGT_FROM_PRED_INTRA && !is_inter && ext_tx_set > 0 &&
+          ALLOW_INTRA_EXT_TX)
+        rd_stats->rate += x->intra_lgt_cost[txsize_sqr_map[mbmi->min_tx_size]]
+                                           [mbmi->mode][mbmi->use_lgt];
+      if (LGT_FROM_PRED_INTER && is_inter && ext_tx_set > 0)
         rd_stats->rate +=
-            cpi->inter_tx_type_costs[ext_tx_set]
+            x->inter_lgt_cost[txsize_sqr_map[mbmi->min_tx_size]][mbmi->use_lgt];
+    }
+    if (!mbmi->use_lgt) {
+#endif  // CONFIG_LGT_FROM_PRED
+      if (is_inter) {
+        if (ext_tx_set > 0)
+          rd_stats->rate +=
+              x->inter_tx_type_costs[ext_tx_set]
                                     [txsize_sqr_map[mbmi->min_tx_size]]
                                     [mbmi->tx_type];
-    } else {
-      if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX)
-        rd_stats->rate +=
-            cpi->intra_tx_type_costs[ext_tx_set][mbmi->min_tx_size][mbmi->mode]
+      } else {
+        if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX)
+          rd_stats->rate +=
+              x->intra_tx_type_costs[ext_tx_set][mbmi->min_tx_size][mbmi->mode]
                                     [mbmi->tx_type];
+      }
     }
+#if CONFIG_LGT_FROM_PRED
   }
+#endif
 #else
   if (mbmi->min_tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id])
-    rd_stats->rate +=
-        cpi->inter_tx_type_costs[mbmi->min_tx_size][mbmi->tx_type];
+    rd_stats->rate += x->inter_tx_type_costs[mbmi->min_tx_size][mbmi->tx_type];
 #endif  // CONFIG_EXT_TX
 #endif  // CONFIG_TXK_SEL
 
@@ -5013,6 +5286,162 @@ static int64_t select_tx_size_fix_type(const AV1_COMP *cpi, MACROBLOCK *x,
   return rd;
 }
 
+static uint32_t get_block_residue_hash(MACROBLOCK *x, BLOCK_SIZE bsize) {
+  const int rows = block_size_high[bsize];
+  const int cols = block_size_wide[bsize];
+  const int diff_stride = cols;
+  const struct macroblock_plane *const p = &x->plane[0];
+  const int16_t *diff = &p->src_diff[0];
+  uint8_t hash_data[MAX_SB_SQUARE];
+  for (int r = 0; r < rows; ++r) {
+    for (int c = 0; c < cols; ++c) {
+      hash_data[cols * r + c] = clip_pixel(diff[c] + 128);
+    }
+    diff += diff_stride;
+  }
+  return (av1_get_crc_value(&x->tx_rd_record.crc_calculator, hash_data,
+                            rows * cols)
+          << 7) +
+         bsize;
+}
+
+static void save_tx_rd_info(int n4, uint32_t hash, const MACROBLOCK *const x,
+                            const RD_STATS *const rd_stats,
+                            TX_RD_INFO *const tx_rd_info) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  tx_rd_info->hash_value = hash;
+  tx_rd_info->tx_type = mbmi->tx_type;
+  tx_rd_info->tx_size = mbmi->tx_size;
+#if CONFIG_VAR_TX
+  tx_rd_info->min_tx_size = mbmi->min_tx_size;
+  memcpy(tx_rd_info->blk_skip, x->blk_skip[0],
+         sizeof(tx_rd_info->blk_skip[0]) * n4);
+  for (int idy = 0; idy < xd->n8_h; ++idy)
+    for (int idx = 0; idx < xd->n8_w; ++idx)
+      tx_rd_info->inter_tx_size[idy][idx] = mbmi->inter_tx_size[idy][idx];
+#endif  // CONFIG_VAR_TX
+#if CONFIG_TXK_SEL
+  av1_copy(tx_rd_info->txk_type, mbmi->txk_type);
+#endif  // CONFIG_TXK_SEL
+  tx_rd_info->rd_stats = *rd_stats;
+}
+
+static void fetch_tx_rd_info(int n4, const TX_RD_INFO *const tx_rd_info,
+                             RD_STATS *const rd_stats, MACROBLOCK *const x) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  mbmi->tx_type = tx_rd_info->tx_type;
+  mbmi->tx_size = tx_rd_info->tx_size;
+#if CONFIG_VAR_TX
+  mbmi->min_tx_size = tx_rd_info->min_tx_size;
+  memcpy(x->blk_skip[0], tx_rd_info->blk_skip,
+         sizeof(tx_rd_info->blk_skip[0]) * n4);
+  for (int idy = 0; idy < xd->n8_h; ++idy)
+    for (int idx = 0; idx < xd->n8_w; ++idx)
+      mbmi->inter_tx_size[idy][idx] = tx_rd_info->inter_tx_size[idy][idx];
+#endif  // CONFIG_VAR_TX
+#if CONFIG_TXK_SEL
+  av1_copy(mbmi->txk_type, tx_rd_info->txk_type);
+#endif  // CONFIG_TXK_SEL
+  *rd_stats = tx_rd_info->rd_stats;
+}
+
+// Uses simple features on top of DCT coefficients to quickly predict
+// whether optimal RD decision is to skip encoding the residual.
+static int predict_skip_flag_8bit(const MACROBLOCK *x, BLOCK_SIZE bsize) {
+  if (bsize > BLOCK_16X16) return 0;
+  // Tuned for target false-positive rate of 5% for all block sizes:
+  const uint32_t threshold_table[] = { 50, 50, 50, 55, 47, 47, 53, 22, 22, 37 };
+  const struct macroblock_plane *const p = &x->plane[0];
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  tran_low_t DCT_coefs[32 * 32];
+  TxfmParam param;
+  param.tx_type = DCT_DCT;
+#if CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
+  param.tx_size = max_txsize_rect_lookup[bsize];
+#else
+  param.tx_size = max_txsize_lookup[bsize];
+#endif
+  param.bd = 8;
+  param.lossless = 0;
+  av1_fwd_txfm(p->src_diff, DCT_coefs, bw, &param);
+
+  uint32_t dc = (uint32_t)av1_dc_quant(x->qindex, 0, AOM_BITS_8);
+  uint32_t ac = (uint32_t)av1_ac_quant(x->qindex, 0, AOM_BITS_8);
+  uint32_t max_quantized_coef = (100 * (uint32_t)abs(DCT_coefs[0])) / dc;
+  for (int i = 1; i < bw * bh; i++) {
+    uint32_t cur_quantized_coef = (100 * (uint32_t)abs(DCT_coefs[i])) / ac;
+    if (cur_quantized_coef > max_quantized_coef)
+      max_quantized_coef = cur_quantized_coef;
+  }
+
+  return max_quantized_coef < threshold_table[AOMMAX(bsize - BLOCK_4X4, 0)];
+}
+
+// Used to set proper context for early termination with skip = 1.
+static void set_skip_flag(const AV1_COMP *cpi, MACROBLOCK *x,
+                          RD_STATS *rd_stats, int bsize) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const int n4 = bsize_to_num_blk(bsize);
+#if CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
+  const TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
+#else
+  const TX_SIZE tx_size = max_txsize_lookup[bsize];
+#endif
+  mbmi->tx_type = DCT_DCT;
+  for (int idy = 0; idy < xd->n8_h; ++idy)
+    for (int idx = 0; idx < xd->n8_w; ++idx)
+      mbmi->inter_tx_size[idy][idx] = tx_size;
+  mbmi->tx_size = tx_size;
+  mbmi->min_tx_size = get_min_tx_size(tx_size);
+  memset(x->blk_skip[0], 1, sizeof(uint8_t) * n4);
+  rd_stats->skip = 1;
+
+  // Rate.
+  const int tx_size_ctx = txsize_sqr_map[tx_size];
+  ENTROPY_CONTEXT ctxa[2 * MAX_MIB_SIZE];
+  ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE];
+  av1_get_entropy_contexts(bsize, 0, &xd->plane[0], ctxa, ctxl);
+  int coeff_ctx = get_entropy_context(tx_size, ctxa, ctxl);
+  int rate = x->token_head_costs[tx_size_ctx][PLANE_TYPE_Y][1][0][coeff_ctx][0];
+  if (tx_size > TX_4X4) {
+    int ctx = txfm_partition_context(
+        xd->above_txfm_context, xd->left_txfm_context, mbmi->sb_type, tx_size);
+    rate += av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 0);
+  }
+#if !CONFIG_TXK_SEL
+#if CONFIG_EXT_TX
+  const AV1_COMMON *cm = &cpi->common;
+  const int ext_tx_set = get_ext_tx_set(max_txsize_lookup[bsize], bsize, 1,
+                                        cm->reduced_tx_set_used);
+  if (get_ext_tx_types(mbmi->min_tx_size, bsize, 1, cm->reduced_tx_set_used) >
+          1 &&
+      !xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+    if (ext_tx_set > 0)
+      rate +=
+          x->inter_tx_type_costs[ext_tx_set][txsize_sqr_map[mbmi->min_tx_size]]
+                                [mbmi->tx_type];
+  }
+#else
+  if (mbmi->min_tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id])
+    rd_stats->rate += x->inter_tx_type_costs[mbmi->min_tx_size][mbmi->tx_type];
+#endif  // CONFIG_EXT_TX
+#endif  // CONFIG_TXK_SEL
+  rd_stats->rate = rate;
+
+  // Distortion.
+  int64_t tmp = pixel_diff_dist(x, 0, x->plane[0].src_diff,
+                                block_size_wide[bsize], 0, 0, bsize, bsize);
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    tmp = ROUND_POWER_OF_TWO(tmp, (xd->bd - 8) * 2);
+#endif  // CONFIG_HIGHBITDEPTH
+  rd_stats->dist = rd_stats->sse = (tmp << 4);
+}
+
 static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
                                RD_STATS *rd_stats, BLOCK_SIZE bsize,
                                int64_t ref_best_rd) {
@@ -5037,18 +5466,52 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
   const int n4 = bsize_to_num_blk(bsize);
   int idx, idy;
   int prune = 0;
-  const int count32 =
-      1 << (2 * (cm->mib_size_log2 - mi_width_log2_lookup[BLOCK_32X32]));
-#if CONFIG_EXT_PARTITION
-  RD_STATS rd_stats_stack[16];
-#else
-  RD_STATS rd_stats_stack[4];
-#endif  // CONFIG_EXT_PARTITION
 #if CONFIG_EXT_TX
+  const TxSetType tx_set_type = get_ext_tx_set_type(
+      max_tx_size, bsize, is_inter, cm->reduced_tx_set_used);
   const int ext_tx_set =
       get_ext_tx_set(max_tx_size, bsize, is_inter, cm->reduced_tx_set_used);
 #endif  // CONFIG_EXT_TX
 
+  av1_invalid_rd_stats(rd_stats);
+
+#if CONFIG_LGT_FROM_PRED
+  mbmi->use_lgt = 0;
+  int search_lgt = is_inter
+                       ? LGT_FROM_PRED_INTER &&
+                             (!cpi->sf.tx_type_search.prune_mode > NO_PRUNE)
+                       : LGT_FROM_PRED_INTRA && ALLOW_INTRA_EXT_TX;
+#endif  // CONFIG_LGT_FROM_PRED
+
+  const uint32_t hash = get_block_residue_hash(x, bsize);
+  TX_RD_RECORD *tx_rd_record = &x->tx_rd_record;
+
+  if (ref_best_rd != INT64_MAX) {
+    for (int i = 0; i < tx_rd_record->num; ++i) {
+      const int index = (tx_rd_record->index_start + i) % RD_RECORD_BUFFER_LEN;
+      // If there is a match in the tx_rd_record, fetch the RD decision and
+      // terminate early.
+      if (tx_rd_record->tx_rd_info[index].hash_value == hash) {
+        TX_RD_INFO *tx_rd_info = &tx_rd_record->tx_rd_info[index];
+        fetch_tx_rd_info(n4, tx_rd_info, rd_stats, x);
+        return;
+      }
+    }
+  }
+
+// If we predict that skip is the optimal RD decision - set the respective
+// context and terminate early.
+#if CONFIG_HIGHBITDEPTH
+  if (!(xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH))
+#endif  // CONFIG_HIGHBITDEPTH
+  {
+    if (is_inter && cpi->sf.tx_type_search.use_skip_flag_prediction &&
+        predict_skip_flag_8bit(x, bsize)) {
+      set_skip_flag(cpi, x, rd_stats, bsize);
+      return;
+    }
+  }
+
   if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE)
 #if CONFIG_EXT_TX
     prune = prune_tx_types(cpi, bsize, x, xd, ext_tx_set);
@@ -5056,10 +5519,7 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
     prune = prune_tx_types(cpi, bsize, x, xd, 0);
 #endif  // CONFIG_EXT_TX
 
-  av1_invalid_rd_stats(rd_stats);
-
-  for (idx = 0; idx < count32; ++idx)
-    av1_invalid_rd_stats(&rd_stats_stack[idx]);
+  int found = 0;
 
   for (tx_type = txk_start; tx_type < txk_end; ++tx_type) {
     RD_STATS this_rd_stats;
@@ -5067,11 +5527,14 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
 #if CONFIG_MRC_TX
     // MRC_DCT only implemented for TX_32X32 so only include this tx in
     // the search for TX_32X32
-    if (tx_type == MRC_DCT && max_tx_size != TX_32X32) continue;
+    if (tx_type == MRC_DCT &&
+        (max_tx_size != TX_32X32 || (is_inter && !USE_MRC_INTER) ||
+         (!is_inter && !USE_MRC_INTRA)))
+      continue;
 #endif  // CONFIG_MRC_TX
 #if CONFIG_EXT_TX
+    if (!av1_ext_tx_used[tx_set_type][tx_type]) continue;
     if (is_inter) {
-      if (!ext_tx_used_inter[ext_tx_set][tx_type]) continue;
       if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE) {
         if (!do_tx_type_search(tx_type, prune)) continue;
       }
@@ -5079,7 +5542,6 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
       if (!ALLOW_INTRA_EXT_TX && bsize >= BLOCK_8X8) {
         if (tx_type != intra_mode_to_tx_type_context[mbmi->mode]) continue;
       }
-      if (!ext_tx_used_intra[ext_tx_set][tx_type]) continue;
     }
 #else   // CONFIG_EXT_TX
     if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE &&
@@ -5094,8 +5556,8 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
       if (tx_type != DCT_DCT) continue;
 
     rd = select_tx_size_fix_type(cpi, x, &this_rd_stats, bsize, ref_best_rd,
-                                 tx_type, rd_stats_stack);
-
+                                 tx_type);
+    ref_best_rd = AOMMIN(rd, ref_best_rd);
     if (rd < best_rd) {
       best_rd = rd;
       *rd_stats = this_rd_stats;
@@ -5103,12 +5565,41 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
       best_tx = mbmi->tx_size;
       best_min_tx_size = mbmi->min_tx_size;
       memcpy(best_blk_skip, x->blk_skip[0], sizeof(best_blk_skip[0]) * n4);
+      found = 1;
       for (idy = 0; idy < xd->n8_h; ++idy)
         for (idx = 0; idx < xd->n8_w; ++idx)
           best_tx_size[idy][idx] = mbmi->inter_tx_size[idy][idx];
     }
   }
 
+  // We should always find at least one candidate unless ref_best_rd is less
+  // than INT64_MAX (in which case, all the calls to select_tx_size_fix_type
+  // might have failed to find something better)
+  assert(IMPLIES(!found, ref_best_rd != INT64_MAX));
+  if (!found) return;
+
+#if CONFIG_LGT_FROM_PRED
+  if (search_lgt && is_lgt_allowed(mbmi->mode, max_tx_size) &&
+      !cm->reduced_tx_set_used) {
+    RD_STATS this_rd_stats;
+    mbmi->use_lgt = 1;
+    rd = select_tx_size_fix_type(cpi, x, &this_rd_stats, bsize, ref_best_rd, 0);
+    if (rd < best_rd) {
+      best_rd = rd;
+      *rd_stats = this_rd_stats;
+      best_tx = mbmi->tx_size;
+      best_min_tx_size = mbmi->min_tx_size;
+      memcpy(best_blk_skip, x->blk_skip[0], sizeof(best_blk_skip[0]) * n4);
+      for (idy = 0; idy < xd->n8_h; ++idy)
+        for (idx = 0; idx < xd->n8_w; ++idx)
+          best_tx_size[idy][idx] = mbmi->inter_tx_size[idy][idx];
+    } else {
+      mbmi->use_lgt = 0;
+    }
+  }
+#endif  // CONFIG_LGT_FROM_PRED
+  // We found a candidate transform to use. Copy our results from the "best"
+  // array into mbmi.
   mbmi->tx_type = best_tx_type;
   for (idy = 0; idy < xd->n8_h; ++idy)
     for (idx = 0; idx < xd->n8_w; ++idx)
@@ -5116,6 +5607,19 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
   mbmi->tx_size = best_tx;
   mbmi->min_tx_size = best_min_tx_size;
   memcpy(x->blk_skip[0], best_blk_skip, sizeof(best_blk_skip[0]) * n4);
+
+  // Save the RD search results into tx_rd_record.
+  int index;
+  if (tx_rd_record->num < RD_RECORD_BUFFER_LEN) {
+    index =
+        (tx_rd_record->index_start + tx_rd_record->num) % RD_RECORD_BUFFER_LEN;
+    ++tx_rd_record->num;
+  } else {
+    index = tx_rd_record->index_start;
+    tx_rd_record->index_start =
+        (tx_rd_record->index_start + 1) % RD_RECORD_BUFFER_LEN;
+  }
+  save_tx_rd_info(n4, hash, x, rd_stats, &tx_rd_record->tx_rd_info[index]);
 }
 
 static void tx_block_rd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
@@ -5145,7 +5649,9 @@ static void tx_block_rd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
     ENTROPY_CONTEXT *tl = left_ctx + blk_row;
     av1_tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
                       plane_bsize, ta, tl, rd_stats);
+#if !CONFIG_PVQ
     av1_set_txb_context(x, plane, block, tx_size, ta, tl);
+#endif  // !CONFIG_PVQ
   } else {
     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
     const int bsl = tx_size_wide_unit[sub_txs];
@@ -5250,7 +5756,6 @@ static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x,
 }
 #endif  // CONFIG_VAR_TX
 
-#if CONFIG_PALETTE
 static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
                                        int dc_mode_cost,
                                        uint8_t *best_palette_color_map,
@@ -5263,6 +5768,7 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
   assert(!is_inter_block(mbmi));
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
   const BLOCK_SIZE bsize = mbmi->sb_type;
+  assert(bsize >= BLOCK_8X8);
   int this_rate;
   int64_t this_rd;
   int colors_u, colors_v, colors;
@@ -5296,17 +5802,14 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
 #endif  // CONFIG_HIGHBITDEPTH
 
 #if CONFIG_PALETTE_DELTA_ENCODING
-  const MODE_INFO *above_mi = xd->above_mi;
-  const MODE_INFO *left_mi = xd->left_mi;
   uint16_t color_cache[2 * PALETTE_MAX_SIZE];
-  const int n_cache = av1_get_palette_cache(above_mi, left_mi, 1, color_cache);
+  const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
 #endif  // CONFIG_PALETTE_DELTA_ENCODING
 
   colors = colors_u > colors_v ? colors_u : colors_v;
   if (colors > 1 && colors <= 64) {
     int r, c, n, i, j;
     const int max_itr = 50;
-    uint8_t color_order[PALETTE_MAX_SIZE];
     float lb_u, ub_u, val_u;
     float lb_v, ub_v, val_v;
     float *const data = x->palette_buffer->kmeans_data_buf;
@@ -5402,7 +5905,7 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
       if (tokenonly_rd_stats.rate == INT_MAX) continue;
       this_rate =
           tokenonly_rd_stats.rate + dc_mode_cost +
-          cpi->palette_uv_size_cost[bsize - BLOCK_8X8][n - PALETTE_MIN_SIZE] +
+          x->palette_uv_size_cost[bsize - BLOCK_8X8][n - PALETTE_MIN_SIZE] +
           write_uniform_cost(n, color_map[0]) +
           av1_cost_bit(
               av1_default_palette_uv_mode_prob[pmi->palette_size[0] > 0], 1);
@@ -5411,17 +5914,8 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
                                              color_cache, n_cache,
 #endif  // CONFIG_PALETTE_DELTA_ENCODING
                                              cpi->common.bit_depth);
-      for (i = 0; i < rows; ++i) {
-        for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
-          int color_idx;
-          const int color_ctx = av1_get_palette_color_index_context(
-              color_map, plane_block_width, i, j, n, color_order, &color_idx);
-          assert(color_idx >= 0 && color_idx < n);
-          this_rate += cpi->palette_uv_color_cost[n - PALETTE_MIN_SIZE]
-                                                 [color_ctx][color_idx];
-        }
-      }
-
+      this_rate +=
+          av1_cost_color_map(x, 1, 0, bsize, mbmi->tx_size, PALETTE_MAP);
       this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
       if (this_rd < *best_rd) {
         *best_rd = this_rd;
@@ -5438,10 +5932,10 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
   }
   if (best_mbmi->palette_mode_info.palette_size[1] > 0) {
     memcpy(color_map, best_palette_color_map,
-           rows * cols * sizeof(best_palette_color_map[0]));
+           plane_block_width * plane_block_height *
+               sizeof(best_palette_color_map[0]));
   }
 }
-#endif  // CONFIG_PALETTE
 
 #if CONFIG_FILTER_INTRA
 // Return 1 if an filter intra mode is selected; return 0 otherwise.
@@ -5461,9 +5955,7 @@ static int rd_pick_filter_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
   av1_zero(filter_intra_mode_info);
   mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 1;
   mbmi->uv_mode = UV_DC_PRED;
-#if CONFIG_PALETTE
   mbmi->palette_mode_info.palette_size[1] = 0;
-#endif  // CONFIG_PALETTE
 
   for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) {
     mbmi->filter_intra_mode_info.filter_intra_mode[1] = mode;
@@ -5472,7 +5964,7 @@ static int rd_pick_filter_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
 
     this_rate = tokenonly_rd_stats.rate +
                 av1_cost_bit(cpi->common.fc->filter_intra_probs[1], 1) +
-                cpi->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode] +
+                x->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode] +
                 write_uniform_cost(FILTER_INTRA_MODES, mode);
     this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
     if (this_rd < *best_rd) {
@@ -5586,11 +6078,10 @@ static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
 #endif  // CONFIG_EXT_INTRA
 
 #if CONFIG_CFL
-static int64_t cfl_alpha_dist(const uint8_t *y_pix, int y_stride,
-                              const int y_averages_q3[MAX_NUM_TXB],
-                              const uint8_t *src, int src_stride, int width,
-                              int height, TX_SIZE tx_size, int dc_pred,
-                              int alpha_q3, int64_t *dist_neg_out) {
+static int64_t cfl_alpha_dist_lbd(const int16_t *pred_buf_q3,
+                                  const uint8_t *src, int src_stride, int width,
+                                  int height, int dc_pred, int alpha_q3,
+                                  int64_t *dist_neg_out) {
   int64_t dist = 0;
   int diff;
 
@@ -5609,63 +6100,87 @@ static int64_t cfl_alpha_dist(const uint8_t *y_pix, int y_stride,
   }
 
   int64_t dist_neg = 0;
-  const int tx_height = tx_size_high[tx_size];
-  const int tx_width = tx_size_wide[tx_size];
-  const int y_block_row_off = y_stride * tx_height;
-  const int src_block_row_off = src_stride * tx_height;
-  const uint8_t *t_y_pix;
-  const uint8_t *t_src;
-  int a = 0;
-  for (int b_j = 0; b_j < height; b_j += tx_height) {
-    const int h = b_j + tx_height;
-    for (int b_i = 0; b_i < width; b_i += tx_width) {
-      const int w = b_i + tx_width;
-      const int tx_avg_q3 = y_averages_q3[a++];
-      t_y_pix = y_pix;
-      t_src = src;
-      for (int t_j = b_j; t_j < h; t_j++) {
-        for (int t_i = b_i; t_i < w; t_i++) {
-          const int uv = t_src[t_i];
-
-          const int scaled_luma =
-              get_scaled_luma_q0(alpha_q3, t_y_pix[t_i], tx_avg_q3);
-
-          // TODO(ltrudeau) add support for HBD.
-          diff = uv - clamp(scaled_luma + dc_pred, 0, 255);
-          dist += diff * diff;
-
-          // TODO(ltrudeau) add support for HBD.
-          diff = uv - clamp(-scaled_luma + dc_pred, 0, 255);
-          dist_neg += diff * diff;
-        }
-        t_y_pix += y_stride;
-        t_src += src_stride;
-      }
+  for (int j = 0; j < height; j++) {
+    for (int i = 0; i < width; i++) {
+      const int uv = src[i];
+      const int scaled_luma = get_scaled_luma_q0(alpha_q3, pred_buf_q3[i]);
+
+      diff = uv - clip_pixel(scaled_luma + dc_pred);
+      dist += diff * diff;
+
+      diff = uv - clip_pixel(-scaled_luma + dc_pred);
+      dist_neg += diff * diff;
     }
-    y_pix += y_block_row_off;
-    src += src_block_row_off;
+    pred_buf_q3 += MAX_SB_SIZE;
+    src += src_stride;
   }
 
   if (dist_neg_out) *dist_neg_out = dist_neg;
 
   return dist;
 }
+#if CONFIG_HIGHBITDEPTH
+static int64_t cfl_alpha_dist_hbd(const int16_t *pred_buf_q3,
+                                  const uint16_t *src, int src_stride,
+                                  int width, int height, int dc_pred,
+                                  int alpha_q3, int bit_depth,
+                                  int64_t *dist_neg_out) {
+  const int shift = 2 * (bit_depth - 8);
+  const int rounding = shift > 0 ? (1 << shift) >> 1 : 0;
+  int64_t dist = 0;
+  int diff;
 
-static inline void cfl_update_costs(CFL_CTX *cfl, FRAME_CONTEXT *ec_ctx) {
-  assert(ec_ctx->cfl_alpha_cdf[CFL_ALPHABET_SIZE - 1] ==
-         AOM_ICDF(CDF_PROB_TOP));
+  if (alpha_q3 == 0) {
+    for (int j = 0; j < height; j++) {
+      for (int i = 0; i < width; i++) {
+        diff = src[i] - dc_pred;
+        dist += diff * diff;
+      }
+      src += src_stride;
+    }
+    dist = (dist + rounding) >> shift;
 
-  aom_cdf_prob prev_cdf = 0;
+    if (dist_neg_out) *dist_neg_out = dist;
 
-  for (int c = 0; c < CFL_ALPHABET_SIZE; c++) {
-    const int sign_bit_cost = (cfl_alpha_codes[c][CFL_PRED_U] != 0) +
-                              (cfl_alpha_codes[c][CFL_PRED_V] != 0);
+    return dist;
+  }
+
+  int64_t dist_neg = 0;
+  for (int j = 0; j < height; j++) {
+    for (int i = 0; i < width; i++) {
+      const int uv = src[i];
+      const int scaled_luma = get_scaled_luma_q0(alpha_q3, pred_buf_q3[i]);
 
-    aom_cdf_prob prob = AOM_ICDF(ec_ctx->cfl_alpha_cdf[c]) - prev_cdf;
-    prev_cdf = AOM_ICDF(ec_ctx->cfl_alpha_cdf[c]);
+      diff = uv - clip_pixel_highbd(scaled_luma + dc_pred, bit_depth);
+      dist += diff * diff;
 
-    cfl->costs[c] = av1_cost_symbol(prob) + av1_cost_literal(sign_bit_cost);
+      diff = uv - clip_pixel_highbd(-scaled_luma + dc_pred, bit_depth);
+      dist_neg += diff * diff;
+    }
+    pred_buf_q3 += MAX_SB_SIZE;
+    src += src_stride;
+  }
+
+  if (dist_neg_out) *dist_neg_out = (dist_neg + rounding) >> shift;
+
+  return (dist + rounding) >> shift;
+}
+#endif  // CONFIG_HIGHBITDEPTH
+static int64_t cfl_alpha_dist(const int16_t *pred_buf_q3, const uint8_t *src,
+                              int src_stride, int width, int height,
+                              int dc_pred, int alpha_q3, int use_hbd,
+                              int bit_depth, int64_t *dist_neg_out) {
+#if CONFIG_HIGHBITDEPTH
+  if (use_hbd) {
+    const uint16_t *src_16 = CONVERT_TO_SHORTPTR(src);
+    return cfl_alpha_dist_hbd(pred_buf_q3, src_16, src_stride, width, height,
+                              dc_pred, alpha_q3, bit_depth, dist_neg_out);
   }
+#endif  // CONFIG_HIGHBITDEPTH
+  (void)use_hbd;
+  (void)bit_depth;
+  return cfl_alpha_dist_lbd(pred_buf_q3, src, src_stride, width, height,
+                            dc_pred, alpha_q3, dist_neg_out);
 }
 
 static int cfl_rd_pick_alpha(MACROBLOCK *const x, TX_SIZE tx_size) {
@@ -5677,7 +6192,6 @@ static int cfl_rd_pick_alpha(MACROBLOCK *const x, TX_SIZE tx_size) {
   const int src_stride_v = p_v->src.stride;
 
   MACROBLOCKD *const xd = &x->e_mbd;
-  FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
 
   CFL_CTX *const cfl = xd->cfl;
@@ -5686,74 +6200,71 @@ static int cfl_rd_pick_alpha(MACROBLOCK *const x, TX_SIZE tx_size) {
   const int height = cfl->uv_height;
   const int dc_pred_u = cfl->dc_pred[CFL_PRED_U];
   const int dc_pred_v = cfl->dc_pred[CFL_PRED_V];
-  const int *y_averages_q3 = cfl->y_averages_q3;
-  const uint8_t *y_pix = cfl->y_down_pix;
-
-  CFL_SIGN_TYPE *signs = mbmi->cfl_alpha_signs;
-
-  cfl_update_costs(cfl, ec_ctx);
+  const int16_t *pred_buf_q3 = cfl->pred_buf_q3;
+  const int use_hbd = get_bitdepth_data_path_index(xd);
 
   int64_t sse[CFL_PRED_PLANES][CFL_MAGS_SIZE];
   sse[CFL_PRED_U][0] =
-      cfl_alpha_dist(y_pix, MAX_SB_SIZE, y_averages_q3, src_u, src_stride_u,
-                     width, height, tx_size, dc_pred_u, 0, NULL);
+      cfl_alpha_dist(pred_buf_q3, src_u, src_stride_u, width, height, dc_pred_u,
+                     0, use_hbd, xd->bd, NULL);
   sse[CFL_PRED_V][0] =
-      cfl_alpha_dist(y_pix, MAX_SB_SIZE, y_averages_q3, src_v, src_stride_v,
-                     width, height, tx_size, dc_pred_v, 0, NULL);
+      cfl_alpha_dist(pred_buf_q3, src_v, src_stride_v, width, height, dc_pred_v,
+                     0, use_hbd, xd->bd, NULL);
 
-  for (int m = 1; m < CFL_MAGS_SIZE; m += 2) {
-    assert(cfl_alpha_mags_q3[m + 1] == -cfl_alpha_mags_q3[m]);
+  for (int c = 0; c < CFL_ALPHABET_SIZE; c++) {
+    const int m = c * 2 + 1;
+    const int abs_alpha_q3 = c + 1;
     sse[CFL_PRED_U][m] = cfl_alpha_dist(
-        y_pix, MAX_SB_SIZE, y_averages_q3, src_u, src_stride_u, width, height,
-        tx_size, dc_pred_u, cfl_alpha_mags_q3[m], &sse[CFL_PRED_U][m + 1]);
+        pred_buf_q3, src_u, src_stride_u, width, height, dc_pred_u,
+        abs_alpha_q3, use_hbd, xd->bd, &sse[CFL_PRED_U][m + 1]);
     sse[CFL_PRED_V][m] = cfl_alpha_dist(
-        y_pix, MAX_SB_SIZE, y_averages_q3, src_v, src_stride_v, width, height,
-        tx_size, dc_pred_v, cfl_alpha_mags_q3[m], &sse[CFL_PRED_V][m + 1]);
+        pred_buf_q3, src_v, src_stride_v, width, height, dc_pred_v,
+        abs_alpha_q3, use_hbd, xd->bd, &sse[CFL_PRED_V][m + 1]);
   }
 
   int64_t dist;
   int64_t cost;
-  int64_t best_cost;
+  int64_t best_cost = INT64_MAX;
+  int best_rate = 0;
 
   // Compute least squares parameter of the entire block
-  // IMPORTANT: We assume that the first code is 0,0
   int ind = 0;
-  signs[CFL_PRED_U] = CFL_SIGN_POS;
-  signs[CFL_PRED_V] = CFL_SIGN_POS;
-
-  dist = sse[CFL_PRED_U][0] + sse[CFL_PRED_V][0];
-  dist *= 16;
-  best_cost = RDCOST(x->rdmult, cfl->costs[0], dist);
-
-  for (int c = 1; c < CFL_ALPHABET_SIZE; c++) {
-    const int idx_u = cfl_alpha_codes[c][CFL_PRED_U];
-    const int idx_v = cfl_alpha_codes[c][CFL_PRED_V];
-    for (CFL_SIGN_TYPE sign_u = idx_u == 0; sign_u < CFL_SIGNS; sign_u++) {
-      for (CFL_SIGN_TYPE sign_v = idx_v == 0; sign_v < CFL_SIGNS; sign_v++) {
+  int signs = 0;
+
+  for (int joint_sign = 0; joint_sign < CFL_JOINT_SIGNS; joint_sign++) {
+    const int sign_u = CFL_SIGN_U(joint_sign);
+    const int sign_v = CFL_SIGN_V(joint_sign);
+    const int size_u = (sign_u == CFL_SIGN_ZERO) ? 1 : CFL_ALPHABET_SIZE;
+    const int size_v = (sign_v == CFL_SIGN_ZERO) ? 1 : CFL_ALPHABET_SIZE;
+    for (int u = 0; u < size_u; u++) {
+      const int idx_u = (sign_u == CFL_SIGN_ZERO) ? 0 : u * 2 + 1;
+      for (int v = 0; v < size_v; v++) {
+        const int idx_v = (sign_v == CFL_SIGN_ZERO) ? 0 : v * 2 + 1;
         dist = sse[CFL_PRED_U][idx_u + (sign_u == CFL_SIGN_NEG)] +
                sse[CFL_PRED_V][idx_v + (sign_v == CFL_SIGN_NEG)];
         dist *= 16;
-        cost = RDCOST(x->rdmult, cfl->costs[c], dist);
+        const int rate = x->cfl_cost[joint_sign][CFL_PRED_U][u] +
+                         x->cfl_cost[joint_sign][CFL_PRED_V][v];
+        cost = RDCOST(x->rdmult, rate, dist);
         if (cost < best_cost) {
           best_cost = cost;
-          ind = c;
-          signs[CFL_PRED_U] = sign_u;
-          signs[CFL_PRED_V] = sign_v;
+          best_rate = rate;
+          ind = (u << CFL_ALPHABET_SIZE_LOG2) + v;
+          signs = joint_sign;
         }
       }
     }
   }
 
   mbmi->cfl_alpha_idx = ind;
-  return cfl->costs[ind];
+  mbmi->cfl_alpha_signs = signs;
+  return best_rate;
 }
 #endif  // CONFIG_CFL
 
 static void init_sbuv_mode(MB_MODE_INFO *const mbmi) {
   mbmi->uv_mode = UV_DC_PRED;
-#if CONFIG_PALETTE
   mbmi->palette_mode_info.palette_size[1] = 0;
-#endif  // CONFIG_PALETTE
 #if CONFIG_FILTER_INTRA
   mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
 #endif  // CONFIG_FILTER_INTRA
@@ -5772,9 +6283,9 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   od_rollback_buffer buf;
   od_encode_checkpoint(&x->daala_enc, &buf);
 #endif  // CONFIG_PVQ
-#if CONFIG_PALETTE
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-#endif  // CONFIG_PALETTE
+  const int try_palette =
+      av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type);
 
   for (int mode_idx = 0; mode_idx < UV_INTRA_MODES; ++mode_idx) {
     int this_rate;
@@ -5782,7 +6293,7 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     UV_PREDICTION_MODE mode = uv_rd_search_mode_order[mode_idx];
 #if CONFIG_EXT_INTRA
     const int is_directional_mode =
-        av1_is_directional_mode(mode, mbmi->sb_type);
+        av1_is_directional_mode(get_uv_mode(mode), mbmi->sb_type);
 #endif  // CONFIG_EXT_INTRA
     if (!(cpi->sf.intra_uv_mode_mask[txsize_sqr_up_map[max_tx_size]] &
           (1 << mode)))
@@ -5791,7 +6302,8 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     mbmi->uv_mode = mode;
 #if CONFIG_CFL
     int cfl_alpha_rate = 0;
-    if (mode == UV_DC_PRED) {
+    if (mode == UV_CFL_PRED) {
+      assert(!is_directional_mode);
       const TX_SIZE uv_tx_size = av1_get_uv_tx_size(mbmi, &xd->plane[1]);
       cfl_alpha_rate = cfl_rd_pick_alpha(x, uv_tx_size);
     }
@@ -5799,7 +6311,7 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
 #if CONFIG_EXT_INTRA
     mbmi->angle_delta[1] = 0;
     if (is_directional_mode && av1_use_angle_delta(mbmi->sb_type)) {
-      const int rate_overhead = cpi->intra_uv_mode_cost[mbmi->mode][mode] +
+      const int rate_overhead = x->intra_uv_mode_cost[mbmi->mode][mode] +
                                 write_uniform_cost(2 * MAX_ANGLE_DELTA + 1, 0);
       if (!rd_pick_intra_angle_sbuv(cpi, x, bsize, rate_overhead, best_rd,
                                     &this_rate, &tokenonly_rd_stats))
@@ -5816,10 +6328,10 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     }
 #endif  // CONFIG_EXT_INTRA
     this_rate =
-        tokenonly_rd_stats.rate + cpi->intra_uv_mode_cost[mbmi->mode][mode];
+        tokenonly_rd_stats.rate + x->intra_uv_mode_cost[mbmi->mode][mode];
 
 #if CONFIG_CFL
-    if (mode == UV_DC_PRED) {
+    if (mode == UV_CFL_PRED) {
       this_rate += cfl_alpha_rate;
     }
 #endif
@@ -5830,15 +6342,12 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     }
 #endif  // CONFIG_EXT_INTRA
 #if CONFIG_FILTER_INTRA
-    if (mbmi->sb_type >= BLOCK_8X8 && mode == DC_PRED)
+    if (mbmi->sb_type >= BLOCK_8X8 && mode == UV_DC_PRED)
       this_rate += av1_cost_bit(cpi->common.fc->filter_intra_probs[1], 0);
 #endif  // CONFIG_FILTER_INTRA
-#if CONFIG_PALETTE
-    if (cpi->common.allow_screen_content_tools && mbmi->sb_type >= BLOCK_8X8 &&
-        mode == UV_DC_PRED)
+    if (try_palette && mode == UV_DC_PRED)
       this_rate += av1_cost_bit(
           av1_default_palette_uv_mode_prob[pmi->palette_size[0] > 0], 0);
-#endif  // CONFIG_PALETTE
 
 #if CONFIG_PVQ
     od_encode_rollback(&x->daala_enc, &buf);
@@ -5855,15 +6364,13 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     }
   }
 
-#if CONFIG_PALETTE
-  if (cpi->common.allow_screen_content_tools && mbmi->sb_type >= BLOCK_8X8) {
+  if (try_palette) {
     uint8_t *best_palette_color_map = x->palette_buffer->best_palette_color_map;
     rd_pick_palette_intra_sbuv(cpi, x,
-                               cpi->intra_uv_mode_cost[mbmi->mode][UV_DC_PRED],
+                               x->intra_uv_mode_cost[mbmi->mode][UV_DC_PRED],
                                best_palette_color_map, &best_mbmi, &best_rd,
                                rate, rate_tokenonly, distortion, skippable);
   }
-#endif  // CONFIG_PALETTE
 
 #if CONFIG_FILTER_INTRA
   if (mbmi->sb_type >= BLOCK_8X8) {
@@ -5880,19 +6387,17 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
 }
 
 static void choose_intra_uv_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
-                                 PICK_MODE_CONTEXT *ctx, BLOCK_SIZE bsize,
-                                 TX_SIZE max_tx_size, int *rate_uv,
-                                 int *rate_uv_tokenonly, int64_t *dist_uv,
-                                 int *skip_uv, UV_PREDICTION_MODE *mode_uv) {
+                                 BLOCK_SIZE bsize, TX_SIZE max_tx_size,
+                                 int *rate_uv, int *rate_uv_tokenonly,
+                                 int64_t *dist_uv, int *skip_uv,
+                                 UV_PREDICTION_MODE *mode_uv) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   // Use an estimated rd for uv_intra based on DC_PRED if the
   // appropriate speed flag is set.
-  (void)ctx;
-  init_sbuv_mode(&x->e_mbd.mi[0]->mbmi);
+  init_sbuv_mode(mbmi);
 #if CONFIG_CB4X4
-#if CONFIG_CHROMA_2X2
-  rd_pick_intra_sbuv_mode(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
-                          bsize, max_tx_size);
-#else
+#if !CONFIG_CHROMA_2X2
   if (x->skip_chroma_rd) {
     *rate_uv = 0;
     *rate_uv_tokenonly = 0;
@@ -5901,32 +6406,47 @@ static void choose_intra_uv_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
     *mode_uv = UV_DC_PRED;
     return;
   }
-  BLOCK_SIZE bs = scale_chroma_bsize(bsize, x->e_mbd.plane[1].subsampling_x,
-                                     x->e_mbd.plane[1].subsampling_y);
-  rd_pick_intra_sbuv_mode(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
-                          bs, max_tx_size);
-#endif  // CONFIG_CHROMA_2X2
+  bsize = scale_chroma_bsize(bsize, xd->plane[AOM_PLANE_U].subsampling_x,
+                             xd->plane[AOM_PLANE_U].subsampling_y);
+#endif  // !CONFIG_CHROMA_2X2
+#if CONFIG_CFL
+  // Only store reconstructed luma when there's chroma RDO. When there's no
+  // chroma RDO, the reconstructed luma will be stored in encode_superblock().
+  xd->cfl->store_y = !x->skip_chroma_rd;
+#endif  // CONFIG_CFL
 #else
-  rd_pick_intra_sbuv_mode(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
-                          bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize, max_tx_size);
+  bsize = bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize;
+#if CONFIG_CFL
+  xd->cfl->store_y = 1;
+#endif  // CONFIG_CFL
 #endif  // CONFIG_CB4X4
-  *mode_uv = x->e_mbd.mi[0]->mbmi.uv_mode;
+#if CONFIG_CFL
+  if (xd->cfl->store_y) {
+    // Perform one extra call to txfm_rd_in_plane(), with the values chosen
+    // during luma RDO, so we can store reconstructed luma values
+    RD_STATS this_rd_stats;
+    txfm_rd_in_plane(x, cpi, &this_rd_stats, INT64_MAX, AOM_PLANE_Y,
+                     mbmi->sb_type, mbmi->tx_size,
+                     cpi->sf.use_fast_coef_costing);
+    xd->cfl->store_y = 0;
+  }
+#endif  // CONFIG_CFL
+  rd_pick_intra_sbuv_mode(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
+                          bsize, max_tx_size);
+  *mode_uv = mbmi->uv_mode;
 }
 
-static int cost_mv_ref(const AV1_COMP *const cpi, PREDICTION_MODE mode,
+static int cost_mv_ref(const MACROBLOCK *const x, PREDICTION_MODE mode,
                        int16_t mode_context) {
-#if CONFIG_EXT_INTER
   if (is_inter_compound_mode(mode)) {
-    return cpi
+    return x
         ->inter_compound_mode_cost[mode_context][INTER_COMPOUND_OFFSET(mode)];
 #if CONFIG_COMPOUND_SINGLEREF
   } else if (is_inter_singleref_comp_mode(mode)) {
-    return cpi
-        ->inter_singleref_comp_mode_cost[mode_context]
-                                        [INTER_SINGLEREF_COMP_OFFSET(mode)];
+    return x->inter_singleref_comp_mode_cost[mode_context]
+                                            [INTER_SINGLEREF_COMP_OFFSET(mode)];
 #endif  // CONFIG_COMPOUND_SINGLEREF
   }
-#endif
 
   int mode_cost = 0;
   int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
@@ -5935,32 +6455,32 @@ static int cost_mv_ref(const AV1_COMP *const cpi, PREDICTION_MODE mode,
   assert(is_inter_mode(mode));
 
   if (mode == NEWMV) {
-    mode_cost = cpi->newmv_mode_cost[mode_ctx][0];
+    mode_cost = x->newmv_mode_cost[mode_ctx][0];
     return mode_cost;
   } else {
-    mode_cost = cpi->newmv_mode_cost[mode_ctx][1];
+    mode_cost = x->newmv_mode_cost[mode_ctx][1];
     mode_ctx = (mode_context >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK;
 
     if (is_all_zero_mv) return mode_cost;
 
     if (mode == ZEROMV) {
-      mode_cost += cpi->zeromv_mode_cost[mode_ctx][0];
+      mode_cost += x->zeromv_mode_cost[mode_ctx][0];
       return mode_cost;
     } else {
-      mode_cost += cpi->zeromv_mode_cost[mode_ctx][1];
+      mode_cost += x->zeromv_mode_cost[mode_ctx][1];
       mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
 
       if (mode_context & (1 << SKIP_NEARESTMV_OFFSET)) mode_ctx = 6;
       if (mode_context & (1 << SKIP_NEARMV_OFFSET)) mode_ctx = 7;
       if (mode_context & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET)) mode_ctx = 8;
 
-      mode_cost += cpi->refmv_mode_cost[mode_ctx][mode != NEARESTMV];
+      mode_cost += x->refmv_mode_cost[mode_ctx][mode != NEARESTMV];
       return mode_cost;
     }
   }
 }
 
-#if CONFIG_EXT_INTER && (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT)
+#if (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT)
 static int get_interinter_compound_type_bits(BLOCK_SIZE bsize,
                                              COMPOUND_TYPE comp_type) {
   (void)bsize;
@@ -5975,7 +6495,7 @@ static int get_interinter_compound_type_bits(BLOCK_SIZE bsize,
     default: assert(0); return 0;
   }
 }
-#endif  // CONFIG_EXT_INTER && (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT)
+#endif  // (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT)
 
 typedef struct {
   int eobs;
@@ -5986,9 +6506,7 @@ typedef struct {
   int64_t brdcost;
   int_mv mvs[2];
   int_mv pred_mv[2];
-#if CONFIG_EXT_INTER
   int_mv ref_mv[2];
-#endif  // CONFIG_EXT_INTER
 
 #if CONFIG_CHROMA_2X2
   ENTROPY_CONTEXT ta[4];
@@ -6009,16 +6527,12 @@ typedef struct {
   int64_t sse;
   int segment_yrate;
   PREDICTION_MODE modes[4];
-#if CONFIG_EXT_INTER
 #if CONFIG_COMPOUND_SINGLEREF
   SEG_RDSTAT rdstat[4][INTER_MODES + INTER_SINGLEREF_COMP_MODES +
                        INTER_COMPOUND_MODES];
 #else   // !CONFIG_COMPOUND_SINGLEREF
   SEG_RDSTAT rdstat[4][INTER_MODES + INTER_COMPOUND_MODES];
 #endif  // CONFIG_COMPOUND_SINGLEREF
-#else   // !CONFIG_EXT_INTER
-  SEG_RDSTAT rdstat[4][INTER_MODES];
-#endif  // CONFIG_EXT_INTER
   int mvthresh;
 } BEST_SEG_INFO;
 
@@ -6032,10 +6546,9 @@ static INLINE int mv_check_bounds(const MvLimits *mv_limits, const MV *mv) {
 // Check if NEARESTMV/NEARMV/ZEROMV is the cheapest way encode zero motion.
 // TODO(aconverse): Find out if this is still productive then clean up or remove
 static int check_best_zero_mv(
-    const AV1_COMP *const cpi, const int16_t mode_context[TOTAL_REFS_PER_FRAME],
-#if CONFIG_EXT_INTER
+    const AV1_COMP *const cpi, const MACROBLOCK *const x,
+    const int16_t mode_context[TOTAL_REFS_PER_FRAME],
     const int16_t compound_mode_context[TOTAL_REFS_PER_FRAME],
-#endif  // CONFIG_EXT_INTER
     int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME], int this_mode,
     const MV_REFERENCE_FRAME ref_frames[2], const BLOCK_SIZE bsize, int block,
     int mi_row, int mi_col) {
@@ -6045,34 +6558,33 @@ static int check_best_zero_mv(
 #endif
   (void)mi_row;
   (void)mi_col;
+  (void)cpi;
 #if CONFIG_GLOBAL_MOTION
-  if (this_mode == ZEROMV
-#if CONFIG_EXT_INTER
-      || this_mode == ZERO_ZEROMV
-#endif  // CONFIG_EXT_INTER
-      ) {
+  if (this_mode == ZEROMV || this_mode == ZERO_ZEROMV) {
     for (int cur_frm = 0; cur_frm < 1 + comp_pred_mode; cur_frm++) {
       zeromv[cur_frm].as_int =
           gm_get_motion_vector(&cpi->common.global_motion[ref_frames[cur_frm]],
                                cpi->common.allow_high_precision_mv, bsize,
-                               mi_col, mi_row, block)
+                               mi_col, mi_row, block
+#if CONFIG_AMVR
+                               ,
+                               cpi->common.cur_frame_mv_precision_level
+#endif
+                               )
               .as_int;
     }
   }
 #endif  // CONFIG_GLOBAL_MOTION
 
-#if !CONFIG_EXT_INTER
-  assert(ref_frames[1] != INTRA_FRAME);  // Just sanity check
-#endif                                   // !CONFIG_EXT_INTER
   if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
       frame_mv[this_mode][ref_frames[0]].as_int == zeromv[0].as_int &&
       (ref_frames[1] <= INTRA_FRAME ||
        frame_mv[this_mode][ref_frames[1]].as_int == zeromv[1].as_int)) {
     int16_t rfc =
         av1_mode_context_analyzer(mode_context, ref_frames, bsize, block);
-    int c1 = cost_mv_ref(cpi, NEARMV, rfc);
-    int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
-    int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
+    int c1 = cost_mv_ref(x, NEARMV, rfc);
+    int c2 = cost_mv_ref(x, NEARESTMV, rfc);
+    int c3 = cost_mv_ref(x, ZEROMV, rfc);
 
     if (this_mode == NEARMV) {
       if (c1 > c3) return 0;
@@ -6092,16 +6604,14 @@ static int check_best_zero_mv(
           return 0;
       }
     }
-  }
-#if CONFIG_EXT_INTER
-  else if ((this_mode == NEAREST_NEARESTMV || this_mode == NEAR_NEARMV ||
-            this_mode == ZERO_ZEROMV) &&
-           frame_mv[this_mode][ref_frames[0]].as_int == zeromv[0].as_int &&
-           frame_mv[this_mode][ref_frames[1]].as_int == zeromv[1].as_int) {
+  } else if ((this_mode == NEAREST_NEARESTMV || this_mode == NEAR_NEARMV ||
+              this_mode == ZERO_ZEROMV) &&
+             frame_mv[this_mode][ref_frames[0]].as_int == zeromv[0].as_int &&
+             frame_mv[this_mode][ref_frames[1]].as_int == zeromv[1].as_int) {
     int16_t rfc = compound_mode_context[ref_frames[0]];
-    int c2 = cost_mv_ref(cpi, NEAREST_NEARESTMV, rfc);
-    int c3 = cost_mv_ref(cpi, ZERO_ZEROMV, rfc);
-    int c5 = cost_mv_ref(cpi, NEAR_NEARMV, rfc);
+    int c2 = cost_mv_ref(x, NEAREST_NEARESTMV, rfc);
+    int c3 = cost_mv_ref(x, ZERO_ZEROMV, rfc);
+    int c5 = cost_mv_ref(x, NEAR_NEARMV, rfc);
 
     if (this_mode == NEAREST_NEARESTMV) {
       if (c2 > c3) return 0;
@@ -6116,45 +6626,42 @@ static int check_best_zero_mv(
         return 0;
     }
   }
-#endif  // CONFIG_EXT_INTER
   return 1;
 }
 
 static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
                                 BLOCK_SIZE bsize, int_mv *frame_mv,
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
                                 int_mv *frame_comp_mv,
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
                                 int mi_row, int mi_col,
-#if CONFIG_EXT_INTER
                                 int_mv *ref_mv_sub8x8[2], const uint8_t *mask,
-                                int mask_stride,
-#endif  // CONFIG_EXT_INTER
-                                int *rate_mv, const int block) {
+                                int mask_stride, int *rate_mv,
+                                const int block) {
   const AV1_COMMON *const cm = &cpi->common;
   const int pw = block_size_wide[bsize];
   const int ph = block_size_high[bsize];
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
 // This function should only ever be called for compound modes
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
   if (!has_second_ref(mbmi)) {
     assert(is_inter_singleref_comp_mode(mbmi->mode));
     assert(frame_comp_mv);
   }
   assert(has_second_ref(mbmi) || is_inter_singleref_comp_mode(mbmi->mode));
-  const int refs[2] = { mbmi->ref_frame[0], has_second_ref(mbmi)
-                                                ? mbmi->ref_frame[1]
-                                                : mbmi->ref_frame[0] };
+  const int refs[2] = { mbmi->ref_frame[0],
+                        has_second_ref(mbmi) ? mbmi->ref_frame[1]
+                                             : mbmi->ref_frame[0] };
 #else
   assert(has_second_ref(mbmi));
   const int refs[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] };
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
   int_mv ref_mv[2];
   int ite, ref;
   struct scale_factors sf;
 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-  // ic and ir are the 4x4 coordiantes of the sub8x8 at index "block"
+  // ic and ir are the 4x4 coordinates of the sub8x8 at index "block"
   const int ic = block & 1;
   const int ir = (block - ic) >> 1;
   struct macroblockd_plane *const pd = &xd->plane[0];
@@ -6162,18 +6669,19 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
   const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y) + 4 * ir;
 #if CONFIG_GLOBAL_MOTION
   int is_global[2];
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
-  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+#if CONFIG_COMPOUND_SINGLEREF
+  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref)
 #else
-  for (ref = 0; ref < 2; ++ref) {
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  for (ref = 0; ref < 2; ++ref)
+#endif  // CONFIG_COMPOUND_SINGLEREF
+  {
     WarpedMotionParams *const wm =
         &xd->global_motion[xd->mi[0]->mbmi.ref_frame[ref]];
     is_global[ref] = is_global_mv_block(xd->mi[0], block, wm->wmtype);
   }
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
   if (!has_second_ref(mbmi)) is_global[1] = is_global[0];
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
 #endif  // CONFIG_GLOBAL_MOTION
 #else   // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
   (void)block;
@@ -6195,20 +6703,21 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
   DECLARE_ALIGNED(16, uint8_t, second_pred[MAX_SB_SQUARE]);
 #endif  // CONFIG_HIGHBITDEPTH
 
-#if CONFIG_EXT_INTER && CONFIG_CB4X4
+#if CONFIG_CB4X4
   (void)ref_mv_sub8x8;
-#endif  // CONFIG_EXT_INTER && CONFIG_CB4X4
+#endif  // CONFIG_CB4X4
 
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
-  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+#if CONFIG_COMPOUND_SINGLEREF
+  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref)
 #else
-  for (ref = 0; ref < 2; ++ref) {
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
-#if CONFIG_EXT_INTER && !CONFIG_CB4X4
+  for (ref = 0; ref < 2; ++ref)
+#endif  // CONFIG_COMPOUND_SINGLEREF
+  {
+#if !CONFIG_CB4X4
     if (bsize < BLOCK_8X8 && ref_mv_sub8x8 != NULL)
       ref_mv[ref].as_int = ref_mv_sub8x8[ref]->as_int;
     else
-#endif  // CONFIG_EXT_INTER && !CONFIG_CB4X4
+#endif  // !CONFIG_CB4X4
       ref_mv[ref] = x->mbmi_ext->ref_mvs[refs[ref]][0];
 
     if (scaled_ref_frame[ref]) {
@@ -6223,7 +6732,7 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
     }
   }
 
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
   if (!has_second_ref(mbmi)) {
     assert(is_inter_singleref_comp_mode(mbmi->mode));
     // NOTE: For single ref comp mode, set up the 2nd set of ref_mv/pre_planes
@@ -6239,7 +6748,7 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
       av1_setup_pre_planes(xd, 1, scaled_ref_frame[0], mi_row, mi_col, NULL);
     }
   }
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
 
 // Since we have scaled the reference frames to match the size of the current
 // frame we must use a unit scaling factor during mode selection.
@@ -6253,14 +6762,15 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
 
 // Allow joint search multiple times iteratively for each reference frame
 // and break out of the search loop if it couldn't find a better mv.
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
   const int num_ites =
       (has_second_ref(mbmi) || mbmi->mode == SR_NEW_NEWMV) ? 4 : 1;
   const int start_ite = has_second_ref(mbmi) ? 0 : 1;
-  for (ite = start_ite; ite < (start_ite + num_ites); ite++) {
+  for (ite = start_ite; ite < (start_ite + num_ites); ite++)
 #else
-  for (ite = 0; ite < 4; ite++) {
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  for (ite = 0; ite < 4; ite++)
+#endif  // CONFIG_COMPOUND_SINGLEREF
+  {
     struct buf_2d ref_yv12[2];
     int bestsme = INT_MAX;
     int sadpb = x->sadperbit16;
@@ -6288,23 +6798,23 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
     ref_yv12[1] = xd->plane[plane].pre[1];
 
 // Get the prediction block from the 'other' reference frame.
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
     MV *const the_other_mv = (has_second_ref(mbmi) || id)
                                  ? &frame_mv[refs[!id]].as_mv
                                  : &frame_comp_mv[refs[0]].as_mv;
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
 
 #if CONFIG_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
       av1_highbd_build_inter_predictor(
           ref_yv12[!id].buf, ref_yv12[!id].stride, second_pred, pw,
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
           the_other_mv,
-#else   // !(CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF)
+#else   // !(CONFIG_COMPOUND_SINGLEREF)
           &frame_mv[refs[!id]].as_mv,
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
-          &sf, pw, ph, 0, mbmi->interp_filter,
+#endif  // CONFIG_COMPOUND_SINGLEREF
+          &sf, pw, ph, 0, mbmi->interp_filters,
 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
           &warp_types, p_col, p_row,
 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
@@ -6314,12 +6824,12 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
 #endif  // CONFIG_HIGHBITDEPTH
       av1_build_inter_predictor(
           ref_yv12[!id].buf, ref_yv12[!id].stride, second_pred, pw,
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
           the_other_mv,
-#else   // !(CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF)
+#else   // !(CONFIG_COMPOUND_SINGLEREF)
         &frame_mv[refs[!id]].as_mv,
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
-          &sf, pw, ph, &conv_params, mbmi->interp_filter,
+#endif  // CONFIG_COMPOUND_SINGLEREF
+          &sf, pw, ph, &conv_params, mbmi->interp_filters,
 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
           &warp_types, p_col, p_row, plane, !id,
 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
@@ -6334,74 +6844,75 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
 
 // Use the mv result from the single mode as mv predictor.
 // Use the mv result from the single mode as mv predictor.
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
     if (!has_second_ref(mbmi) && id)
       *best_mv = frame_comp_mv[refs[0]].as_mv;
     else
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
       *best_mv = frame_mv[refs[id]].as_mv;
 
     best_mv->col >>= 3;
     best_mv->row >>= 3;
 
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
     if (!has_second_ref(mbmi))
       av1_set_mvcost(x, refs[0], 0, mbmi->ref_mv_idx);
     else
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
       av1_set_mvcost(x, refs[id], id, mbmi->ref_mv_idx);
 
     // Small-range full-pixel motion search.
-    bestsme =
-        av1_refining_search_8p_c(x, sadpb, search_range, &cpi->fn_ptr[bsize],
-#if CONFIG_EXT_INTER
-                                 mask, mask_stride, id,
-#endif
-                                 &ref_mv[id].as_mv, second_pred);
+    bestsme = av1_refining_search_8p_c(x, sadpb, search_range,
+                                       &cpi->fn_ptr[bsize], mask, mask_stride,
+                                       id, &ref_mv[id].as_mv, second_pred);
     if (bestsme < INT_MAX) {
-#if CONFIG_EXT_INTER
       if (mask)
         bestsme = av1_get_mvpred_mask_var(x, best_mv, &ref_mv[id].as_mv,
                                           second_pred, mask, mask_stride, id,
                                           &cpi->fn_ptr[bsize], 1);
       else
-#endif
         bestsme = av1_get_mvpred_av_var(x, best_mv, &ref_mv[id].as_mv,
                                         second_pred, &cpi->fn_ptr[bsize], 1);
     }
 
     x->mv_limits = tmp_mv_limits;
 
-    if (bestsme < INT_MAX) {
+#if CONFIG_AMVR
+    if (cpi->common.cur_frame_mv_precision_level) {
+      x->best_mv.as_mv.row *= 8;
+      x->best_mv.as_mv.col *= 8;
+    }
+    if (bestsme < INT_MAX && cpi->common.cur_frame_mv_precision_level == 0)
+#else
+    if (bestsme < INT_MAX)
+#endif
+    {
       int dis; /* TODO: use dis in distortion calculation later. */
       unsigned int sse;
       bestsme = cpi->find_fractional_mv_step(
           x, &ref_mv[id].as_mv, cpi->common.allow_high_precision_mv,
           x->errorperbit, &cpi->fn_ptr[bsize], 0,
           cpi->sf.mv.subpel_iters_per_step, NULL, x->nmvjointcost, x->mvcost,
-          &dis, &sse, second_pred,
-#if CONFIG_EXT_INTER
-          mask, mask_stride, id,
-#endif
-          pw, ph, cpi->sf.use_upsampled_references);
+          &dis, &sse, second_pred, mask, mask_stride, id, pw, ph,
+          cpi->sf.use_upsampled_references);
     }
 
     // Restore the pointer to the first (possibly scaled) prediction buffer.
     if (id) xd->plane[plane].pre[0] = ref_yv12[0];
 
     if (bestsme < last_besterr[id]) {
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
       // NOTE: For single ref comp mode, frame_mv stores the first mv and
       //       frame_comp_mv stores the second mv.
       if (!has_second_ref(mbmi) && id)
         frame_comp_mv[refs[0]].as_mv = *best_mv;
       else
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
         frame_mv[refs[id]].as_mv = *best_mv;
       last_besterr[id] = bestsme;
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
       if (!has_second_ref(mbmi)) last_besterr[!id] = last_besterr[id];
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
     } else {
       break;
     }
@@ -6409,11 +6920,12 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
 
   *rate_mv = 0;
 
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
-  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+#if CONFIG_COMPOUND_SINGLEREF
+  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref)
 #else
-  for (ref = 0; ref < 2; ++ref) {
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  for (ref = 0; ref < 2; ++ref)
+#endif  // CONFIG_COMPOUND_SINGLEREF
+  {
     if (scaled_ref_frame[ref]) {
       // Restore the prediction frame pointers to their unscaled versions.
       int i;
@@ -6421,14 +6933,14 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
         xd->plane[i].pre[ref] = backup_yv12[ref][i];
     }
 
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
     if (!has_second_ref(mbmi))
       av1_set_mvcost(x, refs[0], 0, mbmi->ref_mv_idx);
     else
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
       av1_set_mvcost(x, refs[ref], ref, mbmi->ref_mv_idx);
 
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
     if (!has_second_ref(mbmi)) {
       // NOTE: For single ref comp mode, i.e. !has_second_ref(mbmi) is true, the
       //       first mv is stored in frame_mv[] and the second mv is stored in
@@ -6442,25 +6954,25 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
                                   &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
                                   x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
     } else {
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
-#if CONFIG_EXT_INTER && !CONFIG_CB4X4
+#endif  // CONFIG_COMPOUND_SINGLEREF
+#if !CONFIG_CB4X4
       if (bsize >= BLOCK_8X8)
-#endif  // CONFIG_EXT_INTER && !CONFIG_CB4X4
+#endif  // !CONFIG_CB4X4
         *rate_mv += av1_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
                                     &x->mbmi_ext->ref_mvs[refs[ref]][0].as_mv,
                                     x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
-#if CONFIG_EXT_INTER && !CONFIG_CB4X4
+#if !CONFIG_CB4X4
       else
         *rate_mv += av1_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
                                     &ref_mv_sub8x8[ref]->as_mv, x->nmvjointcost,
                                     x->mvcost, MV_COST_WEIGHT);
-#endif  // CONFIG_EXT_INTER && !CONFIG_CB4X4
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // !CONFIG_CB4X4
+#if CONFIG_COMPOUND_SINGLEREF
     }
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
   }
 
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
   if (!has_second_ref(mbmi)) {
     if (scaled_ref_frame[0]) {
       // Restore the prediction frame pointers to their unscaled versions.
@@ -6469,7 +6981,7 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
         xd->plane[i].pre[1] = backup_yv12[1][i];
     }
   }
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
 }
 
 static void estimate_ref_frame_costs(
@@ -6516,6 +7028,7 @@ static void estimate_ref_frame_costs(
       aom_prob ref_single_p3 = av1_get_pred_prob_single_ref_p3(cm, xd);
       aom_prob ref_single_p4 = av1_get_pred_prob_single_ref_p4(cm, xd);
       aom_prob ref_single_p5 = av1_get_pred_prob_single_ref_p5(cm, xd);
+      aom_prob ref_single_p6 = av1_get_pred_prob_single_ref_p6(cm, xd);
 #endif  // CONFIG_EXT_REFS
 
       unsigned int base_cost = av1_cost_bit(intra_inter_p, 1);
@@ -6523,7 +7036,7 @@ static void estimate_ref_frame_costs(
       ref_costs_single[LAST_FRAME] =
 #if CONFIG_EXT_REFS
           ref_costs_single[LAST2_FRAME] = ref_costs_single[LAST3_FRAME] =
-              ref_costs_single[BWDREF_FRAME] =
+              ref_costs_single[BWDREF_FRAME] = ref_costs_single[ALTREF2_FRAME] =
 #endif  // CONFIG_EXT_REFS
                   ref_costs_single[GOLDEN_FRAME] =
                       ref_costs_single[ALTREF_FRAME] = base_cost;
@@ -6534,6 +7047,7 @@ static void estimate_ref_frame_costs(
       ref_costs_single[LAST3_FRAME] += av1_cost_bit(ref_single_p1, 0);
       ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p1, 0);
       ref_costs_single[BWDREF_FRAME] += av1_cost_bit(ref_single_p1, 1);
+      ref_costs_single[ALTREF2_FRAME] += av1_cost_bit(ref_single_p1, 1);
       ref_costs_single[ALTREF_FRAME] += av1_cost_bit(ref_single_p1, 1);
 
       ref_costs_single[LAST_FRAME] += av1_cost_bit(ref_single_p3, 0);
@@ -6542,6 +7056,7 @@ static void estimate_ref_frame_costs(
       ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p3, 1);
 
       ref_costs_single[BWDREF_FRAME] += av1_cost_bit(ref_single_p2, 0);
+      ref_costs_single[ALTREF2_FRAME] += av1_cost_bit(ref_single_p2, 0);
       ref_costs_single[ALTREF_FRAME] += av1_cost_bit(ref_single_p2, 1);
 
       ref_costs_single[LAST_FRAME] += av1_cost_bit(ref_single_p4, 0);
@@ -6549,6 +7064,9 @@ static void estimate_ref_frame_costs(
 
       ref_costs_single[LAST3_FRAME] += av1_cost_bit(ref_single_p5, 0);
       ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p5, 1);
+
+      ref_costs_single[BWDREF_FRAME] += av1_cost_bit(ref_single_p6, 0);
+      ref_costs_single[ALTREF2_FRAME] += av1_cost_bit(ref_single_p6, 1);
 #else   // !CONFIG_EXT_REFS
       ref_costs_single[LAST_FRAME] += av1_cost_bit(ref_single_p1, 0);
       ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p1, 1);
@@ -6563,6 +7081,7 @@ static void estimate_ref_frame_costs(
       ref_costs_single[LAST2_FRAME] = 512;
       ref_costs_single[LAST3_FRAME] = 512;
       ref_costs_single[BWDREF_FRAME] = 512;
+      ref_costs_single[ALTREF2_FRAME] = 512;
 #endif  // CONFIG_EXT_REFS
       ref_costs_single[GOLDEN_FRAME] = 512;
       ref_costs_single[ALTREF_FRAME] = 512;
@@ -6574,6 +7093,7 @@ static void estimate_ref_frame_costs(
       aom_prob ref_comp_p1 = av1_get_pred_prob_comp_ref_p1(cm, xd);
       aom_prob ref_comp_p2 = av1_get_pred_prob_comp_ref_p2(cm, xd);
       aom_prob bwdref_comp_p = av1_get_pred_prob_comp_bwdref_p(cm, xd);
+      aom_prob bwdref_comp_p1 = av1_get_pred_prob_comp_bwdref_p1(cm, xd);
 #endif  // CONFIG_EXT_REFS
 
       unsigned int base_cost = av1_cost_bit(intra_inter_p, 1);
@@ -6589,7 +7109,8 @@ static void estimate_ref_frame_costs(
 #else
               base_cost;
 #endif  // USE_UNI_COMP_REFS
-      ref_bicomp_costs[BWDREF_FRAME] = ref_bicomp_costs[ALTREF_FRAME] = 0;
+      ref_bicomp_costs[BWDREF_FRAME] = ref_bicomp_costs[ALTREF2_FRAME] = 0;
+      ref_bicomp_costs[ALTREF_FRAME] = 0;
 
       ref_bicomp_costs[LAST_FRAME] += av1_cost_bit(ref_comp_p, 0);
       ref_bicomp_costs[LAST2_FRAME] += av1_cost_bit(ref_comp_p, 0);
@@ -6603,14 +7124,18 @@ static void estimate_ref_frame_costs(
       ref_bicomp_costs[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p2, 1);
 
       ref_bicomp_costs[BWDREF_FRAME] += av1_cost_bit(bwdref_comp_p, 0);
+      ref_bicomp_costs[ALTREF2_FRAME] += av1_cost_bit(bwdref_comp_p, 0);
       ref_bicomp_costs[ALTREF_FRAME] += av1_cost_bit(bwdref_comp_p, 1);
 
-      int ref0;
+      ref_bicomp_costs[BWDREF_FRAME] += av1_cost_bit(bwdref_comp_p1, 0);
+      ref_bicomp_costs[ALTREF2_FRAME] += av1_cost_bit(bwdref_comp_p1, 1);
+
+      int ref0, ref1;
       for (ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) {
-        ref_costs_comp[ref0][BWDREF_FRAME] =
-            ref_bicomp_costs[ref0] + ref_bicomp_costs[BWDREF_FRAME];
-        ref_costs_comp[ref0][ALTREF_FRAME] =
-            ref_bicomp_costs[ref0] + ref_bicomp_costs[ALTREF_FRAME];
+        for (ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1) {
+          ref_costs_comp[ref0][ref1] =
+              ref_bicomp_costs[ref0] + ref_bicomp_costs[ref1];
+        }
       }
 
       aom_prob uni_comp_ref_p = av1_get_pred_prob_uni_comp_ref_p(cm, xd);
@@ -6642,7 +7167,8 @@ static void estimate_ref_frame_costs(
               ref_costs_comp[GOLDEN_FRAME] = base_cost;
 
 #if CONFIG_EXT_REFS
-      ref_costs_comp[BWDREF_FRAME] = ref_costs_comp[ALTREF_FRAME] = 0;
+      ref_costs_comp[BWDREF_FRAME] = ref_costs_comp[ALTREF2_FRAME] =
+          ref_costs_comp[ALTREF_FRAME] = 0;
 #endif  // CONFIG_EXT_REFS
 
 #if CONFIG_EXT_REFS
@@ -6660,7 +7186,11 @@ static void estimate_ref_frame_costs(
       // NOTE(zoeliu): BWDREF and ALTREF each add an extra cost by coding 1
       //               more bit.
       ref_costs_comp[BWDREF_FRAME] += av1_cost_bit(bwdref_comp_p, 0);
+      ref_costs_comp[ALTREF2_FRAME] += av1_cost_bit(bwdref_comp_p, 0);
       ref_costs_comp[ALTREF_FRAME] += av1_cost_bit(bwdref_comp_p, 1);
+
+      ref_costs_comp[BWDREF_FRAME] += av1_cost_bit(bwdref_comp_p1, 0);
+      ref_costs_comp[ALTREF2_FRAME] += av1_cost_bit(bwdref_comp_p1, 1);
 #else   // !CONFIG_EXT_REFS
       ref_costs_comp[LAST_FRAME] += av1_cost_bit(ref_comp_p, 0);
       ref_costs_comp[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p, 1);
@@ -6668,10 +7198,10 @@ static void estimate_ref_frame_costs(
 #endif  // CONFIG_EXT_COMP_REFS
     } else {
 #if CONFIG_EXT_COMP_REFS
-      int ref0;
+      int ref0, ref1;
       for (ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) {
-        ref_costs_comp[ref0][BWDREF_FRAME] = 512;
-        ref_costs_comp[ref0][ALTREF_FRAME] = 512;
+        for (ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1)
+          ref_costs_comp[ref0][ref1] = 512;
       }
       ref_costs_comp[LAST_FRAME][LAST2_FRAME] = 512;
       ref_costs_comp[LAST_FRAME][LAST3_FRAME] = 512;
@@ -6683,6 +7213,7 @@ static void estimate_ref_frame_costs(
       ref_costs_comp[LAST2_FRAME] = 512;
       ref_costs_comp[LAST3_FRAME] = 512;
       ref_costs_comp[BWDREF_FRAME] = 512;
+      ref_costs_comp[ALTREF2_FRAME] = 512;
       ref_costs_comp[ALTREF_FRAME] = 512;
 #endif  // CONFIG_EXT_REFS
       ref_costs_comp[GOLDEN_FRAME] = 512;
@@ -6732,17 +7263,19 @@ static void setup_buffer_inter(
   // Gets an initial list of candidate vectors from neighbours and orders them
   av1_find_mv_refs(cm, xd, mi, ref_frame, &mbmi_ext->ref_mv_count[ref_frame],
                    mbmi_ext->ref_mv_stack[ref_frame],
-#if CONFIG_EXT_INTER
-                   mbmi_ext->compound_mode_context,
-#endif  // CONFIG_EXT_INTER
-                   candidates, mi_row, mi_col, NULL, NULL,
-                   mbmi_ext->mode_context);
+                   mbmi_ext->compound_mode_context, candidates, mi_row, mi_col,
+                   NULL, NULL, mbmi_ext->mode_context);
 
-  // Candidate refinement carried out at encoder and decoder
+// Candidate refinement carried out at encoder and decoder
+#if CONFIG_AMVR
+  av1_find_best_ref_mvs(cm->allow_high_precision_mv, candidates,
+                        &frame_nearest_mv[ref_frame], &frame_near_mv[ref_frame],
+                        cm->cur_frame_mv_precision_level);
+#else
   av1_find_best_ref_mvs(cm->allow_high_precision_mv, candidates,
                         &frame_nearest_mv[ref_frame],
                         &frame_near_mv[ref_frame]);
-
+#endif
 // Further refinement that is encode side only to test the top few candidates
 // in full and choose the best as the centre point for subsequent searches.
 // The current implementation doesn't support scaling.
@@ -6758,10 +7291,7 @@ static void setup_buffer_inter(
 
 static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
                                  BLOCK_SIZE bsize, int mi_row, int mi_col,
-#if CONFIG_EXT_INTER
-                                 int ref_idx,
-#endif  // CONFIG_EXT_INTER
-                                 int *rate_mv) {
+                                 int ref_idx, int *rate_mv) {
   MACROBLOCKD *xd = &x->e_mbd;
   const AV1_COMMON *cm = &cpi->common;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
@@ -6770,17 +7300,12 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
   int step_param;
   int sadpb = x->sadperbit16;
   MV mvp_full;
-#if CONFIG_EXT_INTER
 #if CONFIG_COMPOUND_SINGLEREF
   int ref =
       has_second_ref(mbmi) ? mbmi->ref_frame[ref_idx] : mbmi->ref_frame[0];
 #else   // !CONFIG_COMPOUND_SINGLEREF
   int ref = mbmi->ref_frame[ref_idx];
 #endif  // CONFIG_COMPOUND_SINGLEREF
-#else   // !CONFIG_EXT_INTER
-  int ref = mbmi->ref_frame[0];
-  int ref_idx = 0;
-#endif  // CONFIG_EXT_INTER
   MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv;
 
   MvLimits tmp_mv_limits = x->mv_limits;
@@ -6812,7 +7337,7 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
   // Work out the size of the first step in the mv step search.
   // 0 here is maximum length first step. 1 is AOMMAX >> 1 etc.
   if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) {
-    // Take wtd average of the step_params based on the last frame's
+    // Take the weighted average of the step_params based on the last frame's
     // max mv magnitude and that based on the best ref mvs of the current
     // block for the given reference.
     step_param =
@@ -6834,10 +7359,13 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
     int bhl = b_height_log2_lookup[bsize];
     int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4);
 
-    if (tlevel < 5) step_param += 2;
+    if (tlevel < 5) {
+      step_param += 2;
+      step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 1);
+    }
 
     // prev_mv_sad is not setup for dynamically scaled frames.
-    if (cpi->oxcf.resize_mode != RESIZE_DYNAMIC) {
+    if (cpi->oxcf.resize_mode != RESIZE_RANDOM) {
       int i;
       for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) {
         if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
@@ -6874,9 +7402,16 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
   switch (mbmi->motion_mode) {
     case SIMPLE_TRANSLATION:
 #endif  // CONFIG_MOTION_VAR
+#if CONFIG_HASH_ME
       bestsme = av1_full_pixel_search(cpi, x, bsize, &mvp_full, step_param,
                                       sadpb, cond_cost_list(cpi, cost_list),
-                                      &ref_mv, INT_MAX, 1);
+                                      &ref_mv, INT_MAX, 1, (MI_SIZE * mi_col),
+                                      (MI_SIZE * mi_row), 0);
+#else
+  bestsme = av1_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb,
+                                  cond_cost_list(cpi, cost_list), &ref_mv,
+                                  INT_MAX, 1);
+#endif
 #if CONFIG_MOTION_VAR
       break;
     case OBMC_CAUSAL:
@@ -6891,7 +7426,15 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
 
   x->mv_limits = tmp_mv_limits;
 
+#if CONFIG_AMVR
+  if (cpi->common.cur_frame_mv_precision_level) {
+    x->best_mv.as_mv.row *= 8;
+    x->best_mv.as_mv.col *= 8;
+  }
+  if (bestsme < INT_MAX && cpi->common.cur_frame_mv_precision_level == 0) {
+#else
   if (bestsme < INT_MAX) {
+#endif
     int dis; /* TODO: use dis in distortion calculation later. */
 #if CONFIG_MOTION_VAR
     switch (mbmi->motion_mode) {
@@ -6908,11 +7451,8 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
               x, &ref_mv, cm->allow_high_precision_mv, x->errorperbit,
               &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
               cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
-              x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL,
-#if CONFIG_EXT_INTER
-              NULL, 0, 0,
-#endif
-              pw, ph, 1);
+              x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, NULL,
+              0, 0, pw, ph, 1);
 
           if (try_second) {
             const int minc =
@@ -6936,11 +7476,7 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
                   &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
                   cpi->sf.mv.subpel_iters_per_step,
                   cond_cost_list(cpi, cost_list), x->nmvjointcost, x->mvcost,
-                  &dis, &x->pred_sse[ref], NULL,
-#if CONFIG_EXT_INTER
-                  NULL, 0, 0,
-#endif
-                  pw, ph, 1);
+                  &dis, &x->pred_sse[ref], NULL, NULL, 0, 0, pw, ph, 1);
               if (this_var < best_mv_var) best_mv = x->best_mv.as_mv;
               x->best_mv.as_mv = best_mv;
             }
@@ -6950,11 +7486,8 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
               x, &ref_mv, cm->allow_high_precision_mv, x->errorperbit,
               &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
               cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
-              x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL,
-#if CONFIG_EXT_INTER
-              NULL, 0, 0,
-#endif
-              0, 0, 0);
+              x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, NULL,
+              0, 0, 0, 0, 0);
         }
 #if CONFIG_MOTION_VAR
         break;
@@ -6994,7 +7527,6 @@ static INLINE void restore_dst_buf(MACROBLOCKD *xd, BUFFER_SET dst) {
   }
 }
 
-#if CONFIG_EXT_INTER
 static void build_second_inter_pred(const AV1_COMP *cpi, MACROBLOCK *x,
                                     BLOCK_SIZE bsize, const MV *other_mv,
                                     int mi_row, int mi_col, const int block,
@@ -7013,7 +7545,7 @@ static void build_second_inter_pred(const AV1_COMP *cpi, MACROBLOCK *x,
   struct scale_factors sf;
 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
   struct macroblockd_plane *const pd = &xd->plane[0];
-  // ic and ir are the 4x4 coordiantes of the sub8x8 at index "block"
+  // ic and ir are the 4x4 coordinates of the sub8x8 at index "block"
   const int ic = block & 1;
   const int ir = (block - ic) >> 1;
   const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x) + 4 * ic;
@@ -7079,7 +7611,7 @@ static void build_second_inter_pred(const AV1_COMP *cpi, MACROBLOCK *x,
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     av1_highbd_build_inter_predictor(
         ref_yv12.buf, ref_yv12.stride, second_pred, pw, other_mv, &sf, pw, ph,
-        0, mbmi->interp_filter,
+        0, mbmi->interp_filters,
 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
         &warp_types, p_col, p_row,
 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
@@ -7088,7 +7620,7 @@ static void build_second_inter_pred(const AV1_COMP *cpi, MACROBLOCK *x,
 #endif  // CONFIG_HIGHBITDEPTH
     av1_build_inter_predictor(
         ref_yv12.buf, ref_yv12.stride, second_pred, pw, other_mv, &sf, pw, ph,
-        &conv_params, mbmi->interp_filter,
+        &conv_params, mbmi->interp_filters,
 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
         &warp_types, p_col, p_row, plane, !ref_idx,
 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
@@ -7197,7 +7729,15 @@ static void compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
 
   x->mv_limits = tmp_mv_limits;
 
+#if CONFIG_AMVR
+  if (cpi->common.cur_frame_mv_precision_level) {
+    x->best_mv.as_mv.row *= 8;
+    x->best_mv.as_mv.col *= 8;
+  }
+  if (bestsme < INT_MAX && cpi->common.cur_frame_mv_precision_level == 0) {
+#else
   if (bestsme < INT_MAX) {
+#endif
     int dis; /* TODO: use dis in distortion calculation later. */
     unsigned int sse;
     bestsme = cpi->find_fractional_mv_step(
@@ -7339,9 +7879,8 @@ static void do_masked_motion_search_indexed(
     tmp_mv[1].as_int = frame_mv[rf[1]].as_int;
 }
 #endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
-#endif  // CONFIG_EXT_INTER
 
-// In some situations we want to discount tha pparent cost of a new motion
+// In some situations we want to discount the apparent cost of a new motion
 // vector. Where there is a subtle motion field and especially where there is
 // low spatial complexity then it can be hard to cover the cost of a new motion
 // vector in a single block, even if that motion vector reduces distortion.
@@ -7371,7 +7910,6 @@ static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
            xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
 }
 
-#if CONFIG_EXT_INTER
 #if CONFIG_WEDGE
 static int estimate_wedge_sign(const AV1_COMP *cpi, const MACROBLOCK *x,
                                const BLOCK_SIZE bsize, const uint8_t *pred0,
@@ -7416,7 +7954,6 @@ static int estimate_wedge_sign(const AV1_COMP *cpi, const MACROBLOCK *x,
   return (tl + br > 0);
 }
 #endif  // CONFIG_WEDGE
-#endif  // CONFIG_EXT_INTER
 
 #if !CONFIG_DUAL_FILTER
 static InterpFilter predict_interp_filter(
@@ -7440,19 +7977,17 @@ static InterpFilter predict_interp_filter(
                   (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
   if (pred_filter_search) {
     InterpFilter af = SWITCHABLE, lf = SWITCHABLE;
-    if (xd->up_available) af = xd->mi[-xd->mi_stride]->mbmi.interp_filter;
-    if (xd->left_available) lf = xd->mi[-1]->mbmi.interp_filter;
+    if (xd->up_available)
+      af = av1_extract_interp_filter(
+          xd->mi[-xd->mi_stride]->mbmi.interp_filters, 0);
+    if (xd->left_available)
+      lf = av1_extract_interp_filter(xd->mi[-1]->mbmi.interp_filters, 0);
 
-#if CONFIG_EXT_INTER
     if ((this_mode != NEWMV && this_mode != NEW_NEWMV) || (af == lf))
-#else
-    if ((this_mode != NEWMV) || (af == lf))
-#endif  // CONFIG_EXT_INTER
       best_filter = af;
   }
   if (is_comp_pred) {
     if (cpi->sf.adaptive_mode_search) {
-#if CONFIG_EXT_INTER
       switch (this_mode) {
         case NEAREST_NEARESTMV:
           if (single_filter[NEARESTMV][refs[0]] ==
@@ -7495,11 +8030,6 @@ static InterpFilter predict_interp_filter(
             best_filter = single_filter[this_mode][refs[0]];
           break;
       }
-#else
-      if (single_filter[this_mode][refs[0]] ==
-          single_filter[this_mode][refs[1]])
-        best_filter = single_filter[this_mode][refs[0]];
-#endif  // CONFIG_EXT_INTER
     }
   }
   if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) {
@@ -7509,7 +8039,6 @@ static InterpFilter predict_interp_filter(
 }
 #endif  // !CONFIG_DUAL_FILTER
 
-#if CONFIG_EXT_INTER
 // Choose the best wedge index and sign
 #if CONFIG_WEDGE
 static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
@@ -7924,7 +8453,6 @@ static int64_t build_and_cost_compound_type(
   return best_rd_cur;
 }
 #endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
-#endif  // CONFIG_EXT_INTER
 
 typedef struct {
 #if CONFIG_MOTION_VAR
@@ -7935,23 +8463,21 @@ typedef struct {
   int left_pred_stride[MAX_MB_PLANE];
 #endif  // CONFIG_MOTION_VAR
   int_mv *single_newmv;
-#if CONFIG_EXT_INTER
   // Pointer to array of motion vectors to use for each ref and their rates
   // Should point to first of 2 arrays in 2D array
   int *single_newmv_rate;
   // Pointer to array of predicted rate-distortion
   // Should point to first of 2 arrays in 2D array
   int64_t (*modelled_rd)[TOTAL_REFS_PER_FRAME];
-#endif  // CONFIG_EXT_INTER
   InterpFilter single_filter[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
 } HandleInterModeArgs;
 
 static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
                             const BLOCK_SIZE bsize,
                             int_mv (*const mode_mv)[TOTAL_REFS_PER_FRAME],
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
                             int_mv (*const mode_comp_mv)[TOTAL_REFS_PER_FRAME],
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
                             const int mi_row, const int mi_col,
                             int *const rate_mv, int_mv *const single_newmv,
                             HandleInterModeArgs *const args) {
@@ -7960,13 +8486,11 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
   const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   const int is_comp_pred = has_second_ref(mbmi);
   const PREDICTION_MODE this_mode = mbmi->mode;
-#if CONFIG_EXT_INTER
   const int is_comp_interintra_pred = (mbmi->ref_frame[1] == INTRA_FRAME);
-#endif  // CONFIG_EXT_INTER
   int_mv *const frame_mv = mode_mv[this_mode];
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
   int_mv *const frame_comp_mv = mode_comp_mv[this_mode];
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
   const int refs[2] = { mbmi->ref_frame[0],
                         mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] };
   int i;
@@ -7974,7 +8498,6 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
   (void)args;
 
   if (is_comp_pred) {
-#if CONFIG_EXT_INTER
     for (i = 0; i < 2; ++i) {
       single_newmv[refs[i]].as_int = args->single_newmv[refs[i]].as_int;
     }
@@ -7985,9 +8508,9 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
 
       if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
         joint_motion_search(cpi, x, bsize, frame_mv,
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
                             NULL,  // int_mv *frame_comp_mv
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif                             // CONFIG_COMPOUND_SINGLEREF
                             mi_row, mi_col, NULL, NULL, 0, rate_mv, 0);
       } else {
         *rate_mv = 0;
@@ -8034,24 +8557,7 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
       }
     }
-#else   // !CONFIG_EXT_INTER
-    // Initialize mv using single prediction mode result.
-    frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
-    frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
-
-    if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
-      joint_motion_search(cpi, x, bsize, frame_mv, mi_row, mi_col, rate_mv, 0);
-    } else {
-      *rate_mv = 0;
-      for (i = 0; i < 2; ++i) {
-        av1_set_mvcost(x, refs[i], i, mbmi->ref_mv_idx);
-        *rate_mv += av1_mv_bit_cost(&frame_mv[refs[i]].as_mv,
-                                    &mbmi_ext->ref_mvs[refs[i]][0].as_mv,
-                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
-      }
-    }
-#endif  // CONFIG_EXT_INTER
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
   } else if (is_inter_singleref_comp_mode(this_mode)) {
     // Single ref comp mode
     const int mode0 = compound_ref0_mode(this_mode);
@@ -8085,9 +8591,8 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
                                   &mbmi_ext->ref_mvs[refs[0]][0].as_mv,
                                   x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
     }
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
   } else {
-#if CONFIG_EXT_INTER
     if (is_comp_interintra_pred) {
       x->best_mv = args->single_newmv[refs[0]];
       *rate_mv = args->single_newmv_rate[refs[0]];
@@ -8096,10 +8601,6 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
       args->single_newmv[refs[0]] = x->best_mv;
       args->single_newmv_rate[refs[0]] = *rate_mv;
     }
-#else
-    single_motion_search(cpi, x, bsize, mi_row, mi_col, rate_mv);
-    single_newmv[refs[0]] = x->best_mv;
-#endif  // CONFIG_EXT_INTER
 
     if (x->best_mv.as_int == INVALID_MV) return INT64_MAX;
 
@@ -8149,7 +8650,7 @@ int64_t interpolation_filter_search(
 
   set_default_interp_filters(mbmi, assign_filter);
 
-  *switchable_rate = av1_get_switchable_rate(cpi, xd);
+  *switchable_rate = av1_get_switchable_rate(cm, x, xd);
   av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
   model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate, &tmp_dist,
                   skip_txfm_sb, skip_sse_sb);
@@ -8164,12 +8665,7 @@ int64_t interpolation_filter_search(
       const int filter_set_size = SWITCHABLE_FILTERS;
 #endif  // CONFIG_DUAL_FILTER
       int best_in_temp = 0;
-#if CONFIG_DUAL_FILTER
-      InterpFilter best_filter[4];
-      av1_copy(best_filter, mbmi->interp_filter);
-#else
-      InterpFilter best_filter = mbmi->interp_filter;
-#endif  // CONFIG_DUAL_FILTER
+      InterpFilters best_filters = mbmi->interp_filters;
       restore_dst_buf(xd, *tmp_dst);
       // EIGHTTAP_REGULAR mode is calculated beforehand
       for (i = 1; i < filter_set_size; ++i) {
@@ -8178,14 +8674,12 @@ int64_t interpolation_filter_search(
         int tmp_rs;
         int64_t tmp_rd;
 #if CONFIG_DUAL_FILTER
-        mbmi->interp_filter[0] = filter_sets[i][0];
-        mbmi->interp_filter[1] = filter_sets[i][1];
-        mbmi->interp_filter[2] = filter_sets[i][0];
-        mbmi->interp_filter[3] = filter_sets[i][1];
+        mbmi->interp_filters =
+            av1_make_interp_filters(filter_sets[i][0], filter_sets[i][1]);
 #else
-        mbmi->interp_filter = (InterpFilter)i;
+        mbmi->interp_filters = av1_broadcast_interp_filter((InterpFilter)i);
 #endif  // CONFIG_DUAL_FILTER
-        tmp_rs = av1_get_switchable_rate(cpi, xd);
+        tmp_rs = av1_get_switchable_rate(cm, x, xd);
         av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
         model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
                         &tmp_dist, &tmp_skip_sb, &tmp_skip_sse);
@@ -8193,12 +8687,8 @@ int64_t interpolation_filter_search(
 
         if (tmp_rd < *rd) {
           *rd = tmp_rd;
-          *switchable_rate = av1_get_switchable_rate(cpi, xd);
-#if CONFIG_DUAL_FILTER
-          av1_copy(best_filter, mbmi->interp_filter);
-#else
-          best_filter = mbmi->interp_filter;
-#endif  // CONFIG_DUAL_FILTER
+          *switchable_rate = av1_get_switchable_rate(cm, x, xd);
+          best_filters = mbmi->interp_filters;
           *skip_txfm_sb = tmp_skip_sb;
           *skip_sse_sb = tmp_skip_sse;
           best_in_temp = !best_in_temp;
@@ -8214,24 +8704,29 @@ int64_t interpolation_filter_search(
       } else {
         restore_dst_buf(xd, *orig_dst);
       }
-#if CONFIG_DUAL_FILTER
-      av1_copy(mbmi->interp_filter, best_filter);
-#else
-      mbmi->interp_filter = best_filter;
-#endif  // CONFIG_DUAL_FILTER
+      mbmi->interp_filters = best_filters;
     } else {
-#if CONFIG_DUAL_FILTER
-      for (i = 0; i < 4; ++i)
-        assert(mbmi->interp_filter[i] == EIGHTTAP_REGULAR);
-#else
-      assert(mbmi->interp_filter == EIGHTTAP_REGULAR);
-#endif  // CONFIG_DUAL_FILTER
+      assert(mbmi->interp_filters ==
+             av1_broadcast_interp_filter(EIGHTTAP_REGULAR));
     }
   }
 
   return 0;
 }
 
+#if CONFIG_DUAL_FILTER
+static InterpFilters condition_interp_filters_on_mv(
+    InterpFilters interp_filters, const MACROBLOCKD *xd) {
+  InterpFilter filters[2];
+  for (int i = 0; i < 2; ++i)
+    filters[i] = (has_subpel_mv_component(xd->mi[0], xd, i))
+                     ? av1_extract_interp_filter(interp_filters, i)
+                     : EIGHTTAP_REGULAR;
+
+  return av1_make_interp_filters(filters[0], filters[1]);
+}
+#endif
+
 // TODO(afergs): Refactor the MBMI references in here - there's four
 // TODO(afergs): Refactor optional args - add them to a struct or remove
 static int64_t motion_mode_rd(
@@ -8242,10 +8737,8 @@ static int64_t motion_mode_rd(
     const int *refs, int rate_mv,
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
     // only used when WARPED_MOTION is on?
-    int_mv *const single_newmv,
-#if CONFIG_EXT_INTER
-    int rate2_bmc_nocoeff, MB_MODE_INFO *best_bmc_mbmi, int rate_mv_bmc,
-#endif  // CONFIG_EXT_INTER
+    int_mv *const single_newmv, int rate2_bmc_nocoeff,
+    MB_MODE_INFO *best_bmc_mbmi, int rate_mv_bmc,
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
     int rs, int *skip_txfm_sb, int64_t *skip_sse_sb, BUFFER_SET *orig_dst) {
   const AV1_COMMON *const cm = &cpi->common;
@@ -8263,6 +8756,9 @@ static int64_t motion_mode_rd(
   (void)rate_mv;
   (void)is_comp_pred;
   (void)this_mode;
+#if !CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR
+  (void)single_newmv;
+#endif
 
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
   MOTION_MODE motion_mode, last_motion_mode_allowed;
@@ -8298,23 +8794,10 @@ static int64_t motion_mode_rd(
 #else
   mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref);
 #endif  // WARPED_MOTION_SORT_SAMPLES
-#if CONFIG_EXT_INTER
   best_bmc_mbmi->num_proj_ref[0] = mbmi->num_proj_ref[0];
-#endif  // CONFIG_EXT_INTER
 #endif  // CONFIG_WARPED_MOTION
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
   rate2_nocoeff = rd_stats->rate;
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-  // We cannot estimate the rd cost for the motion mode NCOBMC_ADAPT_WEIGHT
-  // right now since it requires mvs from all neighboring blocks. We will
-  // check if this mode is beneficial after all the mv's in the current
-  // superblock are selected.
-  last_motion_mode_allowed = motion_mode_allowed_wrapper(1,
-#if CONFIG_GLOBAL_MOTION
-                                                         0, xd->global_motion,
-#endif  // CONFIG_GLOBAL_MOTION
-                                                         mi);
-#else
   last_motion_mode_allowed = motion_mode_allowed(
 #if CONFIG_GLOBAL_MOTION
       0, xd->global_motion,
@@ -8323,7 +8806,6 @@ static int64_t motion_mode_rd(
       xd,
 #endif
       mi);
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
   base_mbmi = *mbmi;
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 
@@ -8334,54 +8816,44 @@ static int64_t motion_mode_rd(
     int64_t tmp_rd = INT64_MAX;
     int tmp_rate;
     int64_t tmp_dist;
-#if CONFIG_EXT_INTER
     int tmp_rate2 =
         motion_mode != SIMPLE_TRANSLATION ? rate2_bmc_nocoeff : rate2_nocoeff;
-#else
-    int tmp_rate2 = rate2_nocoeff;
-#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_NCOBMC_ADAPT_WEIGHT
+    // We cannot estimate the rd cost for the motion mode NCOBMC_ADAPT_WEIGHT
+    // right now since it requires mvs from all neighboring blocks. We will
+    // check if this mode is beneficial after all the mv's in the current
+    // superblock are selected.
+    if (motion_mode == NCOBMC_ADAPT_WEIGHT) continue;
+#endif
 
     *mbmi = base_mbmi;
     mbmi->motion_mode = motion_mode;
 #if CONFIG_MOTION_VAR
     if (mbmi->motion_mode == OBMC_CAUSAL) {
-#if CONFIG_EXT_INTER
       *mbmi = *best_bmc_mbmi;
       mbmi->motion_mode = OBMC_CAUSAL;
-#endif  // CONFIG_EXT_INTER
       if (!is_comp_pred &&
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
           !is_inter_singleref_comp_mode(this_mode) &&
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
           have_newmv_in_inter_mode(this_mode)) {
         int tmp_rate_mv = 0;
 
-        single_motion_search(cpi, x, bsize, mi_row, mi_col,
-#if CONFIG_EXT_INTER
-                             0,
-#endif  // CONFIG_EXT_INTER
-                             &tmp_rate_mv);
+        single_motion_search(cpi, x, bsize, mi_row, mi_col, 0, &tmp_rate_mv);
         mbmi->mv[0].as_int = x->best_mv.as_int;
         if (discount_newmv_test(cpi, this_mode, mbmi->mv[0], mode_mv,
                                 refs[0])) {
           tmp_rate_mv = AOMMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
         }
-#if CONFIG_EXT_INTER
         tmp_rate2 = rate2_bmc_nocoeff - rate_mv_bmc + tmp_rate_mv;
-#else
-        tmp_rate2 = rate2_nocoeff - rate_mv + tmp_rate_mv;
-#endif  // CONFIG_EXT_INTER
 #if CONFIG_DUAL_FILTER
-        if (!has_subpel_mv_component(xd->mi[0], xd, 0))
-          mbmi->interp_filter[0] = EIGHTTAP_REGULAR;
-        if (!has_subpel_mv_component(xd->mi[0], xd, 1))
-          mbmi->interp_filter[1] = EIGHTTAP_REGULAR;
+        mbmi->interp_filters =
+            condition_interp_filters_on_mv(mbmi->interp_filters, xd);
 #endif  // CONFIG_DUAL_FILTER
         av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
-#if CONFIG_EXT_INTER
       } else {
         av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
-#endif  // CONFIG_EXT_INTER
       }
       av1_build_obmc_inter_prediction(
           cm, xd, mi_row, mi_col, args->above_pred_buf, args->above_pred_stride,
@@ -8396,20 +8868,11 @@ static int64_t motion_mode_rd(
 #if WARPED_MOTION_SORT_SAMPLES
       int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
 #endif  // WARPED_MOTION_SORT_SAMPLES
-#if CONFIG_EXT_INTER
       *mbmi = *best_bmc_mbmi;
       mbmi->motion_mode = WARPED_CAUSAL;
-#endif  // CONFIG_EXT_INTER
       mbmi->wm_params[0].wmtype = DEFAULT_WMTYPE;
-#if CONFIG_DUAL_FILTER
-      for (int dir = 0; dir < 4; ++dir)
-        mbmi->interp_filter[dir] = cm->interp_filter == SWITCHABLE
-                                       ? EIGHTTAP_REGULAR
-                                       : cm->interp_filter;
-#else
-      mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP_REGULAR
-                                                            : cm->interp_filter;
-#endif  // CONFIG_DUAL_FILTER
+      mbmi->interp_filters = av1_broadcast_interp_filter(
+          av1_unswitchable_filter(cm->interp_filter));
 
 #if WARPED_MOTION_SORT_SAMPLES
       memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
@@ -8418,9 +8881,7 @@ static int64_t motion_mode_rd(
       if (mbmi->num_proj_ref[0] > 1) {
         mbmi->num_proj_ref[0] = sortSamples(pts_mv0, &mbmi->mv[0].as_mv, pts,
                                             pts_inref, mbmi->num_proj_ref[0]);
-#if CONFIG_EXT_INTER
         best_bmc_mbmi->num_proj_ref[0] = mbmi->num_proj_ref[0];
-#endif  // CONFIG_EXT_INTER
       }
 #endif  // WARPED_MOTION_SORT_SAMPLES
 
@@ -8461,19 +8922,13 @@ static int64_t motion_mode_rd(
                                     refs[0])) {
               tmp_rate_mv = AOMMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
             }
-#if CONFIG_EXT_INTER
 #if WARPED_MOTION_SORT_SAMPLES
             best_bmc_mbmi->num_proj_ref[0] = mbmi->num_proj_ref[0];
 #endif  // WARPED_MOTION_SORT_SAMPLES
             tmp_rate2 = rate2_bmc_nocoeff - rate_mv_bmc + tmp_rate_mv;
-#else
-            tmp_rate2 = rate2_nocoeff - rate_mv + tmp_rate_mv;
-#endif  // CONFIG_EXT_INTER
 #if CONFIG_DUAL_FILTER
-            if (!has_subpel_mv_component(xd->mi[0], xd, 0))
-              mbmi->interp_filter[0] = EIGHTTAP_REGULAR;
-            if (!has_subpel_mv_component(xd->mi[0], xd, 1))
-              mbmi->interp_filter[1] = EIGHTTAP_REGULAR;
+            mbmi->interp_filters =
+                condition_interp_filters_on_mv(mbmi->interp_filters, xd);
 #endif  // CONFIG_DUAL_FILTER
           } else {
             // Restore the old MV and WM parameters.
@@ -8503,10 +8958,10 @@ static int64_t motion_mode_rd(
 #if CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR
       if (last_motion_mode_allowed == WARPED_CAUSAL)
 #endif  // CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR
-        rd_stats->rate += cpi->motion_mode_cost[bsize][mbmi->motion_mode];
+        rd_stats->rate += x->motion_mode_cost[bsize][mbmi->motion_mode];
 #if CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR
       else
-        rd_stats->rate += cpi->motion_mode_cost1[bsize][mbmi->motion_mode];
+        rd_stats->rate += x->motion_mode_cost1[bsize][mbmi->motion_mode];
 #endif  // CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR
     }
 #if CONFIG_WARPED_MOTION
@@ -8629,25 +9084,11 @@ static int64_t motion_mode_rd(
     }
 
 #if CONFIG_GLOBAL_MOTION
-    if (this_mode == ZEROMV
-#if CONFIG_EXT_INTER
-        || this_mode == ZERO_ZEROMV
-#endif  // CONFIG_EXT_INTER
-        ) {
+    if (this_mode == ZEROMV || this_mode == ZERO_ZEROMV) {
       if (is_nontrans_global_motion(xd)) {
         rd_stats->rate -= rs;
-#if CONFIG_DUAL_FILTER
-        mbmi->interp_filter[0] = cm->interp_filter == SWITCHABLE
-                                     ? EIGHTTAP_REGULAR
-                                     : cm->interp_filter;
-        mbmi->interp_filter[1] = cm->interp_filter == SWITCHABLE
-                                     ? EIGHTTAP_REGULAR
-                                     : cm->interp_filter;
-#else
-        mbmi->interp_filter = cm->interp_filter == SWITCHABLE
-                                  ? EIGHTTAP_REGULAR
-                                  : cm->interp_filter;
-#endif  // CONFIG_DUAL_FILTER
+        mbmi->interp_filters = av1_broadcast_interp_filter(
+            av1_unswitchable_filter(cm->interp_filter));
       }
     }
 #endif  // CONFIG_GLOBAL_MOTION
@@ -8697,48 +9138,43 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
                                  RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
                                  int *disable_skip,
                                  int_mv (*mode_mv)[TOTAL_REFS_PER_FRAME],
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
                                  int_mv (*mode_comp_mv)[TOTAL_REFS_PER_FRAME],
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
                                  int mi_row, int mi_col,
                                  HandleInterModeArgs *args,
                                  const int64_t ref_best_rd) {
   const AV1_COMMON *cm = &cpi->common;
-  (void)cm;
   MACROBLOCKD *xd = &x->e_mbd;
   MODE_INFO *mi = xd->mi[0];
   MB_MODE_INFO *mbmi = &mi->mbmi;
   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   const int is_comp_pred = has_second_ref(mbmi);
   const int this_mode = mbmi->mode;
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
   const int is_singleref_comp_mode = is_inter_singleref_comp_mode(this_mode);
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
   int_mv *frame_mv = mode_mv[this_mode];
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
   // The comp mv for the compound mode in single ref
   int_mv *frame_comp_mv = mode_comp_mv[this_mode];
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
   int i;
   int refs[2] = { mbmi->ref_frame[0],
                   (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
   int_mv cur_mv[2];
   int rate_mv = 0;
-#if CONFIG_EXT_INTER
   int pred_exists = 1;
 #if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT || CONFIG_INTERINTRA
   const int bw = block_size_wide[bsize];
-#endif  // ONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
+#endif  // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
   int_mv single_newmv[TOTAL_REFS_PER_FRAME];
 #if CONFIG_INTERINTRA
-  const unsigned int *const interintra_mode_cost =
-      cpi->interintra_mode_cost[size_group_lookup[bsize]];
+  const int *const interintra_mode_cost =
+      x->interintra_mode_cost[size_group_lookup[bsize]];
 #endif  // CONFIG_INTERINTRA
   const int is_comp_interintra_pred = (mbmi->ref_frame[1] == INTRA_FRAME);
   uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
-#else
-  int_mv *const single_newmv = args->single_newmv;
-#endif  // CONFIG_EXT_INTER
 #if CONFIG_HIGHBITDEPTH
   DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
 #else
@@ -8747,11 +9183,9 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   uint8_t *tmp_buf;
 
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-#if CONFIG_EXT_INTER
   int rate2_bmc_nocoeff;
   MB_MODE_INFO best_bmc_mbmi;
   int rate_mv_bmc;
-#endif  // CONFIG_EXT_INTER
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
   int64_t rd = INT64_MAX;
   BUFFER_SET orig_dst, tmp_dst;
@@ -8766,7 +9200,6 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   mbmi->ncobmc_mode[1] = NO_OVERLAP;
 #endif
 
-#if CONFIG_EXT_INTER
 #if CONFIG_INTERINTRA
   int compmode_interintra_cost = 0;
   mbmi->use_wedge_interintra = 0;
@@ -8775,6 +9208,9 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   int compmode_interinter_cost = 0;
   mbmi->interinter_compound_type = COMPOUND_AVERAGE;
 #endif
+#if CONFIG_LGT_FROM_PRED
+  mbmi->use_lgt = 0;
+#endif
 
 #if CONFIG_INTERINTRA
   if (!cm->allow_interintra_compound && is_comp_interintra_pred)
@@ -8785,9 +9221,7 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   assert(!is_comp_interintra_pred || (!is_comp_pred));
   // is_comp_interintra_pred implies is_interintra_allowed(mbmi->sb_type)
   assert(!is_comp_interintra_pred || is_interintra_allowed(mbmi));
-#endif  // CONFIG_EXT_INTER
 
-#if CONFIG_EXT_INTER
 #if CONFIG_COMPOUND_SINGLEREF
   if (is_comp_pred || is_singleref_comp_mode)
 #else   // !CONFIG_COMPOUND_SINGLEREF
@@ -8795,7 +9229,6 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
 #endif  // CONFIG_COMPOUND_SINGLEREF
     mode_ctx = mbmi_ext->compound_mode_context[refs[0]];
   else
-#endif  // CONFIG_EXT_INTER
     mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
                                          mbmi->ref_frame, bsize, -1);
 
@@ -8818,21 +9251,21 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     if (frame_mv[refs[0]].as_int == INVALID_MV ||
         frame_mv[refs[1]].as_int == INVALID_MV)
       return INT64_MAX;
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
   } else if (is_singleref_comp_mode) {
     if (frame_mv[refs[0]].as_int == INVALID_MV ||
         frame_comp_mv[refs[0]].as_int == INVALID_MV)
       return INT64_MAX;
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
   }
 
   mbmi->motion_mode = SIMPLE_TRANSLATION;
   if (have_newmv_in_inter_mode(this_mode)) {
     const int64_t ret_val =
         handle_newmv(cpi, x, bsize, mode_mv,
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
                      mode_comp_mv,
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
                      mi_row, mi_col, &rate_mv, single_newmv, args);
     if (ret_val != 0)
       return ret_val;
@@ -8847,7 +9280,7 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     mbmi->mv[i].as_int = cur_mv[i].as_int;
   }
 
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
   if (!is_comp_pred && is_singleref_comp_mode) {
     cur_mv[1] = frame_comp_mv[refs[0]];
     // Clip "next_nearest" so that it does not extend to far out of image
@@ -8855,17 +9288,9 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     if (mv_check_bounds(&x->mv_limits, &cur_mv[1].as_mv)) return INT64_MAX;
     mbmi->mv[1].as_int = cur_mv[1].as_int;
   }
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
 
-#if CONFIG_EXT_INTER
-  if (this_mode == NEAREST_NEARESTMV)
-#else
-  if (this_mode == NEARESTMV && is_comp_pred)
-#endif  // CONFIG_EXT_INTER
-  {
-#if !CONFIG_EXT_INTER
-    uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
-#endif  // !CONFIG_EXT_INTER
+  if (this_mode == NEAREST_NEARESTMV) {
     if (mbmi_ext->ref_mv_count[ref_frame_type] > 0) {
       cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv;
       cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv;
@@ -8878,7 +9303,6 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     }
   }
 
-#if CONFIG_EXT_INTER
   if (mbmi_ext->ref_mv_count[ref_frame_type] > 0) {
 #if CONFIG_COMPOUND_SINGLEREF
     if (this_mode == NEAREST_NEWMV ||  // this_mode == SR_NEAREST_NEWMV ||
@@ -8889,7 +9313,12 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     {
       cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv;
 
+#if CONFIG_AMVR
+      lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv,
+                         cm->cur_frame_mv_precision_level);
+#else
       lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv);
+#endif
       clamp_mv2(&cur_mv[0].as_mv, xd);
       if (mv_check_bounds(&x->mv_limits, &cur_mv[0].as_mv)) return INT64_MAX;
       mbmi->mv[0].as_int = cur_mv[0].as_int;
@@ -8898,7 +9327,12 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     if (this_mode == NEW_NEARESTMV) {
       cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv;
 
+#if CONFIG_AMVR
+      lower_mv_precision(&cur_mv[1].as_mv, cm->allow_high_precision_mv,
+                         cm->cur_frame_mv_precision_level);
+#else
       lower_mv_precision(&cur_mv[1].as_mv, cm->allow_high_precision_mv);
+#endif
       clamp_mv2(&cur_mv[1].as_mv, xd);
       if (mv_check_bounds(&x->mv_limits, &cur_mv[1].as_mv)) return INT64_MAX;
       mbmi->mv[1].as_int = cur_mv[1].as_int;
@@ -8914,7 +9348,12 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
         this_mode == NEAR_NEARMV) {
       cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
 
+#if CONFIG_AMVR
+      lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv,
+                         cm->cur_frame_mv_precision_level);
+#else
       lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv);
+#endif
       clamp_mv2(&cur_mv[0].as_mv, xd);
       if (mv_check_bounds(&x->mv_limits, &cur_mv[0].as_mv)) return INT64_MAX;
       mbmi->mv[0].as_int = cur_mv[0].as_int;
@@ -8932,28 +9371,17 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
 #endif  // CONFIG_COMPOUND_SINGLEREF
         cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
 
+#if CONFIG_AMVR
+      lower_mv_precision(&cur_mv[1].as_mv, cm->allow_high_precision_mv,
+                         cm->cur_frame_mv_precision_level);
+#else
       lower_mv_precision(&cur_mv[1].as_mv, cm->allow_high_precision_mv);
+#endif
       clamp_mv2(&cur_mv[1].as_mv, xd);
       if (mv_check_bounds(&x->mv_limits, &cur_mv[1].as_mv)) return INT64_MAX;
       mbmi->mv[1].as_int = cur_mv[1].as_int;
     }
   }
-#else   // !CONFIG_EXT_INTER
-  if (this_mode == NEARMV && is_comp_pred) {
-    uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
-    if (mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
-      int ref_mv_idx = mbmi->ref_mv_idx + 1;
-      cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
-      cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
-
-      for (i = 0; i < 2; ++i) {
-        clamp_mv2(&cur_mv[i].as_mv, xd);
-        if (mv_check_bounds(&x->mv_limits, &cur_mv[i].as_mv)) return INT64_MAX;
-        mbmi->mv[i].as_int = cur_mv[i].as_int;
-      }
-    }
-  }
-#endif  // CONFIG_EXT_INTER
 
   // do first prediction into the destination buffer. Do the next
   // prediction into a temporary buffer. Then keep track of which one
@@ -8978,26 +9406,15 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   // initiation of a motion field.
   if (discount_newmv_test(cpi, this_mode, frame_mv[refs[0]], mode_mv,
                           refs[0])) {
-#if CONFIG_EXT_INTER
-    rd_stats->rate +=
-        AOMMIN(cost_mv_ref(cpi, this_mode, mode_ctx),
-               cost_mv_ref(cpi, is_comp_pred ? NEAREST_NEARESTMV : NEARESTMV,
-                           mode_ctx));
-#else
-    rd_stats->rate += AOMMIN(cost_mv_ref(cpi, this_mode, mode_ctx),
-                             cost_mv_ref(cpi, NEARESTMV, mode_ctx));
-#endif  // CONFIG_EXT_INTER
+    rd_stats->rate += AOMMIN(
+        cost_mv_ref(x, this_mode, mode_ctx),
+        cost_mv_ref(x, is_comp_pred ? NEAREST_NEARESTMV : NEARESTMV, mode_ctx));
   } else {
-    rd_stats->rate += cost_mv_ref(cpi, this_mode, mode_ctx);
+    rd_stats->rate += cost_mv_ref(x, this_mode, mode_ctx);
   }
 
   if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd &&
-#if CONFIG_EXT_INTER
-      mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV
-#else
-      mbmi->mode != NEARESTMV
-#endif  // CONFIG_EXT_INTER
-      )
+      mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV)
     return INT64_MAX;
 
   int64_t ret_val = interpolation_filter_search(
@@ -9005,7 +9422,6 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
       &rd, &rs, &skip_txfm_sb, &skip_sse_sb);
   if (ret_val != 0) return ret_val;
 
-#if CONFIG_EXT_INTER
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
   best_bmc_mbmi = *mbmi;
   rate2_bmc_nocoeff = rd_stats->rate;
@@ -9028,7 +9444,6 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     int best_tmp_rate_mv = rate_mv;
     int tmp_skip_txfm_sb;
     int64_t tmp_skip_sse_sb;
-    int compound_type_cost[COMPOUND_TYPES];
     DECLARE_ALIGNED(16, uint8_t, pred0[2 * MAX_SB_SQUARE]);
     DECLARE_ALIGNED(16, uint8_t, pred1[2 * MAX_SB_SQUARE]);
     uint8_t *preds0[1] = { pred0 };
@@ -9040,6 +9455,7 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     masked_compound_used = masked_compound_used && cm->allow_masked_compound;
 #endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
     COMPOUND_TYPE cur_type;
+    int best_compmode_interinter_cost = 0;
 
     best_mv[0].as_int = cur_mv[0].as_int;
     best_mv[1].as_int = cur_mv[1].as_int;
@@ -9049,7 +9465,7 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     best_compound_data.seg_mask = tmp_mask_buf;
 #endif  // CONFIG_COMPOUND_SEGMENT
 
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
     // TODO(zoeliu): To further check whether the following setups are needed.
     // Single ref compound mode: Prepare the 2nd ref frame predictor the same as
     // the 1st one.
@@ -9058,11 +9474,9 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
       for (i = 0; i < MAX_MB_PLANE; i++)
         xd->plane[i].pre[1] = xd->plane[i].pre[0];
     }
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
 
     if (masked_compound_used) {
-      av1_cost_tokens(compound_type_cost, cm->fc->compound_type_prob[bsize],
-                      av1_compound_type_tree);
       // get inter predictors to use for masked compound modes
       av1_build_inter_predictors_for_planes_single_buf(
           xd, bsize, 0, 0, mi_row, mi_col, 0, preds0, strides);
@@ -9076,11 +9490,19 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
       tmp_rate_mv = rate_mv;
       best_rd_cur = INT64_MAX;
       mbmi->interinter_compound_type = cur_type;
+      int masked_type_cost = 0;
+      if (masked_compound_used) {
+#if CONFIG_WEDGE && CONFIG_COMPOUND_SEGMENT
+        if (!is_interinter_compound_used(COMPOUND_WEDGE, bsize))
+          masked_type_cost += av1_cost_literal(1);
+        else
+#endif  // CONFIG_WEDGE && CONFIG_COMPOUND_SEGMENT
+          masked_type_cost +=
+              x->compound_type_cost[bsize][mbmi->interinter_compound_type];
+      }
       rs2 = av1_cost_literal(get_interinter_compound_type_bits(
                 bsize, mbmi->interinter_compound_type)) +
-            (masked_compound_used
-                 ? compound_type_cost[mbmi->interinter_compound_type]
-                 : 0);
+            masked_type_cost;
 
       switch (cur_type) {
         case COMPOUND_AVERAGE:
@@ -9130,6 +9552,7 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
 #endif  // CONFIG_COMPOUND_SEGMENT
         best_compound_data.interinter_compound_type =
             mbmi->interinter_compound_type;
+        best_compmode_interinter_cost = rs2;
         if (have_newmv_in_inter_mode(this_mode)) {
           if (use_masked_motion_search(cur_type)) {
             best_tmp_rate_mv = tmp_rate_mv;
@@ -9174,12 +9597,7 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
 
     pred_exists = 0;
 
-    compmode_interinter_cost =
-        av1_cost_literal(get_interinter_compound_type_bits(
-            bsize, mbmi->interinter_compound_type)) +
-        (masked_compound_used
-             ? compound_type_cost[mbmi->interinter_compound_type]
-             : 0);
+    compmode_interinter_cost = best_compmode_interinter_cost;
   }
 #endif  // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
 
@@ -9216,7 +9634,7 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     for (j = 0; j < INTERINTRA_MODES; ++j) {
       mbmi->interintra_mode = (INTERINTRA_MODE)j;
       rmode = interintra_mode_cost[mbmi->interintra_mode];
-      av1_build_intra_predictors_for_interintra(xd, bsize, 0, &orig_dst,
+      av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, &orig_dst,
                                                 intrapred, bw);
       av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
       model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
@@ -9229,7 +9647,7 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     }
     mbmi->interintra_mode = best_interintra_mode;
     rmode = interintra_mode_cost[mbmi->interintra_mode];
-    av1_build_intra_predictors_for_interintra(xd, bsize, 0, &orig_dst,
+    av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, &orig_dst,
                                               intrapred, bw);
     av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
     av1_subtract_plane(x, bsize, 0);
@@ -9340,16 +9758,11 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
                     &tmp_dist, &skip_txfm_sb, &skip_sse_sb);
     rd = RDCOST(x->rdmult, rs + tmp_rate, tmp_dist);
   }
-#endif  // CONFIG_EXT_INTER
 
   if (!is_comp_pred)
-#if CONFIG_DUAL_FILTER
-    args->single_filter[this_mode][refs[0]] = mbmi->interp_filter[0];
-#else
-    args->single_filter[this_mode][refs[0]] = mbmi->interp_filter;
-#endif  // CONFIG_DUAL_FILTER
+    args->single_filter[this_mode][refs[0]] =
+        av1_extract_interp_filter(mbmi->interp_filters, 0);
 
-#if CONFIG_EXT_INTER
   if (args->modelled_rd != NULL) {
     if (is_comp_pred) {
       const int mode0 = compound_ref0_mode(this_mode);
@@ -9364,7 +9777,6 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
       args->modelled_rd[this_mode][refs[0]] = rd;
     }
   }
-#endif  // CONFIG_EXT_INTER
 
   if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
     // if current pred_error modeled rd is substantially more than the best
@@ -9375,7 +9787,6 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     }
   }
 
-#if CONFIG_EXT_INTER
 #if CONFIG_INTERINTRA
   rd_stats->rate += compmode_interintra_cost;
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
@@ -9385,18 +9796,14 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
 #if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
   rd_stats->rate += compmode_interinter_cost;
 #endif
-#endif
 
-  ret_val = motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv,
-                           disable_skip, mode_mv, mi_row, mi_col, args,
-                           ref_best_rd, refs, rate_mv,
+  ret_val = motion_mode_rd(
+      cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv, disable_skip, mode_mv,
+      mi_row, mi_col, args, ref_best_rd, refs, rate_mv,
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-                           single_newmv,
-#if CONFIG_EXT_INTER
-                           rate2_bmc_nocoeff, &best_bmc_mbmi, rate_mv_bmc,
-#endif  // CONFIG_EXT_INTER
+      single_newmv, rate2_bmc_nocoeff, &best_bmc_mbmi, rate_mv_bmc,
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-                           rs, &skip_txfm_sb, &skip_sse_sb, &orig_dst);
+      rs, &skip_txfm_sb, &skip_sse_sb, &orig_dst);
   if (ret_val != 0) return ret_val;
 
   return 0;  // The rate-distortion cost will be re-calculated by caller.
@@ -9407,11 +9814,10 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
                                        RD_STATS *rd_cost, BLOCK_SIZE bsize,
                                        int64_t best_rd) {
   const AV1_COMMON *const cm = &cpi->common;
-  if (bsize < BLOCK_8X8 || !cm->allow_screen_content_tools) return INT64_MAX;
+  if (!av1_allow_intrabc(bsize, cm)) return INT64_MAX;
 
   MACROBLOCKD *const xd = &x->e_mbd;
   const TileInfo *tile = &xd->tile;
-  FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
   MODE_INFO *const mi = xd->mi[0];
   const int mi_row = -xd->mb_to_top_edge / (8 * MI_SIZE);
   const int mi_col = -xd->mb_to_left_edge / (8 * MI_SIZE);
@@ -9425,11 +9831,8 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
   int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
   av1_find_mv_refs(cm, xd, mi, ref_frame, &mbmi_ext->ref_mv_count[ref_frame],
                    mbmi_ext->ref_mv_stack[ref_frame],
-#if CONFIG_EXT_INTER
-                   mbmi_ext->compound_mode_context,
-#endif  // CONFIG_EXT_INTER
-                   candidates, mi_row, mi_col, NULL, NULL,
-                   mbmi_ext->mode_context);
+                   mbmi_ext->compound_mode_context, candidates, mi_row, mi_col,
+                   NULL, NULL, mbmi_ext->mode_context);
 
   int_mv nearestmv, nearmv;
   av1_find_best_ref_mvs(0, candidates, &nearestmv, &nearmv);
@@ -9495,9 +9898,16 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
     mvp_full.row >>= 3;
     int sadpb = x->sadperbit16;
     int cost_list[5];
+#if CONFIG_HASH_ME
+    int bestsme = av1_full_pixel_search(
+        cpi, x, bsize, &mvp_full, step_param, sadpb,
+        cond_cost_list(cpi, cost_list), &dv_ref.as_mv, INT_MAX, 1,
+        (MI_SIZE * mi_col), (MI_SIZE * mi_row), 1);
+#else
     int bestsme = av1_full_pixel_search(cpi, x, bsize, &mvp_full, step_param,
                                         sadpb, cond_cost_list(cpi, cost_list),
                                         &dv_ref.as_mv, INT_MAX, 1);
+#endif
 
     x->mv_limits = tmp_mv_limits;
     if (bestsme == INT_MAX) continue;
@@ -9506,18 +9916,12 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
     if (mv_check_bounds(&x->mv_limits, &dv)) continue;
     if (!is_dv_valid(dv, tile, mi_row, mi_col, bsize)) continue;
 
-#if CONFIG_PALETTE
     memset(&mbmi->palette_mode_info, 0, sizeof(mbmi->palette_mode_info));
-#endif
     mbmi->use_intrabc = 1;
     mbmi->mode = DC_PRED;
     mbmi->uv_mode = UV_DC_PRED;
     mbmi->mv[0].as_mv = dv;
-#if CONFIG_DUAL_FILTER
-    for (int idx = 0; idx < 4; ++idx) mbmi->interp_filter[idx] = BILINEAR;
-#else
-    mbmi->interp_filter = BILINEAR;
-#endif
+    mbmi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
     mbmi->skip = 0;
     x->skip = 0;
     av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
@@ -9527,8 +9931,7 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
     // in MV_COST_WEIGHT is too large. Explore other values.
     int rate_mv = av1_mv_bit_cost(&dv, &dv_ref.as_mv, x->nmvjointcost,
                                   x->mvcost, MV_COST_WEIGHT_SUB);
-    const int rate_mode = av1_cost_bit(ec_ctx->intrabc_prob, 1);
-
+    const int rate_mode = x->intrabc_cost[1];
     RD_STATS rd_stats, rd_stats_uv;
     av1_subtract_plane(x, bsize, 0);
     super_block_yrd(cpi, x, &rd_stats, bsize, INT64_MAX);
@@ -9605,6 +10008,9 @@ void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
   mbmi->use_intrabc = 0;
   mbmi->mv[0].as_int = 0;
 #endif  // CONFIG_INTRABC
+#if CONFIG_LGT_FROM_PRED
+  mbmi->use_lgt = 0;
+#endif
 
   const int64_t intra_yrd =
       (bsize >= BLOCK_8X8 || unify_bsize)
@@ -9615,25 +10021,23 @@ void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
 
   if (intra_yrd < best_rd) {
 #if CONFIG_CFL
-    // Perform one extra txfm_rd_in_plane() call, this time with the best value
-    // so we can store reconstructed luma values
-    RD_STATS this_rd_stats;
-
 #if CONFIG_CB4X4
-    // Don't store the luma value if no chroma is associated.
-    // Don't worry, we will store this reconstructed luma in the following
-    // encode dry-run the chroma plane will never know.
-    x->cfl_store_y = !x->skip_chroma_rd;
+    // Only store reconstructed luma when there's chroma RDO. When there's no
+    // chroma RDO, the reconstructed luma will be stored in encode_superblock().
+    xd->cfl->store_y = !x->skip_chroma_rd;
 #else
-    x->cfl_store_y = 1;
-#endif
-
-    txfm_rd_in_plane(x, cpi, &this_rd_stats, INT64_MAX, AOM_PLANE_Y,
-                     mbmi->sb_type, mbmi->tx_size,
-                     cpi->sf.use_fast_coef_costing);
-
-    x->cfl_store_y = 0;
-#endif
+    xd->cfl->store_y = 1;
+#endif  // CONFIG_CB4X4
+    if (xd->cfl->store_y) {
+      // Perform one extra call to txfm_rd_in_plane(), with the values chosen
+      // during luma RDO, so we can store reconstructed luma values
+      RD_STATS this_rd_stats;
+      txfm_rd_in_plane(x, cpi, &this_rd_stats, INT64_MAX, AOM_PLANE_Y,
+                       mbmi->sb_type, mbmi->tx_size,
+                       cpi->sf.use_fast_coef_costing);
+      xd->cfl->store_y = 0;
+    }
+#endif  // CONFIG_CFL
     max_uv_tx_size = uv_txsize_lookup[bsize][mbmi->tx_size][pd[1].subsampling_x]
                                      [pd[1].subsampling_y];
     init_sbuv_mode(mbmi);
@@ -9646,7 +10050,7 @@ void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
                             &uv_skip, AOMMAX(BLOCK_8X8, bsize), max_uv_tx_size);
 #endif  // CONFIG_CB4X4
 
-    if (y_skip && uv_skip) {
+    if (y_skip && (uv_skip || x->skip_chroma_rd)) {
       rd_cost->rate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
                       av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
       rd_cost->dist = dist_y + dist_uv;
@@ -9656,9 +10060,6 @@ void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
       rd_cost->dist = dist_y + dist_uv;
     }
     rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-    rd_cost->dist_y = dist_y;
-#endif
   } else {
     rd_cost->rate = INT_MAX;
   }
@@ -9747,12 +10148,12 @@ int av1_active_edge_sb(const AV1_COMP *cpi, int mi_row, int mi_col) {
          av1_active_v_edge(cpi, mi_col, cpi->common.mib_size);
 }
 
-#if CONFIG_PALETTE
 static void restore_uv_color_map(const AV1_COMP *const cpi, MACROBLOCK *x) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
   const BLOCK_SIZE bsize = mbmi->sb_type;
+  assert(bsize >= BLOCK_8X8);
   int src_stride = x->plane[1].src.stride;
   const uint8_t *const src_u = x->plane[1].src.buf;
   const uint8_t *const src_v = x->plane[2].src.buf;
@@ -9796,24 +10197,20 @@ static void restore_uv_color_map(const AV1_COMP *const cpi, MACROBLOCK *x) {
   extend_palette_color_map(color_map, cols, rows, plane_block_width,
                            plane_block_height);
 }
-#endif  // CONFIG_PALETTE
 
 #if CONFIG_FILTER_INTRA
 static void pick_filter_intra_interframe(
-    const AV1_COMP *cpi, MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
-    BLOCK_SIZE bsize, int mi_row, int mi_col, int *rate_uv_intra,
-    int *rate_uv_tokenonly, int64_t *dist_uv, int *skip_uv,
-    UV_PREDICTION_MODE *mode_uv,
+    const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
+    int mi_col, int *rate_uv_intra, int *rate_uv_tokenonly, int64_t *dist_uv,
+    int *skip_uv, UV_PREDICTION_MODE *mode_uv,
     FILTER_INTRA_MODE_INFO *filter_intra_mode_info_uv,
 #if CONFIG_EXT_INTRA
     int8_t *uv_angle_delta,
 #endif  // CONFIG_EXT_INTRA
-#if CONFIG_PALETTE
-    PALETTE_MODE_INFO *pmi_uv, int palette_ctx,
-#endif  // CONFIG_PALETTE
-    int skip_mask, unsigned int *ref_costs_single, int64_t *best_rd,
-    int64_t *best_intra_rd, PREDICTION_MODE *best_intra_mode,
-    int *best_mode_index, int *best_skip2, int *best_mode_skippable,
+    PALETTE_MODE_INFO *pmi_uv, int palette_ctx, int skip_mask,
+    unsigned int *ref_costs_single, int64_t *best_rd, int64_t *best_intra_rd,
+    PREDICTION_MODE *best_intra_mode, int *best_mode_index, int *best_skip2,
+    int *best_mode_skippable,
 #if CONFIG_SUPERTX
     int *returnrate_nocoef,
 #endif  // CONFIG_SUPERTX
@@ -9821,12 +10218,12 @@ static void pick_filter_intra_interframe(
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-#if CONFIG_PALETTE
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-#endif  // CONFIG_PALETTE
+  const int try_palette =
+      av1_allow_palette(cm->allow_screen_content_tools, mbmi->sb_type);
   int rate2 = 0, rate_y = INT_MAX, skippable = 0, rate_uv, rate_dummy, i;
   int dc_mode_index;
-  const int *const intra_mode_cost = cpi->mbmode_cost[size_group_lookup[bsize]];
+  const int *const intra_mode_cost = x->mbmode_cost[size_group_lookup[bsize]];
   int64_t distortion2 = 0, distortion_y = 0, this_rd = *best_rd;
   int64_t distortion_uv, model_rd = INT64_MAX;
   TX_SIZE uv_tx;
@@ -9854,12 +10251,10 @@ static void pick_filter_intra_interframe(
   uv_tx = uv_txsize_lookup[bsize][mbmi->tx_size][xd->plane[1].subsampling_x]
                           [xd->plane[1].subsampling_y];
   if (rate_uv_intra[uv_tx] == INT_MAX) {
-    choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx, &rate_uv_intra[uv_tx],
+    choose_intra_uv_mode(cpi, x, bsize, uv_tx, &rate_uv_intra[uv_tx],
                          &rate_uv_tokenonly[uv_tx], &dist_uv[uv_tx],
                          &skip_uv[uv_tx], &mode_uv[uv_tx]);
-#if CONFIG_PALETTE
     if (cm->allow_screen_content_tools) pmi_uv[uv_tx] = *pmi;
-#endif  // CONFIG_PALETTE
     filter_intra_mode_info_uv[uv_tx] = mbmi->filter_intra_mode_info;
 #if CONFIG_EXT_INTRA
     uv_angle_delta[uv_tx] = mbmi->angle_delta[1];
@@ -9870,14 +10265,12 @@ static void pick_filter_intra_interframe(
   distortion_uv = dist_uv[uv_tx];
   skippable = skippable && skip_uv[uv_tx];
   mbmi->uv_mode = mode_uv[uv_tx];
-#if CONFIG_PALETTE
   if (cm->allow_screen_content_tools) {
     pmi->palette_size[1] = pmi_uv[uv_tx].palette_size[1];
     memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
            pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE,
            2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
   }
-#endif  // CONFIG_PALETTE
 #if CONFIG_EXT_INTRA
   mbmi->angle_delta[1] = uv_angle_delta[uv_tx];
 #endif  // CONFIG_EXT_INTRA
@@ -9889,13 +10282,10 @@ static void pick_filter_intra_interframe(
   }
 
   rate2 = rate_y + intra_mode_cost[mbmi->mode] + rate_uv +
-          cpi->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode];
-#if CONFIG_PALETTE
-  if (cpi->common.allow_screen_content_tools && mbmi->mode == DC_PRED &&
-      bsize >= BLOCK_8X8)
+          x->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode];
+  if (try_palette && mbmi->mode == DC_PRED)
     rate2 += av1_cost_bit(
         av1_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_ctx], 0);
-#endif  // CONFIG_PALETTE
 
   if (!xd->lossless[mbmi->segment_id]) {
     // super_block_yrd above includes the cost of the tx_size in the
@@ -9910,7 +10300,7 @@ static void pick_filter_intra_interframe(
   rate2 += write_uniform_cost(
       FILTER_INTRA_MODES, mbmi->filter_intra_mode_info.filter_intra_mode[0]);
 #if CONFIG_EXT_INTRA
-  if (av1_is_directional_mode(mbmi->uv_mode, bsize) &&
+  if (av1_is_directional_mode(get_uv_mode(mbmi->uv_mode), bsize) &&
       av1_use_angle_delta(bsize)) {
     rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
                                 MAX_ANGLE_DELTA + mbmi->angle_delta[1]);
@@ -9992,11 +10382,9 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
   const SPEED_FEATURES *const sf = &cpi->sf;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-#if CONFIG_PALETTE
   const int try_palette =
-      cpi->common.allow_screen_content_tools && bsize >= BLOCK_8X8;
+      av1_allow_palette(cm->allow_screen_content_tools, mbmi->sb_type);
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-#endif  // CONFIG_PALETTE
   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   const struct segmentation *const seg = &cm->seg;
   PREDICTION_MODE this_mode;
@@ -10004,15 +10392,13 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
   unsigned char segment_id = mbmi->segment_id;
   int comp_pred, i, k;
   int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
   int_mv frame_comp_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
   struct buf_2d yv12_mb[TOTAL_REFS_PER_FRAME][MAX_MB_PLANE];
   int_mv single_newmv[TOTAL_REFS_PER_FRAME] = { { 0 } };
-#if CONFIG_EXT_INTER
   int single_newmv_rate[TOTAL_REFS_PER_FRAME] = { 0 };
   int64_t modelled_rd[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
-#endif  // CONFIG_EXT_INTER
   static const int flag_list[TOTAL_REFS_PER_FRAME] = {
     0,
     AOM_LAST_FLAG,
@@ -10023,6 +10409,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     AOM_GOLD_FLAG,
 #if CONFIG_EXT_REFS
     AOM_BWD_FLAG,
+    AOM_ALT2_FLAG,
 #endif  // CONFIG_EXT_REFS
     AOM_ALT_FLAG
   };
@@ -10049,9 +10436,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
   int64_t dist_uvs[TX_SIZES_ALL];
   int skip_uvs[TX_SIZES_ALL];
   UV_PREDICTION_MODE mode_uv[TX_SIZES_ALL];
-#if CONFIG_PALETTE
   PALETTE_MODE_INFO pmi_uv[TX_SIZES_ALL];
-#endif  // CONFIG_PALETTE
 #if CONFIG_EXT_INTRA
   int8_t uv_angle_delta[TX_SIZES_ALL];
   int is_directional_mode, angle_stats_ready = 0;
@@ -10063,14 +10448,14 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 #endif  // CONFIG_FILTER_INTRA
   const int intra_cost_penalty = av1_get_intra_cost_penalty(
       cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
-  const int *const intra_mode_cost = cpi->mbmode_cost[size_group_lookup[bsize]];
+  const int *const intra_mode_cost = x->mbmode_cost[size_group_lookup[bsize]];
   int best_skip2 = 0;
   uint16_t ref_frame_skip_mask[2] = { 0 };
   uint32_t mode_skip_mask[TOTAL_REFS_PER_FRAME] = { 0 };
-#if CONFIG_EXT_INTER && CONFIG_INTERINTRA
+#if CONFIG_INTERINTRA
   MV_REFERENCE_FRAME best_single_inter_ref = LAST_FRAME;
   int64_t best_single_inter_rd = INT64_MAX;
-#endif  // CONFIG_EXT_INTER && CONFIG_INTERINTRA
+#endif  // CONFIG_INTERINTRA
   int mode_skip_start = sf->mode_skip_start + 1;
   const int *const rd_threshes = rd_opt->threshes[segment_id][bsize];
   const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
@@ -10088,25 +10473,17 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     { NULL },
     { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE },
 #endif  // CONFIG_MOTION_VAR
-#if CONFIG_EXT_INTER
     NULL,
     NULL,
     NULL,
-#else   // CONFIG_EXT_INTER
-    NULL,
-#endif  // CONFIG_EXT_INTER
     { { 0 } },
   };
 
-#if CONFIG_PALETTE || CONFIG_EXT_INTRA
   const int rows = block_size_high[bsize];
   const int cols = block_size_wide[bsize];
-#endif  // CONFIG_PALETTE || CONFIG_EXT_INTRA
-#if CONFIG_PALETTE
   int palette_ctx = 0;
   const MODE_INFO *above_mi = xd->above_mi;
   const MODE_INFO *left_mi = xd->left_mi;
-#endif  // CONFIG_PALETTE
 #if CONFIG_MOTION_VAR
   int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
   int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
@@ -10141,7 +10518,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 
   av1_zero(best_mbmode);
 
-#if CONFIG_PALETTE
   av1_zero(pmi_uv);
   if (try_palette) {
     if (above_mi)
@@ -10149,7 +10525,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     if (left_mi)
       palette_ctx += (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
   }
-#endif  // CONFIG_PALETTE
 
   estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
                            &comp_mode_p);
@@ -10168,16 +10543,10 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
   *returnrate_nocoef = INT_MAX;
 #endif  // CONFIG_SUPERTX
 
-#if CONFIG_SPEED_REFS
-  memset(x->mbmi_ext->ref_mvs, 0, sizeof(x->mbmi_ext->ref_mvs));
-#endif  // CONFIG_SPEED_REFS
-
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     x->pred_mv_sad[ref_frame] = INT_MAX;
     x->mbmi_ext->mode_context[ref_frame] = 0;
-#if CONFIG_EXT_INTER
     x->mbmi_ext->compound_mode_context[ref_frame] = 0;
-#endif  // CONFIG_EXT_INTER
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
       assert(get_ref_frame_buffer(cpi, ref_frame) != NULL);
       setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
@@ -10188,12 +10557,16 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     frame_mv[ZEROMV][ref_frame].as_int =
         gm_get_motion_vector(&cm->global_motion[ref_frame],
                              cm->allow_high_precision_mv, bsize, mi_col, mi_row,
-                             0)
+                             0
+#if CONFIG_AMVR
+                             ,
+                             cm->cur_frame_mv_precision_level
+#endif
+                             )
             .as_int;
 #else   // CONFIG_GLOBAL_MOTION
     frame_mv[ZEROMV][ref_frame].as_int = 0;
 #endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_EXT_INTER
     frame_mv[NEW_NEWMV][ref_frame].as_int = INVALID_MV;
 #if CONFIG_COMPOUND_SINGLEREF
     frame_mv[SR_NEW_NEWMV][ref_frame].as_int = INVALID_MV;
@@ -10203,12 +10576,16 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     frame_mv[ZERO_ZEROMV][ref_frame].as_int =
         gm_get_motion_vector(&cm->global_motion[ref_frame],
                              cm->allow_high_precision_mv, bsize, mi_col, mi_row,
-                             0)
+                             0
+#if CONFIG_AMVR
+                             ,
+                             cm->cur_frame_mv_precision_level
+#endif
+                             )
             .as_int;
 #else   // CONFIG_GLOBAL_MOTION
     frame_mv[ZERO_ZEROMV][ref_frame].as_int = 0;
 #endif  // CONFIG_GLOBAL_MOTION
-#endif  // CONFIG_EXT_INTER
   }
 
   for (; ref_frame < MODE_CTX_REF_FRAMES; ++ref_frame) {
@@ -10217,11 +10594,8 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     x->mbmi_ext->mode_context[ref_frame] = 0;
     av1_find_mv_refs(cm, xd, mi, ref_frame, &mbmi_ext->ref_mv_count[ref_frame],
                      mbmi_ext->ref_mv_stack[ref_frame],
-#if CONFIG_EXT_INTER
-                     mbmi_ext->compound_mode_context,
-#endif  // CONFIG_EXT_INTER
-                     candidates, mi_row, mi_col, NULL, NULL,
-                     mbmi_ext->mode_context);
+                     mbmi_ext->compound_mode_context, candidates, mi_row,
+                     mi_col, NULL, NULL, mbmi_ext->mode_context);
     if (mbmi_ext->ref_mv_count[ref_frame] < 2) {
       MV_REFERENCE_FRAME rf[2];
       av1_set_ref_frame(rf, ref_frame);
@@ -10257,25 +10631,11 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     if (!(cpi->ref_frame_flags & flag_list[ref_frame])) {
-// Skip checking missing references in both single and compound reference
-// modes. Note that a mode will be skipped iff both reference frames
-// are masked out.
-#if CONFIG_EXT_COMP_REFS
+      // Skip checking missing references in both single and compound reference
+      // modes. Note that a mode will be skipped iff both reference frames
+      // are masked out.
       ref_frame_skip_mask[0] |= (1 << ref_frame);
       ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
-#else  // !CONFIG_EXT_COMP_REFS
-#if CONFIG_EXT_REFS
-      if (ref_frame == BWDREF_FRAME || ref_frame == ALTREF_FRAME) {
-        ref_frame_skip_mask[0] |= (1 << ref_frame);
-        ref_frame_skip_mask[1] |= ((1 << ref_frame) | 0x01);
-      } else {
-#endif  // CONFIG_EXT_REFS
-        ref_frame_skip_mask[0] |= (1 << ref_frame);
-        ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
-#if CONFIG_EXT_REFS
-      }
-#endif  // CONFIG_EXT_REFS
-#endif  // CONFIG_EXT_COMP_REFS
     } else {
       for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
         // Skip fixed mv modes for poor references
@@ -10307,7 +10667,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       ref_frame_skip_mask[0] = (1 << LAST_FRAME) |
 #if CONFIG_EXT_REFS
                                (1 << LAST2_FRAME) | (1 << LAST3_FRAME) |
-                               (1 << BWDREF_FRAME) |
+                               (1 << BWDREF_FRAME) | (1 << ALTREF2_FRAME) |
 #endif  // CONFIG_EXT_REFS
                                (1 << GOLDEN_FRAME);
       ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
@@ -10317,7 +10677,12 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 #if CONFIG_GLOBAL_MOTION
       zeromv.as_int = gm_get_motion_vector(&cm->global_motion[ALTREF_FRAME],
                                            cm->allow_high_precision_mv, bsize,
-                                           mi_col, mi_row, 0)
+                                           mi_col, mi_row, 0
+#if CONFIG_AMVR
+                                           ,
+                                           cm->cur_frame_mv_precision_level
+#endif
+                                           )
                           .as_int;
 #else
       zeromv.as_int = 0;
@@ -10326,7 +10691,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         mode_skip_mask[ALTREF_FRAME] |= (1 << NEARMV);
       if (frame_mv[NEARESTMV][ALTREF_FRAME].as_int != zeromv.as_int)
         mode_skip_mask[ALTREF_FRAME] |= (1 << NEARESTMV);
-#if CONFIG_EXT_INTER
       if (frame_mv[NEAREST_NEARESTMV][ALTREF_FRAME].as_int != zeromv.as_int)
         mode_skip_mask[ALTREF_FRAME] |= (1 << NEAREST_NEARESTMV);
       if (frame_mv[NEAR_NEARMV][ALTREF_FRAME].as_int != zeromv.as_int)
@@ -10337,7 +10701,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
               zeromv.as_int)
         mode_skip_mask[ALTREF_FRAME] |= (1 << SR_NEAREST_NEARMV);
 #endif  // CONFIG_COMPOUND_SINGLEREF
-#endif  // CONFIG_EXT_INTER
     }
   }
 
@@ -10400,11 +10763,9 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 #if CONFIG_PVQ
   od_encode_checkpoint(&x->daala_enc, &pre_buf);
 #endif  // CONFIG_PVQ
-#if CONFIG_EXT_INTER
   for (i = 0; i < MB_MODE_COUNT; ++i)
     for (ref_frame = 0; ref_frame < TOTAL_REFS_PER_FRAME; ++ref_frame)
       modelled_rd[i][ref_frame] = INT64_MAX;
-#endif  // CONFIG_EXT_INTER
 
   for (midx = 0; midx < MAX_MODES; ++midx) {
     int mode_index;
@@ -10414,10 +10775,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     int compmode_cost = 0;
     int rate2 = 0, rate_y = 0, rate_uv = 0;
     int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-    int64_t distortion2_y = 0;
-    int64_t total_sse_y = INT64_MAX;
-#endif
     int skippable = 0;
     int this_skip2 = 0;
     int64_t total_sse = INT64_MAX;
@@ -10431,7 +10788,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     second_ref_frame = av1_mode_order[mode_index].ref_frame[1];
     mbmi->ref_mv_idx = 0;
 
-#if CONFIG_EXT_INTER
     if (ref_frame > INTRA_FRAME && second_ref_frame == INTRA_FRAME) {
       // Mode must by compatible
       if (!is_interintra_allowed_mode(this_mode)) continue;
@@ -10451,7 +10807,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
           frame_mv[compound_ref1_mode(this_mode)][ref_frame].as_int;
 #endif  // CONFIG_COMPOUND_SINGLEREF
     }
-#endif  // CONFIG_EXT_INTER
 
     // Look at the reference frame of the best mode so far and set the
     // skip mask to look at a subset of the remaining modes.
@@ -10481,6 +10836,10 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
           ref_frame_skip_mask[0] |= BWDREF_FRAME_MODE_MASK;
           ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
           break;
+        case ALTREF2_FRAME:
+          ref_frame_skip_mask[0] |= ALTREF2_FRAME_MODE_MASK;
+          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+          break;
 #endif  // CONFIG_EXT_REFS
         case ALTREF_FRAME: ref_frame_skip_mask[0] |= ALTREF_FRAME_MODE_MASK;
 #if CONFIG_EXT_REFS
@@ -10537,7 +10896,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     // This is only used in motion vector unit test.
     if (cpi->oxcf.motion_vector_unit_test && ref_frame == INTRA_FRAME) continue;
 
-#if CONFIG_ONE_SIDED_COMPOUND  // Changes LL bitstream
+#if CONFIG_ONE_SIDED_COMPOUND && !CONFIG_EXT_COMP_REFS  // Changes LL bitstream
 #if CONFIG_EXT_REFS
     if (cpi->oxcf.pass == 0) {
       // Complexity-compression trade-offs
@@ -10546,8 +10905,8 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       if (second_ref_frame == ALTREF_FRAME) continue;
       // if (second_ref_frame == BWDREF_FRAME) continue;
     }
-#endif
-#endif
+#endif  // CONFIG_EXT_REFS
+#endif  // CONFIG_ONE_SIDED_COMPOUND && !CONFIG_EXT_COMP_REFS
     comp_pred = second_ref_frame > INTRA_FRAME;
     if (comp_pred) {
       if (!cpi->allow_comp_inter_inter) continue;
@@ -10601,12 +10960,9 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     } else {
 #endif  // CONFIG_GLOBAL_MOTION
       const MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, second_ref_frame };
-      if (!check_best_zero_mv(cpi, mbmi_ext->mode_context,
-#if CONFIG_EXT_INTER
-                              mbmi_ext->compound_mode_context,
-#endif  // CONFIG_EXT_INTER
-                              frame_mv, this_mode, ref_frames, bsize, -1,
-                              mi_row, mi_col))
+      if (!check_best_zero_mv(cpi, x, mbmi_ext->mode_context,
+                              mbmi_ext->compound_mode_context, frame_mv,
+                              this_mode, ref_frames, bsize, -1, mi_row, mi_col))
         continue;
     }
 
@@ -10614,10 +10970,8 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     mbmi->uv_mode = UV_DC_PRED;
     mbmi->ref_frame[0] = ref_frame;
     mbmi->ref_frame[1] = second_ref_frame;
-#if CONFIG_PALETTE
     pmi->palette_size[0] = 0;
     pmi->palette_size[1] = 0;
-#endif  // CONFIG_PALETTE
 #if CONFIG_FILTER_INTRA
     mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
     mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
@@ -10639,18 +10993,18 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
     }
 
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
     // Single ref compound mode
     if (!comp_pred && is_inter_singleref_comp_mode(mbmi->mode)) {
       xd->block_refs[1] = xd->block_refs[0];
       for (i = 0; i < MAX_MB_PLANE; i++)
         xd->plane[i].pre[1] = xd->plane[i].pre[0];
     }
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
 
-#if CONFIG_EXT_INTER && CONFIG_INTERINTRA
+#if CONFIG_INTERINTRA
     mbmi->interintra_mode = (INTERINTRA_MODE)(II_DC_PRED - 1);
-#endif  // CONFIG_EXT_INTER && CONFIG_INTERINTRA
+#endif  // CONFIG_INTERINTRA
 
     if (ref_frame == INTRA_FRAME) {
       RD_STATS rd_stats_y;
@@ -10699,12 +11053,10 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       uv_tx = uv_txsize_lookup[bsize][mbmi->tx_size][pd->subsampling_x]
                               [pd->subsampling_y];
       if (rate_uv_intra[uv_tx] == INT_MAX) {
-        choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx, &rate_uv_intra[uv_tx],
+        choose_intra_uv_mode(cpi, x, bsize, uv_tx, &rate_uv_intra[uv_tx],
                              &rate_uv_tokenonly[uv_tx], &dist_uvs[uv_tx],
                              &skip_uvs[uv_tx], &mode_uv[uv_tx]);
-#if CONFIG_PALETTE
         if (try_palette) pmi_uv[uv_tx] = *pmi;
-#endif  // CONFIG_PALETTE
 
 #if CONFIG_EXT_INTRA
         uv_angle_delta[uv_tx] = mbmi->angle_delta[1];
@@ -10718,14 +11070,12 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       distortion_uv = dist_uvs[uv_tx];
       skippable = skippable && skip_uvs[uv_tx];
       mbmi->uv_mode = mode_uv[uv_tx];
-#if CONFIG_PALETTE
       if (try_palette) {
         pmi->palette_size[1] = pmi_uv[uv_tx].palette_size[1];
         memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
                pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE,
                2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
       }
-#endif  // CONFIG_PALETTE
 
 #if CONFIG_EXT_INTRA
       mbmi->angle_delta[1] = uv_angle_delta[uv_tx];
@@ -10742,20 +11092,18 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 #if CONFIG_CB4X4
       rate2 = rate_y + intra_mode_cost[mbmi->mode];
       if (!x->skip_chroma_rd)
-        rate2 += rate_uv + cpi->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode];
+        rate2 += rate_uv + x->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode];
 #else
       rate2 = rate_y + intra_mode_cost[mbmi->mode] + rate_uv +
-              cpi->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode];
+              x->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode];
 #endif  // CONFIG_CB4X4
 
-#if CONFIG_PALETTE
       if (try_palette && mbmi->mode == DC_PRED) {
         rate2 += av1_cost_bit(
             av1_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_ctx], 0);
       }
-#endif  // CONFIG_PALETTE
 
-      if (!xd->lossless[mbmi->segment_id] && bsize >= BLOCK_8X8) {
+      if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(bsize)) {
         // super_block_yrd above includes the cost of the tx_size in the
         // tokenonly rate, but for intra blocks, tx_size is always coded
         // (prediction granularity), so we account for it in the full rate,
@@ -10769,14 +11117,14 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         const int p_angle =
             mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP;
         if (av1_is_intra_filter_switchable(p_angle))
-          rate2 += cpi->intra_filter_cost[intra_filter_ctx][mbmi->intra_filter];
+          rate2 += x->intra_filter_cost[intra_filter_ctx][mbmi->intra_filter];
 #endif  // CONFIG_INTRA_INTERP
         if (av1_use_angle_delta(bsize)) {
           rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
                                       MAX_ANGLE_DELTA + mbmi->angle_delta[0]);
         }
       }
-      if (av1_is_directional_mode(mbmi->uv_mode, bsize) &&
+      if (av1_is_directional_mode(get_uv_mode(mbmi->uv_mode), bsize) &&
           av1_use_angle_delta(bsize)) {
         rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
                                     MAX_ANGLE_DELTA + mbmi->angle_delta[1]);
@@ -10806,19 +11154,15 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED)
         rate2 += intra_cost_penalty;
       distortion2 = distortion_y + distortion_uv;
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-      if (bsize < BLOCK_8X8) distortion2_y = distortion_y;
-#endif
     } else {
       int_mv backup_ref_mv[2];
 
-#if !SUB8X8_COMP_REF
-      if (bsize == BLOCK_4X4 && mbmi->ref_frame[1] > INTRA_FRAME) continue;
-#endif  // !SUB8X8_COMP_REF
+      if (!is_comp_ref_allowed(bsize) && mbmi->ref_frame[1] > INTRA_FRAME)
+        continue;
 
       backup_ref_mv[0] = mbmi_ext->ref_mvs[ref_frame][0];
       if (comp_pred) backup_ref_mv[1] = mbmi_ext->ref_mvs[second_ref_frame][0];
-#if CONFIG_EXT_INTER && CONFIG_INTERINTRA
+#if CONFIG_INTERINTRA
       if (second_ref_frame == INTRA_FRAME) {
         if (best_single_inter_ref != ref_frame) continue;
         mbmi->interintra_mode = intra_to_interintra_mode[best_intra_mode];
@@ -10836,11 +11180,10 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
 #endif  // CONFIG_FILTER_INTRA
       }
-#endif  // CONFIG_EXT_INTER && CONFIG_INTERINTRA
+#endif  // CONFIG_INTERINTRA
       mbmi->ref_mv_idx = 0;
       ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
 
-#if CONFIG_EXT_INTER
       if (comp_pred) {
         if (mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
           int ref_mv_idx = 0;
@@ -10887,7 +11230,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         }
 #endif  // CONFIG_COMPOUND_SINGLEREF
       } else {
-#endif  // CONFIG_EXT_INTER
         if (mbmi->mode == NEWMV && mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
           int ref;
           for (ref = 0; ref < 1 + comp_pred; ++ref) {
@@ -10899,38 +11241,21 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
             mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0] = this_mv;
           }
         }
-#if CONFIG_EXT_INTER
       }
-#endif  // CONFIG_EXT_INTER
       {
         RD_STATS rd_stats, rd_stats_y, rd_stats_uv;
         av1_init_rd_stats(&rd_stats);
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-        // While av1 master uses rd_stats_y.rate through out the codebase,
-        // which is set when handle_inter_moden is called, the daala-dist code
-        // in rd_pick_partition() for cb4x4 and sub8x8 blocks need to know
-        // .dist_y which comes from rd_stats_y.dist and rd_stats_y.sse.
-        // The problem is rd_stats_y.dist and rd_stats_y.sse are sometimes not
-        // initialized when rd_stats.skip = 1,
-        // then instead rd_stats.dist and rd_stats.sse have the
-        // combined luma and chroma dist and sse.
-        // This can be seen inside motion_mode_rd(), which is called by
-        // handle_inter_mode().
-        if (bsize < BLOCK_8X8) av1_init_rd_stats(&rd_stats_y);
-#endif
         rd_stats.rate = rate2;
 
         // Point to variables that are maintained between loop iterations
         args.single_newmv = single_newmv;
-#if CONFIG_EXT_INTER
         args.single_newmv_rate = single_newmv_rate;
         args.modelled_rd = modelled_rd;
-#endif  // CONFIG_EXT_INTER
         this_rd = handle_inter_mode(cpi, x, bsize, &rd_stats, &rd_stats_y,
                                     &rd_stats_uv, &disable_skip, frame_mv,
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
                                     frame_comp_mv,
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
                                     mi_row, mi_col, &args, best_rd);
 
         rate2 = rd_stats.rate;
@@ -10939,21 +11264,10 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         total_sse = rd_stats.sse;
         rate_y = rd_stats_y.rate;
         rate_uv = rd_stats_uv.rate;
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-        if (bsize < BLOCK_8X8) {
-          if (rd_stats_y.rate != INT_MAX) {
-            assert(rd_stats_y.sse < INT64_MAX);
-            assert(rd_stats_y.dist < INT64_MAX);
-          }
-          total_sse_y = rd_stats_y.sse;
-          distortion2_y = rd_stats_y.dist;
-        }
-#endif
       }
 
 // TODO(jingning): This needs some refactoring to improve code quality
 // and reduce redundant steps.
-#if CONFIG_EXT_INTER
 #if CONFIG_COMPOUND_SINGLEREF
       if ((have_nearmv_in_inter_mode(mbmi->mode) &&
            mbmi_ext->ref_mv_count[ref_frame_type] > 2) ||
@@ -10966,11 +11280,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
           ((mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) &&
            mbmi_ext->ref_mv_count[ref_frame_type] > 1))
 #endif  // CONFIG_COMPOUND_SINGLEREF
-#else   // !CONFIG_EXT_INTER
-      if ((mbmi->mode == NEARMV &&
-           mbmi_ext->ref_mv_count[ref_frame_type] > 2) ||
-          (mbmi->mode == NEWMV && mbmi_ext->ref_mv_count[ref_frame_type] > 1))
-#endif  // CONFIG_EXT_INTER
       {
         int_mv backup_mv = frame_mv[NEARMV][ref_frame];
         MB_MODE_INFO backup_mbmi = *mbmi;
@@ -10978,12 +11287,8 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         int64_t tmp_ref_rd = this_rd;
         int ref_idx;
 
-// TODO(jingning): This should be deprecated shortly.
-#if CONFIG_EXT_INTER
+        // TODO(jingning): This should be deprecated shortly.
         int idx_offset = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0;
-#else
-        int idx_offset = (mbmi->mode == NEARMV) ? 1 : 0;
-#endif  // CONFIG_EXT_INTER
         int ref_set =
             AOMMIN(2, mbmi_ext->ref_mv_count[ref_frame_type] - 1 - idx_offset);
 
@@ -10994,7 +11299,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         backup_fmv[0] = frame_mv[NEWMV][ref_frame];
         if (comp_pred) backup_fmv[1] = frame_mv[NEWMV][second_ref_frame];
 
-        rate2 += (rate2 < INT_MAX ? cpi->drl_mode_cost0[drl_ctx][0] : 0);
+        rate2 += (rate2 < INT_MAX ? x->drl_mode_cost0[drl_ctx][0] : 0);
 
         if (this_rd < INT64_MAX) {
           if (RDCOST(x->rdmult, rate_y + rate_uv, distortion2) <
@@ -11003,10 +11308,11 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
                 x->rdmult, rate2 + av1_cost_bit(av1_get_skip_prob(cm, xd), 0),
                 distortion2);
           else
-            tmp_ref_rd = RDCOST(
-                x->rdmult, rate2 + av1_cost_bit(av1_get_skip_prob(cm, xd), 1) -
-                               rate_y - rate_uv,
-                total_sse);
+            tmp_ref_rd =
+                RDCOST(x->rdmult,
+                       rate2 + av1_cost_bit(av1_get_skip_prob(cm, xd), 1) -
+                           rate_y - rate_uv,
+                       total_sse);
         }
 #if CONFIG_VAR_TX
         for (i = 0; i < MAX_MB_PLANE; ++i)
@@ -11027,7 +11333,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 
           mbmi->ref_mv_idx = 1 + ref_idx;
 
-#if CONFIG_EXT_INTER
           if (comp_pred) {
             int ref_mv_idx = mbmi->ref_mv_idx;
             // Special case: NEAR_NEWMV and NEW_NEARMV modes use
@@ -11092,7 +11397,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
             }
 #endif  // CONFIG_COMPOUND_SINGLEREF
           } else {
-#endif  // CONFIG_EXT_INTER
             for (ref = 0; ref < 1 + comp_pred; ++ref) {
               int_mv this_mv =
                   (ref == 0)
@@ -11104,9 +11408,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
                            xd->n8_h << MI_SIZE_LOG2, xd);
               mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0] = this_mv;
             }
-#if CONFIG_EXT_INTER
           }
-#endif
 
           cur_mv =
               mbmi_ext->ref_mv_stack[ref_frame][mbmi->ref_mv_idx + idx_offset]
@@ -11115,39 +11417,25 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 
           if (!mv_check_bounds(&x->mv_limits, &cur_mv.as_mv)) {
             int_mv dummy_single_newmv[TOTAL_REFS_PER_FRAME] = { { 0 } };
-#if CONFIG_EXT_INTER
             int dummy_single_newmv_rate[TOTAL_REFS_PER_FRAME] = { 0 };
-#endif  // CONFIG_EXT_INTER
 
             frame_mv[NEARMV][ref_frame] = cur_mv;
             av1_init_rd_stats(&tmp_rd_stats);
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-            // With the same reason as 'rd_stats_y' passed to above
-            // handle_inter_mode(), tmp_rd_stats_y.dist and
-            // tmp_rd_stats_y.sse are sometimes not initialized, esp. when
-            // tmp_rd_stats.skip = 1 and tmp_rd_stats.dist and .sse
-            // represent combined luma and chroma .dist and .sse,
-            // we should initialized tmp_rd_stats_y.
-            if (bsize < BLOCK_8X8) av1_init_rd_stats(&tmp_rd_stats_y);
-#endif
+
             // Point to variables that are not maintained between iterations
             args.single_newmv = dummy_single_newmv;
-#if CONFIG_EXT_INTER
             args.single_newmv_rate = dummy_single_newmv_rate;
             args.modelled_rd = NULL;
-#endif  // CONFIG_EXT_INTER
             tmp_alt_rd = handle_inter_mode(cpi, x, bsize, &tmp_rd_stats,
                                            &tmp_rd_stats_y, &tmp_rd_stats_uv,
                                            &dummy_disable_skip, frame_mv,
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
                                            frame_comp_mv,
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
                                            mi_row, mi_col, &args, best_rd);
             // Prevent pointers from escaping local scope
             args.single_newmv = NULL;
-#if CONFIG_EXT_INTER
             args.single_newmv_rate = NULL;
-#endif  // CONFIG_EXT_INTER
           }
 
           for (i = 0; i < mbmi->ref_mv_idx; ++i) {
@@ -11155,7 +11443,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
             drl1_ctx = av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type],
                                    i + idx_offset);
             tmp_rd_stats.rate +=
-                (tmp_rd_stats.rate < INT_MAX ? cpi->drl_mode_cost0[drl1_ctx][1]
+                (tmp_rd_stats.rate < INT_MAX ? x->drl_mode_cost0[drl1_ctx][1]
                                              : 0);
           }
 
@@ -11166,7 +11454,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
                 av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type],
                             mbmi->ref_mv_idx + idx_offset);
             tmp_rd_stats.rate +=
-                (tmp_rd_stats.rate < INT_MAX ? cpi->drl_mode_cost0[drl1_ctx][0]
+                (tmp_rd_stats.rate < INT_MAX ? x->drl_mode_cost0[drl1_ctx][0]
                                              : 0);
           }
 
@@ -11178,16 +11466,18 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
             if (RDCOST(x->rdmult, tmp_rd_stats_y.rate + tmp_rd_stats_uv.rate,
                        tmp_rd_stats.dist) <
                 RDCOST(x->rdmult, 0, tmp_rd_stats.sse))
-              tmp_alt_rd = RDCOST(
-                  x->rdmult, tmp_rd_stats.rate +
-                                 av1_cost_bit(av1_get_skip_prob(cm, xd), 0),
-                  tmp_rd_stats.dist);
+              tmp_alt_rd =
+                  RDCOST(x->rdmult,
+                         tmp_rd_stats.rate +
+                             av1_cost_bit(av1_get_skip_prob(cm, xd), 0),
+                         tmp_rd_stats.dist);
             else
-              tmp_alt_rd = RDCOST(
-                  x->rdmult, tmp_rd_stats.rate +
-                                 av1_cost_bit(av1_get_skip_prob(cm, xd), 1) -
-                                 tmp_rd_stats_y.rate - tmp_rd_stats_uv.rate,
-                  tmp_rd_stats.sse);
+              tmp_alt_rd =
+                  RDCOST(x->rdmult,
+                         tmp_rd_stats.rate +
+                             av1_cost_bit(av1_get_skip_prob(cm, xd), 1) -
+                             tmp_rd_stats_y.rate - tmp_rd_stats_uv.rate,
+                         tmp_rd_stats.sse);
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
           }
 
@@ -11203,16 +11493,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
             tmp_ref_rd = tmp_alt_rd;
             backup_mbmi = *mbmi;
             backup_skip = x->skip;
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-            if (bsize < BLOCK_8X8) {
-              if (tmp_rd_stats_y.rate != INT_MAX) {
-                assert(tmp_rd_stats_y.sse < INT64_MAX);
-                assert(tmp_rd_stats_y.dist < INT64_MAX);
-              }
-              total_sse_y = tmp_rd_stats_y.sse;
-              distortion2_y = tmp_rd_stats_y.dist;
-            }
-#endif
 #if CONFIG_VAR_TX
             for (i = 0; i < MAX_MB_PLANE; ++i)
               memcpy(x->blk_skip_drl[i], x->blk_skip[i],
@@ -11238,12 +11518,8 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 
       if (this_rd == INT64_MAX) continue;
 
-#if SUB8X8_COMP_REF
-      compmode_cost = av1_cost_bit(comp_mode_p, comp_pred);
-#else
-      if (mbmi->sb_type != BLOCK_4X4)
+      if (is_comp_ref_allowed(mbmi->sb_type))
         compmode_cost = av1_cost_bit(comp_mode_p, comp_pred);
-#endif  // SUB8X8_COMP_REF
 
       if (cm->reference_mode == REFERENCE_MODE_SELECT) rate2 += compmode_cost;
     }
@@ -11263,14 +11539,14 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       rate2 += ref_costs_single[ref_frame];
     }
 
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
     // Add the cost to signal single/comp mode in single ref.
     if (!comp_pred && cm->reference_mode != COMPOUND_REFERENCE) {
       aom_prob singleref_comp_mode_p = av1_get_inter_mode_prob(cm, xd);
       rate2 += av1_cost_bit(singleref_comp_mode_p,
                             is_inter_singleref_comp_mode(mbmi->mode));
     }
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
 
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
     if (ref_frame == INTRA_FRAME)
@@ -11299,12 +11575,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
           this_skip2 = 1;
           rate_y = 0;
           rate_uv = 0;
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-          if (bsize < BLOCK_8X8) {
-            assert(total_sse_y < INT64_MAX);
-            distortion2_y = total_sse_y;
-          }
-#endif
         }
       } else {
         // Add in the cost of the no skip flag.
@@ -11324,25 +11594,19 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
     }
 
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-    if ((bsize < BLOCK_8X8) && (rate2 != INT_MAX)) {
-      assert(distortion2_y < INT64_MAX);
-    }
-#endif
-
     if (ref_frame == INTRA_FRAME) {
       // Keep record of best intra rd
       if (this_rd < best_intra_rd) {
         best_intra_rd = this_rd;
         best_intra_mode = mbmi->mode;
       }
-#if CONFIG_EXT_INTER && CONFIG_INTERINTRA
+#if CONFIG_INTERINTRA
     } else if (second_ref_frame == NONE_FRAME) {
       if (this_rd < best_single_inter_rd) {
         best_single_inter_rd = this_rd;
         best_single_inter_ref = mbmi->ref_frame[0];
       }
-#endif  // CONFIG_EXT_INTER && CONFIG_INTERINTRA
+#endif  // CONFIG_INTERINTRA
     }
 
     if (!disable_skip && ref_frame == INTRA_FRAME) {
@@ -11388,12 +11652,11 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 #endif
             mi);
         if (motion_allowed == WARPED_CAUSAL)
-          *returnrate_nocoef -= cpi->motion_mode_cost[bsize][mbmi->motion_mode];
+          *returnrate_nocoef -= x->motion_mode_cost[bsize][mbmi->motion_mode];
         else if (motion_allowed == OBMC_CAUSAL)
-          *returnrate_nocoef -=
-              cpi->motion_mode_cost1[bsize][mbmi->motion_mode];
+          *returnrate_nocoef -= x->motion_mode_cost1[bsize][mbmi->motion_mode];
 #else
-        *returnrate_nocoef -= cpi->motion_mode_cost[bsize][mbmi->motion_mode];
+        *returnrate_nocoef -= x->motion_mode_cost[bsize][mbmi->motion_mode];
 #endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 #endif  // CONFIG_SUPERTX
@@ -11406,12 +11669,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         best_rate_y = rate_y + av1_cost_bit(av1_get_skip_prob(cm, xd),
                                             this_skip2 || skippable);
         best_rate_uv = rate_uv;
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-        if (bsize < BLOCK_8X8) {
-          assert(distortion2_y < INT64_MAX);
-          rd_cost->dist_y = distortion2_y;
-        }
-#endif
 #if CONFIG_VAR_TX
         for (i = 0; i < MAX_MB_PLANE; ++i)
           memcpy(ctx->blk_skip[i], x->blk_skip[i],
@@ -11419,11 +11676,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 #endif  // CONFIG_VAR_TX
       }
     }
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-    if ((bsize < BLOCK_8X8) && (rd_cost->rate != INT_MAX)) {
-      assert(rd_cost->dist_y < INT64_MAX);
-    }
-#endif
+
     /* keep record of best compound/single-only prediction */
     if (!disable_skip && ref_frame != INTRA_FRAME) {
       int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
@@ -11475,14 +11728,14 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
     }
 
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
     // Single ref compound mode
     if (!has_second_ref(mbmi) && is_inter_singleref_comp_mode(mbmi->mode)) {
       xd->block_refs[1] = xd->block_refs[0];
       for (i = 0; i < MAX_MB_PLANE; i++)
         xd->plane[i].pre[1] = xd->plane[i].pre[0];
     }
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
 
     if (is_inter_mode(mbmi->mode)) {
       av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
@@ -11497,6 +11750,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 #if CONFIG_VAR_TX
       if (cm->tx_mode == TX_MODE_SELECT || xd->lossless[mbmi->segment_id]) {
         select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+        assert(rd_stats_y.rate != INT_MAX);
       } else {
         int idx, idy;
         super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
@@ -11538,6 +11792,9 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 #endif  // CONFIG_VAR_TX
       best_mbmode.tx_type = mbmi->tx_type;
       best_mbmode.tx_size = mbmi->tx_size;
+#if CONFIG_LGT_FROM_PRED
+      best_mbmode.use_lgt = mbmi->use_lgt;
+#endif
 #if CONFIG_VAR_TX
       for (idy = 0; idy < xd->n8_h; ++idy)
         for (idx = 0; idx < xd->n8_w; ++idx)
@@ -11554,23 +11811,9 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       rd_cost->dist = rd_stats_y.dist + rd_stats_uv.dist;
       rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
       best_skip2 = skip_blk;
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-      if (bsize < BLOCK_8X8) {
-        assert(rd_cost->rate != INT_MAX);
-        assert(rd_cost->dist_y < INT64_MAX);
-        rd_cost->dist_y = rd_stats_y.dist;
-      }
-#endif
     }
   }
 
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-  if ((bsize < BLOCK_8X8) && (rd_cost->rate != INT_MAX)) {
-    assert(rd_cost->dist_y < INT64_MAX);
-  }
-#endif
-
-#if CONFIG_PALETTE
   // Only try palette mode when the best mode so far is an intra mode.
   if (try_palette && !is_inter_mode(best_mbmode.mode)) {
     int rate2 = 0;
@@ -11603,7 +11846,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     uv_tx = uv_txsize_lookup[bsize][mbmi->tx_size][xd->plane[1].subsampling_x]
                             [xd->plane[1].subsampling_y];
     if (rate_uv_intra[uv_tx] == INT_MAX) {
-      choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx, &rate_uv_intra[uv_tx],
+      choose_intra_uv_mode(cpi, x, bsize, uv_tx, &rate_uv_intra[uv_tx],
                            &rate_uv_tokenonly[uv_tx], &dist_uvs[uv_tx],
                            &skip_uvs[uv_tx], &mode_uv[uv_tx]);
       pmi_uv[uv_tx] = *pmi;
@@ -11666,28 +11909,21 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     }
   }
 PALETTE_EXIT:
-#endif  // CONFIG_PALETTE
 
 #if CONFIG_FILTER_INTRA
   // TODO(huisu): filter-intra is turned off in lossless mode for now to
   // avoid a unit test failure
-  if (!xd->lossless[mbmi->segment_id] &&
-#if CONFIG_PALETTE
-      pmi->palette_size[0] == 0 &&
-#endif  // CONFIG_PALETTE
+  if (!xd->lossless[mbmi->segment_id] && pmi->palette_size[0] == 0 &&
       !dc_skipped && best_mode_index >= 0 &&
       best_intra_rd < (best_rd + (best_rd >> 3))) {
     pick_filter_intra_interframe(
-        cpi, x, ctx, bsize, mi_row, mi_col, rate_uv_intra, rate_uv_tokenonly,
+        cpi, x, bsize, mi_row, mi_col, rate_uv_intra, rate_uv_tokenonly,
         dist_uvs, skip_uvs, mode_uv, filter_intra_mode_info_uv,
 #if CONFIG_EXT_INTRA
         uv_angle_delta,
 #endif  // CONFIG_EXT_INTRA
-#if CONFIG_PALETTE
-        pmi_uv, palette_ctx,
-#endif  // CONFIG_PALETTE
-        0, ref_costs_single, &best_rd, &best_intra_rd, &best_intra_mode,
-        &best_mode_index, &best_skip2, &best_mode_skippable,
+        pmi_uv, palette_ctx, 0, ref_costs_single, &best_rd, &best_intra_rd,
+        &best_intra_mode, &best_mode_index, &best_skip2, &best_mode_skippable,
 #if CONFIG_SUPERTX
         returnrate_nocoef,
 #endif  // CONFIG_SUPERTX
@@ -11699,15 +11935,11 @@ PALETTE_EXIT:
 // Therefore, sometimes, NEWMV is chosen instead of NEARESTMV, NEARMV, and
 // ZEROMV. Here, checks are added for those cases, and the mode decisions
 // are corrected.
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
 // NOTE: For SR_NEW_NEWMV, no need to check as the two mvs from the same ref
 //       are surely different from each other.
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
-  if (best_mbmode.mode == NEWMV
-#if CONFIG_EXT_INTER
-      || best_mbmode.mode == NEW_NEWMV
-#endif  // CONFIG_EXT_INTER
-      ) {
+#endif  // CONFIG_COMPOUND_SINGLEREF
+  if (best_mbmode.mode == NEWMV || best_mbmode.mode == NEW_NEWMV) {
     const MV_REFERENCE_FRAME refs[2] = { best_mbmode.ref_frame[0],
                                          best_mbmode.ref_frame[1] };
     int comp_pred_mode = refs[1] > INTRA_FRAME;
@@ -11716,14 +11948,25 @@ PALETTE_EXIT:
 #if CONFIG_GLOBAL_MOTION
     zeromv[0].as_int = gm_get_motion_vector(&cm->global_motion[refs[0]],
                                             cm->allow_high_precision_mv, bsize,
-                                            mi_col, mi_row, 0)
+                                            mi_col, mi_row, 0
+#if CONFIG_AMVR
+                                            ,
+                                            cm->cur_frame_mv_precision_level
+#endif
+                                            )
                            .as_int;
-    zeromv[1].as_int = comp_pred_mode
-                           ? gm_get_motion_vector(&cm->global_motion[refs[1]],
-                                                  cm->allow_high_precision_mv,
-                                                  bsize, mi_col, mi_row, 0)
-                                 .as_int
-                           : 0;
+    zeromv[1].as_int =
+        comp_pred_mode
+            ? gm_get_motion_vector(&cm->global_motion[refs[1]],
+                                   cm->allow_high_precision_mv, bsize, mi_col,
+                                   mi_row, 0
+#if CONFIG_AMVR
+                                   ,
+                                   cm->cur_frame_mv_precision_level
+#endif
+                                   )
+                  .as_int
+            : 0;
 #else
     zeromv[0].as_int = 0;
     zeromv[1].as_int = 0;
@@ -11749,7 +11992,6 @@ PALETTE_EXIT:
       int_mv nearestmv[2];
       int_mv nearmv[2];
 
-#if CONFIG_EXT_INTER
       if (mbmi_ext->ref_mv_count[rf_type] > 1) {
         nearmv[0] = mbmi_ext->ref_mv_stack[rf_type][1].this_mv;
         nearmv[1] = mbmi_ext->ref_mv_stack[rf_type][1].comp_mv;
@@ -11757,22 +11999,6 @@ PALETTE_EXIT:
         nearmv[0] = frame_mv[NEARMV][refs[0]];
         nearmv[1] = frame_mv[NEARMV][refs[1]];
       }
-#else
-      int ref_set = (mbmi_ext->ref_mv_count[rf_type] >= 2)
-                        ? AOMMIN(2, mbmi_ext->ref_mv_count[rf_type] - 2)
-                        : INT_MAX;
-
-      for (i = 0; i <= ref_set && ref_set != INT_MAX; ++i) {
-        nearmv[0] = mbmi_ext->ref_mv_stack[rf_type][i + 1].this_mv;
-        nearmv[1] = mbmi_ext->ref_mv_stack[rf_type][i + 1].comp_mv;
-
-        if (nearmv[0].as_int == best_mbmode.mv[0].as_int &&
-            nearmv[1].as_int == best_mbmode.mv[1].as_int) {
-          best_mbmode.mode = NEARMV;
-          best_mbmode.ref_mv_idx = i;
-        }
-      }
-#endif  // CONFIG_EXT_INTER
       if (mbmi_ext->ref_mv_count[rf_type] >= 1) {
         nearestmv[0] = mbmi_ext->ref_mv_stack[rf_type][0].this_mv;
         nearestmv[1] = mbmi_ext->ref_mv_stack[rf_type][0].comp_mv;
@@ -11782,9 +12008,7 @@ PALETTE_EXIT:
       }
 
       if (nearestmv[0].as_int == best_mbmode.mv[0].as_int &&
-          nearestmv[1].as_int == best_mbmode.mv[1].as_int)
-#if CONFIG_EXT_INTER
-      {
+          nearestmv[1].as_int == best_mbmode.mv[1].as_int) {
         best_mbmode.mode = NEAREST_NEARESTMV;
       } else {
         int ref_set = (mbmi_ext->ref_mv_count[rf_type] >= 2)
@@ -11808,21 +12032,12 @@ PALETTE_EXIT:
             best_mbmode.mv[1].as_int == zeromv[1].as_int)
           best_mbmode.mode = ZERO_ZEROMV;
       }
-#else
-      {
-        best_mbmode.mode = NEARESTMV;
-      } else if (best_mbmode.mv[0].as_int == zeromv[0].as_int &&
-                 best_mbmode.mv[1].as_int == zeromv[1].as_int) {
-        best_mbmode.mode = ZEROMV;
-      }
-#endif  // CONFIG_EXT_INTER
     }
   }
 
   // Make sure that the ref_mv_idx is only nonzero when we're
   // using a mode which can support ref_mv_idx
   if (best_mbmode.ref_mv_idx != 0 &&
-#if CONFIG_EXT_INTER
 #if CONFIG_COMPOUND_SINGLEREF
       !(best_mbmode.mode == NEWMV || best_mbmode.mode == SR_NEW_NEWMV ||
         best_mbmode.mode == NEW_NEWMV ||
@@ -11831,45 +12046,31 @@ PALETTE_EXIT:
       !(best_mbmode.mode == NEWMV || best_mbmode.mode == NEW_NEWMV ||
         have_nearmv_in_inter_mode(best_mbmode.mode)))
 #endif  // CONFIG_COMPOUND_SINGLEREF
-#else   // !CONFIG_EXT_INTER
-      !(best_mbmode.mode == NEARMV || best_mbmode.mode == NEWMV))
-#endif  // CONFIG_EXT_INTER
   {
     best_mbmode.ref_mv_idx = 0;
   }
 
-  {
+  if (best_mbmode.ref_frame[0] > INTRA_FRAME &&
+      best_mbmode.ref_frame[1] <= INTRA_FRAME) {
     int8_t ref_frame_type = av1_ref_frame_type(best_mbmode.ref_frame);
     int16_t mode_ctx = mbmi_ext->mode_context[ref_frame_type];
     if (mode_ctx & (1 << ALL_ZERO_FLAG_OFFSET)) {
-      int_mv zeromv[2];
+      int_mv zeromv;
 #if CONFIG_GLOBAL_MOTION
-      const MV_REFERENCE_FRAME refs[2] = { best_mbmode.ref_frame[0],
-                                           best_mbmode.ref_frame[1] };
-      zeromv[0].as_int = gm_get_motion_vector(&cm->global_motion[refs[0]],
-                                              cm->allow_high_precision_mv,
-                                              bsize, mi_col, mi_row, 0)
-                             .as_int;
-      zeromv[1].as_int = (refs[1] != NONE_FRAME)
-                             ? gm_get_motion_vector(&cm->global_motion[refs[1]],
-                                                    cm->allow_high_precision_mv,
-                                                    bsize, mi_col, mi_row, 0)
-                                   .as_int
-                             : 0;
-      lower_mv_precision(&zeromv[0].as_mv, cm->allow_high_precision_mv);
-      lower_mv_precision(&zeromv[1].as_mv, cm->allow_high_precision_mv);
+      const MV_REFERENCE_FRAME ref = best_mbmode.ref_frame[0];
+      zeromv.as_int = gm_get_motion_vector(&cm->global_motion[ref],
+                                           cm->allow_high_precision_mv, bsize,
+                                           mi_col, mi_row, 0
+#if CONFIG_AMVR
+                                           ,
+                                           cm->cur_frame_mv_precision_level
+#endif
+                                           )
+                          .as_int;
 #else
-      zeromv[0].as_int = zeromv[1].as_int = 0;
+      zeromv.as_int = 0;
 #endif  // CONFIG_GLOBAL_MOTION
-      if (best_mbmode.ref_frame[0] > INTRA_FRAME &&
-          best_mbmode.mv[0].as_int == zeromv[0].as_int &&
-#if CONFIG_EXT_INTER
-          (best_mbmode.ref_frame[1] <= INTRA_FRAME)
-#else
-          (best_mbmode.ref_frame[1] == NONE_FRAME ||
-           best_mbmode.mv[1].as_int == zeromv[1].as_int)
-#endif  // CONFIG_EXT_INTER
-              ) {
+      if (best_mbmode.mv[0].as_int == zeromv.as_int) {
         best_mbmode.mode = ZEROMV;
       }
     }
@@ -11881,24 +12082,14 @@ PALETTE_EXIT:
     return;
   }
 
-#if CONFIG_DUAL_FILTER
   assert((cm->interp_filter == SWITCHABLE) ||
-         (cm->interp_filter == best_mbmode.interp_filter[0]) ||
+         (cm->interp_filter ==
+          av1_extract_interp_filter(best_mbmode.interp_filters, 0)) ||
          !is_inter_block(&best_mbmode));
+#if CONFIG_DUAL_FILTER
   assert((cm->interp_filter == SWITCHABLE) ||
-         (cm->interp_filter == best_mbmode.interp_filter[1]) ||
-         !is_inter_block(&best_mbmode));
-  if (best_mbmode.ref_frame[1] > INTRA_FRAME) {
-    assert((cm->interp_filter == SWITCHABLE) ||
-           (cm->interp_filter == best_mbmode.interp_filter[2]) ||
-           !is_inter_block(&best_mbmode));
-    assert((cm->interp_filter == SWITCHABLE) ||
-           (cm->interp_filter == best_mbmode.interp_filter[3]) ||
-           !is_inter_block(&best_mbmode));
-  }
-#else
-  assert((cm->interp_filter == SWITCHABLE) ||
-         (cm->interp_filter == best_mbmode.interp_filter) ||
+         (cm->interp_filter ==
+          av1_extract_interp_filter(best_mbmode.interp_filters, 1)) ||
          !is_inter_block(&best_mbmode));
 #endif  // CONFIG_DUAL_FILTER
 
@@ -11913,11 +12104,7 @@ PALETTE_EXIT:
 // Note: this section is needed since the mode may have been forced to
 // ZEROMV by the all-zero mode handling of ref-mv.
 #if CONFIG_GLOBAL_MOTION
-  if (mbmi->mode == ZEROMV
-#if CONFIG_EXT_INTER
-      || mbmi->mode == ZERO_ZEROMV
-#endif  // CONFIG_EXT_INTER
-      ) {
+  if (mbmi->mode == ZEROMV || mbmi->mode == ZERO_ZEROMV) {
 #if CONFIG_WARPED_MOTION || CONFIG_MOTION_VAR
     // Correct the motion mode for ZEROMV
     const MOTION_MODE last_motion_mode_allowed =
@@ -11932,17 +12119,8 @@ PALETTE_EXIT:
 
     // Correct the interpolation filter for ZEROMV
     if (is_nontrans_global_motion(xd)) {
-#if CONFIG_DUAL_FILTER
-      mbmi->interp_filter[0] = cm->interp_filter == SWITCHABLE
-                                   ? EIGHTTAP_REGULAR
-                                   : cm->interp_filter;
-      mbmi->interp_filter[1] = cm->interp_filter == SWITCHABLE
-                                   ? EIGHTTAP_REGULAR
-                                   : cm->interp_filter;
-#else
-      mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP_REGULAR
-                                                            : cm->interp_filter;
-#endif  // CONFIG_DUAL_FILTER
+      mbmi->interp_filters = av1_broadcast_interp_filter(
+          av1_unswitchable_filter(cm->interp_filter));
     }
   }
 #endif  // CONFIG_GLOBAL_MOTION
@@ -11968,11 +12146,10 @@ PALETTE_EXIT:
   store_coding_context(x, ctx, best_mode_index, best_pred_diff,
                        best_mode_skippable);
 
-#if CONFIG_PALETTE
-  if (cm->allow_screen_content_tools && pmi->palette_size[1] > 0) {
+  if (pmi->palette_size[1] > 0) {
+    assert(try_palette);
     restore_uv_color_map(cpi, x);
   }
-#endif  // CONFIG_PALETTE
 }
 
 void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
@@ -12013,10 +12190,8 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
 
   assert(segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP));
 
-#if CONFIG_PALETTE
   mbmi->palette_mode_info.palette_size[0] = 0;
   mbmi->palette_mode_info.palette_size[1] = 0;
-#endif  // CONFIG_PALETTE
 
 #if CONFIG_FILTER_INTRA
   mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
@@ -12030,8 +12205,12 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
 #if CONFIG_GLOBAL_MOTION
   mbmi->mv[0].as_int =
       gm_get_motion_vector(&cm->global_motion[mbmi->ref_frame[0]],
-                           cm->allow_high_precision_mv, bsize, mi_col, mi_row,
-                           0)
+                           cm->allow_high_precision_mv, bsize, mi_col, mi_row, 0
+#if CONFIG_AMVR
+                           ,
+                           cm->cur_frame_mv_precision_level
+#endif
+                           )
           .as_int;
 #else   // CONFIG_GLOBAL_MOTION
   mbmi->mv[0].as_int = 0;
@@ -12041,6 +12220,9 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
 
   mbmi->ref_mv_idx = 0;
   mbmi->pred_mv[0].as_int = 0;
+#if CONFIG_LGT_FROM_PRED
+  mbmi->use_lgt = 0;
+#endif
 
   mbmi->motion_mode = SIMPLE_TRANSLATION;
 #if CONFIG_MOTION_VAR
@@ -12074,31 +12256,18 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
       int rs;
       int best_rs = INT_MAX;
       for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
-#if CONFIG_DUAL_FILTER
-        int k;
-        for (k = 0; k < 4; ++k) mbmi->interp_filter[k] = i;
-#else
-        mbmi->interp_filter = i;
-#endif  // CONFIG_DUAL_FILTER
-        rs = av1_get_switchable_rate(cpi, xd);
+        mbmi->interp_filters = av1_broadcast_interp_filter(i);
+        rs = av1_get_switchable_rate(cm, x, xd);
         if (rs < best_rs) {
           best_rs = rs;
-#if CONFIG_DUAL_FILTER
-          best_filter = mbmi->interp_filter[0];
-#else
-          best_filter = mbmi->interp_filter;
-#endif  // CONFIG_DUAL_FILTER
+          best_filter = av1_extract_interp_filter(mbmi->interp_filters, 0);
         }
       }
     }
   }
-// Set the appropriate filter
-#if CONFIG_DUAL_FILTER
-  for (i = 0; i < 4; ++i) mbmi->interp_filter[i] = best_filter;
-#else
-  mbmi->interp_filter = best_filter;
-#endif  // CONFIG_DUAL_FILTER
-  rate2 += av1_get_switchable_rate(cpi, xd);
+  // Set the appropriate filter
+  mbmi->interp_filters = av1_broadcast_interp_filter(best_filter);
+  rate2 += av1_get_switchable_rate(cm, x, xd);
 
   if (cm->reference_mode == REFERENCE_MODE_SELECT)
     rate2 += av1_cost_bit(comp_mode_p, comp_pred);
@@ -12111,22 +12280,16 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
   rd_cost->rate = rate2;
   rd_cost->dist = distortion2;
   rd_cost->rdcost = this_rd;
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-  if (bsize < BLOCK_8X8) rd_cost->dist_y = distortion2;
-#endif
+
   if (this_rd >= best_rd_so_far) {
     rd_cost->rate = INT_MAX;
     rd_cost->rdcost = INT64_MAX;
     return;
   }
 
-#if CONFIG_DUAL_FILTER
   assert((cm->interp_filter == SWITCHABLE) ||
-         (cm->interp_filter == mbmi->interp_filter[0]));
-#else
-  assert((cm->interp_filter == SWITCHABLE) ||
-         (cm->interp_filter == mbmi->interp_filter));
-#endif  // CONFIG_DUAL_FILTER
+         (cm->interp_filter ==
+          av1_extract_interp_filter(mbmi->interp_filters, 0)));
 
   av1_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact,
                             cpi->sf.adaptive_rd_thresh, bsize, THR_ZEROMV);
@@ -12137,6 +12300,124 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
 }
 
 #if CONFIG_MOTION_VAR
+
+struct calc_target_weighted_pred_ctxt {
+  const MACROBLOCK *x;
+  const uint8_t *tmp;
+  int tmp_stride;
+  int overlap;
+};
+
+static INLINE void calc_target_weighted_pred_above(MACROBLOCKD *xd,
+                                                   int rel_mi_col,
+                                                   uint8_t nb_mi_width,
+                                                   MODE_INFO *nb_mi,
+                                                   void *fun_ctxt) {
+  (void)nb_mi;
+
+  struct calc_target_weighted_pred_ctxt *ctxt =
+      (struct calc_target_weighted_pred_ctxt *)fun_ctxt;
+
+#if CONFIG_HIGHBITDEPTH
+  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+#else
+  const int is_hbd = 0;
+#endif  // CONFIG_HIGHBITDEPTH
+
+  const int bw = xd->n8_w << MI_SIZE_LOG2;
+  const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap);
+
+  int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_col * MI_SIZE);
+  int32_t *mask = ctxt->x->mask_buf + (rel_mi_col * MI_SIZE);
+  const uint8_t *tmp = ctxt->tmp + rel_mi_col * MI_SIZE;
+
+  if (!is_hbd) {
+    for (int row = 0; row < ctxt->overlap; ++row) {
+      const uint8_t m0 = mask1d[row];
+      const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+      for (int col = 0; col < nb_mi_width * MI_SIZE; ++col) {
+        wsrc[col] = m1 * tmp[col];
+        mask[col] = m0;
+      }
+      wsrc += bw;
+      mask += bw;
+      tmp += ctxt->tmp_stride;
+    }
+#if CONFIG_HIGHBITDEPTH
+  } else {
+    const uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp);
+
+    for (int row = 0; row < ctxt->overlap; ++row) {
+      const uint8_t m0 = mask1d[row];
+      const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+      for (int col = 0; col < nb_mi_width * MI_SIZE; ++col) {
+        wsrc[col] = m1 * tmp16[col];
+        mask[col] = m0;
+      }
+      wsrc += bw;
+      mask += bw;
+      tmp16 += ctxt->tmp_stride;
+    }
+#endif  // CONFIG_HIGHBITDEPTH
+  }
+}
+
+static INLINE void calc_target_weighted_pred_left(MACROBLOCKD *xd,
+                                                  int rel_mi_row,
+                                                  uint8_t nb_mi_height,
+                                                  MODE_INFO *nb_mi,
+                                                  void *fun_ctxt) {
+  (void)nb_mi;
+
+  struct calc_target_weighted_pred_ctxt *ctxt =
+      (struct calc_target_weighted_pred_ctxt *)fun_ctxt;
+
+#if CONFIG_HIGHBITDEPTH
+  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+#else
+  const int is_hbd = 0;
+#endif  // CONFIG_HIGHBITDEPTH
+
+  const int bw = xd->n8_w << MI_SIZE_LOG2;
+  const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap);
+
+  int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_row * MI_SIZE * bw);
+  int32_t *mask = ctxt->x->mask_buf + (rel_mi_row * MI_SIZE * bw);
+  const uint8_t *tmp = ctxt->tmp + (rel_mi_row * MI_SIZE * ctxt->tmp_stride);
+
+  if (!is_hbd) {
+    for (int row = 0; row < nb_mi_height * MI_SIZE; ++row) {
+      for (int col = 0; col < ctxt->overlap; ++col) {
+        const uint8_t m0 = mask1d[col];
+        const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+        wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 +
+                    (tmp[col] << AOM_BLEND_A64_ROUND_BITS) * m1;
+        mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0;
+      }
+      wsrc += bw;
+      mask += bw;
+      tmp += ctxt->tmp_stride;
+    }
+#if CONFIG_HIGHBITDEPTH
+  } else {
+    const uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp);
+
+    for (int row = 0; row < nb_mi_height * MI_SIZE; ++row) {
+      for (int col = 0; col < ctxt->overlap; ++col) {
+        const uint8_t m0 = mask1d[col];
+        const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+        wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 +
+                    (tmp16[col] << AOM_BLEND_A64_ROUND_BITS) * m1;
+        mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0;
+      }
+      wsrc += bw;
+      mask += bw;
+      tmp16 += ctxt->tmp_stride;
+    }
+#endif  // CONFIG_HIGHBITDEPTH
+  }
+}
+
 // This function has a structure similar to av1_build_obmc_inter_prediction
 //
 // The OBMC predictor is computed as:
@@ -12181,13 +12462,11 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
                                       int above_stride, const uint8_t *left,
                                       int left_stride) {
   const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
-  int row, col, i;
   const int bw = xd->n8_w << MI_SIZE_LOG2;
   const int bh = xd->n8_h << MI_SIZE_LOG2;
   int32_t *mask_buf = x->mask_buf;
   int32_t *wsrc_buf = x->wsrc_buf;
-  const int wsrc_stride = bw;
-  const int mask_stride = bw;
+
   const int src_scale = AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA;
 #if CONFIG_HIGHBITDEPTH
   const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
@@ -12200,86 +12479,20 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
   assert(xd->plane[0].subsampling_y == 0);
 
   av1_zero_array(wsrc_buf, bw * bh);
-  for (i = 0; i < bw * bh; ++i) mask_buf[i] = AOM_BLEND_A64_MAX_ALPHA;
+  for (int i = 0; i < bw * bh; ++i) mask_buf[i] = AOM_BLEND_A64_MAX_ALPHA;
 
   // handle above row
   if (xd->up_available) {
     const int overlap =
-        AOMMIN(block_size_high[bsize] >> 1, block_size_high[BLOCK_64X64] >> 1);
-    const int miw = AOMMIN(xd->n8_w, cm->mi_cols - mi_col);
-    const int mi_row_offset = -1;
-    const uint8_t *const mask1d = av1_get_obmc_mask(overlap);
-    const int neighbor_limit = max_neighbor_obmc[b_width_log2_lookup[bsize]];
-    int neighbor_count = 0;
-
-    assert(miw > 0);
-
-    i = 0;
-    do {  // for each mi in the above row
-      const int mi_col_offset = i;
-      const MB_MODE_INFO *above_mbmi =
-          &xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]->mbmi;
-#if CONFIG_CHROMA_SUB8X8
-      if (above_mbmi->sb_type < BLOCK_8X8)
-        above_mbmi =
-            &xd->mi[mi_col_offset + 1 + mi_row_offset * xd->mi_stride]->mbmi;
-#endif
-      const BLOCK_SIZE a_bsize = AOMMAX(above_mbmi->sb_type, BLOCK_8X8);
-      const int above_step =
-          AOMMIN(mi_size_wide[a_bsize], mi_size_wide[BLOCK_64X64]);
-      const int mi_step = AOMMIN(xd->n8_w, above_step);
-      const int neighbor_bw = mi_step * MI_SIZE;
-
-      if (is_neighbor_overlappable(above_mbmi)) {
-        if (!CONFIG_CB4X4 && (a_bsize == BLOCK_4X4 || a_bsize == BLOCK_4X8))
-          neighbor_count += 2;
-        else
-          neighbor_count++;
-        if (neighbor_count > neighbor_limit) break;
-
-        const int tmp_stride = above_stride;
-        int32_t *wsrc = wsrc_buf + (i * MI_SIZE);
-        int32_t *mask = mask_buf + (i * MI_SIZE);
-
-        if (!is_hbd) {
-          const uint8_t *tmp = above;
-
-          for (row = 0; row < overlap; ++row) {
-            const uint8_t m0 = mask1d[row];
-            const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
-            for (col = 0; col < neighbor_bw; ++col) {
-              wsrc[col] = m1 * tmp[col];
-              mask[col] = m0;
-            }
-            wsrc += wsrc_stride;
-            mask += mask_stride;
-            tmp += tmp_stride;
-          }
-#if CONFIG_HIGHBITDEPTH
-        } else {
-          const uint16_t *tmp = CONVERT_TO_SHORTPTR(above);
-
-          for (row = 0; row < overlap; ++row) {
-            const uint8_t m0 = mask1d[row];
-            const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
-            for (col = 0; col < neighbor_bw; ++col) {
-              wsrc[col] = m1 * tmp[col];
-              mask[col] = m0;
-            }
-            wsrc += wsrc_stride;
-            mask += mask_stride;
-            tmp += tmp_stride;
-          }
-#endif  // CONFIG_HIGHBITDEPTH
-        }
-      }
-
-      above += neighbor_bw;
-      i += mi_step;
-    } while (i < miw);
+        AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1;
+    struct calc_target_weighted_pred_ctxt ctxt = { x, above, above_stride,
+                                                   overlap };
+    foreach_overlappable_nb_above(cm, (MACROBLOCKD *)xd, mi_col,
+                                  max_neighbor_obmc[b_width_log2_lookup[bsize]],
+                                  calc_target_weighted_pred_above, &ctxt);
   }
 
-  for (i = 0; i < bw * bh; ++i) {
+  for (int i = 0; i < bw * bh; ++i) {
     wsrc_buf[i] *= AOM_BLEND_A64_MAX_ALPHA;
     mask_buf[i] *= AOM_BLEND_A64_MAX_ALPHA;
   }
@@ -12287,102 +12500,33 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
   // handle left column
   if (xd->left_available) {
     const int overlap =
-        AOMMIN(block_size_wide[bsize] >> 1, block_size_wide[BLOCK_64X64] >> 1);
-    const int mih = AOMMIN(xd->n8_h, cm->mi_rows - mi_row);
-    const int mi_col_offset = -1;
-    const uint8_t *const mask1d = av1_get_obmc_mask(overlap);
-    const int neighbor_limit = max_neighbor_obmc[b_height_log2_lookup[bsize]];
-    int neighbor_count = 0;
-
-    assert(mih > 0);
-
-    i = 0;
-    do {  // for each mi in the left column
-      const int mi_row_offset = i;
-      MB_MODE_INFO *left_mbmi =
-          &xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]->mbmi;
-
-#if CONFIG_CHROMA_SUB8X8
-      if (left_mbmi->sb_type < BLOCK_8X8)
-        left_mbmi =
-            &xd->mi[mi_col_offset + (mi_row_offset + 1) * xd->mi_stride]->mbmi;
-#endif
-      const BLOCK_SIZE l_bsize = AOMMAX(left_mbmi->sb_type, BLOCK_8X8);
-      const int left_step =
-          AOMMIN(mi_size_high[l_bsize], mi_size_high[BLOCK_64X64]);
-      const int mi_step = AOMMIN(xd->n8_h, left_step);
-      const int neighbor_bh = mi_step * MI_SIZE;
-
-      if (is_neighbor_overlappable(left_mbmi)) {
-        if (!CONFIG_CB4X4 && (l_bsize == BLOCK_4X4 || l_bsize == BLOCK_8X4))
-          neighbor_count += 2;
-        else
-          neighbor_count++;
-        if (neighbor_count > neighbor_limit) break;
-
-        const int tmp_stride = left_stride;
-        int32_t *wsrc = wsrc_buf + (i * MI_SIZE * wsrc_stride);
-        int32_t *mask = mask_buf + (i * MI_SIZE * mask_stride);
-
-        if (!is_hbd) {
-          const uint8_t *tmp = left;
-
-          for (row = 0; row < neighbor_bh; ++row) {
-            for (col = 0; col < overlap; ++col) {
-              const uint8_t m0 = mask1d[col];
-              const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
-              wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 +
-                          (tmp[col] << AOM_BLEND_A64_ROUND_BITS) * m1;
-              mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0;
-            }
-            wsrc += wsrc_stride;
-            mask += mask_stride;
-            tmp += tmp_stride;
-          }
-#if CONFIG_HIGHBITDEPTH
-        } else {
-          const uint16_t *tmp = CONVERT_TO_SHORTPTR(left);
-
-          for (row = 0; row < neighbor_bh; ++row) {
-            for (col = 0; col < overlap; ++col) {
-              const uint8_t m0 = mask1d[col];
-              const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
-              wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 +
-                          (tmp[col] << AOM_BLEND_A64_ROUND_BITS) * m1;
-              mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0;
-            }
-            wsrc += wsrc_stride;
-            mask += mask_stride;
-            tmp += tmp_stride;
-          }
-#endif  // CONFIG_HIGHBITDEPTH
-        }
-      }
-
-      left += neighbor_bh * left_stride;
-      i += mi_step;
-    } while (i < mih);
+        AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1;
+    struct calc_target_weighted_pred_ctxt ctxt = { x, left, left_stride,
+                                                   overlap };
+    foreach_overlappable_nb_left(cm, (MACROBLOCKD *)xd, mi_row,
+                                 max_neighbor_obmc[b_height_log2_lookup[bsize]],
+                                 calc_target_weighted_pred_left, &ctxt);
   }
 
   if (!is_hbd) {
     const uint8_t *src = x->plane[0].src.buf;
 
-    for (row = 0; row < bh; ++row) {
-      for (col = 0; col < bw; ++col) {
+    for (int row = 0; row < bh; ++row) {
+      for (int col = 0; col < bw; ++col) {
         wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col];
       }
-      wsrc_buf += wsrc_stride;
+      wsrc_buf += bw;
       src += x->plane[0].src.stride;
     }
 #if CONFIG_HIGHBITDEPTH
   } else {
     const uint16_t *src = CONVERT_TO_SHORTPTR(x->plane[0].src.buf);
 
-    for (row = 0; row < bh; ++row) {
-      for (col = 0; col < bw; ++col) {
+    for (int row = 0; row < bh; ++row) {
+      for (int col = 0; col < bw; ++col) {
         wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col];
       }
-      wsrc_buf += wsrc_stride;
+      wsrc_buf += bw;
       src += x->plane[0].src.stride;
     }
 #endif  // CONFIG_HIGHBITDEPTH
@@ -12508,8 +12652,9 @@ void av1_check_ncobmc_rd(const struct AV1_COMP *cpi, struct macroblock *x,
   }
 
   if (rd_causal >
-      RDCOST(x->rdmult, rd_stats_y.rate + rd_stats_uv.rate +
-                            av1_cost_bit(cm->fc->motion_mode_prob[bsize][0], 1),
+      RDCOST(x->rdmult,
+             rd_stats_y.rate + rd_stats_uv.rate +
+                 av1_cost_bit(cm->fc->motion_mode_prob[bsize][0], 1),
              (rd_stats_y.dist + rd_stats_uv.dist))) {
     x->skip = skip_blk;
   } else {
@@ -12518,4 +12663,328 @@ void av1_check_ncobmc_rd(const struct AV1_COMP *cpi, struct macroblock *x,
   }
 }
 #endif  // CONFIG_NCOBMC
+
+int64_t get_prediction_rd_cost(const struct AV1_COMP *cpi, struct macroblock *x,
+                               int mi_row, int mi_col, int *skip_blk,
+                               MB_MODE_INFO *backup_mbmi) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  BLOCK_SIZE bsize = mbmi->sb_type;
+#if CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_WARPED_MOTION
+  const MOTION_MODE motion_allowed = motion_mode_allowed(
+#if CONFIG_GLOBAL_MOTION
+      0, xd->global_motion,
+#endif  // CONFIG_GLOBAL_MOTION
+#if CONFIG_WARPED_MOTION
+      xd,
+#endif
+      xd->mi[0]);
+#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_WARPED_MOTION
+  RD_STATS rd_stats_y, rd_stats_uv;
+  int rate_skip0 = av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
+  int rate_skip1 = av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+  int64_t this_rd;
+  int ref;
+
+#if CONFIG_CB4X4
+  x->skip_chroma_rd =
+      !is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+                           xd->plane[1].subsampling_y);
+#endif
+
+  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+    YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[ref]);
+    assert(cfg != NULL);
+    av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
+                         &xd->block_refs[ref]->sf);
+  }
+  av1_setup_dst_planes(x->e_mbd.plane, bsize,
+                       get_frame_new_buffer(&cpi->common), mi_row, mi_col);
+
+#if CONFIG_NCOBMC_ADAPT_WEIGHT
+  if (mbmi->motion_mode != NCOBMC_ADAPT_WEIGHT)
+#endif
+    av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
+
+#if CONFIG_MOTION_VAR
+  if (mbmi->motion_mode == OBMC_CAUSAL) {
+#if CONFIG_NCOBMC
+    av1_build_ncobmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+#else
+    av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+#endif
+  }
+#endif  // CONFIG_MOTION_VAR
+
+#if CONFIG_NCOBMC_ADAPT_WEIGHT
+  if (mbmi->motion_mode == NCOBMC_ADAPT_WEIGHT)
+    for (int plane = 0; plane < MAX_MB_PLANE; ++plane)
+      get_pred_from_intrpl_buf(xd, mi_row, mi_col, bsize, plane);
+#endif
+  av1_subtract_plane(x, bsize, 0);
+
+#if CONFIG_VAR_TX
+  if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
+    select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+  } else {
+    int idx, idy;
+    super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+    for (idy = 0; idy < xd->n8_h; ++idy)
+      for (idx = 0; idx < xd->n8_w; ++idx)
+        mbmi->inter_tx_size[idy][idx] = mbmi->tx_size;
+    memset(x->blk_skip[0], rd_stats_y.skip,
+           sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
+  }
+  inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+#else
+  super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+  super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+#endif
+  assert(rd_stats_y.rate != INT_MAX && rd_stats_uv.rate != INT_MAX);
+
+  if (rd_stats_y.skip && rd_stats_uv.skip) {
+    rd_stats_y.rate = rate_skip1;
+    rd_stats_uv.rate = 0;
+    rd_stats_y.dist = rd_stats_y.sse;
+    rd_stats_uv.dist = rd_stats_uv.sse;
+    *skip_blk = 1;
+  } else if (RDCOST(x->rdmult,
+                    (rd_stats_y.rate + rd_stats_uv.rate + rate_skip0),
+                    (rd_stats_y.dist + rd_stats_uv.dist)) >
+             RDCOST(x->rdmult, rate_skip1,
+                    (rd_stats_y.sse + rd_stats_uv.sse))) {
+    rd_stats_y.rate = rate_skip1;
+    rd_stats_uv.rate = 0;
+    rd_stats_y.dist = rd_stats_y.sse;
+    rd_stats_uv.dist = rd_stats_uv.sse;
+    *skip_blk = 1;
+  } else {
+    rd_stats_y.rate += rate_skip0;
+    *skip_blk = 0;
+  }
+
+  if (backup_mbmi) *backup_mbmi = *mbmi;
+
+  this_rd = RDCOST(x->rdmult, (rd_stats_y.rate + rd_stats_uv.rate),
+                   (rd_stats_y.dist + rd_stats_uv.dist));
+#if CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_WARPED_MOTION
+  if (motion_allowed == NCOBMC_ADAPT_WEIGHT) {
+    assert(mbmi->motion_mode <= NCOBMC_ADAPT_WEIGHT);
+    this_rd +=
+        RDCOST(x->rdmult, x->motion_mode_cost2[bsize][mbmi->motion_mode], 0);
+  } else if (motion_allowed == OBMC_CAUSAL) {
+    assert(mbmi->motion_mode <= OBMC_CAUSAL);
+    this_rd +=
+        RDCOST(x->rdmult, x->motion_mode_cost1[bsize][mbmi->motion_mode], 0);
+  } else {
+#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_WARPED_MOTION
+    this_rd +=
+        RDCOST(x->rdmult, x->motion_mode_cost[bsize][mbmi->motion_mode], 0);
+#if CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_WARPED_MOTION
+  }
+#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_WARPED_MOTION
+  return this_rd;
+}
+
+#if CONFIG_NCOBMC_ADAPT_WEIGHT
+void av1_check_ncobmc_adapt_weight_rd(const struct AV1_COMP *cpi,
+                                      struct macroblock *x, int mi_row,
+                                      int mi_col) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  BLOCK_SIZE bsize = mbmi->sb_type;
+#if CONFIG_VAR_TX
+  const int n4 = bsize_to_num_blk(bsize);
+  uint8_t st_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE * 8];
+  uint8_t obmc_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE * 8];
+  uint8_t ncobmc_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE * 8];
+#endif
+  MB_MODE_INFO st_mbmi, obmc_mbmi, ncobmc_mbmi;
+  int st_skip, obmc_skip, ncobmc_skip;
+  int64_t st_rd, obmc_rd, ncobmc_rd;
+#if CONFIG_WARPED_MOTION
+  const AV1_COMMON *const cm = &cpi->common;
+  const int is_warp_motion = mbmi->motion_mode == WARPED_CAUSAL;
+  const int rs = RDCOST(x->rdmult, av1_get_switchable_rate(cm, x, xd), 0);
+  MB_MODE_INFO warp_mbmi;
+  int64_t warp_rd;
+  int warp_skip;
+#endif
+
+  // Recompute the rd for the motion mode decided in rd loop
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+  st_rd = get_prediction_rd_cost(cpi, x, mi_row, mi_col, &st_skip, &st_mbmi);
+#if CONFIG_WARPED_MOTION
+  st_rd += rs;
+#endif
+#if CONFIG_VAR_TX
+  memcpy(st_blk_skip, x->blk_skip[0], sizeof(st_blk_skip[0]) * n4);
+#endif
+
+  mbmi->motion_mode = OBMC_CAUSAL;
+  obmc_rd =
+      get_prediction_rd_cost(cpi, x, mi_row, mi_col, &obmc_skip, &obmc_mbmi);
+#if CONFIG_WARPED_MOTION
+  obmc_rd += rs;
+#endif
+#if CONFIG_VAR_TX
+  memcpy(obmc_blk_skip, x->blk_skip[0], sizeof(obmc_blk_skip[0]) * n4);
+#endif
+
+  // Compute the rd cost for ncobmc adaptive weight
+  mbmi->motion_mode = NCOBMC_ADAPT_WEIGHT;
+  ncobmc_rd = get_prediction_rd_cost(cpi, x, mi_row, mi_col, &ncobmc_skip,
+                                     &ncobmc_mbmi);
+#if CONFIG_WARPED_MOTION
+  ncobmc_rd += rs;
+#endif
+  // Calculate the ncobmc mode costs
+  {
+    ADAPT_OVERLAP_BLOCK aob = adapt_overlap_block_lookup[bsize];
+    ncobmc_rd +=
+        RDCOST(x->rdmult, x->ncobmc_mode_cost[aob][mbmi->ncobmc_mode[0]], 0);
+    if (mi_size_wide[bsize] != mi_size_high[bsize])
+      ncobmc_rd +=
+          RDCOST(x->rdmult, x->ncobmc_mode_cost[aob][mbmi->ncobmc_mode[1]], 0);
+  }
+#if CONFIG_VAR_TX
+  memcpy(ncobmc_blk_skip, x->blk_skip[0], sizeof(ncobmc_blk_skip[0]) * n4);
+#endif
+
+#if CONFIG_WARPED_MOTION
+  if (is_warp_motion) {
+    mbmi->motion_mode = WARPED_CAUSAL;
+    warp_rd =
+        get_prediction_rd_cost(cpi, x, mi_row, mi_col, &warp_skip, &warp_mbmi);
+  } else {
+    warp_rd = INT64_MAX;
+  }
+#endif
+
+#if CONFIG_WARPED_MOTION
+  if (AOMMIN(ncobmc_rd, warp_rd) < AOMMIN(st_rd, obmc_rd)) {
+    if (ncobmc_rd < warp_rd) {
+      x->skip = ncobmc_skip;
+      *mbmi = ncobmc_mbmi;
+#if CONFIG_VAR_TX
+      memcpy(x->blk_skip[0], ncobmc_blk_skip, sizeof(ncobmc_blk_skip[0]) * n4);
+#endif
+    } else {
+      x->skip = warp_skip;
+      *mbmi = warp_mbmi;
+    }
+#else
+  if (ncobmc_rd < AOMMIN(st_rd, obmc_rd)) {
+    x->skip = ncobmc_skip;
+    *mbmi = ncobmc_mbmi;
+#if CONFIG_VAR_TX
+    memcpy(x->blk_skip[0], ncobmc_blk_skip, sizeof(ncobmc_blk_skip[0]) * n4);
+#endif
+#endif  // CONFIG_WARPED_MOTION
+  } else {
+    if (obmc_rd < st_rd) {
+      *mbmi = obmc_mbmi;
+      x->skip = obmc_skip;
+#if CONFIG_VAR_TX
+      memcpy(x->blk_skip[0], obmc_blk_skip, sizeof(obmc_blk_skip[0]) * n4);
+#endif
+    } else {
+      *mbmi = st_mbmi;
+      x->skip = st_skip;
+#if CONFIG_VAR_TX
+      memcpy(x->blk_skip[0], st_blk_skip, sizeof(st_blk_skip[0]) * n4);
+#endif
+    }
+  }
+}
+
+int64_t get_ncobmc_error(MACROBLOCKD *xd, int pxl_row, int pxl_col,
+                         BLOCK_SIZE bsize, int plane, struct buf_2d *src) {
+  const int wide = AOMMIN(mi_size_wide[bsize] * MI_SIZE,
+                          (xd->sb_mi_bd.mi_col_end + 1) * MI_SIZE - pxl_col);
+  const int high = AOMMIN(mi_size_high[bsize] * MI_SIZE,
+                          (xd->sb_mi_bd.mi_row_end + 1) * MI_SIZE - pxl_row);
+  const int ss_x = xd->plane[plane].subsampling_x;
+  const int ss_y = xd->plane[plane].subsampling_y;
+  int row_offset = (pxl_row - xd->sb_mi_bd.mi_row_begin * MI_SIZE) >> ss_y;
+  int col_offset = (pxl_col - xd->sb_mi_bd.mi_col_begin * MI_SIZE) >> ss_x;
+  int dst_stride = xd->ncobmc_pred_buf_stride[plane];
+  int dst_offset = row_offset * dst_stride + col_offset;
+  int src_stride = src->stride;
+
+  int r, c;
+  int64_t tmp, error = 0;
+
+  for (r = 0; r < (high >> ss_y); ++r) {
+    for (c = 0; c < (wide >> ss_x); ++c) {
+      tmp = xd->ncobmc_pred_buf[plane][r * dst_stride + c + dst_offset] -
+            src->buf[r * src_stride + c];
+      error += tmp * tmp;
+    }
+  }
+  return error;
+}
+
+int get_ncobmc_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                    MACROBLOCKD *xd, int mi_row, int mi_col, int bsize) {
+  const AV1_COMMON *const cm = &cpi->common;
+  uint8_t *pred_buf[4][MAX_MB_PLANE];
+
+  // TODO(weitinglin): stride size needs to be fixed for high-bit depth
+  int pred_stride[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+
+  // target block in pxl
+  int pxl_row = mi_row << MI_SIZE_LOG2;
+  int pxl_col = mi_col << MI_SIZE_LOG2;
+  int64_t error, best_error = INT64_MAX;
+  int plane, tmp_mode, best_mode = 0;
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    int len = sizeof(uint16_t);
+    ASSIGN_ALIGNED_PTRS_HBD(pred_buf[0], cm->ncobmcaw_buf[0], MAX_SB_SQUARE,
+                            len);
+    ASSIGN_ALIGNED_PTRS_HBD(pred_buf[1], cm->ncobmcaw_buf[1], MAX_SB_SQUARE,
+                            len);
+    ASSIGN_ALIGNED_PTRS_HBD(pred_buf[2], cm->ncobmcaw_buf[2], MAX_SB_SQUARE,
+                            len);
+    ASSIGN_ALIGNED_PTRS_HBD(pred_buf[3], cm->ncobmcaw_buf[3], MAX_SB_SQUARE,
+                            len);
+  } else {
+#endif  // CONFIG_HIGHBITDEPTH
+    ASSIGN_ALIGNED_PTRS(pred_buf[0], cm->ncobmcaw_buf[0], MAX_SB_SQUARE);
+    ASSIGN_ALIGNED_PTRS(pred_buf[1], cm->ncobmcaw_buf[1], MAX_SB_SQUARE);
+    ASSIGN_ALIGNED_PTRS(pred_buf[2], cm->ncobmcaw_buf[2], MAX_SB_SQUARE);
+    ASSIGN_ALIGNED_PTRS(pred_buf[3], cm->ncobmcaw_buf[3], MAX_SB_SQUARE);
+#if CONFIG_HIGHBITDEPTH
+  }
+#endif
+
+  av1_get_ext_blk_preds(cm, xd, bsize, mi_row, mi_col, pred_buf, pred_stride);
+  av1_get_ori_blk_pred(cm, xd, bsize, mi_row, mi_col, pred_buf[3], pred_stride);
+
+  for (tmp_mode = 0; tmp_mode < MAX_NCOBMC_MODES; ++tmp_mode) {
+    error = 0;
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+      build_ncobmc_intrpl_pred(cm, xd, plane, pxl_row, pxl_col, bsize, pred_buf,
+                               pred_stride, tmp_mode);
+      error += get_ncobmc_error(xd, pxl_row, pxl_col, bsize, plane,
+                                &x->plane[plane].src);
+    }
+    if (error < best_error) {
+      best_mode = tmp_mode;
+      best_error = error;
+    }
+  }
+
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    build_ncobmc_intrpl_pred(cm, xd, plane, pxl_row, pxl_col, bsize, pred_buf,
+                             pred_stride, best_mode);
+  }
+
+  return best_mode;
+}
+
+#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
 #endif  // CONFIG_MOTION_VAR
diff --git a/third_party/aom/av1/encoder/rdopt.h b/third_party/aom/av1/encoder/rdopt.h
index 43a6a3794..dbc7527fb 100644
--- a/third_party/aom/av1/encoder/rdopt.h
+++ b/third_party/aom/av1/encoder/rdopt.h
@@ -57,7 +57,6 @@ typedef enum OUTPUT_STATUS {
   OUTPUT_HAS_DECODED_PIXELS
 } OUTPUT_STATUS;
 
-#if CONFIG_PALETTE || CONFIG_INTRABC
 // Returns the number of colors in 'src'.
 int av1_count_colors(const uint8_t *src, int stride, int rows, int cols);
 #if CONFIG_HIGHBITDEPTH
@@ -65,7 +64,6 @@ int av1_count_colors(const uint8_t *src, int stride, int rows, int cols);
 int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols,
                             int bit_depth);
 #endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_PALETTE || CONFIG_INTRABC
 
 void av1_dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
                     BLOCK_SIZE plane_bsize, int block, int blk_row, int blk_col,
@@ -73,7 +71,7 @@ void av1_dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
                     OUTPUT_STATUS output_status);
 
 #if CONFIG_DIST_8X8
-int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCKD *xd,
+int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCK *x,
                      const uint8_t *src, int src_stride, const uint8_t *dst,
                      int dst_stride, const BLOCK_SIZE tx_bsize, int bsw,
                      int bsh, int visible_w, int visible_h, int qindex);
@@ -142,8 +140,21 @@ void av1_txfm_rd_in_plane_supertx(MACROBLOCK *x, const AV1_COMP *cpi, int *rate,
 }  // extern "C"
 #endif
 
-int av1_tx_type_cost(const AV1_COMP *cpi, const MACROBLOCKD *xd,
-                     BLOCK_SIZE bsize, int plane, TX_SIZE tx_size,
-                     TX_TYPE tx_type);
+int av1_tx_type_cost(const AV1_COMMON *cm, const MACROBLOCK *x,
+                     const MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
+                     TX_SIZE tx_size, TX_TYPE tx_type);
+
+int64_t get_prediction_rd_cost(const struct AV1_COMP *cpi, struct macroblock *x,
+                               int mi_row, int mi_col, int *skip_blk,
+                               MB_MODE_INFO *backup_mbmi);
+
+#if CONFIG_NCOBMC_ADAPT_WEIGHT
+void av1_check_ncobmc_adapt_weight_rd(const struct AV1_COMP *cpi,
+                                      struct macroblock *x, int mi_row,
+                                      int mi_col);
+int get_ncobmc_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                    MACROBLOCKD *xd, int mi_row, int mi_col, int bsize);
+
+#endif
 
 #endif  // AV1_ENCODER_RDOPT_H_
diff --git a/third_party/aom/av1/encoder/segmentation.c b/third_party/aom/av1/encoder/segmentation.c
index b61df43fa..4f01fbba4 100644
--- a/third_party/aom/av1/encoder/segmentation.c
+++ b/third_party/aom/av1/encoder/segmentation.c
@@ -32,7 +32,7 @@ void av1_disable_segmentation(struct segmentation *seg) {
   seg->update_data = 0;
 }
 
-void av1_set_segment_data(struct segmentation *seg, signed char *feature_data,
+void av1_set_segment_data(struct segmentation *seg, int8_t *feature_data,
                           unsigned char abs_delta) {
   seg->abs_delta = abs_delta;
 
@@ -167,76 +167,78 @@ static void count_segs_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
   const int bs = mi_size_wide[bsize], hbs = bs / 2;
 #if CONFIG_EXT_PARTITION_TYPES
   PARTITION_TYPE partition;
+#if CONFIG_EXT_PARTITION_TYPES_AB
+  const int qbs = bs / 4;
+#endif  // CONFIG_EXT_PARTITION_TYPES_AB
 #else
   int bw, bh;
 #endif  // CONFIG_EXT_PARTITION_TYPES
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
 
+#define CSEGS(cs_bw, cs_bh, cs_rowoff, cs_coloff)                              \
+  count_segs(cm, xd, tile, mi + mis * (cs_rowoff) + (cs_coloff),               \
+             no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, \
+             (cs_bw), (cs_bh), mi_row + (cs_rowoff), mi_col + (cs_coloff));
+
 #if CONFIG_EXT_PARTITION_TYPES
   if (bsize == BLOCK_8X8)
     partition = PARTITION_NONE;
   else
     partition = get_partition(cm, mi_row, mi_col, bsize);
   switch (partition) {
-    case PARTITION_NONE:
-      count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
-                 t_unpred_seg_counts, bs, bs, mi_row, mi_col);
-      break;
+    case PARTITION_NONE: CSEGS(bs, bs, 0, 0); break;
     case PARTITION_HORZ:
-      count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
-                 t_unpred_seg_counts, bs, hbs, mi_row, mi_col);
-      count_segs(cm, xd, tile, mi + hbs * mis, no_pred_segcounts,
-                 temporal_predictor_count, t_unpred_seg_counts, bs, hbs,
-                 mi_row + hbs, mi_col);
+      CSEGS(bs, hbs, 0, 0);
+      CSEGS(bs, hbs, hbs, 0);
       break;
     case PARTITION_VERT:
-      count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
-                 t_unpred_seg_counts, hbs, bs, mi_row, mi_col);
-      count_segs(cm, xd, tile, mi + hbs, no_pred_segcounts,
-                 temporal_predictor_count, t_unpred_seg_counts, hbs, bs, mi_row,
-                 mi_col + hbs);
+      CSEGS(hbs, bs, 0, 0);
+      CSEGS(hbs, bs, 0, hbs);
       break;
+#if CONFIG_EXT_PARTITION_TYPES_AB
     case PARTITION_HORZ_A:
-      count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
-                 t_unpred_seg_counts, hbs, hbs, mi_row, mi_col);
-      count_segs(cm, xd, tile, mi + hbs, no_pred_segcounts,
-                 temporal_predictor_count, t_unpred_seg_counts, hbs, hbs,
-                 mi_row, mi_col + hbs);
-      count_segs(cm, xd, tile, mi + hbs * mis, no_pred_segcounts,
-                 temporal_predictor_count, t_unpred_seg_counts, bs, hbs,
-                 mi_row + hbs, mi_col);
+      CSEGS(bs, qbs, 0, 0);
+      CSEGS(bs, qbs, qbs, 0);
+      CSEGS(bs, hbs, hbs, 0);
       break;
     case PARTITION_HORZ_B:
-      count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
-                 t_unpred_seg_counts, bs, hbs, mi_row, mi_col);
-      count_segs(cm, xd, tile, mi + hbs * mis, no_pred_segcounts,
-                 temporal_predictor_count, t_unpred_seg_counts, hbs, hbs,
-                 mi_row + hbs, mi_col);
-      count_segs(cm, xd, tile, mi + hbs + hbs * mis, no_pred_segcounts,
-                 temporal_predictor_count, t_unpred_seg_counts, hbs, hbs,
-                 mi_row + hbs, mi_col + hbs);
+      CSEGS(bs, hbs, 0, 0);
+      CSEGS(bs, qbs, hbs, 0);
+      if (mi_row + 3 * qbs < cm->mi_rows) CSEGS(bs, qbs, 3 * qbs, 0);
       break;
     case PARTITION_VERT_A:
-      count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
-                 t_unpred_seg_counts, hbs, hbs, mi_row, mi_col);
-      count_segs(cm, xd, tile, mi + hbs * mis, no_pred_segcounts,
-                 temporal_predictor_count, t_unpred_seg_counts, hbs, hbs,
-                 mi_row + hbs, mi_col);
-      count_segs(cm, xd, tile, mi + hbs, no_pred_segcounts,
-                 temporal_predictor_count, t_unpred_seg_counts, hbs, bs, mi_row,
-                 mi_col + hbs);
+      CSEGS(qbs, bs, 0, 0);
+      CSEGS(qbs, bs, 0, qbs);
+      CSEGS(hbs, bs, 0, hbs);
       break;
     case PARTITION_VERT_B:
-      count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
-                 t_unpred_seg_counts, hbs, bs, mi_row, mi_col);
-      count_segs(cm, xd, tile, mi + hbs, no_pred_segcounts,
-                 temporal_predictor_count, t_unpred_seg_counts, hbs, hbs,
-                 mi_row, mi_col + hbs);
-      count_segs(cm, xd, tile, mi + hbs + hbs * mis, no_pred_segcounts,
-                 temporal_predictor_count, t_unpred_seg_counts, hbs, hbs,
-                 mi_row + hbs, mi_col + hbs);
+      CSEGS(hbs, bs, 0, 0);
+      CSEGS(qbs, bs, 0, hbs);
+      if (mi_col + 3 * qbs < cm->mi_cols) CSEGS(qbs, bs, 0, 3 * qbs);
+      break;
+#else
+    case PARTITION_HORZ_A:
+      CSEGS(hbs, hbs, 0, 0);
+      CSEGS(hbs, hbs, 0, hbs);
+      CSEGS(bs, hbs, hbs, 0);
+      break;
+    case PARTITION_HORZ_B:
+      CSEGS(bs, hbs, 0, 0);
+      CSEGS(hbs, hbs, hbs, 0);
+      CSEGS(hbs, hbs, hbs, hbs);
       break;
+    case PARTITION_VERT_A:
+      CSEGS(hbs, hbs, 0, 0);
+      CSEGS(hbs, hbs, hbs, 0);
+      CSEGS(hbs, bs, 0, hbs);
+      break;
+    case PARTITION_VERT_B:
+      CSEGS(hbs, bs, 0, 0);
+      CSEGS(hbs, hbs, 0, hbs);
+      CSEGS(hbs, hbs, hbs, hbs);
+      break;
+#endif
     case PARTITION_SPLIT: {
       const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize];
       int n;
@@ -260,20 +262,13 @@ static void count_segs_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
   bh = mi_size_high[mi[0]->mbmi.sb_type];
 
   if (bw == bs && bh == bs) {
-    count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
-               t_unpred_seg_counts, bs, bs, mi_row, mi_col);
+    CSEGS(bs, bs, 0, 0);
   } else if (bw == bs && bh < bs) {
-    count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
-               t_unpred_seg_counts, bs, hbs, mi_row, mi_col);
-    count_segs(cm, xd, tile, mi + hbs * mis, no_pred_segcounts,
-               temporal_predictor_count, t_unpred_seg_counts, bs, hbs,
-               mi_row + hbs, mi_col);
+    CSEGS(bs, hbs, 0, 0);
+    CSEGS(bs, hbs, hbs, 0);
   } else if (bw < bs && bh == bs) {
-    count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
-               t_unpred_seg_counts, hbs, bs, mi_row, mi_col);
-    count_segs(cm, xd, tile, mi + hbs, no_pred_segcounts,
-               temporal_predictor_count, t_unpred_seg_counts, hbs, bs, mi_row,
-               mi_col + hbs);
+    CSEGS(hbs, bs, 0, 0);
+    CSEGS(hbs, bs, 0, hbs);
   } else {
     const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize];
     int n;
@@ -290,6 +285,8 @@ static void count_segs_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
     }
   }
 #endif  // CONFIG_EXT_PARTITION_TYPES
+
+#undef CSEGS
 }
 
 void av1_choose_segmap_coding_method(AV1_COMMON *cm, MACROBLOCKD *xd) {
diff --git a/third_party/aom/av1/encoder/segmentation.h b/third_party/aom/av1/encoder/segmentation.h
index c1491ca2a..1d24ed1d1 100644
--- a/third_party/aom/av1/encoder/segmentation.h
+++ b/third_party/aom/av1/encoder/segmentation.h
@@ -37,7 +37,7 @@ void av1_clear_segdata(struct segmentation *seg, int segment_id,
 //
 // abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use
 // the absolute values given).
-void av1_set_segment_data(struct segmentation *seg, signed char *feature_data,
+void av1_set_segment_data(struct segmentation *seg, int8_t *feature_data,
                           unsigned char abs_delta);
 
 void av1_choose_segmap_coding_method(AV1_COMMON *cm, MACROBLOCKD *xd);
diff --git a/third_party/aom/av1/encoder/speed_features.c b/third_party/aom/av1/encoder/speed_features.c
index eeab33a95..5608d031e 100644
--- a/third_party/aom/av1/encoder/speed_features.c
+++ b/third_party/aom/av1/encoder/speed_features.c
@@ -172,20 +172,20 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
 #if CONFIG_TX64X64
     sf->intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V;
 #if CONFIG_CFL
-    sf->intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC_H_V_CFL;
 #else
     sf->intra_uv_mode_mask[TX_64X64] = INTRA_DC_H_V;
 #endif  // CONFIG_CFL
 #endif  // CONFIG_TX64X64
     sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
 #if CONFIG_CFL
-    sf->intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_H_V_CFL;
 #else
     sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
 #endif
     sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
 #if CONFIG_CFL
-    sf->intra_uv_mode_mask[TX_16X16] = UV_INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_16X16] = UV_INTRA_DC_H_V_CFL;
 #else
     sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
 #endif
@@ -196,10 +196,8 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
     // Use transform domain distortion.
     // Note var-tx expt always uses pixel domain distortion.
     sf->use_transform_domain_distortion = 1;
-#if CONFIG_EXT_INTER
     sf->disable_wedge_search_var_thresh = 100;
     sf->fast_wedge_sign_estimate = 1;
-#endif  // CONFIG_EXT_INTER
   }
 
   if (speed >= 3) {
@@ -240,14 +238,14 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
 #if CONFIG_TX64X64
     sf->intra_y_mode_mask[TX_64X64] = INTRA_DC;
 #if CONFIG_CFL
-    sf->intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC;
+    sf->intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC_CFL;
 #else
     sf->intra_uv_mode_mask[TX_64X64] = INTRA_DC;
 #endif  // CONFIG_CFL
 #endif  // CONFIG_TX64X64
     sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
 #if CONFIG_CFL
-    sf->intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC;
+    sf->intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_CFL;
 #else
     sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC;
 #endif  // CONFIG_CFL
@@ -276,7 +274,7 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
     for (i = 0; i < TX_SIZES; ++i) {
       sf->intra_y_mode_mask[i] = INTRA_DC;
 #if CONFIG_CFL
-      sf->intra_uv_mode_mask[i] = UV_INTRA_DC;
+      sf->intra_uv_mode_mask[i] = UV_INTRA_DC_CFL;
 #else
       sf->intra_uv_mode_mask[i] = INTRA_DC;
 #endif  // CONFIG_CFL
@@ -404,6 +402,7 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
   sf->alt_ref_search_fp = 0;
   sf->partition_search_type = SEARCH_PARTITION;
   sf->tx_type_search.prune_mode = NO_PRUNE;
+  sf->tx_type_search.use_skip_flag_prediction = 1;
   sf->tx_type_search.fast_intra_tx_type_search = 0;
   sf->tx_type_search.fast_inter_tx_type_search = 0;
   sf->less_rectangular_check = 0;
@@ -422,10 +421,8 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
   sf->adaptive_interp_filter_search = 0;
   sf->allow_partition_search_skip = 0;
   sf->use_upsampled_references = 1;
-#if CONFIG_EXT_INTER
   sf->disable_wedge_search_var_thresh = 0;
   sf->fast_wedge_sign_estimate = 0;
-#endif  // CONFIG_EXT_INTER
 
   for (i = 0; i < TX_SIZES; i++) {
     sf->intra_y_mode_mask[i] = INTRA_ALL;
diff --git a/third_party/aom/av1/encoder/speed_features.h b/third_party/aom/av1/encoder/speed_features.h
index 2c89f4e5c..edd79cd16 100644
--- a/third_party/aom/av1/encoder/speed_features.h
+++ b/third_party/aom/av1/encoder/speed_features.h
@@ -21,31 +21,34 @@ extern "C" {
 enum {
   INTRA_ALL = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED) | (1 << D45_PRED) |
               (1 << D135_PRED) | (1 << D117_PRED) | (1 << D153_PRED) |
-              (1 << D207_PRED) | (1 << D63_PRED) |
-#if CONFIG_ALT_INTRA
-              (1 << SMOOTH_PRED) |
+              (1 << D207_PRED) | (1 << D63_PRED) | (1 << SMOOTH_PRED) |
 #if CONFIG_SMOOTH_HV
               (1 << SMOOTH_V_PRED) | (1 << SMOOTH_H_PRED) |
 #endif  // CONFIG_SMOOTH_HV
-#endif  // CONFIG_ALT_INTRA
               (1 << TM_PRED),
 #if CONFIG_CFL
   UV_INTRA_ALL = (1 << UV_DC_PRED) | (1 << UV_V_PRED) | (1 << UV_H_PRED) |
                  (1 << UV_D45_PRED) | (1 << UV_D135_PRED) |
                  (1 << UV_D117_PRED) | (1 << UV_D153_PRED) |
                  (1 << UV_D207_PRED) | (1 << UV_D63_PRED) |
-#if CONFIG_ALT_INTRA
                  (1 << UV_SMOOTH_PRED) |
 #if CONFIG_SMOOTH_HV
                  (1 << UV_SMOOTH_V_PRED) | (1 << UV_SMOOTH_H_PRED) |
 #endif  // CONFIG_SMOOTH_HV
-#endif  // CONFIG_ALT_INTRA
-                 (1 << UV_TM_PRED),
+                 (1 << UV_TM_PRED) | (1 << UV_CFL_PRED),
   UV_INTRA_DC = (1 << UV_DC_PRED),
+  UV_INTRA_DC_CFL = (1 << UV_DC_PRED) | (1 << UV_CFL_PRED),
   UV_INTRA_DC_TM = (1 << UV_DC_PRED) | (1 << UV_TM_PRED),
+  UV_INTRA_DC_TM_CFL =
+      (1 << UV_DC_PRED) | (1 << UV_TM_PRED) | (1 << UV_CFL_PRED),
   UV_INTRA_DC_H_V = (1 << UV_DC_PRED) | (1 << UV_V_PRED) | (1 << UV_H_PRED),
+  UV_INTRA_DC_H_V_CFL = (1 << UV_DC_PRED) | (1 << UV_V_PRED) |
+                        (1 << UV_H_PRED) | (1 << UV_CFL_PRED),
   UV_INTRA_DC_TM_H_V = (1 << UV_DC_PRED) | (1 << UV_TM_PRED) |
                        (1 << UV_V_PRED) | (1 << UV_H_PRED),
+  UV_INTRA_DC_TM_H_V_CFL = (1 << UV_DC_PRED) | (1 << UV_TM_PRED) |
+                           (1 << UV_V_PRED) | (1 << UV_H_PRED) |
+                           (1 << UV_CFL_PRED),
 #endif  // CONFIG_CFL
   INTRA_DC = (1 << DC_PRED),
   INTRA_DC_TM = (1 << DC_PRED) | (1 << TM_PRED),
@@ -54,7 +57,6 @@ enum {
       (1 << DC_PRED) | (1 << TM_PRED) | (1 << V_PRED) | (1 << H_PRED)
 };
 
-#if CONFIG_EXT_INTER
 enum {
 #if CONFIG_COMPOUND_SINGLEREF
 // TODO(zoeliu): To further consider following single ref comp modes:
@@ -90,17 +92,6 @@ enum {
                             (1 << NEW_NEARMV) | (1 << NEAR_NEWMV) |
                             (1 << NEAR_NEARMV),
 };
-#else   // !CONFIG_EXT_INTER
-enum {
-  INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) | (1 << NEWMV),
-  INTER_NEAREST = (1 << NEARESTMV),
-  INTER_NEAREST_NEW = (1 << NEARESTMV) | (1 << NEWMV),
-  INTER_NEAREST_ZERO = (1 << NEARESTMV) | (1 << ZEROMV),
-  INTER_NEAREST_NEW_ZERO = (1 << NEARESTMV) | (1 << ZEROMV) | (1 << NEWMV),
-  INTER_NEAREST_NEAR_NEW = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV),
-  INTER_NEAREST_NEAR_ZERO = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV),
-};
-#endif  // CONFIG_EXT_INTER
 
 enum {
   DISABLE_ALL_INTER_SPLIT = (1 << THR_COMP_GA) | (1 << THR_COMP_LA) |
@@ -209,6 +200,10 @@ typedef struct {
   TX_TYPE_PRUNE_MODE prune_mode;
   int fast_intra_tx_type_search;
   int fast_inter_tx_type_search;
+
+  // Use a skip flag prediction model to detect blocks with skip = 1 early
+  // and avoid doing full TX type search for such blocks.
+  int use_skip_flag_prediction;
 } TX_TYPE_SEARCH;
 
 typedef enum {
@@ -409,13 +404,11 @@ typedef struct SPEED_FEATURES {
   // Choose a very large value (UINT_MAX) to use 8-tap always
   unsigned int disable_filter_search_var_thresh;
 
-#if CONFIG_EXT_INTER
   // A source variance threshold below which wedge search is disabled
   unsigned int disable_wedge_search_var_thresh;
 
   // Whether fast wedge sign estimate is used
   int fast_wedge_sign_estimate;
-#endif  // CONFIG_EXT_INTER
 
   // These bit masks allow you to enable or disable intra modes for each
   // transform size separately.
diff --git a/third_party/aom/av1/encoder/subexp.c b/third_party/aom/av1/encoder/subexp.c
index 6a8ba12d8..dc96d712a 100644
--- a/third_party/aom/av1/encoder/subexp.c
+++ b/third_party/aom/av1/encoder/subexp.c
@@ -138,47 +138,6 @@ int av1_prob_diff_update_savings_search(const unsigned int *ct, aom_prob oldp,
   return bestsavings;
 }
 
-int av1_prob_diff_update_savings_search_model(const unsigned int *ct,
-                                              const aom_prob oldp,
-                                              aom_prob *bestp, aom_prob upd,
-                                              int stepsize, int probwt) {
-  int i, old_b, new_b, update_b, savings, bestsavings;
-  int newp;
-  const int step_sign = *bestp > oldp ? -1 : 1;
-  const int step = stepsize * step_sign;
-  const int upd_cost = av1_cost_one(upd) - av1_cost_zero(upd);
-  const aom_prob *newplist, *oldplist;
-  aom_prob bestnewp;
-  oldplist = av1_pareto8_full[oldp - 1];
-  old_b = cost_branch256(ct + 2 * PIVOT_NODE, oldp);
-  for (i = UNCONSTRAINED_NODES; i < ENTROPY_NODES; ++i)
-    old_b += cost_branch256(ct + 2 * i, oldplist[i - UNCONSTRAINED_NODES]);
-
-  bestsavings = 0;
-  bestnewp = oldp;
-
-  assert(stepsize > 0);
-
-  if (old_b > upd_cost + (MIN_DELP_BITS << AV1_PROB_COST_SHIFT)) {
-    for (newp = *bestp; (newp - oldp) * step_sign < 0; newp += step) {
-      if (newp < 1 || newp > 255) continue;
-      newplist = av1_pareto8_full[newp - 1];
-      new_b = cost_branch256(ct + 2 * PIVOT_NODE, newp);
-      for (i = UNCONSTRAINED_NODES; i < ENTROPY_NODES; ++i)
-        new_b += cost_branch256(ct + 2 * i, newplist[i - UNCONSTRAINED_NODES]);
-      update_b = prob_diff_update_cost(newp, oldp) + upd_cost;
-      savings = old_b - new_b - update_b * probwt;
-      if (savings > bestsavings) {
-        bestsavings = savings;
-        bestnewp = newp;
-      }
-    }
-  }
-
-  *bestp = bestnewp;
-  return bestsavings;
-}
-
 void av1_cond_prob_diff_update(aom_writer *w, aom_prob *oldp,
                                const unsigned int ct[2], int probwt) {
   const aom_prob upd = DIFF_UPDATE_PROB;
diff --git a/third_party/aom/av1/encoder/temporal_filter.c b/third_party/aom/av1/encoder/temporal_filter.c
index 604647922..daa647689 100644
--- a/third_party/aom/av1/encoder/temporal_filter.c
+++ b/third_party/aom/av1/encoder/temporal_filter.c
@@ -44,18 +44,13 @@ static void temporal_filter_predictors_mb_c(
   ConvolveParams conv_params = get_conv_params(which_mv, which_mv, 0);
 
 #if USE_TEMPORALFILTER_12TAP
-#if CONFIG_DUAL_FILTER
-  const InterpFilter interp_filter[4] = { TEMPORALFILTER_12TAP,
-                                          TEMPORALFILTER_12TAP,
-                                          TEMPORALFILTER_12TAP,
-                                          TEMPORALFILTER_12TAP };
-#else
-  const InterpFilter interp_filter = TEMPORALFILTER_12TAP;
-#endif
+  const InterpFilters interp_filters =
+      av1_broadcast_interp_filter(TEMPORALFILTER_12TAP);
   (void)xd;
 #else
-  const InterpFilter interp_filter = xd->mi[0]->mbmi.interp_filter;
+  const InterpFilters interp_filters = xd->mi[0]->mbmi.interp_filters;
 #endif  // USE_TEMPORALFILTER_12TAP
+
 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
   WarpTypesAllowed warp_types;
   memset(&warp_types, 0, sizeof(WarpTypesAllowed));
@@ -72,7 +67,7 @@ static void temporal_filter_predictors_mb_c(
 #if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     av1_highbd_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, scale,
-                                     16, 16, which_mv, interp_filter,
+                                     16, 16, which_mv, interp_filters,
 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
                                      &warp_types, x, y,
 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
@@ -80,7 +75,7 @@ static void temporal_filter_predictors_mb_c(
 
     av1_highbd_build_inter_predictor(u_mb_ptr, uv_stride, &pred[256],
                                      uv_block_width, &mv, scale, uv_block_width,
-                                     uv_block_height, which_mv, interp_filter,
+                                     uv_block_height, which_mv, interp_filters,
 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
                                      &warp_types, x, y,
 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
@@ -88,7 +83,7 @@ static void temporal_filter_predictors_mb_c(
 
     av1_highbd_build_inter_predictor(v_mb_ptr, uv_stride, &pred[512],
                                      uv_block_width, &mv, scale, uv_block_width,
-                                     uv_block_height, which_mv, interp_filter,
+                                     uv_block_height, which_mv, interp_filters,
 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
                                      &warp_types, x, y,
 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
@@ -97,7 +92,7 @@ static void temporal_filter_predictors_mb_c(
   }
 #endif  // CONFIG_HIGHBITDEPTH
   av1_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, scale, 16, 16,
-                            &conv_params, interp_filter,
+                            &conv_params, interp_filters,
 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
                             &warp_types, x, y, 0, 0,
 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
@@ -105,7 +100,7 @@ static void temporal_filter_predictors_mb_c(
 
   av1_build_inter_predictor(u_mb_ptr, uv_stride, &pred[256], uv_block_width,
                             &mv, scale, uv_block_width, uv_block_height,
-                            &conv_params, interp_filter,
+                            &conv_params, interp_filters,
 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
                             &warp_types, x, y, 1, 0,
 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
@@ -113,7 +108,7 @@ static void temporal_filter_predictors_mb_c(
 
   av1_build_inter_predictor(v_mb_ptr, uv_stride, &pred[512], uv_block_width,
                             &mv, scale, uv_block_width, uv_block_height,
-                            &conv_params, interp_filter,
+                            &conv_params, interp_filters,
 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
                             &warp_types, x, y, 2, 0,
 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
@@ -291,15 +286,30 @@ static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi,
 
   x->mv_limits = tmp_mv_limits;
 
-  // Ignore mv costing by sending NULL pointer instead of cost array
-  bestsme = cpi->find_fractional_mv_step(
-      x, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit,
-      &cpi->fn_ptr[BLOCK_16X16], 0, mv_sf->subpel_iters_per_step,
-      cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL,
-#if CONFIG_EXT_INTER
-      NULL, 0, 0,
+// Ignore mv costing by sending NULL pointer instead of cost array
+#if CONFIG_AMVR
+  if (cpi->common.cur_frame_mv_precision_level == 1) {
+    const uint8_t *const src_address = x->plane[0].src.buf;
+    const int src_stride = x->plane[0].src.stride;
+    const uint8_t *const y = xd->plane[0].pre[0].buf;
+    const int y_stride = xd->plane[0].pre[0].stride;
+    const int offset = x->best_mv.as_mv.row * y_stride + x->best_mv.as_mv.col;
+
+    x->best_mv.as_mv.row *= 8;
+    x->best_mv.as_mv.col *= 8;
+
+    bestsme = cpi->fn_ptr[BLOCK_16X16].vf(y + offset, y_stride, src_address,
+                                          src_stride, &sse);
+  } else {
+#endif
+    bestsme = cpi->find_fractional_mv_step(
+        x, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit,
+        &cpi->fn_ptr[BLOCK_16X16], 0, mv_sf->subpel_iters_per_step,
+        cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL,
+        NULL, 0, 0, 0, 0, 0);
+#if CONFIG_AMVR
+  }
 #endif
-      0, 0, 0);
 
   x->e_mbd.mi[0]->bmi[0].as_mv[0] = x->best_mv;
 
@@ -311,6 +321,9 @@ static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi,
 }
 
 static void temporal_filter_iterate_c(AV1_COMP *cpi,
+#if CONFIG_BGSPRITE
+                                      YV12_BUFFER_CONFIG *target,
+#endif  // CONFIG_BGSPRITE
                                       YV12_BUFFER_CONFIG **frames,
                                       int frame_count, int alt_ref_index,
                                       int strength,
@@ -452,9 +465,17 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi,
       if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
         uint16_t *dst1_16;
         uint16_t *dst2_16;
+#if CONFIG_BGSPRITE
+        dst1 = target->y_buffer;
+#else
         dst1 = cpi->alt_ref_buffer.y_buffer;
+#endif  // CONFIG_BGSPRITE
         dst1_16 = CONVERT_TO_SHORTPTR(dst1);
+#if CONFIG_BGSPRITE
+        stride = target->y_stride;
+#else
         stride = cpi->alt_ref_buffer.y_stride;
+#endif  // CONFIG_BGSPRITE
         byte = mb_y_offset;
         for (i = 0, k = 0; i < 16; i++) {
           for (j = 0; j < 16; j++, k++) {
@@ -494,8 +515,13 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi,
         }
       } else {
 #endif  // CONFIG_HIGHBITDEPTH
-        dst1 = cpi->alt_ref_buffer.y_buffer;
-        stride = cpi->alt_ref_buffer.y_stride;
+#if CONFIG_BGSPRITE
+        dst1 = target->y_buffer;
+        stride = target->y_stride;
+#else
+      dst1 = cpi->alt_ref_buffer.y_buffer;
+      stride = cpi->alt_ref_buffer.y_stride;
+#endif  // CONFIG_BGSPRITE
         byte = mb_y_offset;
         for (i = 0, k = 0; i < 16; i++) {
           for (j = 0; j < 16; j++, k++) {
@@ -507,10 +533,15 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi,
           }
           byte += stride - 16;
         }
-
-        dst1 = cpi->alt_ref_buffer.u_buffer;
-        dst2 = cpi->alt_ref_buffer.v_buffer;
-        stride = cpi->alt_ref_buffer.uv_stride;
+#if CONFIG_BGSPRITE
+        dst1 = target->u_buffer;
+        dst2 = target->v_buffer;
+        stride = target->uv_stride;
+#else
+      dst1 = cpi->alt_ref_buffer.u_buffer;
+      dst2 = cpi->alt_ref_buffer.v_buffer;
+      stride = cpi->alt_ref_buffer.uv_stride;
+#endif  // CONFIG_BGSPRITE
         byte = mb_uv_offset;
         for (i = 0, k = 256; i < mb_uv_height; i++) {
           for (j = 0; j < mb_uv_width; j++, k++) {
@@ -604,7 +635,7 @@ static void adjust_arnr_filter(AV1_COMP *cpi, int distance, int group_boost,
 
 void av1_temporal_filter(AV1_COMP *cpi,
 #if CONFIG_BGSPRITE
-                         YV12_BUFFER_CONFIG *bg,
+                         YV12_BUFFER_CONFIG *bg, YV12_BUFFER_CONFIG *target,
 #endif  // CONFIG_BGSPRITE
                          int distance) {
   RATE_CONTROL *const rc = &cpi->rc;
@@ -618,7 +649,7 @@ void av1_temporal_filter(AV1_COMP *cpi,
   YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = { NULL };
 #if CONFIG_EXT_REFS
   const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-#endif
+#endif  // CONFIG_EXT_REFS
 
   // Apply context specific adjustments to the arnr filter parameters.
   adjust_arnr_filter(cpi, distance, rc->gfu_boost, &frames_to_blur, &strength);
@@ -627,19 +658,34 @@ void av1_temporal_filter(AV1_COMP *cpi,
 //                   case it is more beneficial to use non-zero strength
 //                   filtering.
 #if CONFIG_EXT_REFS
-  if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW) {
+  if (gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) {
     strength = 0;
     frames_to_blur = 1;
   }
-#endif
 
-#if CONFIG_EXT_REFS
-  if (strength == 0 && frames_to_blur == 1) {
-    cpi->is_arf_filter_off[gf_group->arf_update_idx[gf_group->index]] = 1;
-  } else {
-    cpi->is_arf_filter_off[gf_group->arf_update_idx[gf_group->index]] = 0;
+  int which_arf = gf_group->arf_update_idx[gf_group->index];
+
+#if USE_GF16_MULTI_LAYER
+  if (cpi->rc.baseline_gf_interval == 16) {
+    // Identify the index to the current ARF.
+    const int num_arfs_in_gf = cpi->num_extra_arfs + 1;
+    int arf_idx;
+    for (arf_idx = 0; arf_idx < num_arfs_in_gf; arf_idx++) {
+      if (gf_group->index == cpi->arf_pos_in_gf[arf_idx]) {
+        which_arf = arf_idx;
+        break;
+      }
+    }
+    assert(arf_idx < num_arfs_in_gf);
   }
-#endif
+#endif  // USE_GF16_MULTI_LAYER
+
+  // Set the temporal filtering status for the corresponding OVERLAY frame
+  if (strength == 0 && frames_to_blur == 1)
+    cpi->is_arf_filter_off[which_arf] = 1;
+  else
+    cpi->is_arf_filter_off[which_arf] = 0;
+#endif  // CONFIG_EXT_REFS
 
   frames_to_blur_backward = (frames_to_blur / 2);
   frames_to_blur_forward = ((frames_to_blur - 1) / 2);
@@ -678,6 +724,10 @@ void av1_temporal_filter(AV1_COMP *cpi,
 #endif  // CONFIG_HIGHBITDEPTH
   }
 
-  temporal_filter_iterate_c(cpi, frames, frames_to_blur,
-                            frames_to_blur_backward, strength, &sf);
+  temporal_filter_iterate_c(cpi,
+#if CONFIG_BGSPRITE
+                            target,
+#endif  // CONFIG_BGSPRITE
+                            frames, frames_to_blur, frames_to_blur_backward,
+                            strength, &sf);
 }
diff --git a/third_party/aom/av1/encoder/temporal_filter.h b/third_party/aom/av1/encoder/temporal_filter.h
index ebb24703f..7dd9fad58 100644
--- a/third_party/aom/av1/encoder/temporal_filter.h
+++ b/third_party/aom/av1/encoder/temporal_filter.h
@@ -18,7 +18,7 @@ extern "C" {
 
 void av1_temporal_filter(AV1_COMP *cpi,
 #if CONFIG_BGSPRITE
-                         YV12_BUFFER_CONFIG *bg,
+                         YV12_BUFFER_CONFIG *bg, YV12_BUFFER_CONFIG *target,
 #endif  // CONFIG_BGSPRITE
                          int distance);
 
diff --git a/third_party/aom/av1/encoder/tokenize.c b/third_party/aom/av1/encoder/tokenize.c
index b9db891b3..a2e24d66b 100644
--- a/third_party/aom/av1/encoder/tokenize.c
+++ b/third_party/aom/av1/encoder/tokenize.c
@@ -315,36 +315,30 @@ static INLINE void add_token(TOKENEXTRA **t,
   (*t)->eob_val = eob_val;
   (*t)->first_val = first_val;
   (*t)++;
+
+  if (token == BLOCK_Z_TOKEN) {
+    update_cdf(*head_cdf, 0, HEAD_TOKENS + 1);
+  } else {
+    if (eob_val != LAST_EOB) {
+      const int symb = 2 * AOMMIN(token, TWO_TOKEN) - eob_val + first_val;
+      update_cdf(*head_cdf, symb, HEAD_TOKENS + first_val);
+    }
+    if (token > ONE_TOKEN)
+      update_cdf(*tail_cdf, token - TWO_TOKEN, TAIL_TOKENS);
+  }
 }
 #endif  // !CONFIG_PVQ || CONFIG_VAR_TX
 
-#if CONFIG_PALETTE
-void av1_tokenize_palette_sb(const AV1_COMP *cpi,
-                             const struct ThreadData *const td, int plane,
-                             TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize,
-                             int *rate) {
-  assert(plane == 0 || plane == 1);
-  const MACROBLOCK *const x = &td->mb;
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const uint8_t *const color_map = xd->plane[plane].color_index_map;
-  const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-  aom_cdf_prob(
-      *palette_cdf)[PALETTE_COLOR_INDEX_CONTEXTS][CDF_SIZE(PALETTE_COLORS)] =
-      plane ? xd->tile_ctx->palette_uv_color_index_cdf
-            : xd->tile_ctx->palette_y_color_index_cdf;
-  int plane_block_width, rows, cols;
-  av1_get_block_dimensions(bsize, plane, xd, &plane_block_width, NULL, &rows,
-                           &cols);
+static int cost_and_tokenize_map(Av1ColorMapParam *param, TOKENEXTRA **t,
+                                 int calc_rate) {
+  const uint8_t *const color_map = param->color_map;
+  MapCdf map_cdf = param->map_cdf;
+  ColorCost color_cost = param->color_cost;
+  const int plane_block_width = param->plane_width;
+  const int rows = param->rows;
+  const int cols = param->cols;
+  const int n = param->n_colors;
 
-  // The first color index does not use context or entropy.
-  (*t)->token = color_map[0];
-  (*t)->palette_cdf = NULL;
-  (*t)->skip_eob_node = 0;
-  ++(*t);
-
-  const int n = pmi->palette_size[plane];
-  const int calc_rate = rate && dry_run == DRY_RUN_COSTCOEFFS;
   int this_rate = 0;
   uint8_t color_order[PALETTE_MAX_SIZE];
 #if CONFIG_PALETTE_THROUGHPUT
@@ -360,18 +354,99 @@ void av1_tokenize_palette_sb(const AV1_COMP *cpi,
           color_map, plane_block_width, i, j, n, color_order, &color_new_idx);
       assert(color_new_idx >= 0 && color_new_idx < n);
       if (calc_rate) {
-        this_rate += cpi->palette_y_color_cost[n - PALETTE_MIN_SIZE][color_ctx]
-                                              [color_new_idx];
+        this_rate +=
+            (*color_cost)[n - PALETTE_MIN_SIZE][color_ctx][color_new_idx];
+      } else {
+        (*t)->token = color_new_idx;
+        (*t)->color_map_cdf = map_cdf[n - PALETTE_MIN_SIZE][color_ctx];
+        ++(*t);
       }
-      (*t)->token = color_new_idx;
-      (*t)->palette_cdf = palette_cdf[n - PALETTE_MIN_SIZE][color_ctx];
-      (*t)->skip_eob_node = 0;
-      ++(*t);
     }
   }
-  if (rate) *rate += this_rate;
+  if (calc_rate) return this_rate;
+  return 0;
+}
+
+static void get_palette_params(const MACROBLOCK *const x, int plane,
+                               BLOCK_SIZE bsize, Av1ColorMapParam *params) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  params->color_map = xd->plane[plane].color_index_map;
+  params->map_cdf = plane ? xd->tile_ctx->palette_uv_color_index_cdf
+                          : xd->tile_ctx->palette_y_color_index_cdf;
+  params->color_cost =
+      plane ? &x->palette_uv_color_cost : &x->palette_y_color_cost;
+  params->n_colors = pmi->palette_size[plane];
+  av1_get_block_dimensions(bsize, plane, xd, &params->plane_width, NULL,
+                           &params->rows, &params->cols);
+}
+
+#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+static void get_mrc_params(const MACROBLOCK *const x, int block,
+                           TX_SIZE tx_size, Av1ColorMapParam *params) {
+  memset(params, 0, sizeof(*params));
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const int is_inter = is_inter_block(mbmi);
+  params->color_map = BLOCK_OFFSET(xd->mrc_mask, block);
+  params->map_cdf = is_inter ? xd->tile_ctx->mrc_mask_inter_cdf
+                             : xd->tile_ctx->mrc_mask_intra_cdf;
+  params->color_cost =
+      is_inter ? &x->mrc_mask_inter_cost : &x->mrc_mask_intra_cost;
+  params->n_colors = 2;
+  params->plane_width = tx_size_wide[tx_size];
+  params->rows = tx_size_high[tx_size];
+  params->cols = tx_size_wide[tx_size];
+}
+#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+
+static void get_color_map_params(const MACROBLOCK *const x, int plane,
+                                 int block, BLOCK_SIZE bsize, TX_SIZE tx_size,
+                                 COLOR_MAP_TYPE type,
+                                 Av1ColorMapParam *params) {
+  (void)block;
+  (void)tx_size;
+  memset(params, 0, sizeof(*params));
+  switch (type) {
+    case PALETTE_MAP: get_palette_params(x, plane, bsize, params); break;
+#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+    case MRC_MAP: get_mrc_params(x, block, tx_size, params); break;
+#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+    default: assert(0 && "Invalid color map type"); return;
+  }
+}
+
+int av1_cost_color_map(const MACROBLOCK *const x, int plane, int block,
+                       BLOCK_SIZE bsize, TX_SIZE tx_size, COLOR_MAP_TYPE type) {
+  assert(plane == 0 || plane == 1);
+  Av1ColorMapParam color_map_params;
+  get_color_map_params(x, plane, block, bsize, tx_size, type,
+                       &color_map_params);
+  return cost_and_tokenize_map(&color_map_params, NULL, 1);
+}
+
+void av1_tokenize_color_map(const MACROBLOCK *const x, int plane, int block,
+                            TOKENEXTRA **t, BLOCK_SIZE bsize, TX_SIZE tx_size,
+                            COLOR_MAP_TYPE type) {
+  assert(plane == 0 || plane == 1);
+#if CONFIG_MRC_TX
+  if (type == MRC_MAP) {
+    const int is_inter = is_inter_block(&x->e_mbd.mi[0]->mbmi);
+    if ((is_inter && !SIGNAL_MRC_MASK_INTER) ||
+        (!is_inter && !SIGNAL_MRC_MASK_INTRA))
+      return;
+  }
+#endif  // CONFIG_MRC_TX
+  Av1ColorMapParam color_map_params;
+  get_color_map_params(x, plane, block, bsize, tx_size, type,
+                       &color_map_params);
+  // The first color index does not use context or entropy.
+  (*t)->token = color_map_params.color_map[0];
+  (*t)->color_map_cdf = NULL;
+  ++(*t);
+  cost_and_tokenize_map(&color_map_params, t, 0);
 }
-#endif  // CONFIG_PALETTE
 
 #if CONFIG_PVQ
 static void add_pvq_block(AV1_COMMON *const cm, MACROBLOCK *const x,
@@ -410,7 +485,7 @@ static void tokenize_pvq(int plane, int block, int blk_row, int blk_col,
 
   assert(block < MAX_PVQ_BLOCKS_IN_SB);
   pvq_info = &x->pvq[block][plane];
-  add_pvq_block((AV1_COMMON * const)cm, x, pvq_info);
+  add_pvq_block((AV1_COMMON * const) cm, x, pvq_info);
 }
 #endif  // CONFIG_PVQ
 
@@ -444,8 +519,6 @@ static void tokenize_b(int plane, int block, int blk_row, int blk_col,
       av1_get_tx_type(type, xd, blk_row, blk_col, block, tx_size);
   const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, mbmi);
   const int ref = is_inter_block(mbmi);
-  unsigned int(*const counts)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
-      td->rd_counts.coef_counts[txsize_sqr_map[tx_size]][type][ref];
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
   aom_cdf_prob(
       *const coef_head_cdfs)[COEFF_CONTEXTS][CDF_SIZE(ENTROPY_TOKENS)] =
@@ -453,13 +526,9 @@ static void tokenize_b(int plane, int block, int blk_row, int blk_col,
   aom_cdf_prob(
       *const coef_tail_cdfs)[COEFF_CONTEXTS][CDF_SIZE(ENTROPY_TOKENS)] =
       ec_ctx->coef_tail_cdfs[txsize_sqr_map[tx_size]][type][ref];
-  unsigned int(*const blockz_count)[2] =
-      td->counts->blockz_count[txsize_sqr_map[tx_size]][type][ref];
   int eob_val;
   int first_val = 1;
-  const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
-  unsigned int(*const eob_branch)[COEFF_CONTEXTS] =
-      td->counts->eob_branch[txsize_sqr_map[tx_size]][type][ref];
+  const int seg_eob = av1_get_tx_eob(&cpi->common.seg, segment_id, tx_size);
   const uint8_t *const band = get_band_translate(tx_size);
   int16_t token;
   EXTRABIT extra;
@@ -470,12 +539,15 @@ static void tokenize_b(int plane, int block, int blk_row, int blk_col,
   nb = scan_order->neighbors;
   c = 0;
 
+#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+  if (tx_type == MRC_DCT)
+    av1_tokenize_color_map(x, plane, block, &t, plane_bsize, tx_size, MRC_MAP);
+#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+
   if (eob == 0)
     add_token(&t, &coef_tail_cdfs[band[c]][pt], &coef_head_cdfs[band[c]][pt], 1,
               1, 0, BLOCK_Z_TOKEN);
 
-  ++blockz_count[pt][eob != 0];
-
   while (c < eob) {
     int v = qcoeff[scan[c]];
     first_val = (c == 0);
@@ -483,23 +555,13 @@ static void tokenize_b(int plane, int block, int blk_row, int blk_col,
     if (!v) {
       add_token(&t, &coef_tail_cdfs[band[c]][pt], &coef_head_cdfs[band[c]][pt],
                 0, first_val, 0, ZERO_TOKEN);
-      ++counts[band[c]][pt][ZERO_TOKEN];
       token_cache[scan[c]] = 0;
     } else {
       eob_val =
           (c + 1 == eob) ? (c + 1 == seg_eob ? LAST_EOB : EARLY_EOB) : NO_EOB;
-
       av1_get_token_extra(v, &token, &extra);
-
       add_token(&t, &coef_tail_cdfs[band[c]][pt], &coef_head_cdfs[band[c]][pt],
                 eob_val, first_val, extra, (uint8_t)token);
-
-      if (eob_val != LAST_EOB) {
-        ++counts[band[c]][pt][token];
-        ++eob_branch[band[c]][pt];
-        counts[band[c]][pt][EOB_TOKEN] += eob_val != NO_EOB;
-      }
-
       token_cache[scan[c]] = av1_pt_energy_class[token];
     }
     ++c;
@@ -673,7 +735,7 @@ void av1_tokenize_sb_vartx(const AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
     if (!is_chroma_reference(mi_row, mi_col, bsize,
                              xd->plane[plane].subsampling_x,
                              xd->plane[plane].subsampling_y)) {
-#if !CONFIG_PVQ || !CONFIG_LV_MAP
+#if !CONFIG_PVQ && !CONFIG_LV_MAP
       if (!dry_run) {
         (*t)->token = EOSB_TOKEN;
         (*t)++;
@@ -691,7 +753,8 @@ void av1_tokenize_sb_vartx(const AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
 #endif
     const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
     const int mi_height = block_size_high[plane_bsize] >> tx_size_wide_log2[0];
-    const TX_SIZE max_tx_size = get_vartx_max_txsize(mbmi, plane_bsize);
+    const TX_SIZE max_tx_size = get_vartx_max_txsize(
+        mbmi, plane_bsize, pd->subsampling_x || pd->subsampling_y);
     const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
     int bw = block_size_wide[txb_size] >> tx_size_wide_log2[0];
     int bh = block_size_high[txb_size] >> tx_size_wide_log2[0];
diff --git a/third_party/aom/av1/encoder/tokenize.h b/third_party/aom/av1/encoder/tokenize.h
index 73f0305fa..20000e502 100644
--- a/third_party/aom/av1/encoder/tokenize.h
+++ b/third_party/aom/av1/encoder/tokenize.h
@@ -37,15 +37,12 @@ typedef struct {
 typedef struct {
   aom_cdf_prob (*tail_cdf)[CDF_SIZE(ENTROPY_TOKENS)];
   aom_cdf_prob (*head_cdf)[CDF_SIZE(ENTROPY_TOKENS)];
-#if CONFIG_PALETTE
-  aom_cdf_prob *palette_cdf;
-#endif  // CONFIG_PALETTE
+  aom_cdf_prob *color_map_cdf;
   int eob_val;
   int first_val;
   const aom_prob *context_tree;
   EXTRABIT extra;
   uint8_t token;
-  uint8_t skip_eob_node;
 } TOKENEXTRA;
 
 extern const aom_tree_index av1_coef_tree[];
@@ -77,12 +74,14 @@ void av1_tokenize_sb_vartx(const struct AV1_COMP *cpi, struct ThreadData *td,
                            TOKENEXTRA **t, RUN_TYPE dry_run, int mi_row,
                            int mi_col, BLOCK_SIZE bsize, int *rate);
 #endif
-#if CONFIG_PALETTE
-void av1_tokenize_palette_sb(const struct AV1_COMP *cpi,
-                             const struct ThreadData *const td, int plane,
-                             TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize,
-                             int *rate);
-#endif  // CONFIG_PALETTE
+
+int av1_cost_color_map(const MACROBLOCK *const x, int plane, int block,
+                       BLOCK_SIZE bsize, TX_SIZE tx_size, COLOR_MAP_TYPE type);
+
+void av1_tokenize_color_map(const MACROBLOCK *const x, int plane, int block,
+                            TOKENEXTRA **t, BLOCK_SIZE bsize, TX_SIZE tx_size,
+                            COLOR_MAP_TYPE type);
+
 void av1_tokenize_sb(const struct AV1_COMP *cpi, struct ThreadData *td,
                      TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize,
                      int *rate, const int mi_row, const int mi_col);
@@ -139,13 +138,11 @@ static INLINE int av1_get_token_cost(int v, int16_t *token, int cat6_bits) {
   return av1_dct_cat_lt_10_value_cost[v];
 }
 
-#if !CONFIG_PVQ || CONFIG_VAR_TX
-static INLINE int get_tx_eob(const struct segmentation *seg, int segment_id,
-                             TX_SIZE tx_size) {
+static INLINE int av1_get_tx_eob(const struct segmentation *seg, int segment_id,
+                                 TX_SIZE tx_size) {
   const int eob_max = tx_size_2d[tx_size];
   return segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
 }
-#endif
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c b/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c
index 1c0a120ca..078a67510 100644
--- a/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c
+++ b/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c
@@ -16,24 +16,24 @@
 #include "aom_dsp/aom_dsp_common.h"
 
 static INLINE void read_coeff(const tran_low_t *coeff, __m256i *c) {
-#if CONFIG_HIGHBITDEPTH
-  const __m256i x0 = _mm256_loadu_si256((const __m256i *)coeff);
-  const __m256i x1 = _mm256_loadu_si256((const __m256i *)coeff + 1);
-  *c = _mm256_packs_epi32(x0, x1);
-  *c = _mm256_permute4x64_epi64(*c, 0xD8);
-#else
-  *c = _mm256_loadu_si256((const __m256i *)coeff);
-#endif
+  if (sizeof(tran_low_t) == 4) {
+    const __m256i x0 = _mm256_loadu_si256((const __m256i *)coeff);
+    const __m256i x1 = _mm256_loadu_si256((const __m256i *)coeff + 1);
+    *c = _mm256_packs_epi32(x0, x1);
+    *c = _mm256_permute4x64_epi64(*c, 0xD8);
+  } else {
+    *c = _mm256_loadu_si256((const __m256i *)coeff);
+  }
 }
 
 static INLINE void write_zero(tran_low_t *qcoeff) {
   const __m256i zero = _mm256_setzero_si256();
-#if CONFIG_HIGHBITDEPTH
-  _mm256_storeu_si256((__m256i *)qcoeff, zero);
-  _mm256_storeu_si256((__m256i *)qcoeff + 1, zero);
-#else
-  _mm256_storeu_si256((__m256i *)qcoeff, zero);
-#endif
+  if (sizeof(tran_low_t) == 4) {
+    _mm256_storeu_si256((__m256i *)qcoeff, zero);
+    _mm256_storeu_si256((__m256i *)qcoeff + 1, zero);
+  } else {
+    _mm256_storeu_si256((__m256i *)qcoeff, zero);
+  }
 }
 
 static INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
@@ -83,19 +83,16 @@ static INLINE void update_qp(int log_scale, __m256i *thr, __m256i *qp) {
     _mm256_storeu_si256((__m256i *)addr + 1, x1);         \
   } while (0)
 
-#if CONFIG_HIGHBITDEPTH
-#define store_two_quan(q, addr1, dq, addr2) \
-  do {                                      \
-    store_quan(q, addr1);                   \
-    store_quan(dq, addr2);                  \
-  } while (0)
-#else
-#define store_two_quan(q, addr1, dq, addr2)    \
-  do {                                         \
-    _mm256_storeu_si256((__m256i *)addr1, q);  \
-    _mm256_storeu_si256((__m256i *)addr2, dq); \
+#define store_two_quan(q, addr1, dq, addr2)      \
+  do {                                           \
+    if (sizeof(tran_low_t) == 4) {               \
+      store_quan(q, addr1);                      \
+      store_quan(dq, addr2);                     \
+    } else {                                     \
+      _mm256_storeu_si256((__m256i *)addr1, q);  \
+      _mm256_storeu_si256((__m256i *)addr2, dq); \
+    }                                            \
   } while (0)
-#endif
 
 static INLINE void quantize(const __m256i *thr, const __m256i *qp, __m256i *c,
                             const int16_t *iscan_ptr, tran_low_t *qcoeff,
diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c b/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c
index 190317389..4f7c09546 100644
--- a/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c
+++ b/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c
@@ -18,53 +18,53 @@
 static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset,
                               __m128i *c0, __m128i *c1) {
   const tran_low_t *addr = coeff + offset;
-#if CONFIG_HIGHBITDEPTH
-  const __m128i x0 = _mm_load_si128((const __m128i *)addr);
-  const __m128i x1 = _mm_load_si128((const __m128i *)addr + 1);
-  const __m128i x2 = _mm_load_si128((const __m128i *)addr + 2);
-  const __m128i x3 = _mm_load_si128((const __m128i *)addr + 3);
-  *c0 = _mm_packs_epi32(x0, x1);
-  *c1 = _mm_packs_epi32(x2, x3);
-#else
-  *c0 = _mm_load_si128((const __m128i *)addr);
-  *c1 = _mm_load_si128((const __m128i *)addr + 1);
-#endif
+  if (sizeof(tran_low_t) == 4) {
+    const __m128i x0 = _mm_load_si128((const __m128i *)addr);
+    const __m128i x1 = _mm_load_si128((const __m128i *)addr + 1);
+    const __m128i x2 = _mm_load_si128((const __m128i *)addr + 2);
+    const __m128i x3 = _mm_load_si128((const __m128i *)addr + 3);
+    *c0 = _mm_packs_epi32(x0, x1);
+    *c1 = _mm_packs_epi32(x2, x3);
+  } else {
+    *c0 = _mm_load_si128((const __m128i *)addr);
+    *c1 = _mm_load_si128((const __m128i *)addr + 1);
+  }
 }
 
 static INLINE void write_qcoeff(const __m128i *qc0, const __m128i *qc1,
                                 tran_low_t *qcoeff, intptr_t offset) {
   tran_low_t *addr = qcoeff + offset;
-#if CONFIG_HIGHBITDEPTH
-  const __m128i zero = _mm_setzero_si128();
-  __m128i sign_bits = _mm_cmplt_epi16(*qc0, zero);
-  __m128i y0 = _mm_unpacklo_epi16(*qc0, sign_bits);
-  __m128i y1 = _mm_unpackhi_epi16(*qc0, sign_bits);
-  _mm_store_si128((__m128i *)addr, y0);
-  _mm_store_si128((__m128i *)addr + 1, y1);
-
-  sign_bits = _mm_cmplt_epi16(*qc1, zero);
-  y0 = _mm_unpacklo_epi16(*qc1, sign_bits);
-  y1 = _mm_unpackhi_epi16(*qc1, sign_bits);
-  _mm_store_si128((__m128i *)addr + 2, y0);
-  _mm_store_si128((__m128i *)addr + 3, y1);
-#else
-  _mm_store_si128((__m128i *)addr, *qc0);
-  _mm_store_si128((__m128i *)addr + 1, *qc1);
-#endif
+  if (sizeof(tran_low_t) == 4) {
+    const __m128i zero = _mm_setzero_si128();
+    __m128i sign_bits = _mm_cmplt_epi16(*qc0, zero);
+    __m128i y0 = _mm_unpacklo_epi16(*qc0, sign_bits);
+    __m128i y1 = _mm_unpackhi_epi16(*qc0, sign_bits);
+    _mm_store_si128((__m128i *)addr, y0);
+    _mm_store_si128((__m128i *)addr + 1, y1);
+
+    sign_bits = _mm_cmplt_epi16(*qc1, zero);
+    y0 = _mm_unpacklo_epi16(*qc1, sign_bits);
+    y1 = _mm_unpackhi_epi16(*qc1, sign_bits);
+    _mm_store_si128((__m128i *)addr + 2, y0);
+    _mm_store_si128((__m128i *)addr + 3, y1);
+  } else {
+    _mm_store_si128((__m128i *)addr, *qc0);
+    _mm_store_si128((__m128i *)addr + 1, *qc1);
+  }
 }
 
 static INLINE void write_zero(tran_low_t *qcoeff, intptr_t offset) {
   const __m128i zero = _mm_setzero_si128();
   tran_low_t *addr = qcoeff + offset;
-#if CONFIG_HIGHBITDEPTH
-  _mm_store_si128((__m128i *)addr, zero);
-  _mm_store_si128((__m128i *)addr + 1, zero);
-  _mm_store_si128((__m128i *)addr + 2, zero);
-  _mm_store_si128((__m128i *)addr + 3, zero);
-#else
-  _mm_store_si128((__m128i *)addr, zero);
-  _mm_store_si128((__m128i *)addr + 1, zero);
-#endif
+  if (sizeof(tran_low_t) == 4) {
+    _mm_store_si128((__m128i *)addr, zero);
+    _mm_store_si128((__m128i *)addr + 1, zero);
+    _mm_store_si128((__m128i *)addr + 2, zero);
+    _mm_store_si128((__m128i *)addr + 3, zero);
+  } else {
+    _mm_store_si128((__m128i *)addr, zero);
+    _mm_store_si128((__m128i *)addr + 1, zero);
+  }
 }
 
 void av1_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
diff --git a/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c b/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c
index 496c33395..e5b19a44c 100644
--- a/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c
+++ b/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c
@@ -205,7 +205,7 @@ static void fidtx4_sse2(__m128i *in) {
 void av1_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride,
                      TxfmParam *txfm_param) {
   __m128i in[4];
-  int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
 #if CONFIG_MRC_TX
   assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
 #endif
@@ -308,447 +308,6 @@ void av1_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride,
   }
 }
 
-void av1_fdct8x8_quant_sse2(const int16_t *input, int stride,
-                            int16_t *coeff_ptr, intptr_t n_coeffs,
-                            int skip_block, const int16_t *zbin_ptr,
-                            const int16_t *round_ptr, const int16_t *quant_ptr,
-                            const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
-                            int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                            uint16_t *eob_ptr, const int16_t *scan_ptr,
-                            const int16_t *iscan_ptr) {
-  __m128i zero;
-  int pass;
-  // Constants
-  //    When we use them, in one case, they are all the same. In all others
-  //    it's a pair of them that we need to repeat four times. This is done
-  //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  // Load input
-  __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
-  __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
-  __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
-  __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
-  __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
-  __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
-  __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
-  __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
-  __m128i *in[8];
-  int index = 0;
-
-  (void)scan_ptr;
-  (void)zbin_ptr;
-  (void)quant_shift_ptr;
-  (void)coeff_ptr;
-
-  // Pre-condition input (shift by two)
-  in0 = _mm_slli_epi16(in0, 2);
-  in1 = _mm_slli_epi16(in1, 2);
-  in2 = _mm_slli_epi16(in2, 2);
-  in3 = _mm_slli_epi16(in3, 2);
-  in4 = _mm_slli_epi16(in4, 2);
-  in5 = _mm_slli_epi16(in5, 2);
-  in6 = _mm_slli_epi16(in6, 2);
-  in7 = _mm_slli_epi16(in7, 2);
-
-  in[0] = &in0;
-  in[1] = &in1;
-  in[2] = &in2;
-  in[3] = &in3;
-  in[4] = &in4;
-  in[5] = &in5;
-  in[6] = &in6;
-  in[7] = &in7;
-
-  // We do two passes, first the columns, then the rows. The results of the
-  // first pass are transposed so that the same column code can be reused. The
-  // results of the second pass are also transposed so that the rows (processed
-  // as columns) are put back in row positions.
-  for (pass = 0; pass < 2; pass++) {
-    // To store results of each pass before the transpose.
-    __m128i res0, res1, res2, res3, res4, res5, res6, res7;
-    // Add/subtract
-    const __m128i q0 = _mm_add_epi16(in0, in7);
-    const __m128i q1 = _mm_add_epi16(in1, in6);
-    const __m128i q2 = _mm_add_epi16(in2, in5);
-    const __m128i q3 = _mm_add_epi16(in3, in4);
-    const __m128i q4 = _mm_sub_epi16(in3, in4);
-    const __m128i q5 = _mm_sub_epi16(in2, in5);
-    const __m128i q6 = _mm_sub_epi16(in1, in6);
-    const __m128i q7 = _mm_sub_epi16(in0, in7);
-    // Work on first four results
-    {
-      // Add/subtract
-      const __m128i r0 = _mm_add_epi16(q0, q3);
-      const __m128i r1 = _mm_add_epi16(q1, q2);
-      const __m128i r2 = _mm_sub_epi16(q1, q2);
-      const __m128i r3 = _mm_sub_epi16(q0, q3);
-      // Interleave to do the multiply by constants which gets us into 32bits
-      const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
-      const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
-      const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
-      const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
-      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
-      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
-      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
-      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
-      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
-      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
-      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
-      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
-      // dct_const_round_shift
-      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
-      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
-      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-      // Combine
-      res0 = _mm_packs_epi32(w0, w1);
-      res4 = _mm_packs_epi32(w2, w3);
-      res2 = _mm_packs_epi32(w4, w5);
-      res6 = _mm_packs_epi32(w6, w7);
-    }
-    // Work on next four results
-    {
-      // Interleave to do the multiply by constants which gets us into 32bits
-      const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
-      const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
-      const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
-      const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
-      const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
-      const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
-      // dct_const_round_shift
-      const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
-      const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
-      const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
-      const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
-      const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
-      const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
-      const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
-      const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
-      // Combine
-      const __m128i r0 = _mm_packs_epi32(s0, s1);
-      const __m128i r1 = _mm_packs_epi32(s2, s3);
-      // Add/subtract
-      const __m128i x0 = _mm_add_epi16(q4, r0);
-      const __m128i x1 = _mm_sub_epi16(q4, r0);
-      const __m128i x2 = _mm_sub_epi16(q7, r1);
-      const __m128i x3 = _mm_add_epi16(q7, r1);
-      // Interleave to do the multiply by constants which gets us into 32bits
-      const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
-      const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
-      const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
-      const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
-      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
-      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
-      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
-      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
-      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
-      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
-      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
-      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
-      // dct_const_round_shift
-      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
-      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
-      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-      // Combine
-      res1 = _mm_packs_epi32(w0, w1);
-      res7 = _mm_packs_epi32(w2, w3);
-      res5 = _mm_packs_epi32(w4, w5);
-      res3 = _mm_packs_epi32(w6, w7);
-    }
-    // Transpose the 8x8.
-    {
-      // 00 01 02 03 04 05 06 07
-      // 10 11 12 13 14 15 16 17
-      // 20 21 22 23 24 25 26 27
-      // 30 31 32 33 34 35 36 37
-      // 40 41 42 43 44 45 46 47
-      // 50 51 52 53 54 55 56 57
-      // 60 61 62 63 64 65 66 67
-      // 70 71 72 73 74 75 76 77
-      const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
-      const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
-      const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
-      const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
-      const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
-      const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
-      const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
-      const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
-      // 00 10 01 11 02 12 03 13
-      // 20 30 21 31 22 32 23 33
-      // 04 14 05 15 06 16 07 17
-      // 24 34 25 35 26 36 27 37
-      // 40 50 41 51 42 52 43 53
-      // 60 70 61 71 62 72 63 73
-      // 54 54 55 55 56 56 57 57
-      // 64 74 65 75 66 76 67 77
-      const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-      const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-      const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-      const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-      const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-      const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-      const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-      const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-      // 00 10 20 30 01 11 21 31
-      // 40 50 60 70 41 51 61 71
-      // 02 12 22 32 03 13 23 33
-      // 42 52 62 72 43 53 63 73
-      // 04 14 24 34 05 15 21 36
-      // 44 54 64 74 45 55 61 76
-      // 06 16 26 36 07 17 27 37
-      // 46 56 66 76 47 57 67 77
-      in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
-      in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
-      in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
-      in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
-      in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
-      in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
-      in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
-      in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
-      // 00 10 20 30 40 50 60 70
-      // 01 11 21 31 41 51 61 71
-      // 02 12 22 32 42 52 62 72
-      // 03 13 23 33 43 53 63 73
-      // 04 14 24 34 44 54 64 74
-      // 05 15 25 35 45 55 65 75
-      // 06 16 26 36 46 56 66 76
-      // 07 17 27 37 47 57 67 77
-    }
-  }
-  // Post-condition output and store it
-  {
-    // Post-condition (division by two)
-    //    division of two 16 bits signed numbers using shifts
-    //    n / 2 = (n - (n >> 15)) >> 1
-    const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
-    const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
-    const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
-    const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
-    const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
-    const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
-    const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
-    const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
-    in0 = _mm_sub_epi16(in0, sign_in0);
-    in1 = _mm_sub_epi16(in1, sign_in1);
-    in2 = _mm_sub_epi16(in2, sign_in2);
-    in3 = _mm_sub_epi16(in3, sign_in3);
-    in4 = _mm_sub_epi16(in4, sign_in4);
-    in5 = _mm_sub_epi16(in5, sign_in5);
-    in6 = _mm_sub_epi16(in6, sign_in6);
-    in7 = _mm_sub_epi16(in7, sign_in7);
-    in0 = _mm_srai_epi16(in0, 1);
-    in1 = _mm_srai_epi16(in1, 1);
-    in2 = _mm_srai_epi16(in2, 1);
-    in3 = _mm_srai_epi16(in3, 1);
-    in4 = _mm_srai_epi16(in4, 1);
-    in5 = _mm_srai_epi16(in5, 1);
-    in6 = _mm_srai_epi16(in6, 1);
-    in7 = _mm_srai_epi16(in7, 1);
-  }
-
-  iscan_ptr += n_coeffs;
-  qcoeff_ptr += n_coeffs;
-  dqcoeff_ptr += n_coeffs;
-  n_coeffs = -n_coeffs;
-  zero = _mm_setzero_si128();
-
-  if (!skip_block) {
-    __m128i eob;
-    __m128i round, quant, dequant;
-    {
-      __m128i coeff0, coeff1;
-
-      // Setup global values
-      {
-        round = _mm_load_si128((const __m128i *)round_ptr);
-        quant = _mm_load_si128((const __m128i *)quant_ptr);
-        dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-      }
-
-      {
-        __m128i coeff0_sign, coeff1_sign;
-        __m128i qcoeff0, qcoeff1;
-        __m128i qtmp0, qtmp1;
-        // Do DC and first 15 AC
-        coeff0 = *in[0];
-        coeff1 = *in[1];
-
-        // Poor man's sign extract
-        coeff0_sign = _mm_srai_epi16(coeff0, 15);
-        coeff1_sign = _mm_srai_epi16(coeff1, 15);
-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-        round = _mm_unpackhi_epi64(round, round);
-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-        quant = _mm_unpackhi_epi64(quant, quant);
-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-
-        // Reinsert signs
-        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
-        _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
-
-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-        dequant = _mm_unpackhi_epi64(dequant, dequant);
-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
-        _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
-        _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
-      }
-
-      {
-        // Scan for eob
-        __m128i zero_coeff0, zero_coeff1;
-        __m128i nzero_coeff0, nzero_coeff1;
-        __m128i iscan0, iscan1;
-        __m128i eob1;
-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
-        // Add one to convert from indices to counts
-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-        eob = _mm_and_si128(iscan0, nzero_coeff0);
-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-        eob = _mm_max_epi16(eob, eob1);
-      }
-      n_coeffs += 8 * 2;
-    }
-
-    // AC only loop
-    index = 2;
-    while (n_coeffs < 0) {
-      __m128i coeff0, coeff1;
-      {
-        __m128i coeff0_sign, coeff1_sign;
-        __m128i qcoeff0, qcoeff1;
-        __m128i qtmp0, qtmp1;
-
-        assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1);
-        coeff0 = *in[index];
-        coeff1 = *in[index + 1];
-
-        // Poor man's sign extract
-        coeff0_sign = _mm_srai_epi16(coeff0, 15);
-        coeff1_sign = _mm_srai_epi16(coeff1, 15);
-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-
-        // Reinsert signs
-        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
-        _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
-
-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
-        _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
-        _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
-      }
-
-      {
-        // Scan for eob
-        __m128i zero_coeff0, zero_coeff1;
-        __m128i nzero_coeff0, nzero_coeff1;
-        __m128i iscan0, iscan1;
-        __m128i eob0, eob1;
-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
-        // Add one to convert from indices to counts
-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-        eob0 = _mm_max_epi16(eob0, eob1);
-        eob = _mm_max_epi16(eob, eob0);
-      }
-      n_coeffs += 8 * 2;
-      index += 2;
-    }
-
-    // Accumulate EOB
-    {
-      __m128i eob_shuffled;
-      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      *eob_ptr = _mm_extract_epi16(eob, 1);
-    }
-  } else {
-    do {
-      _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
-      _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
-      _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
-      _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
-      n_coeffs += 8 * 2;
-    } while (n_coeffs < 0);
-    *eob_ptr = 0;
-  }
-}
-
 // load 8x8 array
 static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
                                    int stride, int flipud, int fliplr) {
@@ -1307,7 +866,7 @@ static void fidtx8_sse2(__m128i *in) {
 void av1_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride,
                      TxfmParam *txfm_param) {
   __m128i in[8];
-  int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
 #if CONFIG_MRC_TX
   assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
 #endif
@@ -2344,7 +1903,7 @@ static void fidtx16_sse2(__m128i *in0, __m128i *in1) {
 void av1_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride,
                        TxfmParam *txfm_param) {
   __m128i in0[16], in1[16];
-  int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
 #if CONFIG_MRC_TX
   assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
 #endif
@@ -2564,7 +2123,7 @@ static INLINE void write_buffer_4x8(tran_low_t *output, __m128i *res) {
 void av1_fht4x8_sse2(const int16_t *input, tran_low_t *output, int stride,
                      TxfmParam *txfm_param) {
   __m128i in[8];
-  int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
 #if CONFIG_MRC_TX
   assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
 #endif
@@ -2742,7 +2301,7 @@ static INLINE void write_buffer_8x4(tran_low_t *output, __m128i *res) {
 void av1_fht8x4_sse2(const int16_t *input, tran_low_t *output, int stride,
                      TxfmParam *txfm_param) {
   __m128i in[8];
-  int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
 #if CONFIG_MRC_TX
   assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
 #endif
@@ -2886,7 +2445,7 @@ static void row_8x16_rounding(__m128i *in, int bits) {
 void av1_fht8x16_sse2(const int16_t *input, tran_low_t *output, int stride,
                       TxfmParam *txfm_param) {
   __m128i in[16];
-  int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
 #if CONFIG_MRC_TX
   assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
 #endif
@@ -3071,7 +2630,7 @@ static INLINE void load_buffer_16x8(const int16_t *input, __m128i *in,
 void av1_fht16x8_sse2(const int16_t *input, tran_low_t *output, int stride,
                       TxfmParam *txfm_param) {
   __m128i in[16];
-  int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
 #if CONFIG_MRC_TX
   assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
 #endif
@@ -3385,7 +2944,7 @@ static INLINE void fhalfright32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
 void av1_fht16x32_sse2(const int16_t *input, tran_low_t *output, int stride,
                        TxfmParam *txfm_param) {
   __m128i intl[16], intr[16], inbl[16], inbr[16];
-  int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
 #if CONFIG_MRC_TX
   assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
 #endif
@@ -3578,7 +3137,7 @@ static INLINE void write_buffer_32x16(tran_low_t *output, __m128i *res0,
 void av1_fht32x16_sse2(const int16_t *input, tran_low_t *output, int stride,
                        TxfmParam *txfm_param) {
   __m128i in0[16], in1[16], in2[16], in3[16];
-  int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
 #if CONFIG_MRC_TX
   assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
 #endif
@@ -3822,7 +3381,7 @@ static INLINE void write_buffer_32x32(__m128i *in0, __m128i *in1, __m128i *in2,
 void av1_fht32x32_sse2(const int16_t *input, tran_low_t *output, int stride,
                        TxfmParam *txfm_param) {
   __m128i in0[32], in1[32], in2[32], in3[32];
-  int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
 #if CONFIG_MRC_TX
   assert(tx_type != MRC_DCT && "No 32x32 sse2 MRC_DCT implementation");
 #endif
diff --git a/third_party/aom/av1/encoder/x86/dct_ssse3.c b/third_party/aom/av1/encoder/x86/dct_ssse3.c
deleted file mode 100644
index 717a99af8..000000000
--- a/third_party/aom/av1/encoder/x86/dct_ssse3.c
+++ /dev/null
@@ -1,469 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#if defined(_MSC_VER) && _MSC_VER <= 1500
-// Need to include math.h before calling tmmintrin.h/intrin.h
-// in certain versions of MSVS.
-#include <math.h>
-#endif
-#include <tmmintrin.h>  // SSSE3
-
-#include "./av1_rtcd.h"
-#include "aom_dsp/x86/inv_txfm_sse2.h"
-#include "aom_dsp/x86/txfm_common_sse2.h"
-
-void av1_fdct8x8_quant_ssse3(
-    const int16_t *input, int stride, int16_t *coeff_ptr, intptr_t n_coeffs,
-    int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr,
-    const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
-    int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-    uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) {
-  __m128i zero;
-  int pass;
-  // Constants
-  //    When we use them, in one case, they are all the same. In all others
-  //    it's a pair of them that we need to repeat four times. This is done
-  //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__dual_p16_p16 = dual_set_epi16(23170, 23170);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  // Load input
-  __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
-  __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
-  __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
-  __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
-  __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
-  __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
-  __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
-  __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
-  __m128i *in[8];
-  int index = 0;
-
-  (void)scan_ptr;
-  (void)zbin_ptr;
-  (void)quant_shift_ptr;
-  (void)coeff_ptr;
-
-  // Pre-condition input (shift by two)
-  in0 = _mm_slli_epi16(in0, 2);
-  in1 = _mm_slli_epi16(in1, 2);
-  in2 = _mm_slli_epi16(in2, 2);
-  in3 = _mm_slli_epi16(in3, 2);
-  in4 = _mm_slli_epi16(in4, 2);
-  in5 = _mm_slli_epi16(in5, 2);
-  in6 = _mm_slli_epi16(in6, 2);
-  in7 = _mm_slli_epi16(in7, 2);
-
-  in[0] = &in0;
-  in[1] = &in1;
-  in[2] = &in2;
-  in[3] = &in3;
-  in[4] = &in4;
-  in[5] = &in5;
-  in[6] = &in6;
-  in[7] = &in7;
-
-  // We do two passes, first the columns, then the rows. The results of the
-  // first pass are transposed so that the same column code can be reused. The
-  // results of the second pass are also transposed so that the rows (processed
-  // as columns) are put back in row positions.
-  for (pass = 0; pass < 2; pass++) {
-    // To store results of each pass before the transpose.
-    __m128i res0, res1, res2, res3, res4, res5, res6, res7;
-    // Add/subtract
-    const __m128i q0 = _mm_add_epi16(in0, in7);
-    const __m128i q1 = _mm_add_epi16(in1, in6);
-    const __m128i q2 = _mm_add_epi16(in2, in5);
-    const __m128i q3 = _mm_add_epi16(in3, in4);
-    const __m128i q4 = _mm_sub_epi16(in3, in4);
-    const __m128i q5 = _mm_sub_epi16(in2, in5);
-    const __m128i q6 = _mm_sub_epi16(in1, in6);
-    const __m128i q7 = _mm_sub_epi16(in0, in7);
-    // Work on first four results
-    {
-      // Add/subtract
-      const __m128i r0 = _mm_add_epi16(q0, q3);
-      const __m128i r1 = _mm_add_epi16(q1, q2);
-      const __m128i r2 = _mm_sub_epi16(q1, q2);
-      const __m128i r3 = _mm_sub_epi16(q0, q3);
-      // Interleave to do the multiply by constants which gets us into 32bits
-      const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
-      const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
-      const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
-      const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
-
-      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
-      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
-      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
-      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
-
-      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
-      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
-      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
-      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
-      // dct_const_round_shift
-
-      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-
-      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
-      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
-      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-
-      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-
-      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-      // Combine
-
-      res0 = _mm_packs_epi32(w0, w1);
-      res4 = _mm_packs_epi32(w2, w3);
-      res2 = _mm_packs_epi32(w4, w5);
-      res6 = _mm_packs_epi32(w6, w7);
-    }
-    // Work on next four results
-    {
-      // Interleave to do the multiply by constants which gets us into 32bits
-      const __m128i d0 = _mm_sub_epi16(q6, q5);
-      const __m128i d1 = _mm_add_epi16(q6, q5);
-      const __m128i r0 = _mm_mulhrs_epi16(d0, k__dual_p16_p16);
-      const __m128i r1 = _mm_mulhrs_epi16(d1, k__dual_p16_p16);
-
-      // Add/subtract
-      const __m128i x0 = _mm_add_epi16(q4, r0);
-      const __m128i x1 = _mm_sub_epi16(q4, r0);
-      const __m128i x2 = _mm_sub_epi16(q7, r1);
-      const __m128i x3 = _mm_add_epi16(q7, r1);
-      // Interleave to do the multiply by constants which gets us into 32bits
-      const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
-      const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
-      const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
-      const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
-      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
-      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
-      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
-      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
-      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
-      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
-      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
-      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
-      // dct_const_round_shift
-      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
-      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
-      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-      // Combine
-      res1 = _mm_packs_epi32(w0, w1);
-      res7 = _mm_packs_epi32(w2, w3);
-      res5 = _mm_packs_epi32(w4, w5);
-      res3 = _mm_packs_epi32(w6, w7);
-    }
-    // Transpose the 8x8.
-    {
-      // 00 01 02 03 04 05 06 07
-      // 10 11 12 13 14 15 16 17
-      // 20 21 22 23 24 25 26 27
-      // 30 31 32 33 34 35 36 37
-      // 40 41 42 43 44 45 46 47
-      // 50 51 52 53 54 55 56 57
-      // 60 61 62 63 64 65 66 67
-      // 70 71 72 73 74 75 76 77
-      const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
-      const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
-      const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
-      const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
-      const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
-      const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
-      const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
-      const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
-      // 00 10 01 11 02 12 03 13
-      // 20 30 21 31 22 32 23 33
-      // 04 14 05 15 06 16 07 17
-      // 24 34 25 35 26 36 27 37
-      // 40 50 41 51 42 52 43 53
-      // 60 70 61 71 62 72 63 73
-      // 54 54 55 55 56 56 57 57
-      // 64 74 65 75 66 76 67 77
-      const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-      const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-      const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-      const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-      const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-      const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-      const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-      const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-      // 00 10 20 30 01 11 21 31
-      // 40 50 60 70 41 51 61 71
-      // 02 12 22 32 03 13 23 33
-      // 42 52 62 72 43 53 63 73
-      // 04 14 24 34 05 15 21 36
-      // 44 54 64 74 45 55 61 76
-      // 06 16 26 36 07 17 27 37
-      // 46 56 66 76 47 57 67 77
-      in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
-      in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
-      in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
-      in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
-      in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
-      in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
-      in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
-      in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
-      // 00 10 20 30 40 50 60 70
-      // 01 11 21 31 41 51 61 71
-      // 02 12 22 32 42 52 62 72
-      // 03 13 23 33 43 53 63 73
-      // 04 14 24 34 44 54 64 74
-      // 05 15 25 35 45 55 65 75
-      // 06 16 26 36 46 56 66 76
-      // 07 17 27 37 47 57 67 77
-    }
-  }
-  // Post-condition output and store it
-  {
-    // Post-condition (division by two)
-    //    division of two 16 bits signed numbers using shifts
-    //    n / 2 = (n - (n >> 15)) >> 1
-    const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
-    const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
-    const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
-    const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
-    const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
-    const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
-    const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
-    const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
-    in0 = _mm_sub_epi16(in0, sign_in0);
-    in1 = _mm_sub_epi16(in1, sign_in1);
-    in2 = _mm_sub_epi16(in2, sign_in2);
-    in3 = _mm_sub_epi16(in3, sign_in3);
-    in4 = _mm_sub_epi16(in4, sign_in4);
-    in5 = _mm_sub_epi16(in5, sign_in5);
-    in6 = _mm_sub_epi16(in6, sign_in6);
-    in7 = _mm_sub_epi16(in7, sign_in7);
-    in0 = _mm_srai_epi16(in0, 1);
-    in1 = _mm_srai_epi16(in1, 1);
-    in2 = _mm_srai_epi16(in2, 1);
-    in3 = _mm_srai_epi16(in3, 1);
-    in4 = _mm_srai_epi16(in4, 1);
-    in5 = _mm_srai_epi16(in5, 1);
-    in6 = _mm_srai_epi16(in6, 1);
-    in7 = _mm_srai_epi16(in7, 1);
-  }
-
-  iscan_ptr += n_coeffs;
-  qcoeff_ptr += n_coeffs;
-  dqcoeff_ptr += n_coeffs;
-  n_coeffs = -n_coeffs;
-  zero = _mm_setzero_si128();
-
-  if (!skip_block) {
-    __m128i eob;
-    __m128i round, quant, dequant, thr;
-    int16_t nzflag;
-    {
-      __m128i coeff0, coeff1;
-
-      // Setup global values
-      {
-        round = _mm_load_si128((const __m128i *)round_ptr);
-        quant = _mm_load_si128((const __m128i *)quant_ptr);
-        dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-      }
-
-      {
-        __m128i coeff0_sign, coeff1_sign;
-        __m128i qcoeff0, qcoeff1;
-        __m128i qtmp0, qtmp1;
-        // Do DC and first 15 AC
-        coeff0 = *in[0];
-        coeff1 = *in[1];
-
-        // Poor man's sign extract
-        coeff0_sign = _mm_srai_epi16(coeff0, 15);
-        coeff1_sign = _mm_srai_epi16(coeff1, 15);
-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-        round = _mm_unpackhi_epi64(round, round);
-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-        quant = _mm_unpackhi_epi64(quant, quant);
-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-
-        // Reinsert signs
-        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
-        _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
-
-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-        dequant = _mm_unpackhi_epi64(dequant, dequant);
-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
-        _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
-        _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
-      }
-
-      {
-        // Scan for eob
-        __m128i zero_coeff0, zero_coeff1;
-        __m128i nzero_coeff0, nzero_coeff1;
-        __m128i iscan0, iscan1;
-        __m128i eob1;
-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
-        // Add one to convert from indices to counts
-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-        eob = _mm_and_si128(iscan0, nzero_coeff0);
-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-        eob = _mm_max_epi16(eob, eob1);
-      }
-      n_coeffs += 8 * 2;
-    }
-
-    // AC only loop
-    index = 2;
-    thr = _mm_srai_epi16(dequant, 1);
-    while (n_coeffs < 0) {
-      __m128i coeff0, coeff1;
-      {
-        __m128i coeff0_sign, coeff1_sign;
-        __m128i qcoeff0, qcoeff1;
-        __m128i qtmp0, qtmp1;
-
-        assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1);
-        coeff0 = *in[index];
-        coeff1 = *in[index + 1];
-
-        // Poor man's sign extract
-        coeff0_sign = _mm_srai_epi16(coeff0, 15);
-        coeff1_sign = _mm_srai_epi16(coeff1, 15);
-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
-                 _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
-
-        if (nzflag) {
-          qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-          qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-          qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-          qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-
-          // Reinsert signs
-          qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
-          qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
-          qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-          qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
-          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
-
-          coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-          coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
-          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
-          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
-        } else {
-          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
-          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
-
-          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
-          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
-        }
-      }
-
-      if (nzflag) {
-        // Scan for eob
-        __m128i zero_coeff0, zero_coeff1;
-        __m128i nzero_coeff0, nzero_coeff1;
-        __m128i iscan0, iscan1;
-        __m128i eob0, eob1;
-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
-        // Add one to convert from indices to counts
-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-        eob0 = _mm_max_epi16(eob0, eob1);
-        eob = _mm_max_epi16(eob, eob0);
-      }
-      n_coeffs += 8 * 2;
-      index += 2;
-    }
-
-    // Accumulate EOB
-    {
-      __m128i eob_shuffled;
-      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      *eob_ptr = _mm_extract_epi16(eob, 1);
-    }
-  } else {
-    do {
-      _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
-      _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
-      _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
-      _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
-      n_coeffs += 8 * 2;
-    } while (n_coeffs < 0);
-    *eob_ptr = 0;
-  }
-}
diff --git a/third_party/aom/av1/encoder/x86/error_intrin_avx2.c b/third_party/aom/av1/encoder/x86/error_intrin_avx2.c
index 20ba4149c..6599630d0 100644
--- a/third_party/aom/av1/encoder/x86/error_intrin_avx2.c
+++ b/third_party/aom/av1/encoder/x86/error_intrin_avx2.c
@@ -17,14 +17,15 @@
 static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset,
                               __m256i *c) {
   const tran_low_t *addr = coeff + offset;
-#if CONFIG_HIGHBITDEPTH
-  const __m256i x0 = _mm256_loadu_si256((const __m256i *)addr);
-  const __m256i x1 = _mm256_loadu_si256((const __m256i *)addr + 1);
-  const __m256i y = _mm256_packs_epi32(x0, x1);
-  *c = _mm256_permute4x64_epi64(y, 0xD8);
-#else
-  *c = _mm256_loadu_si256((const __m256i *)addr);
-#endif
+
+  if (sizeof(tran_low_t) == 4) {
+    const __m256i x0 = _mm256_loadu_si256((const __m256i *)addr);
+    const __m256i x1 = _mm256_loadu_si256((const __m256i *)addr + 1);
+    const __m256i y = _mm256_packs_epi32(x0, x1);
+    *c = _mm256_permute4x64_epi64(y, 0xD8);
+  } else {
+    *c = _mm256_loadu_si256((const __m256i *)addr);
+  }
 }
 
 int64_t av1_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff,
diff --git a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
index cab36f2bd..b684f7a3a 100644
--- a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
+++ b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
@@ -195,7 +195,7 @@ static void fadst4x4_sse4_1(__m128i *in, int bit) {
 }
 
 void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff,
-                               int input_stride, int tx_type, int bd) {
+                               int input_stride, TX_TYPE tx_type, int bd) {
   __m128i in[4];
   const TXFM_1D_CFG *row_cfg = NULL;
   const TXFM_1D_CFG *col_cfg = NULL;
@@ -926,7 +926,7 @@ static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
 }
 
 void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride,
-                               int tx_type, int bd) {
+                               TX_TYPE tx_type, int bd) {
   __m128i in[16], out[16];
   const TXFM_1D_CFG *row_cfg = NULL;
   const TXFM_1D_CFG *col_cfg = NULL;
@@ -1800,7 +1800,7 @@ static void write_buffer_16x16(const __m128i *in, int32_t *output) {
 }
 
 void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff,
-                                 int stride, int tx_type, int bd) {
+                                 int stride, TX_TYPE tx_type, int bd) {
   __m128i in[64], out[64];
   const TXFM_1D_CFG *row_cfg = NULL;
   const TXFM_1D_CFG *col_cfg = NULL;
diff --git a/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c b/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
index af8e9a5f4..88621c82b 100644
--- a/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
+++ b/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
@@ -916,7 +916,7 @@ static void fidtx16_avx2(__m256i *in) {
 void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride,
                        TxfmParam *txfm_param) {
   __m256i in[16];
-  int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
 #if CONFIG_MRC_TX
   assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
 #endif
@@ -1516,7 +1516,7 @@ void av1_fht32x32_avx2(const int16_t *input, tran_low_t *output, int stride,
                        TxfmParam *txfm_param) {
   __m256i in0[32];  // left 32 columns
   __m256i in1[32];  // right 32 columns
-  int tx_type = txfm_param->tx_type;
+  const TX_TYPE tx_type = txfm_param->tx_type;
 #if CONFIG_MRC_TX
   assert(tx_type != MRC_DCT && "No avx2 32x32 implementation of MRC_DCT");
 #endif