53 files changed, 6143 insertions, 7582 deletions
diff --git a/third_party/aom/av1/encoder/aq_cyclicrefresh.c b/third_party/aom/av1/encoder/aq_cyclicrefresh.c
index e41c608b6..b2b410617 100644
--- a/third_party/aom/av1/encoder/aq_cyclicrefresh.c
+++ b/third_party/aom/av1/encoder/aq_cyclicrefresh.c
@@ -353,8 +353,8 @@ void av1_cyclic_refresh_check_golden_update(AV1_COMP *const cpi) {
   // frame because of the camera movement, set this frame as the golden frame.
   // Use 70% and 5% as the thresholds for golden frame refreshing.
   // Also, force this frame as a golden update frame if this frame will change
-  // the resolution (resize_pending != 0).
-  if (cpi->resize_pending != 0 ||
+  // the resolution (av1_resize_pending != 0).
+  if (av1_resize_pending(cpi) ||
       (cnt1 * 10 > (70 * rows * cols) && cnt2 * 20 < cnt1)) {
     av1_cyclic_refresh_set_golden_update(cpi);
     rc->frames_till_gf_update_due = rc->baseline_gf_interval;
diff --git a/third_party/aom/av1/encoder/av1_quantize.c b/third_party/aom/av1/encoder/av1_quantize.c
index 6cffac264..63727df1f 100644
--- a/third_party/aom/av1/encoder/av1_quantize.c
+++ b/third_party/aom/av1/encoder/av1_quantize.c
@@ -1594,50 +1594,48 @@ static int get_qzbin_factor(int q, aom_bit_depth_t bit_depth) {
 #endif
 }
 
-void av1_init_quantizer(AV1_COMP *cpi) {
-  AV1_COMMON *const cm = &cpi->common;
-  QUANTS *const quants = &cpi->quants;
+void av1_build_quantizer(aom_bit_depth_t bit_depth, int y_dc_delta_q,
+                         int uv_dc_delta_q, int uv_ac_delta_q,
+                         QUANTS *const quants, Dequants *const deq) {
   int i, q, quant;
-#if CONFIG_NEW_QUANT
-  int dq;
-#endif
 
   for (q = 0; q < QINDEX_RANGE; q++) {
-    const int qzbin_factor = get_qzbin_factor(q, cm->bit_depth);
+    const int qzbin_factor = get_qzbin_factor(q, bit_depth);
     const int qrounding_factor = q == 0 ? 64 : 48;
 
     for (i = 0; i < 2; ++i) {
       int qrounding_factor_fp = 64;
       // y
-      quant = i == 0 ? av1_dc_quant(q, cm->y_dc_delta_q, cm->bit_depth)
-                     : av1_ac_quant(q, 0, cm->bit_depth);
+      quant = i == 0 ? av1_dc_quant(q, y_dc_delta_q, bit_depth)
+                     : av1_ac_quant(q, 0, bit_depth);
       invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i], quant);
       quants->y_quant_fp[q][i] = (1 << 16) / quant;
       quants->y_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7;
       quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
       quants->y_round[q][i] = (qrounding_factor * quant) >> 7;
-      cpi->y_dequant[q][i] = quant;
+      deq->y_dequant[q][i] = quant;
 
       // uv
-      quant = i == 0 ? av1_dc_quant(q, cm->uv_dc_delta_q, cm->bit_depth)
-                     : av1_ac_quant(q, cm->uv_ac_delta_q, cm->bit_depth);
+      quant = i == 0 ? av1_dc_quant(q, uv_dc_delta_q, bit_depth)
+                     : av1_ac_quant(q, uv_ac_delta_q, bit_depth);
       invert_quant(&quants->uv_quant[q][i], &quants->uv_quant_shift[q][i],
                    quant);
       quants->uv_quant_fp[q][i] = (1 << 16) / quant;
       quants->uv_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7;
       quants->uv_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
       quants->uv_round[q][i] = (qrounding_factor * quant) >> 7;
-      cpi->uv_dequant[q][i] = quant;
+      deq->uv_dequant[q][i] = quant;
     }
 
 #if CONFIG_NEW_QUANT
+    int dq;
     for (dq = 0; dq < QUANT_PROFILES; dq++) {
       for (i = 0; i < COEF_BANDS; i++) {
-        const int y_quant = cpi->y_dequant[q][i != 0];
-        const int uvquant = cpi->uv_dequant[q][i != 0];
-        av1_get_dequant_val_nuq(y_quant, i, cpi->y_dequant_val_nuq[dq][q][i],
+        const int y_quant = deq->y_dequant[q][i != 0];
+        const int uvquant = deq->uv_dequant[q][i != 0];
+        av1_get_dequant_val_nuq(y_quant, i, deq->y_dequant_val_nuq[dq][q][i],
                                 quants->y_cuml_bins_nuq[dq][q][i], dq);
-        av1_get_dequant_val_nuq(uvquant, i, cpi->uv_dequant_val_nuq[dq][q][i],
+        av1_get_dequant_val_nuq(uvquant, i, deq->uv_dequant_val_nuq[dq][q][i],
                                 quants->uv_cuml_bins_nuq[dq][q][i], dq);
       }
     }
@@ -1650,7 +1648,7 @@ void av1_init_quantizer(AV1_COMP *cpi) {
       quants->y_quant_shift[q][i] = quants->y_quant_shift[q][1];
       quants->y_zbin[q][i] = quants->y_zbin[q][1];
       quants->y_round[q][i] = quants->y_round[q][1];
-      cpi->y_dequant[q][i] = cpi->y_dequant[q][1];
+      deq->y_dequant[q][i] = deq->y_dequant[q][1];
 
       quants->uv_quant[q][i] = quants->uv_quant[q][1];
       quants->uv_quant_fp[q][i] = quants->uv_quant_fp[q][1];
@@ -1658,11 +1656,19 @@ void av1_init_quantizer(AV1_COMP *cpi) {
       quants->uv_quant_shift[q][i] = quants->uv_quant_shift[q][1];
       quants->uv_zbin[q][i] = quants->uv_zbin[q][1];
       quants->uv_round[q][i] = quants->uv_round[q][1];
-      cpi->uv_dequant[q][i] = cpi->uv_dequant[q][1];
+      deq->uv_dequant[q][i] = deq->uv_dequant[q][1];
     }
   }
 }
 
+void av1_init_quantizer(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  QUANTS *const quants = &cpi->quants;
+  Dequants *const dequants = &cpi->dequants;
+  av1_build_quantizer(cm->bit_depth, cm->y_dc_delta_q, cm->uv_dc_delta_q,
+                      cm->uv_ac_delta_q, quants, dequants);
+}
+
 void av1_init_plane_quantizers(const AV1_COMP *cpi, MACROBLOCK *x,
                                int segment_id) {
   const AV1_COMMON *const cm = &cpi->common;
@@ -1712,11 +1718,12 @@ void av1_init_plane_quantizers(const AV1_COMP *cpi, MACROBLOCK *x,
   memcpy(&xd->plane[0].seg_iqmatrix[segment_id], cm->giqmatrix[qmlevel][0],
          sizeof(cm->giqmatrix[qmlevel][0]));
 #endif
-  xd->plane[0].dequant = cpi->y_dequant[qindex];
+  xd->plane[0].dequant = cpi->dequants.y_dequant[qindex];
 #if CONFIG_NEW_QUANT
   for (dq = 0; dq < QUANT_PROFILES; dq++) {
     x->plane[0].cuml_bins_nuq[dq] = quants->y_cuml_bins_nuq[dq][qindex];
-    xd->plane[0].dequant_val_nuq[dq] = cpi->y_dequant_val_nuq[dq][qindex];
+    xd->plane[0].dequant_val_nuq[dq] =
+        cpi->dequants.y_dequant_val_nuq[dq][qindex];
   }
 #endif  // CONFIG_NEW_QUANT
 
@@ -1734,11 +1741,12 @@ void av1_init_plane_quantizers(const AV1_COMP *cpi, MACROBLOCK *x,
     memcpy(&xd->plane[i].seg_iqmatrix[segment_id], cm->giqmatrix[qmlevel][1],
            sizeof(cm->giqmatrix[qmlevel][1]));
 #endif
-    xd->plane[i].dequant = cpi->uv_dequant[qindex];
+    xd->plane[i].dequant = cpi->dequants.uv_dequant[qindex];
 #if CONFIG_NEW_QUANT
     for (dq = 0; dq < QUANT_PROFILES; dq++) {
       x->plane[i].cuml_bins_nuq[dq] = quants->uv_cuml_bins_nuq[dq][qindex];
-      xd->plane[i].dequant_val_nuq[dq] = cpi->uv_dequant_val_nuq[dq][qindex];
+      xd->plane[i].dequant_val_nuq[dq] =
+          cpi->dequants.uv_dequant_val_nuq[dq][qindex];
     }
 #endif  // CONFIG_NEW_QUANT
   }
diff --git a/third_party/aom/av1/encoder/av1_quantize.h b/third_party/aom/av1/encoder/av1_quantize.h
index c87b6b7dc..4bc9cccc2 100644
--- a/third_party/aom/av1/encoder/av1_quantize.h
+++ b/third_party/aom/av1/encoder/av1_quantize.h
@@ -69,6 +69,17 @@ typedef struct {
   DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]);
 } QUANTS;
 
+typedef struct {
+  DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][8]);   // 8: SIMD width
+  DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][8]);  // 8: SIMD width
+#if CONFIG_NEW_QUANT
+  DECLARE_ALIGNED(16, dequant_val_type_nuq,
+                  y_dequant_val_nuq[QUANT_PROFILES][QINDEX_RANGE][COEF_BANDS]);
+  DECLARE_ALIGNED(16, dequant_val_type_nuq,
+                  uv_dequant_val_nuq[QUANT_PROFILES][QINDEX_RANGE][COEF_BANDS]);
+#endif  // CONFIG_NEW_QUANT
+} Dequants;
+
 struct AV1_COMP;
 struct AV1Common;
 
@@ -77,6 +88,10 @@ void av1_frame_init_quantizer(struct AV1_COMP *cpi);
 void av1_init_plane_quantizers(const struct AV1_COMP *cpi, MACROBLOCK *x,
                                int segment_id);
 
+void av1_build_quantizer(aom_bit_depth_t bit_depth, int y_dc_delta_q,
+                         int uv_dc_delta_q, int uv_ac_delta_q,
+                         QUANTS *const quants, Dequants *const deq);
+
 void av1_init_quantizer(struct AV1_COMP *cpi);
 
 void av1_set_quantizer(struct AV1Common *cm, int q);
diff --git a/third_party/aom/av1/encoder/bitstream.c b/third_party/aom/av1/encoder/bitstream.c
index 7cc6179ea..f8378b14d 100644
--- a/third_party/aom/av1/encoder/bitstream.c
+++ b/third_party/aom/av1/encoder/bitstream.c
@@ -63,21 +63,12 @@
 
 static struct av1_token intra_mode_encodings[INTRA_MODES];
 static struct av1_token switchable_interp_encodings[SWITCHABLE_FILTERS];
-#if CONFIG_EXT_PARTITION_TYPES && !CONFIG_EC_MULTISYMBOL
-static const struct av1_token ext_partition_encodings[EXT_PARTITION_TYPES] = {
-  { 0, 1 },  { 4, 3 },  { 12, 4 }, { 7, 3 },
-  { 10, 4 }, { 11, 4 }, { 26, 5 }, { 27, 5 }
-};
-#endif
 static struct av1_token partition_encodings[PARTITION_TYPES];
-#if !CONFIG_REF_MV
-static struct av1_token inter_mode_encodings[INTER_MODES];
-#endif
 #if CONFIG_EXT_INTER
 static const struct av1_token
     inter_compound_mode_encodings[INTER_COMPOUND_MODES] = {
-      { 2, 2 },  { 50, 6 }, { 51, 6 }, { 24, 5 }, { 52, 6 },
-      { 53, 6 }, { 54, 6 }, { 55, 6 }, { 0, 1 },  { 7, 3 }
+      { 2, 2 },  { 12, 4 }, { 52, 6 }, { 53, 6 },
+      { 54, 6 }, { 55, 6 }, { 0, 1 },  { 7, 3 }
     };
 #endif  // CONFIG_EXT_INTER
 #if CONFIG_PALETTE
@@ -85,16 +76,6 @@ static struct av1_token palette_size_encodings[PALETTE_SIZES];
 static struct av1_token palette_color_index_encodings[PALETTE_SIZES]
                                                      [PALETTE_COLORS];
 #endif  // CONFIG_PALETTE
-#if !CONFIG_EC_MULTISYMBOL
-static const struct av1_token tx_size_encodings[MAX_TX_DEPTH][TX_SIZES] = {
-  { { 0, 1 }, { 1, 1 } },                      // Max tx_size is 8X8
-  { { 0, 1 }, { 2, 2 }, { 3, 2 } },            // Max tx_size is 16X16
-  { { 0, 1 }, { 2, 2 }, { 6, 3 }, { 7, 3 } },  // Max tx_size is 32X32
-#if CONFIG_TX64X64
-  { { 0, 1 }, { 2, 2 }, { 6, 3 }, { 14, 4 }, { 15, 4 } },  // Max tx_size 64X64
-#endif                                                     // CONFIG_TX64X64
-};
-#endif
 
 #if CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA || CONFIG_PALETTE
 static INLINE void write_uniform(aom_writer *w, int n, int v) {
@@ -125,7 +106,9 @@ static struct av1_token intra_filter_encodings[INTRA_FILTERS];
 #endif  // CONFIG_INTRA_INTERP
 #endif  // CONFIG_EXT_INTRA
 #if CONFIG_EXT_INTER
+#if CONFIG_INTERINTRA
 static struct av1_token interintra_mode_encodings[INTERINTRA_MODES];
+#endif
 #if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
 static struct av1_token compound_type_encodings[COMPOUND_TYPES];
 #endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
@@ -162,9 +145,6 @@ void av1_encode_token_init(void) {
   av1_tokens_from_tree(intra_mode_encodings, av1_intra_mode_tree);
   av1_tokens_from_tree(switchable_interp_encodings, av1_switchable_interp_tree);
   av1_tokens_from_tree(partition_encodings, av1_partition_tree);
-#if !CONFIG_REF_MV
-  av1_tokens_from_tree(inter_mode_encodings, av1_inter_mode_tree);
-#endif
 
 #if CONFIG_PALETTE
   av1_tokens_from_tree(palette_size_encodings, av1_palette_size_tree);
@@ -178,7 +158,9 @@ void av1_encode_token_init(void) {
   av1_tokens_from_tree(intra_filter_encodings, av1_intra_filter_tree);
 #endif  // CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
 #if CONFIG_EXT_INTER
+#if CONFIG_INTERINTRA
   av1_tokens_from_tree(interintra_mode_encodings, av1_interintra_mode_tree);
+#endif  // CONFIG_INTERINTRA
 #if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
   av1_tokens_from_tree(compound_type_encodings, av1_compound_type_tree);
 #endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
@@ -195,7 +177,6 @@ void av1_encode_token_init(void) {
                        av1_switchable_restore_tree);
 #endif  // CONFIG_LOOP_RESTORATION
 
-#if CONFIG_EC_MULTISYMBOL
   /* This hack is necessary when CONFIG_DUAL_FILTER is enabled because the five
       SWITCHABLE_FILTERS are not consecutive, e.g., 0, 1, 2, 3, 4, when doing
       an in-order traversal of the av1_switchable_interp_tree structure. */
@@ -218,7 +199,6 @@ void av1_encode_token_init(void) {
                         av1_intra_mode_tree);
   av1_indices_from_tree(av1_inter_mode_ind, av1_inter_mode_inv,
                         av1_inter_mode_tree);
-#endif
 }
 
 static void write_intra_mode_kf(const AV1_COMMON *cm, FRAME_CONTEXT *frame_ctx,
@@ -228,37 +208,28 @@ static void write_intra_mode_kf(const AV1_COMMON *cm, FRAME_CONTEXT *frame_ctx,
 #if CONFIG_INTRABC
   assert(!is_intrabc_block(&mi->mbmi));
 #endif  // CONFIG_INTRABC
-#if CONFIG_EC_MULTISYMBOL
   aom_write_symbol(w, av1_intra_mode_ind[mode],
                    get_y_mode_cdf(frame_ctx, mi, above_mi, left_mi, block),
                    INTRA_MODES);
   (void)cm;
-#else
-  av1_write_token(w, av1_intra_mode_tree,
-                  get_y_mode_probs(cm, mi, above_mi, left_mi, block),
-                  &intra_mode_encodings[mode]);
-  (void)frame_ctx;
-#endif
 }
 
-#if CONFIG_EXT_INTER
+#if CONFIG_EXT_INTER && CONFIG_INTERINTRA
 static void write_interintra_mode(aom_writer *w, INTERINTRA_MODE mode,
                                   const aom_prob *probs) {
   av1_write_token(w, av1_interintra_mode_tree, probs,
                   &interintra_mode_encodings[mode]);
 }
-#endif  // CONFIG_EXT_INTER
+#endif  // CONFIG_EXT_INTER && CONFIG_INTERINTRA
 
 static void write_inter_mode(aom_writer *w, PREDICTION_MODE mode,
                              FRAME_CONTEXT *ec_ctx, const int16_t mode_ctx) {
-#if CONFIG_REF_MV
   const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK;
   const aom_prob newmv_prob = ec_ctx->newmv_prob[newmv_ctx];
 
-#define IS_NEWMV_MODE(mode) ((mode) == NEWMV)
-  aom_write(w, !IS_NEWMV_MODE(mode), newmv_prob);
+  aom_write(w, mode != NEWMV, newmv_prob);
 
-  if (!IS_NEWMV_MODE(mode)) {
+  if (mode != NEWMV) {
     const int16_t zeromv_ctx = (mode_ctx >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK;
     const aom_prob zeromv_prob = ec_ctx->zeromv_prob[zeromv_ctx];
 
@@ -281,25 +252,8 @@ static void write_inter_mode(aom_writer *w, PREDICTION_MODE mode,
       aom_write(w, mode != NEARESTMV, refmv_prob);
     }
   }
-
-#undef IS_NEWMV_MODE
-
-#else  // !CONFIG_REF_MV
-  assert(is_inter_mode(mode));
-#if CONFIG_EC_MULTISYMBOL
-  aom_write_symbol(w, av1_inter_mode_ind[INTER_OFFSET(mode)],
-                   ec_ctx->inter_mode_cdf[mode_ctx], INTER_MODES);
-#else
-  {
-    const aom_prob *const inter_probs = ec_ctx->inter_mode_probs[mode_ctx];
-    av1_write_token(w, av1_inter_mode_tree, inter_probs,
-                    &inter_mode_encodings[INTER_OFFSET(mode)]);
-  }
-#endif
-#endif
 }
 
-#if CONFIG_REF_MV
 static void write_drl_idx(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi,
                           const MB_MODE_INFO_EXT *mbmi_ext, aom_writer *w) {
   uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
@@ -341,7 +295,6 @@ static void write_drl_idx(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi,
     return;
   }
 }
-#endif
 
 #if CONFIG_EXT_INTER
 static void write_inter_compound_mode(AV1_COMMON *cm, aom_writer *w,
@@ -409,22 +362,22 @@ static void write_tx_size_vartx(const AV1_COMMON *cm, const MACROBLOCKD *xd,
   const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0);
   const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0);
 
-  int ctx = txfm_partition_context(xd->above_txfm_context + tx_col,
-                                   xd->left_txfm_context + tx_row,
+  int ctx = txfm_partition_context(xd->above_txfm_context + blk_col,
+                                   xd->left_txfm_context + blk_row,
                                    mbmi->sb_type, tx_size);
 
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
   if (depth == MAX_VARTX_DEPTH) {
-    txfm_partition_update(xd->above_txfm_context + tx_col,
-                          xd->left_txfm_context + tx_row, tx_size, tx_size);
+    txfm_partition_update(xd->above_txfm_context + blk_col,
+                          xd->left_txfm_context + blk_row, tx_size, tx_size);
     return;
   }
 
   if (tx_size == mbmi->inter_tx_size[tx_row][tx_col]) {
     aom_write(w, 0, cm->fc->txfm_partition_prob[ctx]);
-    txfm_partition_update(xd->above_txfm_context + tx_col,
-                          xd->left_txfm_context + tx_row, tx_size, tx_size);
+    txfm_partition_update(xd->above_txfm_context + blk_col,
+                          xd->left_txfm_context + blk_row, tx_size, tx_size);
   } else {
     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
     const int bsl = tx_size_wide_unit[sub_txs];
@@ -433,8 +386,8 @@ static void write_tx_size_vartx(const AV1_COMMON *cm, const MACROBLOCKD *xd,
     aom_write(w, 1, cm->fc->txfm_partition_prob[ctx]);
 
     if (tx_size == TX_8X8) {
-      txfm_partition_update(xd->above_txfm_context + tx_col,
-                            xd->left_txfm_context + tx_row, sub_txs, tx_size);
+      txfm_partition_update(xd->above_txfm_context + blk_col,
+                            xd->left_txfm_context + blk_row, sub_txs, tx_size);
       return;
     }
 
@@ -482,22 +435,18 @@ static void write_selected_tx_size(const AV1_COMMON *cm, const MACROBLOCKD *xd,
     const int depth = tx_size_to_depth(coded_tx_size);
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
     assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed(xd, mbmi)));
-    assert(
-        IMPLIES(is_rect_tx(tx_size), tx_size == max_txsize_rect_lookup[bsize]));
 #endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
 
-#if CONFIG_EC_MULTISYMBOL
     aom_write_symbol(w, depth, ec_ctx->tx_size_cdf[tx_size_cat][tx_size_ctx],
                      tx_size_cat + 2);
-#else
-    av1_write_token(w, av1_tx_size_tree[tx_size_cat],
-                    ec_ctx->tx_size_probs[tx_size_cat][tx_size_ctx],
-                    &tx_size_encodings[tx_size_cat][depth]);
-#endif
+#if CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
+    if (is_quarter_tx_allowed(xd, mbmi, is_inter) && tx_size != coded_tx_size)
+      aom_write(w, tx_size == quarter_txsize_lookup[bsize],
+                cm->fc->quarter_tx_size_prob);
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
   }
 }
 
-#if CONFIG_REF_MV
 static void update_inter_mode_probs(AV1_COMMON *cm, aom_writer *w,
                                     FRAME_COUNTS *counts) {
   int i;
@@ -519,7 +468,6 @@ static void update_inter_mode_probs(AV1_COMMON *cm, aom_writer *w,
     av1_cond_prob_diff_update(w, &cm->fc->drl_prob[i], counts->drl_mode[i],
                               probwt);
 }
-#endif
 
 #if CONFIG_EXT_INTER
 static void update_inter_compound_mode_probs(AV1_COMMON *cm, int probwt,
@@ -598,17 +546,8 @@ static void write_delta_qindex(const AV1_COMMON *cm, const MACROBLOCKD *xd,
   (void)xd;
 #endif
 
-#if CONFIG_EC_MULTISYMBOL
   aom_write_symbol(w, AOMMIN(abs, DELTA_Q_SMALL), ec_ctx->delta_q_cdf,
                    DELTA_Q_PROBS + 1);
-#else
-  int i = 0;
-  while (i < DELTA_Q_SMALL && i <= abs) {
-    int bit = (i < abs);
-    aom_write(w, bit, ec_ctx->delta_q_prob[i]);
-    i++;
-  }
-#endif
 
   if (!smallval) {
     rem_bits = OD_ILOG_NZ(abs - 1) - 1;
@@ -655,17 +594,8 @@ static void write_delta_lflevel(const AV1_COMMON *cm, const MACROBLOCKD *xd,
   (void)xd;
 #endif
 
-#if CONFIG_EC_MULTISYMBOL
   aom_write_symbol(w, AOMMIN(abs, DELTA_LF_SMALL), ec_ctx->delta_lf_cdf,
                    DELTA_LF_PROBS + 1);
-#else
-  int i = 0;
-  while (i < DELTA_LF_SMALL && i <= abs) {
-    int bit = (i < abs);
-    aom_write(w, bit, ec_ctx->delta_lf_prob[i]);
-    i++;
-  }
-#endif  // CONFIG_EC_MULTISYMBOL
 
   if (!smallval) {
     rem_bits = OD_ILOG_NZ(abs - 1) - 1;
@@ -908,7 +838,7 @@ static INLINE void write_coeff_extra(const aom_prob *pb, int value,
 }
 #endif
 
-#if CONFIG_NEW_TOKENSET && !CONFIG_LV_MAP
+#if !CONFIG_LV_MAP
 static void pack_mb_tokens(aom_writer *w, const TOKENEXTRA **tp,
                            const TOKENEXTRA *const stop,
                            aom_bit_depth_t bit_depth, const TX_SIZE tx_size,
@@ -921,18 +851,22 @@ static void pack_mb_tokens(aom_writer *w, const TOKENEXTRA **tp,
 
   while (p < stop && p->token != EOSB_TOKEN) {
     const int token = p->token;
+    const int eob_val = p->eob_val;
     if (token == BLOCK_Z_TOKEN) {
       aom_write_symbol(w, 0, *p->head_cdf, HEAD_TOKENS + 1);
       p++;
+#if CONFIG_VAR_TX
+      break;
+#endif
       continue;
     }
 
     const av1_extra_bit *const extra_bits = &av1_extra_bits[token];
-    if (p->eob_val == LAST_EOB) {
+    if (eob_val == LAST_EOB) {
       // Just code a flag indicating whether the value is >1 or 1.
       aom_write_bit(w, token != ONE_TOKEN);
     } else {
-      int comb_symb = 2 * AOMMIN(token, TWO_TOKEN) - p->eob_val + p->first_val;
+      int comb_symb = 2 * AOMMIN(token, TWO_TOKEN) - eob_val + p->first_val;
       aom_write_symbol(w, comb_symb, *p->head_cdf, HEAD_TOKENS + p->first_val);
     }
     if (token > ONE_TOKEN) {
@@ -966,104 +900,13 @@ static void pack_mb_tokens(aom_writer *w, const TOKENEXTRA **tp,
 
 #if CONFIG_VAR_TX
     ++count;
-    if (token == EOB_TOKEN || count == seg_eob) break;
-#endif
-  }
-
-  *tp = p;
-}
-#else  //  CONFIG_NEW_TOKENSET
-#if !CONFIG_LV_MAP
-static void pack_mb_tokens(aom_writer *w, const TOKENEXTRA **tp,
-                           const TOKENEXTRA *const stop,
-                           aom_bit_depth_t bit_depth, const TX_SIZE tx_size,
-                           TOKEN_STATS *token_stats) {
-  const TOKENEXTRA *p = *tp;
-#if CONFIG_VAR_TX
-  int count = 0;
-  const int seg_eob = tx_size_2d[tx_size];
-#endif
-
-  while (p < stop && p->token != EOSB_TOKEN) {
-    const int token = p->token;
-#if !CONFIG_EC_MULTISYMBOL
-    const struct av1_token *const coef_encoding = &av1_coef_encodings[token];
-    int coef_value = coef_encoding->value;
-    int coef_length = coef_encoding->len;
-#endif  // !CONFIG_EC_MULTISYMBOL
-    const av1_extra_bit *const extra_bits = &av1_extra_bits[token];
-
-#if CONFIG_EC_MULTISYMBOL
-    /* skip one or two nodes */
-    if (!p->skip_eob_node)
-      aom_write_record(w, token != EOB_TOKEN, p->context_tree[0], token_stats);
-    if (token != EOB_TOKEN) {
-      aom_write_record(w, token != ZERO_TOKEN, p->context_tree[1], token_stats);
-      if (token != ZERO_TOKEN) {
-        aom_write_symbol(w, token - ONE_TOKEN, *p->token_cdf,
-                         CATEGORY6_TOKEN - ONE_TOKEN + 1);
-      }
-    }
-#else
-    /* skip one or two nodes */
-    if (p->skip_eob_node)
-      coef_length -= p->skip_eob_node;
-    else
-      aom_write_record(w, token != EOB_TOKEN, p->context_tree[0], token_stats);
-
-    if (token != EOB_TOKEN) {
-      aom_write_record(w, token != ZERO_TOKEN, p->context_tree[1], token_stats);
-
-      if (token != ZERO_TOKEN) {
-        aom_write_record(w, token != ONE_TOKEN, p->context_tree[2],
-                         token_stats);
-
-        if (token != ONE_TOKEN) {
-          const int unconstrained_len = UNCONSTRAINED_NODES - p->skip_eob_node;
-          aom_write_tree_record(
-              w, av1_coef_con_tree,
-              av1_pareto8_full[p->context_tree[PIVOT_NODE] - 1], coef_value,
-              coef_length - unconstrained_len, 0, token_stats);
-        }
-      }
-    }
-#endif  // CONFIG_EC_MULTISYMBOL
-
-    if (extra_bits->base_val) {
-      const int bit_string = p->extra;
-      const int bit_string_length = extra_bits->len;  // Length of extra bits to
-      // be written excluding
-      // the sign bit.
-      int skip_bits = (extra_bits->base_val == CAT6_MIN_VAL)
-                          ? (int)sizeof(av1_cat6_prob) -
-                                av1_get_cat6_extrabits_size(tx_size, bit_depth)
-                          : 0;
-
-      assert(!(bit_string >> (bit_string_length - skip_bits + 1)));
-      if (bit_string_length > 0) {
-#if CONFIG_NEW_MULTISYMBOL
-        skip_bits &= ~3;
-        write_coeff_extra(extra_bits->cdf, bit_string >> 1,
-                          bit_string_length - skip_bits, w);
-#else
-        write_coeff_extra(extra_bits->prob, bit_string >> 1, bit_string_length,
-                          skip_bits, w, token_stats);
-#endif
-      }
-      aom_write_bit_record(w, bit_string & 1, token_stats);
-    }
-    ++p;
-
-#if CONFIG_VAR_TX
-    ++count;
-    if (token == EOB_TOKEN || count == seg_eob) break;
+    if (eob_val == EARLY_EOB || count == seg_eob) break;
 #endif
   }
 
   *tp = p;
 }
 #endif  // !CONFIG_LV_MAP
-#endif  // CONFIG_NEW_TOKENSET
 #else   // !CONFIG_PVQ
 static PVQ_INFO *get_pvq_block(PVQ_QUEUE *pvq_q) {
   PVQ_INFO *pvq;
@@ -1150,6 +993,80 @@ static void pack_pvq_tokens(aom_writer *w, MACROBLOCK *const x,
 #endif  // !CONFIG_PVG
 
 #if CONFIG_VAR_TX && !CONFIG_COEF_INTERLEAVE
+#if CONFIG_LV_MAP
+static void pack_txb_tokens(aom_writer *w,
+#if CONFIG_LV_MAP
+                            AV1_COMMON *cm,
+#endif  // CONFIG_LV_MAP
+                            const TOKENEXTRA **tp,
+                            const TOKENEXTRA *const tok_end,
+#if CONFIG_PVQ || CONFIG_LV_MAP
+                            MACROBLOCK *const x,
+#endif
+                            MACROBLOCKD *xd, MB_MODE_INFO *mbmi, int plane,
+                            BLOCK_SIZE plane_bsize, aom_bit_depth_t bit_depth,
+                            int block, int blk_row, int blk_col,
+                            TX_SIZE tx_size, TOKEN_STATS *token_stats) {
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+  const int tx_row = blk_row >> (1 - pd->subsampling_y);
+  const int tx_col = blk_col >> (1 - pd->subsampling_x);
+  TX_SIZE plane_tx_size;
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  plane_tx_size =
+      plane ? uv_txsize_lookup[bsize][mbmi->inter_tx_size[tx_row][tx_col]][0][0]
+            : mbmi->inter_tx_size[tx_row][tx_col];
+
+  if (tx_size == plane_tx_size) {
+    TOKEN_STATS tmp_token_stats;
+    init_token_stats(&tmp_token_stats);
+
+#if !CONFIG_PVQ
+    tran_low_t *tcoeff = BLOCK_OFFSET(x->mbmi_ext->tcoeff[plane], block);
+    uint16_t eob = x->mbmi_ext->eobs[plane][block];
+    TXB_CTX txb_ctx = { x->mbmi_ext->txb_skip_ctx[plane][block],
+                        x->mbmi_ext->dc_sign_ctx[plane][block] };
+    av1_write_coeffs_txb(cm, xd, w, block, plane, tcoeff, eob, &txb_ctx);
+#else
+    pack_pvq_tokens(w, x, xd, plane, bsize, tx_size);
+#endif
+#if CONFIG_RD_DEBUG
+    token_stats->txb_coeff_cost_map[blk_row][blk_col] = tmp_token_stats.cost;
+    token_stats->cost += tmp_token_stats.cost;
+#endif
+  } else {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsl = tx_size_wide_unit[sub_txs];
+    int i;
+
+    assert(bsl > 0);
+
+    for (i = 0; i < 4; ++i) {
+      const int offsetr = blk_row + (i >> 1) * bsl;
+      const int offsetc = blk_col + (i & 0x01) * bsl;
+      const int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
+
+      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+
+      pack_txb_tokens(w,
+#if CONFIG_LV_MAP
+                      cm,
+#endif
+                      tp, tok_end,
+#if CONFIG_PVQ || CONFIG_LV_MAP
+                      x,
+#endif
+                      xd, mbmi, plane, plane_bsize, bit_depth, block, offsetr,
+                      offsetc, sub_txs, token_stats);
+      block += step;
+    }
+  }
+}
+#else  // CONFIG_LV_MAP
 static void pack_txb_tokens(aom_writer *w, const TOKENEXTRA **tp,
                             const TOKENEXTRA *const tok_end,
 #if CONFIG_PVQ
@@ -1209,16 +1126,13 @@ static void pack_txb_tokens(aom_writer *w, const TOKENEXTRA **tp,
     }
   }
 }
-#endif
+#endif  // CONFIG_LV_MAP
+#endif  // CONFIG_VAR_TX
 
 static void write_segment_id(aom_writer *w, const struct segmentation *seg,
                              struct segmentation_probs *segp, int segment_id) {
   if (seg->enabled && seg->update_map) {
-#if CONFIG_EC_MULTISYMBOL
     aom_write_symbol(w, segment_id, segp->tree_cdf, MAX_SEGMENTS);
-#else
-    aom_write_tree(w, av1_segment_tree, segp->tree_probs, segment_id, 3, 0);
-#endif
   }
 }
 
@@ -1242,7 +1156,7 @@ static void write_ref_frames(const AV1_COMMON *cm, const MACROBLOCKD *xd,
 #if SUB8X8_COMP_REF
       aom_write(w, is_compound, av1_get_reference_mode_prob(cm, xd));
 #else
-      if (mbmi->sb_type >= BLOCK_8X8)
+      if (mbmi->sb_type != BLOCK_4X4)
         aom_write(w, is_compound, av1_get_reference_mode_prob(cm, xd));
 #endif
     } else {
@@ -1307,7 +1221,9 @@ static void write_ref_frames(const AV1_COMMON *cm, const MACROBLOCKD *xd,
 
 #if CONFIG_FILTER_INTRA
 static void write_filter_intra_mode_info(const AV1_COMMON *const cm,
+                                         const MACROBLOCKD *xd,
                                          const MB_MODE_INFO *const mbmi,
+                                         int mi_row, int mi_col,
                                          aom_writer *w) {
   if (mbmi->mode == DC_PRED
 #if CONFIG_PALETTE
@@ -1323,6 +1239,17 @@ static void write_filter_intra_mode_info(const AV1_COMMON *const cm,
     }
   }
 
+#if CONFIG_CB4X4
+  if (!is_chroma_reference(mi_row, mi_col, mbmi->sb_type,
+                           xd->plane[1].subsampling_x,
+                           xd->plane[1].subsampling_y))
+    return;
+#else
+  (void)xd;
+  (void)mi_row;
+  (void)mi_col;
+#endif  // CONFIG_CB4X4
+
   if (mbmi->uv_mode == DC_PRED
 #if CONFIG_PALETTE
       && mbmi->palette_mode_info.palette_size[1] == 0
@@ -1358,15 +1285,9 @@ static void write_intra_angle_info(const MACROBLOCKD *xd,
 #if CONFIG_INTRA_INTERP
     p_angle = mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP;
     if (av1_is_intra_filter_switchable(p_angle)) {
-#if CONFIG_EC_MULTISYMBOL
       aom_write_symbol(w, mbmi->intra_filter,
                        ec_ctx->intra_filter_cdf[intra_filter_ctx],
                        INTRA_FILTERS);
-#else
-      av1_write_token(w, av1_intra_filter_tree,
-                      ec_ctx->intra_filter_probs[intra_filter_ctx],
-                      &intra_filter_encodings[mbmi->intra_filter]);
-#endif  // CONFIG_EC_MULTISYMBOL
     }
 #endif  // CONFIG_INTRA_INTERP
   }
@@ -1409,15 +1330,9 @@ static void write_mb_interp_filter(AV1_COMP *cpi, const MACROBLOCKD *xd,
           (mbmi->ref_frame[1] > INTRA_FRAME &&
            has_subpel_mv_component(xd->mi[0], xd, dir + 2))) {
         const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
-#if CONFIG_EC_MULTISYMBOL
         aom_write_symbol(w, av1_switchable_interp_ind[mbmi->interp_filter[dir]],
                          ec_ctx->switchable_interp_cdf[ctx],
                          SWITCHABLE_FILTERS);
-#else
-        av1_write_token(w, av1_switchable_interp_tree,
-                        ec_ctx->switchable_interp_prob[ctx],
-                        &switchable_interp_encodings[mbmi->interp_filter[dir]]);
-#endif
         ++cpi->interp_filter_selected[0][mbmi->interp_filter[dir]];
       } else {
         assert(mbmi->interp_filter[dir] == EIGHTTAP_REGULAR);
@@ -1426,14 +1341,8 @@ static void write_mb_interp_filter(AV1_COMP *cpi, const MACROBLOCKD *xd,
 #else
     {
       const int ctx = av1_get_pred_context_switchable_interp(xd);
-#if CONFIG_EC_MULTISYMBOL
       aom_write_symbol(w, av1_switchable_interp_ind[mbmi->interp_filter],
                        ec_ctx->switchable_interp_cdf[ctx], SWITCHABLE_FILTERS);
-#else
-      av1_write_token(w, av1_switchable_interp_tree,
-                      ec_ctx->switchable_interp_prob[ctx],
-                      &switchable_interp_encodings[mbmi->interp_filter]);
-#endif
       ++cpi->interp_filter_selected[0][mbmi->interp_filter];
     }
 #endif  // CONFIG_DUAL_FILTER
@@ -1442,48 +1351,91 @@ static void write_mb_interp_filter(AV1_COMP *cpi, const MACROBLOCKD *xd,
 
 #if CONFIG_PALETTE
 #if CONFIG_PALETTE_DELTA_ENCODING
-// Write luma palette color values with delta encoding. Write the first value as
-// literal, and the deltas between each value and the previous one. The luma
-// palette is sorted so each delta is larger than 0.
-static void write_palette_colors_y(const PALETTE_MODE_INFO *const pmi,
-                                   int bit_depth, aom_writer *w) {
-  const int n = pmi->palette_size[0];
-  int min_bits, i;
-  int bits = av1_get_palette_delta_bits_y(pmi, bit_depth, &min_bits);
+// Transmit color values with delta encoding. Write the first value as
+// literal, and the deltas between each value and the previous one. "min_val" is
+// the smallest possible value of the deltas.
+static void delta_encode_palette_colors(const int *colors, int num,
+                                        int bit_depth, int min_val,
+                                        aom_writer *w) {
+  if (num <= 0) return;
+  assert(colors[0] < (1 << bit_depth));
+  aom_write_literal(w, colors[0], bit_depth);
+  if (num == 1) return;
+  int max_delta = 0;
+  int deltas[PALETTE_MAX_SIZE];
+  memset(deltas, 0, sizeof(deltas));
+  for (int i = 1; i < num; ++i) {
+    assert(colors[i] < (1 << bit_depth));
+    const int delta = colors[i] - colors[i - 1];
+    deltas[i - 1] = delta;
+    assert(delta >= min_val);
+    if (delta > max_delta) max_delta = delta;
+  }
+  const int min_bits = bit_depth - 3;
+  int bits = AOMMAX(av1_ceil_log2(max_delta + 1 - min_val), min_bits);
+  assert(bits <= bit_depth);
+  int range = (1 << bit_depth) - colors[0] - min_val;
   aom_write_literal(w, bits - min_bits, 2);
-  aom_write_literal(w, pmi->palette_colors[0], bit_depth);
-  for (i = 1; i < n; ++i) {
-    aom_write_literal(
-        w, pmi->palette_colors[i] - pmi->palette_colors[i - 1] - 1, bits);
-    bits =
-        AOMMIN(bits, av1_ceil_log2((1 << bit_depth) - pmi->palette_colors[i]));
+  for (int i = 0; i < num - 1; ++i) {
+    aom_write_literal(w, deltas[i] - min_val, bits);
+    range -= deltas[i];
+    bits = AOMMIN(bits, av1_ceil_log2(range));
   }
 }
 
-// Write chroma palette color values. Use delta encoding for u channel as its
-// palette is sorted. For v channel, either use delta encoding or transmit
-// raw values directly, whichever costs less.
-static void write_palette_colors_uv(const PALETTE_MODE_INFO *const pmi,
+// Transmit luma palette color values. First signal if each color in the color
+// cache is used. Those colors that are not in the cache are transmitted with
+// delta encoding.
+static void write_palette_colors_y(const MACROBLOCKD *const xd,
+                                   const PALETTE_MODE_INFO *const pmi,
+                                   int bit_depth, aom_writer *w) {
+  const int n = pmi->palette_size[0];
+  const MODE_INFO *const above_mi = xd->above_mi;
+  const MODE_INFO *const left_mi = xd->left_mi;
+  uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+  const int n_cache = av1_get_palette_cache(above_mi, left_mi, 0, color_cache);
+  int out_cache_colors[PALETTE_MAX_SIZE];
+  uint8_t cache_color_found[2 * PALETTE_MAX_SIZE];
+  const int n_out_cache =
+      av1_index_color_cache(color_cache, n_cache, pmi->palette_colors, n,
+                            cache_color_found, out_cache_colors);
+  int n_in_cache = 0;
+  for (int i = 0; i < n_cache && n_in_cache < n; ++i) {
+    const int found = cache_color_found[i];
+    aom_write_bit(w, found);
+    n_in_cache += found;
+  }
+  assert(n_in_cache + n_out_cache == n);
+  delta_encode_palette_colors(out_cache_colors, n_out_cache, bit_depth, 1, w);
+}
+
+// Write chroma palette color values. U channel is handled similarly to the luma
+// channel. For v channel, either use delta encoding or transmit raw values
+// directly, whichever costs less.
+static void write_palette_colors_uv(const MACROBLOCKD *const xd,
+                                    const PALETTE_MODE_INFO *const pmi,
                                     int bit_depth, aom_writer *w) {
-  int i;
   const int n = pmi->palette_size[1];
-#if CONFIG_HIGHBITDEPTH
   const uint16_t *colors_u = pmi->palette_colors + PALETTE_MAX_SIZE;
   const uint16_t *colors_v = pmi->palette_colors + 2 * PALETTE_MAX_SIZE;
-#else
-  const uint8_t *colors_u = pmi->palette_colors + PALETTE_MAX_SIZE;
-  const uint8_t *colors_v = pmi->palette_colors + 2 * PALETTE_MAX_SIZE;
-#endif  // CONFIG_HIGHBITDEPTH
   // U channel colors.
-  int min_bits_u = 0;
-  int bits_u = av1_get_palette_delta_bits_u(pmi, bit_depth, &min_bits_u);
-  aom_write_literal(w, bits_u - min_bits_u, 2);
-  aom_write_literal(w, colors_u[0], bit_depth);
-  for (i = 1; i < n; ++i) {
-    aom_write_literal(w, colors_u[i] - colors_u[i - 1], bits_u);
-    bits_u = AOMMIN(bits_u, av1_ceil_log2(1 + (1 << bit_depth) - colors_u[i]));
-  }
-  // V channel colors.
+  const MODE_INFO *const above_mi = xd->above_mi;
+  const MODE_INFO *const left_mi = xd->left_mi;
+  uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+  const int n_cache = av1_get_palette_cache(above_mi, left_mi, 1, color_cache);
+  int out_cache_colors[PALETTE_MAX_SIZE];
+  uint8_t cache_color_found[2 * PALETTE_MAX_SIZE];
+  const int n_out_cache = av1_index_color_cache(
+      color_cache, n_cache, colors_u, n, cache_color_found, out_cache_colors);
+  int n_in_cache = 0;
+  for (int i = 0; i < n_cache && n_in_cache < n; ++i) {
+    const int found = cache_color_found[i];
+    aom_write_bit(w, found);
+    n_in_cache += found;
+  }
+  delta_encode_palette_colors(out_cache_colors, n_out_cache, bit_depth, 0, w);
+
+  // V channel colors. Don't use color cache as the colors are not sorted.
   const int max_val = 1 << bit_depth;
   int zero_count = 0, min_bits_v = 0;
   int bits_v =
@@ -1492,10 +1444,12 @@ static void write_palette_colors_uv(const PALETTE_MODE_INFO *const pmi,
       2 + bit_depth + (bits_v + 1) * (n - 1) - zero_count;
   const int rate_using_raw = bit_depth * n;
   if (rate_using_delta < rate_using_raw) {  // delta encoding
+    assert(colors_v[0] < (1 << bit_depth));
     aom_write_bit(w, 1);
     aom_write_literal(w, bits_v - min_bits_v, 2);
     aom_write_literal(w, colors_v[0], bit_depth);
-    for (i = 1; i < n; ++i) {
+    for (int i = 1; i < n; ++i) {
+      assert(colors_v[i] < (1 << bit_depth));
       if (colors_v[i] == colors_v[i - 1]) {  // No need to signal sign bit.
         aom_write_literal(w, 0, bits_v);
         continue;
@@ -1512,7 +1466,10 @@ static void write_palette_colors_uv(const PALETTE_MODE_INFO *const pmi,
     }
   } else {  // Transmit raw values.
     aom_write_bit(w, 0);
-    for (i = 0; i < n; ++i) aom_write_literal(w, colors_v[i], bit_depth);
+    for (int i = 0; i < n; ++i) {
+      assert(colors_v[i] < (1 << bit_depth));
+      aom_write_literal(w, colors_v[i], bit_depth);
+    }
   }
 }
 #endif  // CONFIG_PALETTE_DELTA_ENCODING
@@ -1542,11 +1499,12 @@ static void write_palette_mode_info(const AV1_COMMON *cm, const MACROBLOCKD *xd,
                       av1_default_palette_y_size_prob[bsize - BLOCK_8X8],
                       &palette_size_encodings[n - PALETTE_MIN_SIZE]);
 #if CONFIG_PALETTE_DELTA_ENCODING
-      write_palette_colors_y(pmi, cm->bit_depth, w);
+      write_palette_colors_y(xd, pmi, cm->bit_depth, w);
 #else
-      int i;
-      for (i = 0; i < n; ++i)
+      for (int i = 0; i < n; ++i) {
+        assert(pmi->palette_colors[i] < (1 << cm->bit_depth));
         aom_write_literal(w, pmi->palette_colors[i], cm->bit_depth);
+      }
 #endif  // CONFIG_PALETTE_DELTA_ENCODING
       write_uniform(w, n, pmi->palette_first_color_idx[0]);
     }
@@ -1561,10 +1519,13 @@ static void write_palette_mode_info(const AV1_COMMON *cm, const MACROBLOCKD *xd,
                       av1_default_palette_uv_size_prob[bsize - BLOCK_8X8],
                       &palette_size_encodings[n - PALETTE_MIN_SIZE]);
 #if CONFIG_PALETTE_DELTA_ENCODING
-      write_palette_colors_uv(pmi, cm->bit_depth, w);
+      write_palette_colors_uv(xd, pmi, cm->bit_depth, w);
 #else
-      int i;
-      for (i = 0; i < n; ++i) {
+      for (int i = 0; i < n; ++i) {
+        assert(pmi->palette_colors[PALETTE_MAX_SIZE + i] <
+               (1 << cm->bit_depth));
+        assert(pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] <
+               (1 << cm->bit_depth));
         aom_write_literal(w, pmi->palette_colors[PALETTE_MAX_SIZE + i],
                           cm->bit_depth);
         aom_write_literal(w, pmi->palette_colors[2 * PALETTE_MAX_SIZE + i],
@@ -1625,30 +1586,17 @@ void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
       if (is_inter) {
         assert(ext_tx_used_inter[eset][tx_type]);
         if (eset > 0) {
-#if CONFIG_EC_MULTISYMBOL
           aom_write_symbol(w, av1_ext_tx_inter_ind[eset][tx_type],
                            ec_ctx->inter_ext_tx_cdf[eset][square_tx_size],
                            ext_tx_cnt_inter[eset]);
-#else
-          av1_write_token(w, av1_ext_tx_inter_tree[eset],
-                          ec_ctx->inter_ext_tx_prob[eset][square_tx_size],
-                          &ext_tx_inter_encodings[eset][tx_type]);
-#endif
         }
       } else if (ALLOW_INTRA_EXT_TX) {
         assert(ext_tx_used_intra[eset][tx_type]);
         if (eset > 0) {
-#if CONFIG_EC_MULTISYMBOL
           aom_write_symbol(
               w, av1_ext_tx_intra_ind[eset][tx_type],
               ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][mbmi->mode],
               ext_tx_cnt_intra[eset]);
-#else
-          av1_write_token(
-              w, av1_ext_tx_intra_tree[eset],
-              ec_ctx->intra_ext_tx_prob[eset][square_tx_size][mbmi->mode],
-              &ext_tx_intra_encodings[eset][tx_type]);
-#endif
         }
       }
     }
@@ -1662,28 +1610,14 @@ void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
 #endif  // CONFIG_SUPERTX
         !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
       if (is_inter) {
-#if CONFIG_EC_MULTISYMBOL
         aom_write_symbol(w, av1_ext_tx_ind[tx_type],
                          ec_ctx->inter_ext_tx_cdf[tx_size], TX_TYPES);
-#else
-        av1_write_token(w, av1_ext_tx_tree, ec_ctx->inter_ext_tx_prob[tx_size],
-                        &ext_tx_encodings[tx_type]);
-#endif
       } else {
-#if CONFIG_EC_MULTISYMBOL
         aom_write_symbol(
             w, av1_ext_tx_ind[tx_type],
             ec_ctx->intra_ext_tx_cdf[tx_size]
                                     [intra_mode_to_tx_type_context[mbmi->mode]],
             TX_TYPES);
-#else
-        av1_write_token(
-            w, av1_ext_tx_tree,
-            ec_ctx
-                ->intra_ext_tx_prob[tx_size]
-                                   [intra_mode_to_tx_type_context[mbmi->mode]],
-            &ext_tx_encodings[tx_type]);
-#endif
       }
     }
 #endif  // CONFIG_EXT_TX
@@ -1692,29 +1626,45 @@ void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
 
 static void write_intra_mode(FRAME_CONTEXT *frame_ctx, BLOCK_SIZE bsize,
                              PREDICTION_MODE mode, aom_writer *w) {
-#if CONFIG_EC_MULTISYMBOL
   aom_write_symbol(w, av1_intra_mode_ind[mode],
                    frame_ctx->y_mode_cdf[size_group_lookup[bsize]],
                    INTRA_MODES);
-#else
-  av1_write_token(w, av1_intra_mode_tree,
-                  frame_ctx->y_mode_prob[size_group_lookup[bsize]],
-                  &intra_mode_encodings[mode]);
-#endif
 }
 
 static void write_intra_uv_mode(FRAME_CONTEXT *frame_ctx,
                                 PREDICTION_MODE uv_mode, PREDICTION_MODE y_mode,
                                 aom_writer *w) {
-#if CONFIG_EC_MULTISYMBOL
   aom_write_symbol(w, av1_intra_mode_ind[uv_mode],
                    frame_ctx->uv_mode_cdf[y_mode], INTRA_MODES);
-#else
-  av1_write_token(w, av1_intra_mode_tree, frame_ctx->uv_mode_prob[y_mode],
-                  &intra_mode_encodings[uv_mode]);
-#endif
 }
 
+#if CONFIG_CFL
+static void write_cfl_alphas(FRAME_CONTEXT *const frame_ctx, int skip, int ind,
+                             const CFL_SIGN_TYPE signs[CFL_SIGNS],
+                             aom_writer *w) {
+  if (skip) {
+    assert(ind == 0);
+    assert(signs[CFL_PRED_U] == CFL_SIGN_POS);
+    assert(signs[CFL_PRED_V] == CFL_SIGN_POS);
+  } else {
+    // Check for uninitialized signs
+    if (cfl_alpha_codes[ind][CFL_PRED_U] == 0)
+      assert(signs[CFL_PRED_U] == CFL_SIGN_POS);
+    if (cfl_alpha_codes[ind][CFL_PRED_V] == 0)
+      assert(signs[CFL_PRED_V] == CFL_SIGN_POS);
+
+    // Write a symbol representing a combination of alpha Cb and alpha Cr.
+    aom_write_symbol(w, ind, frame_ctx->cfl_alpha_cdf, CFL_ALPHABET_SIZE);
+
+    // Signs are only signaled for nonzero codes.
+    if (cfl_alpha_codes[ind][CFL_PRED_U] != 0)
+      aom_write_bit(w, signs[CFL_PRED_U]);
+    if (cfl_alpha_codes[ind][CFL_PRED_V] != 0)
+      aom_write_bit(w, signs[CFL_PRED_V]);
+  }
+}
+#endif
+
 static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
                                 const int mi_col,
 #if CONFIG_SUPERTX
@@ -1734,9 +1684,6 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
 #else
   FRAME_CONTEXT *ec_ctx = cm->fc;
 #endif
-#if !CONFIG_REF_MV
-  nmv_context *nmvc = &ec_ctx->nmvc;
-#endif
   const MODE_INFO *mi = xd->mi[0];
 
   const struct segmentation *const seg = &cm->seg;
@@ -1859,12 +1806,23 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
     }
 #if CONFIG_CB4X4
     if (is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
-                            xd->plane[1].subsampling_y))
+                            xd->plane[1].subsampling_y)) {
       write_intra_uv_mode(ec_ctx, mbmi->uv_mode, mode, w);
 #else  // !CONFIG_CB4X4
     write_intra_uv_mode(ec_ctx, mbmi->uv_mode, mode, w);
 #endif  // CONFIG_CB4X4
 
+#if CONFIG_CFL
+      if (mbmi->uv_mode == DC_PRED) {
+        write_cfl_alphas(ec_ctx, mbmi->skip, mbmi->cfl_alpha_idx,
+                         mbmi->cfl_alpha_signs, w);
+      }
+#endif
+
+#if CONFIG_CB4X4
+    }
+#endif
+
 #if CONFIG_EXT_INTRA
     write_intra_angle_info(xd, ec_ctx, w);
 #endif  // CONFIG_EXT_INTRA
@@ -1874,13 +1832,12 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
 #endif  // CONFIG_PALETTE
 #if CONFIG_FILTER_INTRA
     if (bsize >= BLOCK_8X8 || unify_bsize)
-      write_filter_intra_mode_info(cm, mbmi, w);
+      write_filter_intra_mode_info(cm, xd, mbmi, mi_row, mi_col, w);
 #endif  // CONFIG_FILTER_INTRA
   } else {
     int16_t mode_ctx;
     write_ref_frames(cm, xd, w);
 
-#if CONFIG_REF_MV
 #if CONFIG_EXT_INTER
     if (is_compound)
       mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]];
@@ -1888,9 +1845,6 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
 #endif  // CONFIG_EXT_INTER
       mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
                                            mbmi->ref_frame, bsize, -1);
-#else  // CONFIG_REF_MV
-    mode_ctx = mbmi_ext->mode_context[mbmi->ref_frame[0]];
-#endif  // CONFIG_REF_MV
 
     // If segment skip is not enabled code the mode.
     if (!segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
@@ -1902,7 +1856,6 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
 #endif  // CONFIG_EXT_INTER
           write_inter_mode(w, mode, ec_ctx, mode_ctx);
 
-#if CONFIG_REF_MV
 #if CONFIG_EXT_INTER
         if (mode == NEWMV || mode == NEW_NEWMV ||
             have_nearmv_in_inter_mode(mode))
@@ -1912,7 +1865,6 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
           write_drl_idx(cm, mbmi, mbmi_ext, w);
         else
           assert(mbmi->ref_mv_idx == 0);
-#endif
       }
     }
 
@@ -1928,13 +1880,11 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
         for (idx = 0; idx < 2; idx += num_4x4_w) {
           const int j = idy * 2 + idx;
           const PREDICTION_MODE b_mode = mi->bmi[j].as_mode;
-#if CONFIG_REF_MV
 #if CONFIG_EXT_INTER
           if (!is_compound)
 #endif  // CONFIG_EXT_INTER
             mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
                                                  mbmi->ref_frame, bsize, j);
-#endif
 #if CONFIG_EXT_INTER
           if (is_inter_compound_mode(b_mode))
             write_inter_compound_mode(cm, w, b_mode, mode_ctx);
@@ -1948,45 +1898,35 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
           if (b_mode == NEWMV) {
 #endif  // CONFIG_EXT_INTER
             for (ref = 0; ref < 1 + is_compound; ++ref) {
-#if CONFIG_REF_MV
               int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
               int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
                                         mbmi_ext->ref_mv_stack[rf_type], ref,
                                         mbmi->ref_mv_idx);
               nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx];
-#endif
               av1_encode_mv(cpi, w, &mi->bmi[j].as_mv[ref].as_mv,
 #if CONFIG_EXT_INTER
                             &mi->bmi[j].ref_mv[ref].as_mv,
 #else
-#if CONFIG_REF_MV
                             &mi->bmi[j].pred_mv[ref].as_mv,
-#else
-                            &mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0].as_mv,
-#endif  // CONFIG_REF_MV
 #endif  // CONFIG_EXT_INTER
                             nmvc, allow_hp);
             }
           }
 #if CONFIG_EXT_INTER
           else if (b_mode == NEAREST_NEWMV || b_mode == NEAR_NEWMV) {
-#if CONFIG_REF_MV
             int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
             int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
                                       mbmi_ext->ref_mv_stack[rf_type], 1,
                                       mbmi->ref_mv_idx);
             nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx];
-#endif
             av1_encode_mv(cpi, w, &mi->bmi[j].as_mv[1].as_mv,
                           &mi->bmi[j].ref_mv[1].as_mv, nmvc, allow_hp);
           } else if (b_mode == NEW_NEARESTMV || b_mode == NEW_NEARMV) {
-#if CONFIG_REF_MV
             int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
             int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
                                       mbmi_ext->ref_mv_stack[rf_type], 0,
                                       mbmi->ref_mv_idx);
             nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx];
-#endif
             av1_encode_mv(cpi, w, &mi->bmi[j].as_mv[0].as_mv,
                           &mi->bmi[j].ref_mv[0].as_mv, nmvc, allow_hp);
           }
@@ -2001,37 +1941,31 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
 #endif  // CONFIG_EXT_INTER
         int_mv ref_mv;
         for (ref = 0; ref < 1 + is_compound; ++ref) {
-#if CONFIG_REF_MV
           int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
           int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
                                     mbmi_ext->ref_mv_stack[rf_type], ref,
                                     mbmi->ref_mv_idx);
           nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx];
-#endif
           ref_mv = mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0];
           av1_encode_mv(cpi, w, &mbmi->mv[ref].as_mv, &ref_mv.as_mv, nmvc,
                         allow_hp);
         }
 #if CONFIG_EXT_INTER
       } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
-#if CONFIG_REF_MV
         int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
         int nmv_ctx =
             av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
                         mbmi_ext->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx);
         nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx];
-#endif
         av1_encode_mv(cpi, w, &mbmi->mv[1].as_mv,
                       &mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0].as_mv, nmvc,
                       allow_hp);
       } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
-#if CONFIG_REF_MV
         int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
         int nmv_ctx =
             av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
                         mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
         nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx];
-#endif
         av1_encode_mv(cpi, w, &mbmi->mv[0].as_mv,
                       &mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0].as_mv, nmvc,
                       allow_hp);
@@ -2039,12 +1973,12 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
       }
     }
 
-#if CONFIG_EXT_INTER
+#if CONFIG_EXT_INTER && CONFIG_INTERINTRA
     if (cpi->common.reference_mode != COMPOUND_REFERENCE &&
 #if CONFIG_SUPERTX
         !supertx_enabled &&
 #endif  // CONFIG_SUPERTX
-        is_interintra_allowed(mbmi)) {
+        cpi->common.allow_interintra_compound && is_interintra_allowed(mbmi)) {
       const int interintra = mbmi->ref_frame[1] == INTRA_FRAME;
       const int bsize_group = size_group_lookup[bsize];
       aom_write(w, interintra, cm->fc->interintra_prob[bsize_group]);
@@ -2062,7 +1996,7 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
         }
       }
     }
-#endif  // CONFIG_EXT_INTER
+#endif  // CONFIG_EXT_INTER && CONFIG_INTERINTRA
 
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 #if CONFIG_SUPERTX
@@ -2082,21 +2016,23 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
 #endif  // CONFIG_MOTION_VAR
         && is_any_masked_compound_used(bsize)) {
 #if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
-      av1_write_token(w, av1_compound_type_tree,
-                      cm->fc->compound_type_prob[bsize],
-                      &compound_type_encodings[mbmi->interinter_compound_type]);
-#endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+      if (cm->allow_masked_compound) {
+        av1_write_token(
+            w, av1_compound_type_tree, cm->fc->compound_type_prob[bsize],
+            &compound_type_encodings[mbmi->interinter_compound_type]);
 #if CONFIG_WEDGE
-      if (mbmi->interinter_compound_type == COMPOUND_WEDGE) {
-        aom_write_literal(w, mbmi->wedge_index, get_wedge_bits_lookup(bsize));
-        aom_write_bit(w, mbmi->wedge_sign);
-      }
+        if (mbmi->interinter_compound_type == COMPOUND_WEDGE) {
+          aom_write_literal(w, mbmi->wedge_index, get_wedge_bits_lookup(bsize));
+          aom_write_bit(w, mbmi->wedge_sign);
+        }
 #endif  // CONFIG_WEDGE
 #if CONFIG_COMPOUND_SEGMENT
-      if (mbmi->interinter_compound_type == COMPOUND_SEG) {
-        aom_write_literal(w, mbmi->mask_type, MAX_SEG_MASK_BITS);
-      }
+        if (mbmi->interinter_compound_type == COMPOUND_SEG) {
+          aom_write_literal(w, mbmi->mask_type, MAX_SEG_MASK_BITS);
+        }
 #endif  // CONFIG_COMPOUND_SEGMENT
+      }
+#endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
     }
 #endif  // CONFIG_EXT_INTER
 
@@ -2114,15 +2050,17 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
 #endif  // !CONFIG_TXK_SEL
 }
 
+static void write_mb_modes_kf(AV1_COMMON *cm,
 #if CONFIG_DELTA_Q
-static void write_mb_modes_kf(AV1_COMMON *cm, MACROBLOCKD *xd, const int mi_row,
-                              const int mi_col, aom_writer *w) {
-  int skip;
+                              MACROBLOCKD *xd,
 #else
-static void write_mb_modes_kf(AV1_COMMON *cm, const MACROBLOCKD *xd,
+                              const MACROBLOCKD *xd,
+#endif  // CONFIG_DELTA_Q
+#if CONFIG_INTRABC
+                              const MB_MODE_INFO_EXT *mbmi_ext,
+#endif  // CONFIG_INTRABC
                               const int mi_row, const int mi_col,
                               aom_writer *w) {
-#endif
   const struct segmentation *const seg = &cm->seg;
   struct segmentation_probs *const segp = &cm->fc->seg;
   const MODE_INFO *const mi = xd->mi[0];
@@ -2147,7 +2085,7 @@ static void write_mb_modes_kf(AV1_COMMON *cm, const MACROBLOCKD *xd,
   if (seg->update_map) write_segment_id(w, seg, segp, mbmi->segment_id);
 
 #if CONFIG_DELTA_Q
-  skip = write_skip(cm, xd, mbmi->segment_id, mi, w);
+  const int skip = write_skip(cm, xd, mbmi->segment_id, mi, w);
   if (cm->delta_q_present_flag) {
     int super_block_upper_left =
         ((mi_row & MAX_MIB_MASK) == 0) && ((mi_col & MAX_MIB_MASK) == 0);
@@ -2188,13 +2126,19 @@ static void write_mb_modes_kf(AV1_COMMON *cm, const MACROBLOCKD *xd,
 #if CONFIG_INTRABC
   if (bsize >= BLOCK_8X8 && cm->allow_screen_content_tools) {
     int use_intrabc = is_intrabc_block(mbmi);
-    aom_write(w, use_intrabc, INTRABC_PROB);
+    aom_write(w, use_intrabc, ec_ctx->intrabc_prob);
     if (use_intrabc) {
       assert(mbmi->mode == DC_PRED);
       assert(mbmi->uv_mode == DC_PRED);
-      int_mv dv_ref;
-      av1_find_ref_dv(&dv_ref, mi_row, mi_col);
+      int_mv dv_ref = mbmi_ext->ref_mvs[INTRA_FRAME][0];
       av1_encode_dv(w, &mbmi->mv[0].as_mv, &dv_ref.as_mv, &ec_ctx->ndvc);
+#if CONFIG_EXT_TX && !CONFIG_TXK_SEL
+      av1_write_tx_type(cm, xd,
+#if CONFIG_SUPERTX
+                        0,
+#endif
+                        w);
+#endif  // CONFIG_EXT_TX && !CONFIG_TXK_SEL
       return;
     }
   }
@@ -2218,12 +2162,22 @@ static void write_mb_modes_kf(AV1_COMMON *cm, const MACROBLOCKD *xd,
 
 #if CONFIG_CB4X4
   if (is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
-                          xd->plane[1].subsampling_y))
+                          xd->plane[1].subsampling_y)) {
     write_intra_uv_mode(ec_ctx, mbmi->uv_mode, mbmi->mode, w);
 #else  // !CONFIG_CB4X4
   write_intra_uv_mode(ec_ctx, mbmi->uv_mode, mbmi->mode, w);
 #endif  // CONFIG_CB4X4
 
+#if CONFIG_CFL
+    if (mbmi->uv_mode == DC_PRED) {
+      write_cfl_alphas(ec_ctx, mbmi->skip, mbmi->cfl_alpha_idx,
+                       mbmi->cfl_alpha_signs, w);
+    }
+#endif
+
+#if CONFIG_CB4X4
+  }
+#endif
 #if CONFIG_EXT_INTRA
   write_intra_angle_info(xd, ec_ctx, w);
 #endif  // CONFIG_EXT_INTRA
@@ -2233,7 +2187,7 @@ static void write_mb_modes_kf(AV1_COMMON *cm, const MACROBLOCKD *xd,
 #endif  // CONFIG_PALETTE
 #if CONFIG_FILTER_INTRA
   if (bsize >= BLOCK_8X8 || unify_bsize)
-    write_filter_intra_mode_info(cm, mbmi, w);
+    write_filter_intra_mode_info(cm, xd, mbmi, mi_row, mi_col, w);
 #endif  // CONFIG_FILTER_INTRA
 
 #if !CONFIG_TXK_SEL
@@ -2325,12 +2279,17 @@ static void write_mbmi_b(AV1_COMP *cpi, const TileInfo *const tile,
                  cm->mi_rows, cm->mi_cols);
 
   if (frame_is_intra_only(cm)) {
-    write_mb_modes_kf(cm, xd, mi_row, mi_col, w);
+    write_mb_modes_kf(cm, xd,
+#if CONFIG_INTRABC
+                      cpi->td.mb.mbmi_ext,
+#endif  // CONFIG_INTRABC
+                      mi_row, mi_col, w);
   } else {
 #if CONFIG_VAR_TX
-    xd->above_txfm_context = cm->above_txfm_context + mi_col;
-    xd->left_txfm_context =
-        xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+    xd->above_txfm_context =
+        cm->above_txfm_context + (mi_col << TX_UNIT_WIDE_LOG2);
+    xd->left_txfm_context = xd->left_txfm_context_buffer +
+                            ((mi_row & MAX_MIB_MASK) << TX_UNIT_HIGH_LOG2);
 #endif
 #if CONFIG_DUAL_FILTER
     // has_subpel_mv_component needs the ref frame buffers set up to look
@@ -2539,8 +2498,12 @@ static void write_tokens_b(AV1_COMP *cpi, const TileInfo *const tile,
         const int bkh = tx_size_high_unit[max_tx_size];
         for (row = 0; row < num_4x4_h; row += bkh) {
           for (col = 0; col < num_4x4_w; col += bkw) {
-            pack_txb_tokens(w, tok, tok_end,
-#if CONFIG_PVQ
+            pack_txb_tokens(w,
+#if CONFIG_LV_MAP
+                            cm,
+#endif
+                            tok, tok_end,
+#if CONFIG_PVQ || CONFIG_LV_MAP
                             x,
 #endif
                             xd, mbmi, plane, plane_bsize, cm->bit_depth, block,
@@ -2556,10 +2519,10 @@ static void write_tokens_b(AV1_COMP *cpi, const TileInfo *const tile,
         }
 #endif  // CONFIG_RD_DEBUG
       } else {
+#if CONFIG_LV_MAP
+        av1_write_coeffs_mb(cm, x, w, plane);
+#else
         TX_SIZE tx = get_tx_size(plane, xd);
-#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
-        tx = AOMMAX(TX_4X4, tx);
-#endif
         const int bkw = tx_size_wide_unit[tx];
         const int bkh = tx_size_high_unit[tx];
         for (row = 0; row < num_4x4_h; row += bkh) {
@@ -2571,6 +2534,7 @@ static void write_tokens_b(AV1_COMP *cpi, const TileInfo *const tile,
 #endif
           }
         }
+#endif  // CONFIG_LV_MAP
       }
 #else
       TX_SIZE tx = get_tx_size(plane, xd);
@@ -2727,7 +2691,7 @@ static void write_partition(const AV1_COMMON *const cm,
 #if CONFIG_EC_ADAPT
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
   (void)cm;
-#elif CONFIG_EC_MULTISYMBOL
+#else
   FRAME_CONTEXT *ec_ctx = cm->fc;
 #endif
 
@@ -2736,24 +2700,11 @@ static void write_partition(const AV1_COMMON *const cm,
   if (has_rows && has_cols) {
 #if CONFIG_EXT_PARTITION_TYPES
     if (bsize <= BLOCK_8X8)
-#if CONFIG_EC_MULTISYMBOL
       aom_write_symbol(w, p, ec_ctx->partition_cdf[ctx], PARTITION_TYPES);
-#else
-      av1_write_token(w, av1_partition_tree, probs, &partition_encodings[p]);
-#endif
     else
-#if CONFIG_EC_MULTISYMBOL
       aom_write_symbol(w, p, ec_ctx->partition_cdf[ctx], EXT_PARTITION_TYPES);
 #else
-      av1_write_token(w, av1_ext_partition_tree, probs,
-                      &ext_partition_encodings[p]);
-#endif  // CONFIG_EC_MULTISYMBOL
-#else
-#if CONFIG_EC_MULTISYMBOL
     aom_write_symbol(w, p, ec_ctx->partition_cdf[ctx], PARTITION_TYPES);
-#else
-    av1_write_token(w, av1_partition_tree, probs, &partition_encodings[p]);
-#endif
 #endif  // CONFIG_EXT_PARTITION_TYPES
   } else if (!has_rows && has_cols) {
     assert(p == PARTITION_SPLIT || p == PARTITION_HORZ);
@@ -2920,7 +2871,6 @@ static void write_modes_sb(AV1_COMP *const cpi, const TileInfo *const tile,
       const int eset =
           get_ext_tx_set(supertx_size, bsize, 1, cm->reduced_tx_set_used);
       if (eset > 0) {
-#if CONFIG_EC_MULTISYMBOL
 #if CONFIG_EC_ADAPT
         FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 #else
@@ -2929,11 +2879,6 @@ static void write_modes_sb(AV1_COMP *const cpi, const TileInfo *const tile,
         aom_write_symbol(w, av1_ext_tx_inter_ind[eset][mbmi->tx_type],
                          ec_ctx->inter_ext_tx_cdf[eset][supertx_size],
                          ext_tx_cnt_inter[eset]);
-#else
-        av1_write_token(w, av1_ext_tx_inter_tree[eset],
-                        cm->fc->inter_ext_tx_prob[eset][supertx_size],
-                        &ext_tx_inter_encodings[eset][mbmi->tx_type]);
-#endif
       }
     }
 #else
@@ -2989,21 +2934,11 @@ static void write_modes_sb(AV1_COMP *const cpi, const TileInfo *const tile,
 #endif  // CONFIG_EXT_PARTITION_TYPES
 
 #if CONFIG_CDEF
-#if CONFIG_EXT_PARTITION
-  if (cm->sb_size == BLOCK_128X128 && bsize == BLOCK_128X128 &&
-      !sb_all_skip(cm, mi_row, mi_col)) {
+  if (bsize == cm->sb_size && !sb_all_skip(cm, mi_row, mi_col) &&
+      cm->cdef_bits != 0) {
     aom_write_literal(w, cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]
                              ->mbmi.cdef_strength,
                       cm->cdef_bits);
-  } else if (cm->sb_size == BLOCK_64X64 && bsize == BLOCK_64X64 &&
-#else
-  if (bsize == BLOCK_64X64 &&
-#endif  // CONFIG_EXT_PARTITION
-             !sb_all_skip(cm, mi_row, mi_col)) {
-    if (cm->cdef_bits != 0)
-      aom_write_literal(w, cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]
-                               ->mbmi.cdef_strength,
-                        cm->cdef_bits);
   }
 #endif
 }
@@ -3066,7 +3001,7 @@ static void write_modes(AV1_COMP *const cpi, const TileInfo *const tile,
 }
 
 #if !CONFIG_LV_MAP
-#if !CONFIG_PVQ && !(CONFIG_EC_ADAPT && CONFIG_NEW_TOKENSET)
+#if !CONFIG_PVQ && !CONFIG_EC_ADAPT
 static void build_tree_distribution(AV1_COMP *cpi, TX_SIZE tx_size,
                                     av1_coeff_stats *coef_branch_ct,
                                     av1_coeff_probs_model *coef_probs) {
@@ -3097,7 +3032,7 @@ static void build_tree_distribution(AV1_COMP *cpi, TX_SIZE tx_size,
   }
 }
 
-#if !(CONFIG_EC_ADAPT && CONFIG_NEW_TOKENSET)
+#if !CONFIG_EC_ADAPT
 static void update_coef_probs_common(aom_writer *const bc, AV1_COMP *cpi,
                                      TX_SIZE tx_size,
                                      av1_coeff_stats *frame_branch_ct,
@@ -3249,235 +3184,12 @@ static void update_coef_probs_common(aom_writer *const bc, AV1_COMP *cpi,
   }
 }
 #endif
-#if CONFIG_SUBFRAME_PROB_UPDATE
-// Calculate the token counts between subsequent subframe updates.
-static void get_coef_counts_diff(
-    AV1_COMP *cpi, int index,
-    av1_coeff_count coef_counts[TX_SIZES][PLANE_TYPES],
-    unsigned int eob_counts[TX_SIZES][PLANE_TYPES][REF_TYPES][COEF_BANDS]
-                           [COEFF_CONTEXTS]) {
-  int i, j, k, l, m, tx_size, val;
-  const int max_idx = cpi->common.coef_probs_update_idx;
-  const TX_MODE tx_mode = cpi->common.tx_mode;
-  const int max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
-  const SUBFRAME_STATS *subframe_stats = &cpi->subframe_stats;
-
-  assert(max_idx < COEF_PROBS_BUFS);
-
-  for (tx_size = 0; tx_size <= max_tx_size; ++tx_size)
-    for (i = 0; i < PLANE_TYPES; ++i)
-      for (j = 0; j < REF_TYPES; ++j)
-        for (k = 0; k < COEF_BANDS; ++k)
-          for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
-            if (index == max_idx) {
-              val =
-                  cpi->common.counts.eob_branch[tx_size][i][j][k][l] -
-                  subframe_stats->eob_counts_buf[max_idx][tx_size][i][j][k][l];
-            } else {
-              val = subframe_stats
-                        ->eob_counts_buf[index + 1][tx_size][i][j][k][l] -
-                    subframe_stats->eob_counts_buf[index][tx_size][i][j][k][l];
-            }
-            assert(val >= 0);
-            eob_counts[tx_size][i][j][k][l] = val;
-
-            for (m = 0; m < ENTROPY_TOKENS; ++m) {
-              if (index == max_idx) {
-                val = cpi->td.rd_counts.coef_counts[tx_size][i][j][k][l][m] -
-                      subframe_stats
-                          ->coef_counts_buf[max_idx][tx_size][i][j][k][l][m];
-              } else {
-                val = subframe_stats
-                          ->coef_counts_buf[index + 1][tx_size][i][j][k][l][m] -
-                      subframe_stats
-                          ->coef_counts_buf[index][tx_size][i][j][k][l][m];
-              }
-              assert(val >= 0);
-              coef_counts[tx_size][i][j][k][l][m] = val;
-            }
-          }
-}
-
-static void update_coef_probs_subframe(
-    aom_writer *const bc, AV1_COMP *cpi, TX_SIZE tx_size,
-    av1_coeff_stats branch_ct[COEF_PROBS_BUFS][TX_SIZES][PLANE_TYPES],
-    av1_coeff_probs_model *new_coef_probs) {
-  av1_coeff_probs_model *old_coef_probs = cpi->common.fc->coef_probs[tx_size];
-  const aom_prob upd = DIFF_UPDATE_PROB;
-  const int entropy_nodes_update = UNCONSTRAINED_NODES;
-  int i, j, k, l, t;
-  int stepsize = cpi->sf.coeff_prob_appx_step;
-  const int max_idx = cpi->common.coef_probs_update_idx;
-  int idx;
-  unsigned int this_branch_ct[ENTROPY_NODES][COEF_PROBS_BUFS][2];
-
-  switch (cpi->sf.use_fast_coef_updates) {
-    case TWO_LOOP: {
-      /* dry run to see if there is any update at all needed */
-      int savings = 0;
-      int update[2] = { 0, 0 };
-      for (i = 0; i < PLANE_TYPES; ++i) {
-        for (j = 0; j < REF_TYPES; ++j) {
-          for (k = 0; k < COEF_BANDS; ++k) {
-            for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
-              for (t = 0; t < ENTROPY_NODES; ++t) {
-                for (idx = 0; idx <= max_idx; ++idx) {
-                  memcpy(this_branch_ct[t][idx],
-                         branch_ct[idx][tx_size][i][j][k][l][t],
-                         2 * sizeof(this_branch_ct[t][idx][0]));
-                }
-              }
-              for (t = 0; t < entropy_nodes_update; ++t) {
-                aom_prob newp = new_coef_probs[i][j][k][l][t];
-                const aom_prob oldp = old_coef_probs[i][j][k][l][t];
-                int s, u = 0;
-
-                if (t == PIVOT_NODE)
-                  s = av1_prob_update_search_model_subframe(
-                      this_branch_ct, old_coef_probs[i][j][k][l], &newp, upd,
-                      stepsize, max_idx);
-                else
-                  s = av1_prob_update_search_subframe(this_branch_ct[t], oldp,
-                                                      &newp, upd, max_idx);
-                if (s > 0 && newp != oldp) u = 1;
-                if (u)
-                  savings += s - (int)(av1_cost_zero(upd));
-                else
-                  savings -= (int)(av1_cost_zero(upd));
-                update[u]++;
-              }
-            }
-          }
-        }
-      }
-
-      /* Is coef updated at all */
-      if (update[1] == 0 || savings < 0) {
-        aom_write_bit(bc, 0);
-        return;
-      }
-      aom_write_bit(bc, 1);
-      for (i = 0; i < PLANE_TYPES; ++i) {
-        for (j = 0; j < REF_TYPES; ++j) {
-          for (k = 0; k < COEF_BANDS; ++k) {
-            for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
-              for (t = 0; t < ENTROPY_NODES; ++t) {
-                for (idx = 0; idx <= max_idx; ++idx) {
-                  memcpy(this_branch_ct[t][idx],
-                         branch_ct[idx][tx_size][i][j][k][l][t],
-                         2 * sizeof(this_branch_ct[t][idx][0]));
-                }
-              }
-              for (t = 0; t < entropy_nodes_update; ++t) {
-                aom_prob newp = new_coef_probs[i][j][k][l][t];
-                aom_prob *oldp = old_coef_probs[i][j][k][l] + t;
-                int s;
-                int u = 0;
-
-                if (t == PIVOT_NODE)
-                  s = av1_prob_update_search_model_subframe(
-                      this_branch_ct, old_coef_probs[i][j][k][l], &newp, upd,
-                      stepsize, max_idx);
-                else
-                  s = av1_prob_update_search_subframe(this_branch_ct[t], *oldp,
-                                                      &newp, upd, max_idx);
-                if (s > 0 && newp != *oldp) u = 1;
-                aom_write(bc, u, upd);
-                if (u) {
-                  /* send/use new probability */
-                  av1_write_prob_diff_update(bc, newp, *oldp);
-                  *oldp = newp;
-                }
-              }
-            }
-          }
-        }
-      }
-      return;
-    }
-
-    case ONE_LOOP_REDUCED: {
-      int updates = 0;
-      int noupdates_before_first = 0;
-      for (i = 0; i < PLANE_TYPES; ++i) {
-        for (j = 0; j < REF_TYPES; ++j) {
-          for (k = 0; k < COEF_BANDS; ++k) {
-            for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
-              for (t = 0; t < ENTROPY_NODES; ++t) {
-                for (idx = 0; idx <= max_idx; ++idx) {
-                  memcpy(this_branch_ct[t][idx],
-                         branch_ct[idx][tx_size][i][j][k][l][t],
-                         2 * sizeof(this_branch_ct[t][idx][0]));
-                }
-              }
-              for (t = 0; t < entropy_nodes_update; ++t) {
-                aom_prob newp = new_coef_probs[i][j][k][l][t];
-                aom_prob *oldp = old_coef_probs[i][j][k][l] + t;
-                int s;
-                int u = 0;
 
-                if (t == PIVOT_NODE)
-                  s = av1_prob_update_search_model_subframe(
-                      this_branch_ct, old_coef_probs[i][j][k][l], &newp, upd,
-                      stepsize, max_idx);
-                else
-                  s = av1_prob_update_search_subframe(this_branch_ct[t], *oldp,
-                                                      &newp, upd, max_idx);
-                if (s > 0 && newp != *oldp) u = 1;
-                updates += u;
-                if (u == 0 && updates == 0) {
-                  noupdates_before_first++;
-                  continue;
-                }
-                if (u == 1 && updates == 1) {
-                  int v;
-                  // first update
-                  aom_write_bit(bc, 1);
-                  for (v = 0; v < noupdates_before_first; ++v)
-                    aom_write(bc, 0, upd);
-                }
-                aom_write(bc, u, upd);
-                if (u) {
-                  /* send/use new probability */
-                  av1_write_prob_diff_update(bc, newp, *oldp);
-                  *oldp = newp;
-                }
-              }
-            }
-          }
-        }
-      }
-      if (updates == 0) {
-        aom_write_bit(bc, 0);  // no updates
-      }
-      return;
-    }
-    default: assert(0);
-  }
-}
-#endif  // CONFIG_SUBFRAME_PROB_UPDATE
-
-#if !(CONFIG_EC_ADAPT && CONFIG_NEW_TOKENSET)
+#if !CONFIG_EC_ADAPT
 static void update_coef_probs(AV1_COMP *cpi, aom_writer *w) {
   const TX_MODE tx_mode = cpi->common.tx_mode;
   const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
   TX_SIZE tx_size;
-#if CONFIG_SUBFRAME_PROB_UPDATE
-  AV1_COMMON *cm = &cpi->common;
-  SUBFRAME_STATS *subframe_stats = &cpi->subframe_stats;
-  int i;
-  av1_coeff_probs_model dummy_frame_coef_probs[PLANE_TYPES];
-
-  if (cm->do_subframe_update &&
-      cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
-    av1_copy(cpi->common.fc->coef_probs,
-             subframe_stats->enc_starting_coef_probs);
-    for (i = 0; i <= cpi->common.coef_probs_update_idx; ++i) {
-      get_coef_counts_diff(cpi, i, cpi->wholeframe_stats.coef_counts_buf[i],
-                           cpi->wholeframe_stats.eob_counts_buf[i]);
-    }
-  }
-#endif  // CONFIG_SUBFRAME_PROB_UPDATE
 
   for (tx_size = 0; tx_size <= max_tx_size; ++tx_size) {
     av1_coeff_stats frame_branch_ct[PLANE_TYPES];
@@ -3486,63 +3198,13 @@ static void update_coef_probs(AV1_COMP *cpi, aom_writer *w) {
         (tx_size >= TX_16X16 && cpi->sf.tx_size_search_method == USE_TX_8X8)) {
       aom_write_bit(w, 0);
     } else {
-#if CONFIG_SUBFRAME_PROB_UPDATE
-      if (cm->do_subframe_update &&
-          cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
-        unsigned int this_eob_counts_copy[PLANE_TYPES][REF_TYPES][COEF_BANDS]
-                                         [COEFF_CONTEXTS];
-        av1_coeff_count coef_counts_copy[PLANE_TYPES];
-        av1_copy(this_eob_counts_copy, cpi->common.counts.eob_branch[tx_size]);
-        av1_copy(coef_counts_copy, cpi->td.rd_counts.coef_counts[tx_size]);
-        build_tree_distribution(cpi, tx_size, frame_branch_ct,
-                                frame_coef_probs);
-        for (i = 0; i <= cpi->common.coef_probs_update_idx; ++i) {
-          av1_copy(cpi->common.counts.eob_branch[tx_size],
-                   cpi->wholeframe_stats.eob_counts_buf[i][tx_size]);
-          av1_copy(cpi->td.rd_counts.coef_counts[tx_size],
-                   cpi->wholeframe_stats.coef_counts_buf[i][tx_size]);
-          build_tree_distribution(cpi, tx_size, cpi->branch_ct_buf[i][tx_size],
-                                  dummy_frame_coef_probs);
-        }
-        av1_copy(cpi->common.counts.eob_branch[tx_size], this_eob_counts_copy);
-        av1_copy(cpi->td.rd_counts.coef_counts[tx_size], coef_counts_copy);
-
-        update_coef_probs_subframe(w, cpi, tx_size, cpi->branch_ct_buf,
-                                   frame_coef_probs);
-      } else {
-#endif  // CONFIG_SUBFRAME_PROB_UPDATE
-        build_tree_distribution(cpi, tx_size, frame_branch_ct,
-                                frame_coef_probs);
-        update_coef_probs_common(w, cpi, tx_size, frame_branch_ct,
-                                 frame_coef_probs);
-#if CONFIG_SUBFRAME_PROB_UPDATE
-      }
-#endif  // CONFIG_SUBFRAME_PROB_UPDATE
-    }
-  }
-
-#if CONFIG_SUBFRAME_PROB_UPDATE
-  av1_copy(cm->starting_coef_probs, cm->fc->coef_probs);
-  av1_copy(subframe_stats->coef_probs_buf[0], cm->fc->coef_probs);
-  if (cm->do_subframe_update &&
-      cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
-    unsigned int eob_counts_copy[TX_SIZES][PLANE_TYPES][REF_TYPES][COEF_BANDS]
-                                [COEFF_CONTEXTS];
-    av1_copy(eob_counts_copy, cm->counts.eob_branch);
-    for (i = 1; i <= cpi->common.coef_probs_update_idx; ++i) {
-      for (tx_size = 0; tx_size <= max_tx_size; ++tx_size)
-        av1_full_to_model_counts(cm->counts.coef[tx_size],
-                                 subframe_stats->coef_counts_buf[i][tx_size]);
-      av1_copy(cm->counts.eob_branch, subframe_stats->eob_counts_buf[i]);
-      av1_partial_adapt_probs(cm, 0, 0);
-      av1_copy(subframe_stats->coef_probs_buf[i], cm->fc->coef_probs);
+      build_tree_distribution(cpi, tx_size, frame_branch_ct, frame_coef_probs);
+      update_coef_probs_common(w, cpi, tx_size, frame_branch_ct,
+                               frame_coef_probs);
     }
-    av1_copy(cm->fc->coef_probs, subframe_stats->coef_probs_buf[0]);
-    av1_copy(cm->counts.eob_branch, eob_counts_copy);
   }
-#endif  // CONFIG_SUBFRAME_PROB_UPDATE
 }
-#endif  // !(CONFIG_EC_ADAPT && CONFIG_NEW_TOKENSET)
+#endif  // !CONFIG_EC_ADAPT
 #endif  // !CONFIG_EC_ADAPT
 #endif  // !CONFIG_LV_MAP
 
@@ -3574,7 +3236,14 @@ static void encode_restoration_mode(AV1_COMMON *cm,
     rsi = &cm->rst_info[p];
     switch (rsi->frame_restoration_type) {
       case RESTORE_NONE: aom_wb_write_bit(wb, 0); break;
-      case RESTORE_WIENER: aom_wb_write_bit(wb, 1); break;
+      case RESTORE_WIENER:
+        aom_wb_write_bit(wb, 1);
+        aom_wb_write_bit(wb, 0);
+        break;
+      case RESTORE_SGRPROJ:
+        aom_wb_write_bit(wb, 1);
+        aom_wb_write_bit(wb, 1);
+        break;
       default: assert(0);
     }
   }
@@ -3687,6 +3356,7 @@ static void encode_restoration(AV1_COMMON *cm, aom_writer *wb) {
   }
   for (p = 1; p < MAX_MB_PLANE; ++p) {
     set_default_wiener(&ref_wiener_info);
+    set_default_sgrproj(&ref_sgrproj_info);
     rsi = &cm->rst_info[p];
     if (rsi->frame_restoration_type == RESTORE_WIENER) {
       for (i = 0; i < ntiles_uv; ++i) {
@@ -3697,6 +3367,15 @@ static void encode_restoration(AV1_COMMON *cm, aom_writer *wb) {
           write_wiener_filter(&rsi->wiener_info[i], &ref_wiener_info, wb);
         }
       }
+    } else if (rsi->frame_restoration_type == RESTORE_SGRPROJ) {
+      for (i = 0; i < ntiles_uv; ++i) {
+        if (ntiles_uv > 1)
+          aom_write(wb, rsi->restoration_type[i] != RESTORE_NONE,
+                    RESTORE_NONE_SGRPROJ_PROB);
+        if (rsi->restoration_type[i] != RESTORE_NONE) {
+          write_sgrproj_filter(&rsi->sgrproj_info[i], &ref_sgrproj_info, wb);
+        }
+      }
     } else if (rsi->frame_restoration_type != RESTORE_NONE) {
       assert(0);
     }
@@ -3972,6 +3651,9 @@ static void write_tile_info(const AV1_COMMON *const cm,
     aom_wb_write_literal(wb, tile_width - 1, 6);
     aom_wb_write_literal(wb, tile_height - 1, 6);
   }
+#if CONFIG_DEPENDENT_HORZTILES
+  if (tile_height > 1) aom_wb_write_bit(wb, cm->dependent_horz_tiles);
+#endif
 #else
   int min_log2_tile_cols, max_log2_tile_cols, ones;
   av1_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
@@ -3985,11 +3667,10 @@ static void write_tile_info(const AV1_COMMON *const cm,
   // rows
   aom_wb_write_bit(wb, cm->log2_tile_rows != 0);
   if (cm->log2_tile_rows != 0) aom_wb_write_bit(wb, cm->log2_tile_rows != 1);
-#endif  // CONFIG_EXT_TILE
-
 #if CONFIG_DEPENDENT_HORZTILES
   if (cm->log2_tile_rows != 0) aom_wb_write_bit(wb, cm->dependent_horz_tiles);
 #endif
+#endif  // CONFIG_EXT_TILE
 
 #if CONFIG_LOOPFILTERING_ACROSS_TILES
   aom_wb_write_bit(wb, cm->loop_filter_across_tiles_enabled);
@@ -4442,9 +4123,6 @@ static void write_render_size(const AV1_COMMON *cm,
 #if CONFIG_FRAME_SUPERRES
 static void write_superres_scale(const AV1_COMMON *const cm,
                                  struct aom_write_bit_buffer *wb) {
-  // This scaling and frame superres are probably incompatible
-  assert(cm->width == cm->render_width && cm->height == cm->render_height);
-
   // First bit is whether to to scale or not
   if (cm->superres_scale_numerator == SUPERRES_SCALE_DENOMINATOR) {
     aom_wb_write_bit(wb, 0);  // no scaling
@@ -4460,23 +4138,9 @@ static void write_superres_scale(const AV1_COMMON *const cm,
 
 static void write_frame_size(const AV1_COMMON *cm,
                              struct aom_write_bit_buffer *wb) {
-#if CONFIG_FRAME_SUPERRES
-  // If SUPERRES scaling is happening, write the full resolution instead of the
-  // downscaled resolution. The decoder will reduce this resolution itself.
-  if (cm->superres_scale_numerator != SUPERRES_SCALE_DENOMINATOR) {
-    aom_wb_write_literal(wb, cm->superres_width - 1, 16);
-    aom_wb_write_literal(wb, cm->superres_height - 1, 16);
-  } else {
-#endif  // CONFIG_FRAME_SUPERRES
-    aom_wb_write_literal(wb, cm->width - 1, 16);
-    aom_wb_write_literal(wb, cm->height - 1, 16);
-#if CONFIG_FRAME_SUPERRES
-  }
-#endif  // CONFIG_FRAME_SUPERRES
+  aom_wb_write_literal(wb, cm->width - 1, 16);
+  aom_wb_write_literal(wb, cm->height - 1, 16);
 
-  // TODO(afergs): Also write something different to render_size?
-  //               When superres scales, they'll be almost guaranteed to be
-  //               different on the other side.
   write_render_size(cm, wb);
 #if CONFIG_FRAME_SUPERRES
   write_superres_scale(cm, wb);
@@ -4559,6 +4223,28 @@ void write_sequence_header(SequenceHeader *seq_params) {
 }
 #endif
 
+#if CONFIG_EXT_INTER
+static void write_compound_tools(const AV1_COMMON *cm,
+                                 struct aom_write_bit_buffer *wb) {
+  (void)cm;
+  (void)wb;
+#if CONFIG_INTERINTRA
+  if (!frame_is_intra_only(cm) && cm->reference_mode != COMPOUND_REFERENCE) {
+    aom_wb_write_bit(wb, cm->allow_interintra_compound);
+  } else {
+    assert(cm->allow_interintra_compound == 0);
+  }
+#endif  // CONFIG_INTERINTRA
+#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
+  if (!frame_is_intra_only(cm) && cm->reference_mode != SINGLE_REFERENCE) {
+    aom_wb_write_bit(wb, cm->allow_masked_compound);
+  } else {
+    assert(cm->allow_masked_compound == 0);
+  }
+#endif  // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
+}
+#endif  // CONFIG_EXT_INTER
+
 static void write_uncompressed_header(AV1_COMP *cpi,
                                       struct aom_write_bit_buffer *wb) {
   AV1_COMMON *const cm = &cpi->common;
@@ -4637,14 +4323,14 @@ static void write_uncompressed_header(AV1_COMP *cpi,
     assert(cpi->common.ans_window_size_log2 < 24);
     aom_wb_write_literal(wb, cpi->common.ans_window_size_log2 - 8, 4);
 #endif  // CONFIG_ANS && ANS_MAX_SYMBOLS
-#if CONFIG_PALETTE
+#if CONFIG_PALETTE || CONFIG_INTRABC
     aom_wb_write_bit(wb, cm->allow_screen_content_tools);
-#endif  // CONFIG_PALETTE
+#endif  // CONFIG_PALETTE || CONFIG_INTRABC
   } else {
     if (!cm->show_frame) aom_wb_write_bit(wb, cm->intra_only);
-#if CONFIG_PALETTE
+#if CONFIG_PALETTE || CONFIG_INTRABC
     if (cm->intra_only) aom_wb_write_bit(wb, cm->allow_screen_content_tools);
-#endif  // CONFIG_PALETTE
+#endif  // CONFIG_PALETTE || CONFIG_INTRABC
     if (!cm->error_resilient_mode) {
       if (cm->intra_only) {
         aom_wb_write_bit(wb,
@@ -4813,6 +4499,9 @@ static void write_uncompressed_header(AV1_COMP *cpi,
     if (!use_hybrid_pred) aom_wb_write_bit(wb, use_compound_pred);
 #endif  // !CONFIG_REF_ADAPT
   }
+#if CONFIG_EXT_INTER
+  write_compound_tools(cm, wb);
+#endif  // CONFIG_EXT_INTER
 
 #if CONFIG_EXT_TX
   aom_wb_write_bit(wb, cm->reduced_tx_set_used);
@@ -4896,14 +4585,6 @@ static void write_global_motion(AV1_COMP *cpi, aom_writer *w) {
   AV1_COMMON *const cm = &cpi->common;
   int frame;
   for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
-#if !CONFIG_REF_MV
-    // With ref-mv, clearing unused global motion models here is
-    // unsafe, and we need to rely on the recode loop to do it
-    // instead. See av1_find_mv_refs for details.
-    if (!cpi->td.rd_counts.global_motion_used[frame]) {
-      set_default_warp_params(&cm->global_motion[frame]);
-    }
-#endif
     write_global_motion_params(
         &cm->global_motion[frame], &cm->prev_frame->global_motion[frame],
         cm->fc->global_motion_types_prob, w, cm->allow_high_precision_mv);
@@ -4950,13 +4631,18 @@ static uint32_t write_compressed_header(AV1_COMP *cpi, uint8_t *data) {
 #if !CONFIG_EC_ADAPT
   update_txfm_probs(cm, header_bc, counts);
 #endif
+#if CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
+  if (cm->tx_mode == TX_MODE_SELECT)
+    av1_cond_prob_diff_update(header_bc, &cm->fc->quarter_tx_size_prob,
+                              cm->counts.quarter_tx_size, probwt);
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
 #if CONFIG_LV_MAP
   av1_write_txb_probs(cpi, header_bc);
 #else
 #if !CONFIG_PVQ
-#if !(CONFIG_EC_ADAPT && CONFIG_NEW_TOKENSET)
+#if !CONFIG_EC_ADAPT
   update_coef_probs(cpi, header_bc);
-#endif  // !(CONFIG_EC_ADAPT && CONFIG_NEW_TOKENSET)
+#endif  // !CONFIG_EC_ADAPT
 #endif  // CONFIG_PVQ
 #endif  // CONFIG_LV_MAP
 
@@ -5023,9 +4709,7 @@ static uint32_t write_compressed_header(AV1_COMP *cpi, uint8_t *data) {
 
   if (frame_is_intra_only(cm)) {
     av1_copy(cm->kf_y_prob, av1_kf_y_mode_prob);
-#if CONFIG_EC_MULTISYMBOL
     av1_copy(cm->fc->kf_y_cdf, av1_kf_y_mode_cdf);
-#endif
 
 #if !CONFIG_EC_ADAPT
     for (i = 0; i < INTRA_MODES; ++i)
@@ -5034,21 +4718,19 @@ static uint32_t write_compressed_header(AV1_COMP *cpi, uint8_t *data) {
                          counts->kf_y_mode[i][j], INTRA_MODES, probwt,
                          header_bc);
 #endif  // CONFIG_EC_ADAPT
-  } else {
-#if CONFIG_REF_MV
-    update_inter_mode_probs(cm, header_bc, counts);
-#else
-#if !CONFIG_EC_ADAPT
-    for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
-      prob_diff_update(av1_inter_mode_tree, cm->fc->inter_mode_probs[i],
-                       counts->inter_mode[i], INTER_MODES, probwt, header_bc);
+#if CONFIG_INTRABC
+    if (cm->allow_screen_content_tools) {
+      av1_cond_prob_diff_update(header_bc, &fc->intrabc_prob,
+                                cm->counts.intrabc, probwt);
     }
 #endif
-#endif
+  } else {
+    update_inter_mode_probs(cm, header_bc, counts);
 #if CONFIG_EXT_INTER
     update_inter_compound_mode_probs(cm, probwt, header_bc);
-
-    if (cm->reference_mode != COMPOUND_REFERENCE) {
+#if CONFIG_INTERINTRA
+    if (cm->reference_mode != COMPOUND_REFERENCE &&
+        cm->allow_interintra_compound) {
       for (i = 0; i < BLOCK_SIZE_GROUPS; i++) {
         if (is_interintra_allowed_bsize_group(i)) {
           av1_cond_prob_diff_update(header_bc, &fc->interintra_prob[i],
@@ -5060,14 +4742,17 @@ static uint32_t write_compressed_header(AV1_COMP *cpi, uint8_t *data) {
             av1_interintra_mode_tree, cm->fc->interintra_mode_prob[i],
             counts->interintra_mode[i], INTERINTRA_MODES, probwt, header_bc);
       }
+#if CONFIG_WEDGE
       for (i = 0; i < BLOCK_SIZES; i++) {
         if (is_interintra_allowed_bsize(i) && is_interintra_wedge_used(i))
           av1_cond_prob_diff_update(header_bc, &fc->wedge_interintra_prob[i],
                                     cm->counts.wedge_interintra[i], probwt);
       }
+#endif  // CONFIG_WEDGE
     }
+#endif  // CONFIG_INTERINTRA
 #if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
-    if (cm->reference_mode != SINGLE_REFERENCE) {
+    if (cm->reference_mode != SINGLE_REFERENCE && cm->allow_masked_compound) {
       for (i = 0; i < BLOCK_SIZES; i++)
         prob_diff_update(av1_compound_type_tree, fc->compound_type_prob[i],
                          cm->counts.compound_interinter[i], COMPOUND_TYPES,
@@ -5133,12 +4818,7 @@ static uint32_t write_compressed_header(AV1_COMP *cpi, uint8_t *data) {
     }
 #endif
 
-    av1_write_nmv_probs(cm, cm->allow_high_precision_mv, header_bc,
-#if CONFIG_REF_MV
-                        counts->mv);
-#else
-                        &counts->mv);
-#endif
+    av1_write_nmv_probs(cm, cm->allow_high_precision_mv, header_bc, counts->mv);
 #if !CONFIG_EC_ADAPT
     update_ext_tx_probs(cm, header_bc);
 #endif
@@ -5149,22 +4829,12 @@ static uint32_t write_compressed_header(AV1_COMP *cpi, uint8_t *data) {
     write_global_motion(cpi, header_bc);
 #endif  // CONFIG_GLOBAL_MOTION
   }
-#if CONFIG_EC_MULTISYMBOL
 #if !CONFIG_EC_ADAPT
-#if CONFIG_NEW_TOKENSET
   av1_coef_head_cdfs(fc);
-#endif
   av1_coef_pareto_cdfs(fc);
-#if CONFIG_REF_MV
   for (i = 0; i < NMV_CONTEXTS; ++i) av1_set_mv_cdfs(&fc->nmvc[i]);
-#else
-  av1_set_mv_cdfs(&fc->nmvc);
-#endif
-#if CONFIG_EC_MULTISYMBOL
   av1_set_mode_cdfs(cm);
-#endif
 #endif  // !CONFIG_EC_ADAPT
-#endif
 #if CONFIG_ANS
   aom_buf_ans_flush(header_bc);
   header_size = buf_ans_write_end(header_bc);
diff --git a/third_party/aom/av1/encoder/block.h b/third_party/aom/av1/encoder/block.h
index 39e08d5b4..e16479e64 100644
--- a/third_party/aom/av1/encoder/block.h
+++ b/third_party/aom/av1/encoder/block.h
@@ -17,9 +17,7 @@
 #if CONFIG_PVQ
 #include "av1/encoder/encint.h"
 #endif
-#if CONFIG_REF_MV
 #include "av1/common/mvref_common.h"
-#endif
 
 #ifdef __cplusplus
 extern "C" {
@@ -79,13 +77,11 @@ typedef struct {
   int dc_sign_ctx[MAX_MB_PLANE]
                  [MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
 #endif
-#if CONFIG_REF_MV
   uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
   CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
 #if CONFIG_EXT_INTER
   int16_t compound_mode_context[MODE_CTX_REF_FRAMES];
 #endif  // CONFIG_EXT_INTER
-#endif
 } MB_MODE_INFO_EXT;
 
 typedef struct {
@@ -141,27 +137,18 @@ struct macroblock {
   unsigned int pred_sse[TOTAL_REFS_PER_FRAME];
   int pred_mv_sad[TOTAL_REFS_PER_FRAME];
 
-#if CONFIG_REF_MV
   int *nmvjointcost;
   int nmv_vec_cost[NMV_CONTEXTS][MV_JOINTS];
   int *nmvcost[NMV_CONTEXTS][2];
   int *nmvcost_hp[NMV_CONTEXTS][2];
   int **mv_cost_stack[NMV_CONTEXTS];
-  int *nmvjointsadcost;
-#else
-  int nmvjointcost[MV_JOINTS];
-  int *nmvcost[2];
-  int *nmvcost_hp[2];
-  int nmvjointsadcost[MV_JOINTS];
-#endif
-
   int **mvcost;
-  int *nmvsadcost[2];
-  int *nmvsadcost_hp[2];
-  int **mvsadcost;
+
 #if CONFIG_MOTION_VAR
   int32_t *wsrc_buf;
   int32_t *mask_buf;
+  uint8_t *above_pred_buf;
+  uint8_t *left_pred_buf;
 #endif  // CONFIG_MOTION_VAR
 
 #if CONFIG_PALETTE
@@ -174,10 +161,8 @@ struct macroblock {
 
 #if CONFIG_VAR_TX
   uint8_t blk_skip[MAX_MB_PLANE][MAX_MIB_SIZE * MAX_MIB_SIZE * 8];
-#if CONFIG_REF_MV
   uint8_t blk_skip_drl[MAX_MB_PLANE][MAX_MIB_SIZE * MAX_MIB_SIZE * 8];
 #endif
-#endif
 
   int skip;
 
@@ -226,8 +211,11 @@ struct macroblock {
   // This is needed when using the 8x8 Daala distortion metric during RDO,
   // because it evaluates distortion in a different order than the underlying
   // 4x4 blocks are coded.
-  int rate_4x4[256];
-#endif
+  int rate_4x4[MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
+#if CONFIG_CB4X4
+  DECLARE_ALIGNED(16, uint8_t, decoded_8x8[8 * 8]);
+#endif  // CONFIG_CB4X4
+#endif  // CONFIG_DAALA_DIST
 #if CONFIG_CFL
   // Whether luma needs to be stored during RDO.
   int cfl_store_y;
diff --git a/third_party/aom/av1/encoder/context_tree.h b/third_party/aom/av1/encoder/context_tree.h
index 67954126c..4f9d5e374 100644
--- a/third_party/aom/av1/encoder/context_tree.h
+++ b/third_party/aom/av1/encoder/context_tree.h
@@ -34,7 +34,6 @@ typedef struct {
   uint8_t *blk_skip[MAX_MB_PLANE];
 #endif
 
-  // dual buffer pointers, 0: in use, 1: best in store
   tran_low_t *coeff[MAX_MB_PLANE];
   tran_low_t *qcoeff[MAX_MB_PLANE];
   tran_low_t *dqcoeff[MAX_MB_PLANE];
@@ -48,9 +47,8 @@ typedef struct {
 
   int num_4x4_blk;
   int skip;
-  int pred_pixel_ready;
   // For current partition, only if all Y, U, and V transform blocks'
-  // coefficients are quantized to 0, skippable is set to 0.
+  // coefficients are quantized to 0, skippable is set to 1.
   int skippable;
   int best_mode_index;
   int hybrid_pred_diff;
diff --git a/third_party/aom/av1/encoder/corner_match.c b/third_party/aom/av1/encoder/corner_match.c
index 64ee0c5ae..3827b65fa 100644
--- a/third_party/aom/av1/encoder/corner_match.c
+++ b/third_party/aom/av1/encoder/corner_match.c
@@ -9,16 +9,13 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include <stdio.h>
 #include <stdlib.h>
 #include <memory.h>
 #include <math.h>
 
+#include "./av1_rtcd.h"
 #include "av1/encoder/corner_match.h"
 
-#define MATCH_SZ 13
-#define MATCH_SZ_BY2 ((MATCH_SZ - 1) / 2)
-#define MATCH_SZ_SQ (MATCH_SZ * MATCH_SZ)
 #define SEARCH_SZ 9
 #define SEARCH_SZ_BY2 ((SEARCH_SZ - 1) / 2)
 
@@ -28,8 +25,8 @@
    centered at (x, y).
 */
 static double compute_variance(unsigned char *im, int stride, int x, int y) {
-  int sum = 0.0;
-  int sumsq = 0.0;
+  int sum = 0;
+  int sumsq = 0;
   int var;
   int i, j;
   for (i = 0; i < MATCH_SZ; ++i)
@@ -46,9 +43,9 @@ static double compute_variance(unsigned char *im, int stride, int x, int y) {
    correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows
    of each image, centered at (x1, y1) and (x2, y2) respectively.
 */
-static double compute_cross_correlation(unsigned char *im1, int stride1, int x1,
-                                        int y1, unsigned char *im2, int stride2,
-                                        int x2, int y2) {
+double compute_cross_correlation_c(unsigned char *im1, int stride1, int x1,
+                                   int y1, unsigned char *im2, int stride2,
+                                   int x2, int y2) {
   int v1, v2;
   int sum1 = 0;
   int sum2 = 0;
diff --git a/third_party/aom/av1/encoder/corner_match.h b/third_party/aom/av1/encoder/corner_match.h
index c0458642c..3b16f9efc 100644
--- a/third_party/aom/av1/encoder/corner_match.h
+++ b/third_party/aom/av1/encoder/corner_match.h
@@ -15,6 +15,10 @@
 #include <stdlib.h>
 #include <memory.h>
 
+#define MATCH_SZ 13
+#define MATCH_SZ_BY2 ((MATCH_SZ - 1) / 2)
+#define MATCH_SZ_SQ (MATCH_SZ * MATCH_SZ)
+
 typedef struct {
   int x, y;
   int rx, ry;
diff --git a/third_party/aom/av1/encoder/daala_compat_enc.c b/third_party/aom/av1/encoder/daala_compat_enc.c
index 3df424cac..c60e2d3d7 100644
--- a/third_party/aom/av1/encoder/daala_compat_enc.c
+++ b/third_party/aom/av1/encoder/daala_compat_enc.c
@@ -12,19 +12,19 @@
 #include "encint.h"
 
 void od_encode_checkpoint(const daala_enc_ctx *enc, od_rollback_buffer *rbuf) {
-#if CONFIG_DAALA_EC
+#if !CONFIG_ANS
   od_ec_enc_checkpoint(&rbuf->ec, &enc->w.ec);
 #else
-#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#error "CONFIG_PVQ currently requires !CONFIG_ANS."
 #endif
   OD_COPY(&rbuf->adapt, enc->state.adapt, 1);
 }
 
 void od_encode_rollback(daala_enc_ctx *enc, const od_rollback_buffer *rbuf) {
-#if CONFIG_DAALA_EC
+#if !CONFIG_ANS
   od_ec_enc_rollback(&enc->w.ec, &rbuf->ec);
 #else
-#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#error "CONFIG_PVQ currently requires !CONFIG_ANS."
 #endif
   OD_COPY(enc->state.adapt, &rbuf->adapt, 1);
 }
diff --git a/third_party/aom/av1/encoder/dct.c b/third_party/aom/av1/encoder/dct.c
index 09e1b0563..f6b64f0f7 100644
--- a/third_party/aom/av1/encoder/dct.c
+++ b/third_party/aom/av1/encoder/dct.c
@@ -19,7 +19,7 @@
 #include "aom_ports/mem.h"
 #include "av1/common/blockd.h"
 #include "av1/common/av1_fwd_txfm1d.h"
-#include "av1/common/av1_fwd_txfm2d_cfg.h"
+#include "av1/common/av1_fwd_txfm1d_cfg.h"
 #include "av1/common/idct.h"
 
 static INLINE void range_check(const tran_low_t *input, const int size,
@@ -1022,6 +1022,10 @@ static void fhalfright32(const tran_low_t *input, tran_low_t *output) {
 }
 
 #if CONFIG_EXT_TX
+// TODO(sarahparker) these functions will be removed once the highbitdepth
+// codepath works properly for rectangular transforms. They have almost
+// identical versions in av1_fwd_txfm1d.c, but those are currently only
+// being used for square transforms.
 static void fidtx4(const tran_low_t *input, tran_low_t *output) {
   int i;
   for (i = 0; i < 4; ++i)
@@ -2133,8 +2137,7 @@ static void fdct64_col(const tran_low_t *input, tran_low_t *output) {
   int32_t in[64], out[64];
   int i;
   for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
-  av1_fdct64_new(in, out, fwd_cos_bit_col_dct_dct_64,
-                 fwd_stage_range_col_dct_dct_64);
+  av1_fdct64_new(in, out, fwd_cos_bit_col_dct_64, fwd_stage_range_col_dct_64);
   for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
 }
 
@@ -2142,8 +2145,7 @@ static void fdct64_row(const tran_low_t *input, tran_low_t *output) {
   int32_t in[64], out[64];
   int i;
   for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
-  av1_fdct64_new(in, out, fwd_cos_bit_row_dct_dct_64,
-                 fwd_stage_range_row_dct_dct_64);
+  av1_fdct64_new(in, out, fwd_cos_bit_row_dct_64, fwd_stage_range_row_dct_64);
   for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
 }
 
@@ -2225,4 +2227,49 @@ void av1_highbd_fht64x64_c(const int16_t *input, tran_low_t *output, int stride,
 }
 #endif  // CONFIG_TX64X64
 #endif  // CONFIG_HIGHBITDEPTH
+
+#if CONFIG_DPCM_INTRA
+void av1_dpcm_ft4_c(const int16_t *input, int stride, TX_TYPE_1D tx_type,
+                    tran_low_t *output) {
+  assert(tx_type < TX_TYPES_1D);
+  static const transform_1d FHT[] = { fdct4, fadst4, fadst4, fidtx4 };
+  const transform_1d ft = FHT[tx_type];
+  tran_low_t temp_in[4];
+  for (int i = 0; i < 4; ++i)
+    temp_in[i] = (tran_low_t)fdct_round_shift(input[i * stride] * 4 * Sqrt2);
+  ft(temp_in, output);
+}
+
+void av1_dpcm_ft8_c(const int16_t *input, int stride, TX_TYPE_1D tx_type,
+                    tran_low_t *output) {
+  assert(tx_type < TX_TYPES_1D);
+  static const transform_1d FHT[] = { fdct8, fadst8, fadst8, fidtx8 };
+  const transform_1d ft = FHT[tx_type];
+  tran_low_t temp_in[8];
+  for (int i = 0; i < 8; ++i) temp_in[i] = input[i * stride] * 4;
+  ft(temp_in, output);
+}
+
+void av1_dpcm_ft16_c(const int16_t *input, int stride, TX_TYPE_1D tx_type,
+                     tran_low_t *output) {
+  assert(tx_type < TX_TYPES_1D);
+  static const transform_1d FHT[] = { fdct16, fadst16, fadst16, fidtx16 };
+  const transform_1d ft = FHT[tx_type];
+  tran_low_t temp_in[16];
+  for (int i = 0; i < 16; ++i)
+    temp_in[i] = (tran_low_t)fdct_round_shift(input[i * stride] * 2 * Sqrt2);
+  ft(temp_in, output);
+}
+
+void av1_dpcm_ft32_c(const int16_t *input, int stride, TX_TYPE_1D tx_type,
+                     tran_low_t *output) {
+  assert(tx_type < TX_TYPES_1D);
+  static const transform_1d FHT[] = { fdct32, fhalfright32, fhalfright32,
+                                      fidtx32 };
+  const transform_1d ft = FHT[tx_type];
+  tran_low_t temp_in[32];
+  for (int i = 0; i < 32; ++i) temp_in[i] = input[i * stride];
+  ft(temp_in, output);
+}
+#endif  // CONFIG_DPCM_INTRA
 #endif  // !AV1_DCT_GTEST
diff --git a/third_party/aom/av1/encoder/encodeframe.c b/third_party/aom/av1/encoder/encodeframe.c
index d254157e7..36d09c02a 100644
--- a/third_party/aom/av1/encoder/encodeframe.c
+++ b/third_party/aom/av1/encoder/encodeframe.c
@@ -72,8 +72,7 @@
 
 static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
                               TOKENEXTRA **t, RUN_TYPE dry_run, int mi_row,
-                              int mi_col, BLOCK_SIZE bsize,
-                              PICK_MODE_CONTEXT *ctx, int *rate);
+                              int mi_col, BLOCK_SIZE bsize, int *rate);
 
 #if CONFIG_SUPERTX
 static int check_intra_b(PICK_MODE_CONTEXT *ctx);
@@ -273,14 +272,13 @@ static void set_offsets_without_segment_id(const AV1_COMP *const cpi,
   const int mi_width = mi_size_wide[bsize];
   const int mi_height = mi_size_high[bsize];
 
-  set_skip_context(xd, mi_row, mi_col);
-
   set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
-
+  set_skip_context(xd, mi_row, mi_col);
 #if CONFIG_VAR_TX
-  xd->above_txfm_context = cm->above_txfm_context + mi_col;
-  xd->left_txfm_context =
-      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  xd->above_txfm_context =
+      cm->above_txfm_context + (mi_col << TX_UNIT_WIDE_LOG2);
+  xd->left_txfm_context = xd->left_txfm_context_buffer +
+                          ((mi_row & MAX_MIB_MASK) << TX_UNIT_HIGH_LOG2);
   xd->max_tx_size = max_txsize_lookup[bsize];
 #endif
 
@@ -452,563 +450,6 @@ static void set_segment_id_supertx(const AV1_COMP *const cpi,
 }
 #endif  // CONFIG_SUPERTX
 
-static void set_block_size(AV1_COMP *const cpi, MACROBLOCK *const x,
-                           MACROBLOCKD *const xd, int mi_row, int mi_col,
-                           BLOCK_SIZE bsize) {
-  if (cpi->common.mi_cols > mi_col && cpi->common.mi_rows > mi_row) {
-    const int mi_width = AOMMAX(mi_size_wide[bsize], mi_size_wide[BLOCK_8X8]);
-    const int mi_height = AOMMAX(mi_size_high[bsize], mi_size_high[BLOCK_8X8]);
-    for (int r = 0; r < mi_height; ++r) {
-      for (int c = 0; c < mi_width; ++c) {
-        set_mode_info_offsets(cpi, x, xd, mi_row + r, mi_col + c);
-        xd->mi[0]->mbmi.sb_type = bsize;
-      }
-    }
-  }
-}
-
-static void set_vt_partitioning(AV1_COMP *cpi, MACROBLOCK *const x,
-                                MACROBLOCKD *const xd, VAR_TREE *vt, int mi_row,
-                                int mi_col, const int64_t *const threshold,
-                                const BLOCK_SIZE *const bsize_min) {
-  AV1_COMMON *const cm = &cpi->common;
-  const int hbw = mi_size_wide[vt->bsize] / 2;
-  const int hbh = mi_size_high[vt->bsize] / 2;
-  const int has_cols = mi_col + hbw < cm->mi_cols;
-  const int has_rows = mi_row + hbh < cm->mi_rows;
-
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
-
-  assert(vt->bsize >= BLOCK_8X8);
-
-  assert(hbh == hbw);
-
-  if (vt->bsize == BLOCK_8X8 && cm->frame_type != KEY_FRAME) {
-    set_block_size(cpi, x, xd, mi_row, mi_col, BLOCK_8X8);
-    return;
-  }
-
-  if (vt->force_split || (!has_cols && !has_rows)) goto split;
-
-  // For bsize=bsize_min (16x16/8x8 for 8x8/4x4 downsampling), select if
-  // variance is below threshold, otherwise split will be selected.
-  // No check for vert/horiz split as too few samples for variance.
-  if (vt->bsize == bsize_min[0]) {
-    if (has_cols && has_rows && vt->variances.none.variance < threshold[0]) {
-      set_block_size(cpi, x, xd, mi_row, mi_col, vt->bsize);
-      return;
-    } else {
-      BLOCK_SIZE subsize = get_subsize(vt->bsize, PARTITION_SPLIT);
-      set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
-      if (vt->bsize > BLOCK_8X8) {
-        set_block_size(cpi, x, xd, mi_row, mi_col + hbw, subsize);
-        set_block_size(cpi, x, xd, mi_row + hbh, mi_col, subsize);
-        set_block_size(cpi, x, xd, mi_row + hbh, mi_col + hbw, subsize);
-      }
-      return;
-    }
-  } else if (vt->bsize > bsize_min[0]) {
-    // For key frame: take split for bsize above 32X32 or very high variance.
-    if (cm->frame_type == KEY_FRAME &&
-        (vt->bsize > BLOCK_32X32 ||
-         vt->variances.none.variance > (threshold[0] << 4))) {
-      goto split;
-    }
-    // If variance is low, take the bsize (no split).
-    if (has_cols && has_rows && vt->variances.none.variance < threshold[0]) {
-      set_block_size(cpi, x, xd, mi_row, mi_col, vt->bsize);
-      return;
-    }
-
-    // Check vertical split.
-    if (has_rows) {
-      BLOCK_SIZE subsize = get_subsize(vt->bsize, PARTITION_VERT);
-      if (vt->variances.vert[0].variance < threshold[0] &&
-          vt->variances.vert[1].variance < threshold[0] &&
-          get_plane_block_size(subsize, &xd->plane[1]) < BLOCK_INVALID) {
-        set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
-        set_block_size(cpi, x, xd, mi_row, mi_col + hbw, subsize);
-        return;
-      }
-    }
-    // Check horizontal split.
-    if (has_cols) {
-      BLOCK_SIZE subsize = get_subsize(vt->bsize, PARTITION_HORZ);
-      if (vt->variances.horz[0].variance < threshold[0] &&
-          vt->variances.horz[1].variance < threshold[0] &&
-          get_plane_block_size(subsize, &xd->plane[1]) < BLOCK_INVALID) {
-        set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
-        set_block_size(cpi, x, xd, mi_row + hbh, mi_col, subsize);
-        return;
-      }
-    }
-  }
-
-split : {
-  set_vt_partitioning(cpi, x, xd, vt->split[0], mi_row, mi_col, threshold + 1,
-                      bsize_min + 1);
-  set_vt_partitioning(cpi, x, xd, vt->split[1], mi_row, mi_col + hbw,
-                      threshold + 1, bsize_min + 1);
-  set_vt_partitioning(cpi, x, xd, vt->split[2], mi_row + hbh, mi_col,
-                      threshold + 1, bsize_min + 1);
-  set_vt_partitioning(cpi, x, xd, vt->split[3], mi_row + hbh, mi_col + hbw,
-                      threshold + 1, bsize_min + 1);
-  return;
-}
-}
-
-// Set the variance split thresholds for following the block sizes:
-// 0 - threshold_64x64, 1 - threshold_32x32, 2 - threshold_16x16,
-// 3 - vbp_threshold_8x8. vbp_threshold_8x8 (to split to 4x4 partition) is
-// currently only used on key frame.
-static void set_vbp_thresholds(AV1_COMP *cpi, int64_t thresholds[], int q) {
-  AV1_COMMON *const cm = &cpi->common;
-  const int is_key_frame = (cm->frame_type == KEY_FRAME);
-  const int threshold_multiplier = is_key_frame ? 20 : 1;
-  const int64_t threshold_base =
-      (int64_t)(threshold_multiplier * cpi->y_dequant[q][1]);
-  if (is_key_frame) {
-    thresholds[1] = threshold_base;
-    thresholds[2] = threshold_base >> 2;
-    thresholds[3] = threshold_base >> 2;
-    thresholds[4] = threshold_base << 2;
-  } else {
-    thresholds[2] = threshold_base;
-    if (cm->width <= 352 && cm->height <= 288) {
-      thresholds[1] = threshold_base >> 2;
-      thresholds[3] = threshold_base << 3;
-    } else {
-      thresholds[1] = threshold_base;
-      thresholds[2] = (5 * threshold_base) >> 2;
-      if (cm->width >= 1920 && cm->height >= 1080)
-        thresholds[2] = (7 * threshold_base) >> 2;
-      thresholds[3] = threshold_base << cpi->oxcf.speed;
-    }
-  }
-  thresholds[0] = INT64_MIN;
-}
-
-void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int q) {
-  AV1_COMMON *const cm = &cpi->common;
-  SPEED_FEATURES *const sf = &cpi->sf;
-  const int is_key_frame = (cm->frame_type == KEY_FRAME);
-  if (sf->partition_search_type != VAR_BASED_PARTITION &&
-      sf->partition_search_type != REFERENCE_PARTITION) {
-    return;
-  } else {
-    set_vbp_thresholds(cpi, cpi->vbp_thresholds, q);
-    // The thresholds below are not changed locally.
-    if (is_key_frame) {
-      cpi->vbp_threshold_sad = 0;
-      cpi->vbp_bsize_min = BLOCK_8X8;
-    } else {
-      if (cm->width <= 352 && cm->height <= 288)
-        cpi->vbp_threshold_sad = 100;
-      else
-        cpi->vbp_threshold_sad = (cpi->y_dequant[q][1] << 1) > 1000
-                                     ? (cpi->y_dequant[q][1] << 1)
-                                     : 1000;
-      cpi->vbp_bsize_min = BLOCK_16X16;
-    }
-    cpi->vbp_threshold_minmax = 15 + (q >> 3);
-  }
-}
-
-// Compute the minmax over the 8x8 subblocks.
-static int compute_minmax_8x8(const uint8_t *src, int src_stride,
-                              const uint8_t *ref, int ref_stride,
-#if CONFIG_HIGHBITDEPTH
-                              int highbd,
-#endif
-                              int pixels_wide, int pixels_high) {
-  int k;
-  int minmax_max = 0;
-  int minmax_min = 255;
-  // Loop over the 4 8x8 subblocks.
-  for (k = 0; k < 4; k++) {
-    const int x8_idx = ((k & 1) << 3);
-    const int y8_idx = ((k >> 1) << 3);
-    int min = 0;
-    int max = 0;
-    if (x8_idx < pixels_wide && y8_idx < pixels_high) {
-      const int src_offset = y8_idx * src_stride + x8_idx;
-      const int ref_offset = y8_idx * ref_stride + x8_idx;
-#if CONFIG_HIGHBITDEPTH
-      if (highbd) {
-        aom_highbd_minmax_8x8(src + src_offset, src_stride, ref + ref_offset,
-                              ref_stride, &min, &max);
-      } else {
-        aom_minmax_8x8(src + src_offset, src_stride, ref + ref_offset,
-                       ref_stride, &min, &max);
-      }
-#else
-      aom_minmax_8x8(src + src_offset, src_stride, ref + ref_offset, ref_stride,
-                     &min, &max);
-#endif
-      if ((max - min) > minmax_max) minmax_max = (max - min);
-      if ((max - min) < minmax_min) minmax_min = (max - min);
-    }
-  }
-  return (minmax_max - minmax_min);
-}
-
-#if CONFIG_HIGHBITDEPTH
-static INLINE int avg_4x4(const uint8_t *const src, const int stride,
-                          const int highbd) {
-  if (highbd) {
-    return aom_highbd_avg_4x4(src, stride);
-  } else {
-    return aom_avg_4x4(src, stride);
-  }
-}
-#else
-static INLINE int avg_4x4(const uint8_t *const src, const int stride) {
-  return aom_avg_4x4(src, stride);
-}
-#endif
-
-#if CONFIG_HIGHBITDEPTH
-static INLINE int avg_8x8(const uint8_t *const src, const int stride,
-                          const int highbd) {
-  if (highbd) {
-    return aom_highbd_avg_8x8(src, stride);
-  } else {
-    return aom_avg_8x8(src, stride);
-  }
-}
-#else
-static INLINE int avg_8x8(const uint8_t *const src, const int stride) {
-  return aom_avg_8x8(src, stride);
-}
-#endif
-
-static void init_variance_tree(VAR_TREE *const vt,
-#if CONFIG_HIGHBITDEPTH
-                               const int highbd,
-#endif
-                               BLOCK_SIZE bsize, BLOCK_SIZE leaf_size,
-                               const int width, const int height,
-                               const uint8_t *const src, const int src_stride,
-                               const uint8_t *const ref, const int ref_stride) {
-  assert(bsize >= leaf_size);
-
-  vt->bsize = bsize;
-
-  vt->force_split = 0;
-
-  vt->src = src;
-  vt->src_stride = src_stride;
-  vt->ref = ref;
-  vt->ref_stride = ref_stride;
-
-  vt->width = width;
-  vt->height = height;
-
-#if CONFIG_HIGHBITDEPTH
-  vt->highbd = highbd;
-#endif  // CONFIG_HIGHBITDEPTH
-
-  if (bsize > leaf_size) {
-    const BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT);
-    const int px = block_size_wide[subsize];
-
-    init_variance_tree(vt->split[0],
-#if CONFIG_HIGHBITDEPTH
-                       highbd,
-#endif  // CONFIG_HIGHBITDEPTH
-                       subsize, leaf_size, AOMMIN(px, width),
-                       AOMMIN(px, height), src, src_stride, ref, ref_stride);
-    init_variance_tree(vt->split[1],
-#if CONFIG_HIGHBITDEPTH
-                       highbd,
-#endif  // CONFIG_HIGHBITDEPTH
-                       subsize, leaf_size, width - px, AOMMIN(px, height),
-                       src + px, src_stride, ref + px, ref_stride);
-    init_variance_tree(vt->split[2],
-#if CONFIG_HIGHBITDEPTH
-                       highbd,
-#endif  // CONFIG_HIGHBITDEPTH
-                       subsize, leaf_size, AOMMIN(px, width), height - px,
-                       src + px * src_stride, src_stride, ref + px * ref_stride,
-                       ref_stride);
-    init_variance_tree(vt->split[3],
-#if CONFIG_HIGHBITDEPTH
-                       highbd,
-#endif  // CONFIG_HIGHBITDEPTH
-                       subsize, leaf_size, width - px, height - px,
-                       src + px * src_stride + px, src_stride,
-                       ref + px * ref_stride + px, ref_stride);
-  }
-}
-
-// Fill the variance tree based on averaging pixel values (sub-sampling), at
-// the leaf node size.
-static void fill_variance_tree(VAR_TREE *const vt, const BLOCK_SIZE leaf_size) {
-  if (vt->bsize > leaf_size) {
-    fill_variance_tree(vt->split[0], leaf_size);
-    fill_variance_tree(vt->split[1], leaf_size);
-    fill_variance_tree(vt->split[2], leaf_size);
-    fill_variance_tree(vt->split[3], leaf_size);
-    fill_variance_node(vt);
-  } else if (vt->width <= 0 || vt->height <= 0) {
-    fill_variance(0, 0, 0, &vt->variances.none);
-  } else {
-    unsigned int sse = 0;
-    int sum = 0;
-    int src_avg;
-    int ref_avg;
-    assert(leaf_size == BLOCK_4X4 || leaf_size == BLOCK_8X8);
-    if (leaf_size == BLOCK_4X4) {
-      src_avg = avg_4x4(vt->src, vt->src_stride IF_HBD(, vt->highbd));
-      ref_avg = avg_4x4(vt->ref, vt->ref_stride IF_HBD(, vt->highbd));
-    } else {
-      src_avg = avg_8x8(vt->src, vt->src_stride IF_HBD(, vt->highbd));
-      ref_avg = avg_8x8(vt->ref, vt->ref_stride IF_HBD(, vt->highbd));
-    }
-    sum = src_avg - ref_avg;
-    sse = sum * sum;
-    fill_variance(sse, sum, 0, &vt->variances.none);
-  }
-}
-
-static void refine_variance_tree(VAR_TREE *const vt, const int64_t threshold) {
-  if (vt->bsize >= BLOCK_8X8) {
-    if (vt->bsize == BLOCK_16X16) {
-      if (vt->variances.none.variance <= threshold)
-        return;
-      else
-        vt->force_split = 0;
-    }
-
-    refine_variance_tree(vt->split[0], threshold);
-    refine_variance_tree(vt->split[1], threshold);
-    refine_variance_tree(vt->split[2], threshold);
-    refine_variance_tree(vt->split[3], threshold);
-
-    if (vt->bsize <= BLOCK_16X16) fill_variance_node(vt);
-  } else if (vt->width <= 0 || vt->height <= 0) {
-    fill_variance(0, 0, 0, &vt->variances.none);
-  } else {
-    const int src_avg = avg_4x4(vt->src, vt->src_stride IF_HBD(, vt->highbd));
-    const int ref_avg = avg_4x4(vt->ref, vt->ref_stride IF_HBD(, vt->highbd));
-    const int sum = src_avg - ref_avg;
-    const unsigned int sse = sum * sum;
-    assert(vt->bsize == BLOCK_4X4);
-    fill_variance(sse, sum, 0, &vt->variances.none);
-  }
-}
-
-static int check_split_key_frame(VAR_TREE *const vt, const int64_t threshold) {
-  if (vt->bsize == BLOCK_32X32) {
-    vt->force_split = vt->variances.none.variance > threshold;
-  } else {
-    vt->force_split |= check_split_key_frame(vt->split[0], threshold);
-    vt->force_split |= check_split_key_frame(vt->split[1], threshold);
-    vt->force_split |= check_split_key_frame(vt->split[2], threshold);
-    vt->force_split |= check_split_key_frame(vt->split[3], threshold);
-  }
-  return vt->force_split;
-}
-
-static int check_split(AV1_COMP *const cpi, VAR_TREE *const vt,
-                       const int segment_id, const int64_t *const thresholds) {
-  if (vt->bsize == BLOCK_16X16) {
-    vt->force_split = vt->variances.none.variance > thresholds[0];
-    if (!vt->force_split && vt->variances.none.variance > thresholds[-1] &&
-        !cyclic_refresh_segment_id_boosted(segment_id)) {
-      // We have some nominal amount of 16x16 variance (based on average),
-      // compute the minmax over the 8x8 sub-blocks, and if above threshold,
-      // force split to 8x8 block for this 16x16 block.
-      int minmax =
-          compute_minmax_8x8(vt->src, vt->src_stride, vt->ref, vt->ref_stride,
-#if CONFIG_HIGHBITDEPTH
-                             vt->highbd,
-#endif
-                             vt->width, vt->height);
-      vt->force_split = minmax > cpi->vbp_threshold_minmax;
-    }
-  } else {
-    vt->force_split |=
-        check_split(cpi, vt->split[0], segment_id, thresholds + 1);
-    vt->force_split |=
-        check_split(cpi, vt->split[1], segment_id, thresholds + 1);
-    vt->force_split |=
-        check_split(cpi, vt->split[2], segment_id, thresholds + 1);
-    vt->force_split |=
-        check_split(cpi, vt->split[3], segment_id, thresholds + 1);
-
-    if (vt->bsize == BLOCK_32X32 && !vt->force_split) {
-      vt->force_split = vt->variances.none.variance > thresholds[0];
-    }
-  }
-
-  return vt->force_split;
-}
-
-// This function chooses partitioning based on the variance between source and
-// reconstructed last (or golden), where variance is computed for down-sampled
-// inputs.
-static void choose_partitioning(AV1_COMP *const cpi, ThreadData *const td,
-                                const TileInfo *const tile, MACROBLOCK *const x,
-                                const int mi_row, const int mi_col) {
-  AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  VAR_TREE *const vt = td->var_root[cm->mib_size_log2 - MIN_MIB_SIZE_LOG2];
-#if CONFIG_DUAL_FILTER
-  int i;
-#endif
-  const uint8_t *src;
-  const uint8_t *ref;
-  int src_stride;
-  int ref_stride;
-  int pixels_wide = MI_SIZE * mi_size_wide[cm->sb_size];
-  int pixels_high = MI_SIZE * mi_size_high[cm->sb_size];
-  int64_t thresholds[5] = {
-    cpi->vbp_thresholds[0], cpi->vbp_thresholds[1], cpi->vbp_thresholds[2],
-    cpi->vbp_thresholds[3], cpi->vbp_thresholds[4],
-  };
-  BLOCK_SIZE bsize_min[5] = { BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,
-                              cpi->vbp_bsize_min, BLOCK_8X8 };
-  const int start_level = cm->sb_size == BLOCK_64X64 ? 1 : 0;
-  const int64_t *const thre = thresholds + start_level;
-  const BLOCK_SIZE *const bmin = bsize_min + start_level;
-
-  const int is_key_frame = (cm->frame_type == KEY_FRAME);
-  const int low_res = (cm->width <= 352 && cm->height <= 288);
-
-  int segment_id = CR_SEGMENT_ID_BASE;
-
-  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
-    const uint8_t *const map =
-        cm->seg.update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
-    segment_id = get_segment_id(cm, map, cm->sb_size, mi_row, mi_col);
-
-    if (cyclic_refresh_segment_id_boosted(segment_id)) {
-      int q = av1_get_qindex(&cm->seg, segment_id, cm->base_qindex);
-      set_vbp_thresholds(cpi, thresholds, q);
-    }
-  }
-
-  set_offsets(cpi, tile, x, mi_row, mi_col, cm->sb_size);
-
-  if (xd->mb_to_right_edge < 0) pixels_wide += (xd->mb_to_right_edge >> 3);
-  if (xd->mb_to_bottom_edge < 0) pixels_high += (xd->mb_to_bottom_edge >> 3);
-
-  src = x->plane[0].src.buf;
-  src_stride = x->plane[0].src.stride;
-
-  if (!is_key_frame) {
-    MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-    const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
-    const YV12_BUFFER_CONFIG *yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
-    unsigned int y_sad, y_sad_g;
-
-    const int hbs = cm->mib_size / 2;
-    const int split_vert = mi_col + hbs >= cm->mi_cols;
-    const int split_horz = mi_row + hbs >= cm->mi_rows;
-    BLOCK_SIZE bsize;
-
-    if (split_vert && split_horz)
-      bsize = get_subsize(cm->sb_size, PARTITION_SPLIT);
-    else if (split_vert)
-      bsize = get_subsize(cm->sb_size, PARTITION_VERT);
-    else if (split_horz)
-      bsize = get_subsize(cm->sb_size, PARTITION_HORZ);
-    else
-      bsize = cm->sb_size;
-
-    assert(yv12 != NULL);
-
-    if (yv12_g && yv12_g != yv12) {
-      av1_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
-                           &cm->frame_refs[GOLDEN_FRAME - 1].sf);
-      y_sad_g = cpi->fn_ptr[bsize].sdf(
-          x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf,
-          xd->plane[0].pre[0].stride);
-    } else {
-      y_sad_g = UINT_MAX;
-    }
-
-    av1_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
-                         &cm->frame_refs[LAST_FRAME - 1].sf);
-    mbmi->ref_frame[0] = LAST_FRAME;
-    mbmi->ref_frame[1] = NONE_FRAME;
-    mbmi->sb_type = cm->sb_size;
-    mbmi->mv[0].as_int = 0;
-#if CONFIG_DUAL_FILTER
-    for (i = 0; i < 4; ++i) mbmi->interp_filter[i] = BILINEAR;
-#else
-    mbmi->interp_filter = BILINEAR;
-#endif
-
-    y_sad = av1_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col);
-
-    if (y_sad_g < y_sad) {
-      av1_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
-                           &cm->frame_refs[GOLDEN_FRAME - 1].sf);
-      mbmi->ref_frame[0] = GOLDEN_FRAME;
-      mbmi->mv[0].as_int = 0;
-      y_sad = y_sad_g;
-    } else {
-      x->pred_mv[LAST_FRAME] = mbmi->mv[0].as_mv;
-    }
-
-    av1_build_inter_predictors_sb(xd, mi_row, mi_col, NULL, cm->sb_size);
-
-    ref = xd->plane[0].dst.buf;
-    ref_stride = xd->plane[0].dst.stride;
-
-    // If the y_sad is very small, take the largest partition and exit.
-    // Don't check on boosted segment for now, as largest is suppressed there.
-    if (segment_id == CR_SEGMENT_ID_BASE && y_sad < cpi->vbp_threshold_sad) {
-      if (!split_vert && !split_horz) {
-        set_block_size(cpi, x, xd, mi_row, mi_col, cm->sb_size);
-        return;
-      }
-    }
-  } else {
-    ref = AV1_VAR_OFFS;
-    ref_stride = 0;
-#if CONFIG_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      switch (xd->bd) {
-        case 10: ref = CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_10); break;
-        case 12: ref = CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_12); break;
-        case 8:
-        default: ref = CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_8); break;
-      }
-    }
-#endif  // CONFIG_HIGHBITDEPTH
-  }
-
-  init_variance_tree(
-      vt,
-#if CONFIG_HIGHBITDEPTH
-      xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH,
-#endif  // CONFIG_HIGHBITDEPTH
-      cm->sb_size, (is_key_frame || low_res) ? BLOCK_4X4 : BLOCK_8X8,
-      pixels_wide, pixels_high, src, src_stride, ref, ref_stride);
-
-  // Fill in the entire tree of variances and compute splits.
-  if (is_key_frame) {
-    fill_variance_tree(vt, BLOCK_4X4);
-    check_split_key_frame(vt, thre[1]);
-  } else {
-    fill_variance_tree(vt, BLOCK_8X8);
-    check_split(cpi, vt, segment_id, thre);
-    if (low_res) {
-      refine_variance_tree(vt, thre[1] << 1);
-    }
-  }
-
-  vt->force_split |= mi_col + cm->mib_size > cm->mi_cols ||
-                     mi_row + cm->mib_size > cm->mi_rows;
-
-  // Now go through the entire structure, splitting every block size until
-  // we get to one that's got a variance lower than our threshold.
-  set_vt_partitioning(cpi, x, xd, vt, mi_row, mi_col, thre, bmin);
-}
-
 #if CONFIG_DUAL_FILTER
 static void reset_intmv_filter_type(const AV1_COMMON *const cm, MACROBLOCKD *xd,
                                     MB_MODE_INFO *mbmi) {
@@ -1067,7 +508,6 @@ static void reset_tx_size(MACROBLOCKD *xd, MB_MODE_INFO *mbmi,
   }
 }
 
-#if CONFIG_REF_MV
 static void set_ref_and_pred_mvs(MACROBLOCK *const x, int_mv *const mi_pred_mv,
                                  int8_t rf_type) {
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -1116,7 +556,6 @@ static void set_ref_and_pred_mvs(MACROBLOCK *const x, int_mv *const mi_pred_mv,
   }
 #endif  // CONFIG_EXT_INTER
 }
-#endif  // CONFIG_REF_MV
 
 static void update_state(const AV1_COMP *const cpi, ThreadData *td,
                          PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col,
@@ -1144,9 +583,7 @@ static void update_state(const AV1_COMP *const cpi, ThreadData *td,
   const int mi_height = mi_size_high[bsize];
   const int unify_bsize = CONFIG_CB4X4;
 
-#if CONFIG_REF_MV
   int8_t rf_type;
-#endif
 
 #if !CONFIG_SUPERTX
   assert(mi->mbmi.sb_type == bsize);
@@ -1159,13 +596,11 @@ static void update_state(const AV1_COMP *const cpi, ThreadData *td,
   reset_intmv_filter_type(cm, xd, mbmi);
 #endif
 
-#if CONFIG_REF_MV
   rf_type = av1_ref_frame_type(mbmi->ref_frame);
   if (x->mbmi_ext->ref_mv_count[rf_type] > 1 &&
       (mbmi->sb_type >= BLOCK_8X8 || unify_bsize)) {
     set_ref_and_pred_mvs(x, mi->mbmi.pred_mv, rf_type);
   }
-#endif  // CONFIG_REF_MV
 
   // If segmentation in use
   if (seg->enabled) {
@@ -1250,7 +685,11 @@ static void update_state(const AV1_COMP *const cpi, ThreadData *td,
         THR_D63_PRED /*D63_PRED*/,
 #if CONFIG_ALT_INTRA
         THR_SMOOTH, /*SMOOTH_PRED*/
-#endif              // CONFIG_ALT_INTRA
+#if CONFIG_SMOOTH_HV
+        THR_SMOOTH_V, /*SMOOTH_V_PRED*/
+        THR_SMOOTH_H, /*SMOOTH_H_PRED*/
+#endif                // CONFIG_SMOOTH_HV
+#endif                // CONFIG_ALT_INTRA
         THR_TM /*TM_PRED*/,
       };
       ++mode_chosen_counts[kf_mode_index[mbmi->mode]];
@@ -1339,9 +778,7 @@ static void update_state_supertx(const AV1_COMP *const cpi, ThreadData *td,
   MV_REF *const frame_mvs = cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
   int w, h;
 
-#if CONFIG_REF_MV
   int8_t rf_type;
-#endif
 
   *mi_addr = *mi;
   *x->mbmi_ext = ctx->mbmi_ext;
@@ -1352,13 +789,11 @@ static void update_state_supertx(const AV1_COMP *const cpi, ThreadData *td,
   reset_intmv_filter_type(cm, xd, mbmi);
 #endif
 
-#if CONFIG_REF_MV
   rf_type = av1_ref_frame_type(mbmi->ref_frame);
   if (x->mbmi_ext->ref_mv_count[rf_type] > 1 &&
       (mbmi->sb_type >= BLOCK_8X8 || unify_bsize)) {
     set_ref_and_pred_mvs(x, mi->mbmi.pred_mv, rf_type);
   }
-#endif  // CONFIG_REF_MV
 
   // If segmentation in use
   if (seg->enabled) {
@@ -1846,6 +1281,29 @@ static int set_segment_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
   return av1_compute_rd_mult(cpi, segment_qindex + cm->y_dc_delta_q);
 }
 
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+static void daala_dist_set_sub8x8_dst(MACROBLOCK *const x, uint8_t *dst8x8,
+                                      BLOCK_SIZE bsize, int bw, int bh,
+                                      int mi_row, int mi_col) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  const int dst_stride = pd->dst.stride;
+  uint8_t *dst = pd->dst.buf;
+
+  assert(bsize < BLOCK_8X8);
+
+  if (bsize < BLOCK_8X8) {
+    int i, j;
+    uint8_t *dst_sub8x8 = &dst8x8[((mi_row & 1) * 8 + (mi_col & 1)) << 2];
+
+    for (j = 0; j < bh; ++j)
+      for (i = 0; i < bw; ++i) {
+        dst_sub8x8[j * 8 + i] = dst[j * dst_stride + i];
+      }
+  }
+}
+#endif
+
 static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data,
                              MACROBLOCK *const x, int mi_row, int mi_col,
                              RD_STATS *rd_cost,
@@ -1865,7 +1323,6 @@ static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data,
   struct macroblockd_plane *const pd = xd->plane;
   const AQ_MODE aq_mode = cpi->oxcf.aq_mode;
   int i, orig_rdmult;
-  const int unify_bsize = CONFIG_CB4X4;
 
   aom_clear_system_state();
 
@@ -1915,7 +1372,6 @@ static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data,
 #endif  // CONFIG_PALETTE
 
   ctx->skippable = 0;
-  ctx->pred_pixel_ready = 0;
 
   // Set to zero to make sure we do not use the previous encoded frame stats
   mbmi->skip = 0;
@@ -1967,38 +1423,21 @@ static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data,
     *totalrate_nocoef = 0;
 #endif  // CONFIG_SUPERTX
   } else {
-    if (bsize >= BLOCK_8X8 || unify_bsize) {
-      if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-        av1_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, mi_row, mi_col,
-                                           rd_cost, bsize, ctx, best_rd);
+    if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+      av1_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, mi_row, mi_col,
+                                         rd_cost, bsize, ctx, best_rd);
 #if CONFIG_SUPERTX
-        *totalrate_nocoef = rd_cost->rate;
+      *totalrate_nocoef = rd_cost->rate;
 #endif  // CONFIG_SUPERTX
-      } else {
-        av1_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost,
-#if CONFIG_SUPERTX
-                                  totalrate_nocoef,
-#endif  // CONFIG_SUPERTX
-                                  bsize, ctx, best_rd);
-#if CONFIG_SUPERTX
-        assert(*totalrate_nocoef >= 0);
-#endif  // CONFIG_SUPERTX
-      }
     } else {
-      if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-        // The decoder rejects sub8x8 partitions when SEG_LVL_SKIP is set.
-        rd_cost->rate = INT_MAX;
-      } else {
-        av1_rd_pick_inter_mode_sub8x8(cpi, tile_data, x, mi_row, mi_col,
-                                      rd_cost,
+      av1_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost,
 #if CONFIG_SUPERTX
-                                      totalrate_nocoef,
+                                totalrate_nocoef,
 #endif  // CONFIG_SUPERTX
-                                      bsize, ctx, best_rd);
+                                bsize, ctx, best_rd);
 #if CONFIG_SUPERTX
-        assert(*totalrate_nocoef >= 0);
+      assert(*totalrate_nocoef >= 0);
 #endif  // CONFIG_SUPERTX
-      }
     }
   }
 
@@ -2020,7 +1459,6 @@ static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data,
   ctx->dist = rd_cost->dist;
 }
 
-#if CONFIG_REF_MV
 static void update_inter_mode_stats(FRAME_COUNTS *counts, PREDICTION_MODE mode,
                                     int16_t mode_context) {
   int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
@@ -2050,7 +1488,6 @@ static void update_inter_mode_stats(FRAME_COUNTS *counts, PREDICTION_MODE mode,
     }
   }
 }
-#endif
 
 static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
                          int mi_col
@@ -2070,7 +1507,6 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
   const MB_MODE_INFO *const mbmi = &mi->mbmi;
   const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   const BLOCK_SIZE bsize = mbmi->sb_type;
-  const int unify_bsize = CONFIG_CB4X4;
 
 #if CONFIG_DELTA_Q
   // delta quant applies to both intra and inter
@@ -2125,7 +1561,7 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
 
         if (cm->reference_mode == REFERENCE_MODE_SELECT) {
 #if !SUB8X8_COMP_REF
-          if (mbmi->sb_type >= BLOCK_8X8)
+          if (mbmi->sb_type != BLOCK_4X4)
             counts->comp_inter[av1_get_reference_mode_context(cm, xd)]
                               [has_second_ref(mbmi)]++;
 #else
@@ -2183,12 +1619,12 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
 #endif  // CONFIG_EXT_REFS
         }
 
-#if CONFIG_EXT_INTER
+#if CONFIG_EXT_INTER && CONFIG_INTERINTRA
         if (cm->reference_mode != COMPOUND_REFERENCE &&
 #if CONFIG_SUPERTX
             !supertx_enabled &&
 #endif
-            is_interintra_allowed(mbmi)) {
+            cm->allow_interintra_compound && is_interintra_allowed(mbmi)) {
           const int bsize_group = size_group_lookup[bsize];
           if (mbmi->ref_frame[1] == INTRA_FRAME) {
             counts->interintra[bsize_group][1]++;
@@ -2199,7 +1635,7 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
             counts->interintra[bsize_group][0]++;
           }
         }
-#endif  // CONFIG_EXT_INTER
+#endif  // CONFIG_EXT_INTER && CONFIG_INTERINTRA
 
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
         const MOTION_MODE motion_allowed = motion_mode_allowed(
@@ -2242,105 +1678,67 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
     if (inter_block &&
         !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
       int16_t mode_ctx;
-#if !CONFIG_REF_MV
-      mode_ctx = mbmi_ext->mode_context[mbmi->ref_frame[0]];
-#endif
-      if (bsize >= BLOCK_8X8 || unify_bsize) {
-        const PREDICTION_MODE mode = mbmi->mode;
-#if CONFIG_REF_MV
+      const PREDICTION_MODE mode = mbmi->mode;
 #if CONFIG_EXT_INTER
-        if (has_second_ref(mbmi)) {
-          mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]];
-          ++counts->inter_compound_mode[mode_ctx][INTER_COMPOUND_OFFSET(mode)];
-        } else {
+      if (has_second_ref(mbmi)) {
+        mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]];
+        ++counts->inter_compound_mode[mode_ctx][INTER_COMPOUND_OFFSET(mode)];
+      } else {
 #endif  // CONFIG_EXT_INTER
-          mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
-                                               mbmi->ref_frame, bsize, -1);
-          update_inter_mode_stats(counts, mode, mode_ctx);
+        mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
+                                             mbmi->ref_frame, bsize, -1);
+        update_inter_mode_stats(counts, mode, mode_ctx);
 #if CONFIG_EXT_INTER
-        }
+      }
 #endif  // CONFIG_EXT_INTER
 
 #if CONFIG_EXT_INTER
-        if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) {
+      if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) {
 #else
-        if (mbmi->mode == NEWMV) {
+      if (mbmi->mode == NEWMV) {
 #endif
-          uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
-          int idx;
+        uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+        int idx;
 
-          for (idx = 0; idx < 2; ++idx) {
-            if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
-              uint8_t drl_ctx =
-                  av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
-              ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx];
+        for (idx = 0; idx < 2; ++idx) {
+          if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+            uint8_t drl_ctx =
+                av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
+            ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx];
 
-              if (mbmi->ref_mv_idx == idx) break;
-            }
+            if (mbmi->ref_mv_idx == idx) break;
           }
         }
+      }
 
 #if CONFIG_EXT_INTER
-        if (have_nearmv_in_inter_mode(mbmi->mode)) {
+      if (have_nearmv_in_inter_mode(mbmi->mode)) {
 #else
-        if (mbmi->mode == NEARMV) {
+      if (mbmi->mode == NEARMV) {
 #endif
-          uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
-          int idx;
+        uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+        int idx;
 
-          for (idx = 1; idx < 3; ++idx) {
-            if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
-              uint8_t drl_ctx =
-                  av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
-              ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx - 1];
+        for (idx = 1; idx < 3; ++idx) {
+          if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+            uint8_t drl_ctx =
+                av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
+            ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx - 1];
 
-              if (mbmi->ref_mv_idx == idx - 1) break;
-            }
-          }
-        }
-#else
-#if CONFIG_EXT_INTER
-        if (is_inter_compound_mode(mode))
-          ++counts->inter_compound_mode[mode_ctx][INTER_COMPOUND_OFFSET(mode)];
-        else
-#endif  // CONFIG_EXT_INTER
-          ++counts->inter_mode[mode_ctx][INTER_OFFSET(mode)];
-#endif
-      } else {
-        const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
-        const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
-        int idx, idy;
-        for (idy = 0; idy < 2; idy += num_4x4_h) {
-          for (idx = 0; idx < 2; idx += num_4x4_w) {
-            const int j = idy * 2 + idx;
-            const PREDICTION_MODE b_mode = mi->bmi[j].as_mode;
-#if CONFIG_REF_MV
-#if CONFIG_EXT_INTER
-            if (has_second_ref(mbmi)) {
-              mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]];
-              ++counts->inter_compound_mode[mode_ctx]
-                                           [INTER_COMPOUND_OFFSET(b_mode)];
-            } else {
-#endif  // CONFIG_EXT_INTER
-              mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
-                                                   mbmi->ref_frame, bsize, j);
-              update_inter_mode_stats(counts, b_mode, mode_ctx);
-#if CONFIG_EXT_INTER
-            }
-#endif  // CONFIG_EXT_INTER
-#else
-#if CONFIG_EXT_INTER
-            if (is_inter_compound_mode(b_mode))
-              ++counts->inter_compound_mode[mode_ctx]
-                                           [INTER_COMPOUND_OFFSET(b_mode)];
-            else
-#endif  // CONFIG_EXT_INTER
-              ++counts->inter_mode[mode_ctx][INTER_OFFSET(b_mode)];
-#endif
+            if (mbmi->ref_mv_idx == idx - 1) break;
           }
         }
       }
     }
+#if CONFIG_INTRABC
+  } else {
+    if (cm->allow_screen_content_tools && bsize >= BLOCK_8X8) {
+      FRAME_COUNTS *const counts = td->counts;
+      ++counts->intrabc[mbmi->use_intrabc];
+    } else {
+      assert(!mbmi->use_intrabc);
+    }
+#endif
   }
 }
 
@@ -2352,8 +1750,8 @@ typedef struct {
 #if CONFIG_VAR_TX
   TXFM_CONTEXT *p_ta;
   TXFM_CONTEXT *p_tl;
-  TXFM_CONTEXT ta[MAX_MIB_SIZE];
-  TXFM_CONTEXT tl[MAX_MIB_SIZE];
+  TXFM_CONTEXT ta[2 * MAX_MIB_SIZE];
+  TXFM_CONTEXT tl[2 * MAX_MIB_SIZE];
 #endif
 } RD_SEARCH_MACROBLOCK_CONTEXT;
 
@@ -2373,12 +1771,15 @@ static void restore_context(MACROBLOCK *x,
   int mi_width = mi_size_wide[bsize];
   int mi_height = mi_size_high[bsize];
   for (p = 0; p < MAX_MB_PLANE; p++) {
-    memcpy(xd->above_context[p] + ((mi_col * 2) >> xd->plane[p].subsampling_x),
+    int tx_col;
+    int tx_row;
+    tx_col = mi_col << (MI_SIZE_LOG2 - tx_size_wide_log2[0]);
+    tx_row = (mi_row & MAX_MIB_MASK) << (MI_SIZE_LOG2 - tx_size_high_log2[0]);
+    memcpy(xd->above_context[p] + (tx_col >> xd->plane[p].subsampling_x),
            ctx->a + num_4x4_blocks_wide * p,
            (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
                xd->plane[p].subsampling_x);
-    memcpy(xd->left_context[p] +
-               ((mi_row & MAX_MIB_MASK) * 2 >> xd->plane[p].subsampling_y),
+    memcpy(xd->left_context[p] + (tx_row >> xd->plane[p].subsampling_y),
            ctx->l + num_4x4_blocks_high * p,
            (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
                xd->plane[p].subsampling_y);
@@ -2391,9 +1792,9 @@ static void restore_context(MACROBLOCK *x,
   xd->above_txfm_context = ctx->p_ta;
   xd->left_txfm_context = ctx->p_tl;
   memcpy(xd->above_txfm_context, ctx->ta,
-         sizeof(*xd->above_txfm_context) * mi_width);
+         sizeof(*xd->above_txfm_context) * (mi_width << TX_UNIT_WIDE_LOG2));
   memcpy(xd->left_txfm_context, ctx->tl,
-         sizeof(*xd->left_txfm_context) * mi_height);
+         sizeof(*xd->left_txfm_context) * (mi_height << TX_UNIT_HIGH_LOG2));
 #endif
 #if CONFIG_PVQ
   od_encode_rollback(&x->daala_enc, rdo_buf);
@@ -2417,13 +1818,16 @@ static void save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
 
   // buffer the above/left context information of the block in search.
   for (p = 0; p < MAX_MB_PLANE; ++p) {
+    int tx_col;
+    int tx_row;
+    tx_col = mi_col << (MI_SIZE_LOG2 - tx_size_wide_log2[0]);
+    tx_row = (mi_row & MAX_MIB_MASK) << (MI_SIZE_LOG2 - tx_size_high_log2[0]);
     memcpy(ctx->a + num_4x4_blocks_wide * p,
-           xd->above_context[p] + (mi_col * 2 >> xd->plane[p].subsampling_x),
+           xd->above_context[p] + (tx_col >> xd->plane[p].subsampling_x),
            (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
                xd->plane[p].subsampling_x);
     memcpy(ctx->l + num_4x4_blocks_high * p,
-           xd->left_context[p] +
-               ((mi_row & MAX_MIB_MASK) * 2 >> xd->plane[p].subsampling_y),
+           xd->left_context[p] + (tx_row >> xd->plane[p].subsampling_y),
            (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
                xd->plane[p].subsampling_y);
   }
@@ -2433,9 +1837,9 @@ static void save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
          sizeof(xd->left_seg_context[0]) * mi_height);
 #if CONFIG_VAR_TX
   memcpy(ctx->ta, xd->above_txfm_context,
-         sizeof(*xd->above_txfm_context) * mi_width);
+         sizeof(*xd->above_txfm_context) * (mi_width << TX_UNIT_WIDE_LOG2));
   memcpy(ctx->tl, xd->left_txfm_context,
-         sizeof(*xd->left_txfm_context) * mi_height);
+         sizeof(*xd->left_txfm_context) * (mi_height << TX_UNIT_HIGH_LOG2));
   ctx->p_ta = xd->above_txfm_context;
   ctx->p_tl = xd->left_txfm_context;
 #endif
@@ -2479,7 +1883,7 @@ static void encode_b(const AV1_COMP *const cpi, const TileInfo *const tile,
                          get_frame_new_buffer(&cpi->common), mi_row, mi_col);
   }
 #endif
-  encode_superblock(cpi, td, tp, dry_run, mi_row, mi_col, bsize, ctx, rate);
+  encode_superblock(cpi, td, tp, dry_run, mi_row, mi_col, bsize, rate);
 
   if (!dry_run) {
 #if CONFIG_EXT_DELTA_Q
@@ -2563,12 +1967,13 @@ static void encode_sb(const AV1_COMP *const cpi, ThreadData *td,
       if (!x->skip) {
         int this_rate = 0;
         av1_encode_sb_supertx((AV1_COMMON *)cm, x, bsize);
-        av1_tokenize_sb_supertx(cpi, td, tp, dry_run, bsize, rate);
+        av1_tokenize_sb_supertx(cpi, td, tp, dry_run, mi_row, mi_col, bsize,
+                                rate);
         if (rate) *rate += this_rate;
       } else {
         xd->mi[0]->mbmi.skip = 1;
         if (!dry_run) td->counts->skip[av1_get_skip_context(xd)][1]++;
-        reset_skip_context(xd, bsize);
+        av1_reset_skip_context(xd, mi_row, mi_col, bsize);
       }
       if (!dry_run) {
         for (y_idx = 0; y_idx < mi_height; y_idx++)
@@ -2849,9 +2254,10 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
   pc_tree->partitioning = partition;
 
 #if CONFIG_VAR_TX
-  xd->above_txfm_context = cm->above_txfm_context + mi_col;
-  xd->left_txfm_context =
-      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  xd->above_txfm_context =
+      cm->above_txfm_context + (mi_col << TX_UNIT_WIDE_LOG2);
+  xd->left_txfm_context = xd->left_txfm_context_buffer +
+                          ((mi_row & MAX_MIB_MASK) << TX_UNIT_HIGH_LOG2);
 #endif
 #if !CONFIG_PVQ
   save_context(x, &x_ctx, mi_row, mi_col, bsize);
@@ -2943,7 +2349,7 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
         av1_init_rd_stats(&tmp_rdc);
         update_state(cpi, td, ctx_h, mi_row, mi_col, subsize, 1);
         encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
-                          ctx_h, NULL);
+                          NULL);
         rd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &tmp_rdc,
 #if CONFIG_SUPERTX
                          &rt_nocoef,
@@ -2986,7 +2392,7 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
         av1_init_rd_stats(&tmp_rdc);
         update_state(cpi, td, ctx_v, mi_row, mi_col, subsize, 1);
         encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
-                          ctx_v, NULL);
+                          NULL);
         rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &tmp_rdc,
 #if CONFIG_SUPERTX
                          &rt_nocoef,
@@ -3566,7 +2972,7 @@ static void rd_test_partition3(
     PICK_MODE_CONTEXT *ctx_0 = &ctxs[0];
     update_state(cpi, td, ctx_0, mi_row0, mi_col0, subsize0, 1);
     encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row0, mi_col0, subsize0,
-                      ctx_0, NULL);
+                      NULL);
 
     if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_0);
 
@@ -3607,7 +3013,7 @@ static void rd_test_partition3(
       PICK_MODE_CONTEXT *ctx_1 = &ctxs[1];
       update_state(cpi, td, ctx_1, mi_row1, mi_col1, subsize1, 1);
       encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row1, mi_col1, subsize1,
-                        ctx_1, NULL);
+                        NULL);
 
       if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_1);
 
@@ -3865,9 +3271,10 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
   }
 
 #if CONFIG_VAR_TX
-  xd->above_txfm_context = cm->above_txfm_context + mi_col;
-  xd->left_txfm_context =
-      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  xd->above_txfm_context =
+      cm->above_txfm_context + (mi_col << TX_UNIT_WIDE_LOG2);
+  xd->left_txfm_context = xd->left_txfm_context_buffer +
+                          ((mi_row & MAX_MIB_MASK) << TX_UNIT_HIGH_LOG2);
 #endif
 #if !CONFIG_PVQ
   save_context(x, &x_ctx, mi_row, mi_col, bsize);
@@ -4157,9 +3564,29 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 #if CONFIG_SUPERTX
           sum_rate_nocoef += this_rate_nocoef;
 #endif  // CONFIG_SUPERTX
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+          sum_rdc.dist_y += this_rdc.dist_y;
+#endif
         }
       }
       reached_last_index = (idx == 4);
+
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+      if (reached_last_index && sum_rdc.rdcost != INT64_MAX &&
+          bsize == BLOCK_8X8) {
+        int use_activity_masking = 0;
+        int64_t daala_dist;
+        const int src_stride = x->plane[0].src.stride;
+        daala_dist = av1_daala_dist(x->plane[0].src.buf - 4 * src_stride - 4,
+                                    src_stride, x->decoded_8x8, 8, 8, 8, 1,
+                                    use_activity_masking, x->qindex)
+                     << 4;
+        sum_rdc.dist = sum_rdc.dist - sum_rdc.dist_y + daala_dist;
+        sum_rdc.rdcost =
+            RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+      }
+#endif  // CONFIG_DAALA_DIST && CONFIG_CB4X4
+
 #if CONFIG_SUPERTX
       if (supertx_allowed && sum_rdc.rdcost < INT64_MAX && reached_last_index) {
         TX_SIZE supertx_size = max_txsize_lookup[bsize];
@@ -4267,7 +3694,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
       PICK_MODE_CONTEXT *ctx_h = &pc_tree->horizontal[0];
       update_state(cpi, td, ctx_h, mi_row, mi_col, subsize, 1);
       encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
-                        ctx_h, NULL);
+                        NULL);
 
       if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_h);
 
@@ -4297,6 +3724,16 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
                        subsize, &pc_tree->horizontal[1],
                        best_rdc.rdcost - sum_rdc.rdcost);
 #endif  // CONFIG_SUPERTX
+
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+      if (this_rdc.rate != INT_MAX && bsize == BLOCK_8X8) {
+        update_state(cpi, td, &pc_tree->horizontal[1], mi_row + mi_step, mi_col,
+                     subsize, DRY_RUN_NORMAL);
+        encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row + mi_step, mi_col,
+                          subsize, NULL);
+      }
+#endif  // CONFIG_DAALA_DIST && CONFIG_CB4X4
+
       if (this_rdc.rate == INT_MAX) {
         sum_rdc.rdcost = INT64_MAX;
 #if CONFIG_SUPERTX
@@ -4309,7 +3746,24 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 #if CONFIG_SUPERTX
         sum_rate_nocoef += this_rate_nocoef;
 #endif  // CONFIG_SUPERTX
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+        sum_rdc.dist_y += this_rdc.dist_y;
+#endif
       }
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+      if (sum_rdc.rdcost != INT64_MAX && bsize == BLOCK_8X8) {
+        int use_activity_masking = 0;
+        int64_t daala_dist;
+        const int src_stride = x->plane[0].src.stride;
+        daala_dist = av1_daala_dist(x->plane[0].src.buf - 4 * src_stride,
+                                    src_stride, x->decoded_8x8, 8, 8, 8, 1,
+                                    use_activity_masking, x->qindex)
+                     << 4;
+        sum_rdc.dist = sum_rdc.dist - sum_rdc.dist_y + daala_dist;
+        sum_rdc.rdcost =
+            RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+      }
+#endif  // CONFIG_DAALA_DIST && CONFIG_CB4X4
     }
 
 #if CONFIG_SUPERTX
@@ -4413,7 +3867,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
         !force_vert_split && (bsize > BLOCK_8X8 || unify_bsize)) {
       update_state(cpi, td, &pc_tree->vertical[0], mi_row, mi_col, subsize, 1);
       encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
-                        &pc_tree->vertical[0], NULL);
+                        NULL);
 
       if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
 
@@ -4444,6 +3898,16 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
                        subsize, &pc_tree->vertical[1],
                        best_rdc.rdcost - sum_rdc.rdcost);
 #endif  // CONFIG_SUPERTX
+
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+      if (this_rdc.rate != INT_MAX && bsize == BLOCK_8X8) {
+        update_state(cpi, td, &pc_tree->vertical[1], mi_row, mi_col + mi_step,
+                     subsize, DRY_RUN_NORMAL);
+        encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col + mi_step,
+                          subsize, NULL);
+      }
+#endif  // CONFIG_DAALA_DIST && CONFIG_CB4X4
+
       if (this_rdc.rate == INT_MAX) {
         sum_rdc.rdcost = INT64_MAX;
 #if CONFIG_SUPERTX
@@ -4456,7 +3920,24 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 #if CONFIG_SUPERTX
         sum_rate_nocoef += this_rate_nocoef;
 #endif  // CONFIG_SUPERTX
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+        sum_rdc.dist_y += this_rdc.dist_y;
+#endif
       }
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+      if (sum_rdc.rdcost != INT64_MAX && bsize == BLOCK_8X8) {
+        int use_activity_masking = 0;
+        int64_t daala_dist;
+        const int src_stride = x->plane[0].src.stride;
+        daala_dist =
+            av1_daala_dist(x->plane[0].src.buf - 4, src_stride, x->decoded_8x8,
+                           8, 8, 8, 1, use_activity_masking, x->qindex)
+            << 4;
+        sum_rdc.dist = sum_rdc.dist - sum_rdc.dist_y + daala_dist;
+        sum_rdc.rdcost =
+            RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+      }
+#endif  // CONFIG_DAALA_DIST && CONFIG_CB4X4
     }
 #if CONFIG_SUPERTX
     if (supertx_allowed && sum_rdc.rdcost < INT64_MAX && !abort_flag) {
@@ -4612,6 +4093,14 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
   x->cfl_store_y = 0;
 #endif
 
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+  if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
+      bsize == BLOCK_4X4 && pc_tree->index == 3) {
+    encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+              pc_tree, NULL);
+  }
+#endif  // CONFIG_DAALA_DIST && CONFIG_CB4X4
+
   if (bsize == cm->sb_size) {
 #if !CONFIG_PVQ && !CONFIG_LV_MAP
     assert(tp_orig < *tp || (tp_orig == *tp && xd->mi[0]->mbmi.skip));
@@ -4762,14 +4251,6 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
                        &dummy_rate_nocoef,
 #endif  // CONFIG_SUPERTX
                        1, pc_root);
-    } else if (sf->partition_search_type == VAR_BASED_PARTITION) {
-      choose_partitioning(cpi, td, tile_info, x, mi_row, mi_col);
-      rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, cm->sb_size,
-                       &dummy_rate, &dummy_dist,
-#if CONFIG_SUPERTX
-                       &dummy_rate_nocoef,
-#endif  // CONFIG_SUPERTX
-                       1, pc_root);
     } else {
       // If required set upper and lower partition size limits
       if (sf->auto_min_max_partition_size) {
@@ -4785,32 +4266,6 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
                         INT64_MAX, pc_root);
     }
   }
-#if CONFIG_SUBFRAME_PROB_UPDATE
-  if (cm->do_subframe_update &&
-      cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
-    const int mi_rows_per_update =
-        MI_SIZE * AOMMAX(cm->mi_rows / MI_SIZE / COEF_PROBS_BUFS, 1);
-    if ((mi_row + MI_SIZE) % mi_rows_per_update == 0 &&
-        mi_row + MI_SIZE < cm->mi_rows &&
-        cm->coef_probs_update_idx < COEF_PROBS_BUFS - 1) {
-      TX_SIZE t;
-      SUBFRAME_STATS *subframe_stats = &cpi->subframe_stats;
-
-      for (t = 0; t < TX_SIZES; ++t)
-        av1_full_to_model_counts(cpi->td.counts->coef[t],
-                                 cpi->td.rd_counts.coef_counts[t]);
-      av1_partial_adapt_probs(cm, mi_row, mi_col);
-      ++cm->coef_probs_update_idx;
-      av1_copy(subframe_stats->coef_probs_buf[cm->coef_probs_update_idx],
-               cm->fc->coef_probs);
-      av1_copy(subframe_stats->coef_counts_buf[cm->coef_probs_update_idx],
-               cpi->td.rd_counts.coef_counts);
-      av1_copy(subframe_stats->eob_counts_buf[cm->coef_probs_update_idx],
-               cm->counts.eob_branch);
-      av1_fill_token_costs(x->token_costs, cm->fc->coef_probs);
-    }
-  }
-#endif  // CONFIG_SUBFRAME_PROB_UPDATE
 }
 
 static void init_encode_frame_mb_context(AV1_COMP *cpi) {
@@ -5041,16 +4496,11 @@ void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row,
     }
   }
 
-#if CONFIG_DAALA_EC
+#if !CONFIG_ANS
   od_ec_enc_init(&td->mb.daala_enc.w.ec, 65025);
-#else
-#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
-#endif
-
-#if CONFIG_DAALA_EC
   od_ec_enc_reset(&td->mb.daala_enc.w.ec);
 #else
-#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#error "CONFIG_PVQ currently requires !CONFIG_ANS."
 #endif
 #endif  // #if CONFIG_PVQ
 
@@ -5079,10 +4529,10 @@ void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row,
       (unsigned int)(tok - cpi->tile_tok[tile_row][tile_col]);
   assert(cpi->tok_count[tile_row][tile_col] <= allocated_tokens(*tile_info));
 #if CONFIG_PVQ
-#if CONFIG_DAALA_EC
+#if !CONFIG_ANS
   od_ec_enc_clear(&td->mb.daala_enc.w.ec);
 #else
-#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#error "CONFIG_PVQ currently requires !CONFIG_ANS."
 #endif
 
   td->mb.pvq_q->last_pos = td->mb.pvq_q->curr_pos;
@@ -5186,6 +4636,24 @@ static int gm_get_params_cost(WarpedMotionParams *gm,
   }
   return (params_cost << AV1_PROB_COST_SHIFT);
 }
+
+static int do_gm_search_logic(SPEED_FEATURES *const sf, int num_refs_using_gm,
+                              int frame) {
+  (void)num_refs_using_gm;
+  (void)frame;
+  switch (sf->gm_search_type) {
+    case GM_FULL_SEARCH: return 1;
+    case GM_REDUCED_REF_SEARCH:
+#if CONFIG_EXT_REFS
+      return !(frame == LAST2_FRAME || frame == LAST3_FRAME);
+#else
+      return (num_refs_using_gm < 2);
+#endif  // CONFIG_EXT_REFS
+    case GM_DISABLE_SEARCH: return 0;
+    default: assert(0);
+  }
+  return 1;
+}
 #endif  // CONFIG_GLOBAL_MOTION
 
 static void encode_frame_internal(AV1_COMP *cpi) {
@@ -5205,9 +4673,7 @@ static void encode_frame_internal(AV1_COMP *cpi) {
 
   x->min_partition_size = AOMMIN(x->min_partition_size, cm->sb_size);
   x->max_partition_size = AOMMIN(x->max_partition_size, cm->sb_size);
-#if CONFIG_REF_MV
   cm->setup_mi(cm);
-#endif
 
   xd->mi = cm->mi_grid_visible;
   xd->mi[0] = cm->mi;
@@ -5218,27 +4684,46 @@ static void encode_frame_internal(AV1_COMP *cpi) {
 
 #if CONFIG_GLOBAL_MOTION
   av1_zero(rdc->global_motion_used);
+  av1_zero(cpi->gmparams_cost);
   if (cpi->common.frame_type == INTER_FRAME && cpi->source &&
       !cpi->global_motion_search_done) {
-    YV12_BUFFER_CONFIG *ref_buf;
+    YV12_BUFFER_CONFIG *ref_buf[TOTAL_REFS_PER_FRAME];
     int frame;
     double params_by_motion[RANSAC_NUM_MOTIONS * (MAX_PARAMDIM - 1)];
     const double *params_this_motion;
     int inliers_by_motion[RANSAC_NUM_MOTIONS];
     WarpedMotionParams tmp_wm_params;
-    static const double kInfiniteErrAdv = 1e12;
     static const double kIdentityParams[MAX_PARAMDIM - 1] = {
       0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0
     };
+    int num_refs_using_gm = 0;
 
     for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
-      ref_buf = get_ref_frame_buffer(cpi, frame);
-      if (ref_buf) {
+      ref_buf[frame] = get_ref_frame_buffer(cpi, frame);
+      int pframe;
+      // check for duplicate buffer
+      for (pframe = LAST_FRAME; pframe < frame; ++pframe) {
+        if (ref_buf[frame] == ref_buf[pframe]) break;
+      }
+      if (pframe < frame) {
+        memcpy(&cm->global_motion[frame], &cm->global_motion[pframe],
+               sizeof(WarpedMotionParams));
+      } else if (ref_buf[frame] &&
+                 do_gm_search_logic(&cpi->sf, num_refs_using_gm, frame)) {
         TransformationType model;
+        const int64_t ref_frame_error = av1_frame_error(
+#if CONFIG_HIGHBITDEPTH
+            xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
+#endif  // CONFIG_HIGHBITDEPTH
+            ref_buf[frame]->y_buffer, ref_buf[frame]->y_stride,
+            cpi->source->y_buffer, 0, 0, cpi->source->y_width,
+            cpi->source->y_height, cpi->source->y_stride);
+
+        if (ref_frame_error == 0) continue;
+
         aom_clear_system_state();
         for (model = ROTZOOM; model < GLOBAL_TRANS_TYPES_ENC; ++model) {
-          double best_erroradvantage = kInfiniteErrAdv;
-
+          int64_t best_warp_error = INT64_MAX;
           // Initially set all params to identity.
           for (i = 0; i < RANSAC_NUM_MOTIONS; ++i) {
             memcpy(params_by_motion + (MAX_PARAMDIM - 1) * i, kIdentityParams,
@@ -5246,7 +4731,7 @@ static void encode_frame_internal(AV1_COMP *cpi) {
           }
 
           compute_global_motion_feature_based(
-              model, cpi->source, ref_buf,
+              model, cpi->source, ref_buf[frame],
 #if CONFIG_HIGHBITDEPTH
               cpi->common.bit_depth,
 #endif  // CONFIG_HIGHBITDEPTH
@@ -5259,17 +4744,17 @@ static void encode_frame_internal(AV1_COMP *cpi) {
             convert_model_to_params(params_this_motion, &tmp_wm_params);
 
             if (tmp_wm_params.wmtype != IDENTITY) {
-              const double erroradv_this_motion = refine_integerized_param(
+              const int64_t warp_error = refine_integerized_param(
                   &tmp_wm_params, tmp_wm_params.wmtype,
 #if CONFIG_HIGHBITDEPTH
                   xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
 #endif  // CONFIG_HIGHBITDEPTH
-                  ref_buf->y_buffer, ref_buf->y_width, ref_buf->y_height,
-                  ref_buf->y_stride, cpi->source->y_buffer,
-                  cpi->source->y_width, cpi->source->y_height,
-                  cpi->source->y_stride, 3);
-              if (erroradv_this_motion < best_erroradvantage) {
-                best_erroradvantage = erroradv_this_motion;
+                  ref_buf[frame]->y_buffer, ref_buf[frame]->y_width,
+                  ref_buf[frame]->y_height, ref_buf[frame]->y_stride,
+                  cpi->source->y_buffer, cpi->source->y_width,
+                  cpi->source->y_height, cpi->source->y_stride, 3);
+              if (warp_error < best_warp_error) {
+                best_warp_error = warp_error;
                 // Save the wm_params modified by refine_integerized_param()
                 // rather than motion index to avoid rerunning refine() below.
                 memcpy(&(cm->global_motion[frame]), &tmp_wm_params,
@@ -5295,17 +4780,17 @@ static void encode_frame_internal(AV1_COMP *cpi) {
           // If the best error advantage found doesn't meet the threshold for
           // this motion type, revert to IDENTITY.
           if (!is_enough_erroradvantage(
-                  best_erroradvantage,
+                  (double)best_warp_error / ref_frame_error,
                   gm_get_params_cost(&cm->global_motion[frame],
                                      &cm->prev_frame->global_motion[frame],
                                      cm->allow_high_precision_mv))) {
             set_default_warp_params(&cm->global_motion[frame]);
           }
-
           if (cm->global_motion[frame].wmtype != IDENTITY) break;
         }
         aom_clear_system_state();
       }
+      if (cm->global_motion[frame].wmtype != IDENTITY) num_refs_using_gm++;
       cpi->gmparams_cost[frame] =
           gm_get_params_cost(&cm->global_motion[frame],
                              &cm->prev_frame->global_motion[frame],
@@ -5352,21 +4837,8 @@ static void encode_frame_internal(AV1_COMP *cpi) {
   av1_initialize_rd_consts(cpi);
   av1_initialize_me_consts(cpi, x, cm->base_qindex);
   init_encode_frame_mb_context(cpi);
-#if CONFIG_TEMPMV_SIGNALING
-  if (last_fb_buf_idx != INVALID_IDX) {
-    cm->prev_frame = &cm->buffer_pool->frame_bufs[last_fb_buf_idx];
-    cm->use_prev_frame_mvs &= !cm->error_resilient_mode &&
-                              cm->width == cm->prev_frame->buf.y_width &&
-                              cm->height == cm->prev_frame->buf.y_height &&
-                              !cm->intra_only && !cm->prev_frame->intra_only;
-  }
-#else
-  cm->use_prev_frame_mvs =
-      !cm->error_resilient_mode && cm->width == cm->last_width &&
-      cm->height == cm->last_height && !cm->intra_only && cm->last_show_frame;
-#endif
 
-#if CONFIG_EXT_REFS
+#if CONFIG_EXT_REFS || CONFIG_TEMPMV_SIGNALING
   // NOTE(zoeliu): As cm->prev_frame can take neither a frame of
   //               show_exisiting_frame=1, nor can it take a frame not used as
   //               a reference, it is probable that by the time it is being
@@ -5377,11 +4849,29 @@ static void encode_frame_internal(AV1_COMP *cpi) {
   //               (1) Simply disable the use of previous frame mvs; or
   //               (2) Have cm->prev_frame point to one reference frame buffer,
   //                   e.g. LAST_FRAME.
-  if (cm->use_prev_frame_mvs && !enc_is_ref_frame_buf(cpi, cm->prev_frame)) {
+  if (!enc_is_ref_frame_buf(cpi, cm->prev_frame)) {
     // Reassign the LAST_FRAME buffer to cm->prev_frame.
-    cm->prev_frame = &cm->buffer_pool->frame_bufs[last_fb_buf_idx];
+    cm->prev_frame = last_fb_buf_idx != INVALID_IDX
+                         ? &cm->buffer_pool->frame_bufs[last_fb_buf_idx]
+                         : NULL;
   }
-#endif  // CONFIG_EXT_REFS
+#endif  // CONFIG_EXT_REFS || CONFIG_TEMPMV_SIGNALING
+
+#if CONFIG_TEMPMV_SIGNALING
+  if (cm->prev_frame) {
+    cm->use_prev_frame_mvs &= !cm->error_resilient_mode &&
+                              cm->width == cm->prev_frame->buf.y_width &&
+                              cm->height == cm->prev_frame->buf.y_height &&
+                              !cm->intra_only && !cm->prev_frame->intra_only;
+  } else {
+    cm->use_prev_frame_mvs = 0;
+  }
+#else
+  cm->use_prev_frame_mvs = !cm->error_resilient_mode && cm->prev_frame &&
+                           cm->width == cm->prev_frame->buf.y_crop_width &&
+                           cm->height == cm->prev_frame->buf.y_crop_height &&
+                           !cm->intra_only && cm->last_show_frame;
+#endif  // CONFIG_TEMPMV_SIGNALING
 
   // Special case: set prev_mi to NULL when the previous mode info
   // context cannot be used.
@@ -5390,14 +4880,8 @@ static void encode_frame_internal(AV1_COMP *cpi) {
 
 #if CONFIG_VAR_TX
   x->txb_split_count = 0;
-#if CONFIG_REF_MV
   av1_zero(x->blk_skip_drl);
 #endif
-#endif
-
-  if (cpi->sf.partition_search_type == VAR_BASED_PARTITION &&
-      cpi->td.var_root[0] == NULL)
-    av1_setup_var_tree(&cpi->common, &cpi->td);
 
   {
     struct aom_usec_timer emr_timer;
@@ -5429,6 +4913,20 @@ static void encode_frame_internal(AV1_COMP *cpi) {
 #endif
 }
 
+#if CONFIG_EXT_INTER
+static void make_consistent_compound_tools(AV1_COMMON *cm) {
+  (void)cm;
+#if CONFIG_INTERINTRA
+  if (frame_is_intra_only(cm) || cm->reference_mode == COMPOUND_REFERENCE)
+    cm->allow_interintra_compound = 0;
+#endif  // CONFIG_INTERINTRA
+#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+  if (frame_is_intra_only(cm) || cm->reference_mode == SINGLE_REFERENCE)
+    cm->allow_masked_compound = 0;
+#endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+}
+#endif  // CONFIG_EXT_INTER
+
 void av1_encode_frame(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
 #if CONFIG_EXT_TX
@@ -5444,29 +4942,15 @@ void av1_encode_frame(AV1_COMP *cpi) {
   // side behavior is where the ALT ref buffer has opposite sign bias to
   // the other two.
   if (!frame_is_intra_only(cm)) {
-#if CONFIG_LOWDELAY_COMPOUND  // Normative in encoder
-    cpi->allow_comp_inter_inter = 1;
-#if CONFIG_EXT_REFS
-    cm->comp_fwd_ref[0] = LAST_FRAME;
-    cm->comp_fwd_ref[1] = LAST2_FRAME;
-    cm->comp_fwd_ref[2] = LAST3_FRAME;
-    cm->comp_fwd_ref[3] = GOLDEN_FRAME;
-    cm->comp_bwd_ref[0] = BWDREF_FRAME;
-    cm->comp_bwd_ref[1] = ALTREF_FRAME;
-#else
-    cm->comp_fixed_ref = ALTREF_FRAME;
-    cm->comp_var_ref[0] = LAST_FRAME;
-    cm->comp_var_ref[1] = GOLDEN_FRAME;
-#endif  // CONFIG_EXT_REFS
-#else
+#if !CONFIG_ONE_SIDED_COMPOUND
     if ((cm->ref_frame_sign_bias[ALTREF_FRAME] ==
          cm->ref_frame_sign_bias[GOLDEN_FRAME]) ||
         (cm->ref_frame_sign_bias[ALTREF_FRAME] ==
          cm->ref_frame_sign_bias[LAST_FRAME])) {
       cpi->allow_comp_inter_inter = 0;
     } else {
+#endif
       cpi->allow_comp_inter_inter = 1;
-
 #if CONFIG_EXT_REFS
       cm->comp_fwd_ref[0] = LAST_FRAME;
       cm->comp_fwd_ref[1] = LAST2_FRAME;
@@ -5475,10 +4959,11 @@ void av1_encode_frame(AV1_COMP *cpi) {
       cm->comp_bwd_ref[0] = BWDREF_FRAME;
       cm->comp_bwd_ref[1] = ALTREF_FRAME;
 #else
-      cm->comp_fixed_ref = ALTREF_FRAME;
-      cm->comp_var_ref[0] = LAST_FRAME;
-      cm->comp_var_ref[1] = GOLDEN_FRAME;
-#endif  // CONFIG_EXT_REFS
+    cm->comp_fixed_ref = ALTREF_FRAME;
+    cm->comp_var_ref[0] = LAST_FRAME;
+    cm->comp_var_ref[1] = GOLDEN_FRAME;
+#endif                          // CONFIG_EXT_REFS
+#if !CONFIG_ONE_SIDED_COMPOUND  // Normative in encoder
     }
 #endif
   } else {
@@ -5529,6 +5014,9 @@ void av1_encode_frame(AV1_COMP *cpi) {
     cm->interp_filter = SWITCHABLE;
 #endif
 
+#if CONFIG_EXT_INTER
+    make_consistent_compound_tools(cm);
+#endif  // CONFIG_EXT_INTER
     encode_frame_internal(cpi);
 
     for (i = 0; i < REFERENCE_MODES; ++i)
@@ -5553,12 +5041,19 @@ void av1_encode_frame(AV1_COMP *cpi) {
 #endif  // !CONFIG_REF_ADAPT
       }
     }
+#if CONFIG_EXT_INTER
+    make_consistent_compound_tools(cm);
+#endif  // CONFIG_EXT_INTER
 
 #if CONFIG_VAR_TX
     if (cm->tx_mode == TX_MODE_SELECT && cpi->td.mb.txb_split_count == 0)
       cm->tx_mode = ALLOW_32X32 + CONFIG_TX64X64;
 #else
+#if CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
+    if (cm->tx_mode == TX_MODE_SELECT && counts->quarter_tx_size[1] == 0) {
+#else
     if (cm->tx_mode == TX_MODE_SELECT) {
+#endif
 #if CONFIG_TX64X64
       int count4x4 = 0;
       int count8x8_8x8p = 0, count8x8_lp = 0;
@@ -5566,41 +5061,50 @@ void av1_encode_frame(AV1_COMP *cpi) {
       int count32x32_32x32p = 0, count32x32_lp = 0;
       int count64x64_64x64p = 0;
       for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
+        int depth;
         // counts->tx_size[max_depth][context_idx][this_depth_level]
-        count4x4 += counts->tx_size[0][i][0];
-        count4x4 += counts->tx_size[1][i][0];
-        count4x4 += counts->tx_size[2][i][0];
-        count4x4 += counts->tx_size[3][i][0];
-
-        count8x8_8x8p += counts->tx_size[0][i][1];
-        count8x8_lp += counts->tx_size[1][i][1];
-        count8x8_lp += counts->tx_size[2][i][1];
-        count8x8_lp += counts->tx_size[3][i][1];
-
-        count16x16_16x16p += counts->tx_size[1][i][2];
-        count16x16_lp += counts->tx_size[2][i][2];
-        count16x16_lp += counts->tx_size[3][i][2];
-
-        count32x32_32x32p += counts->tx_size[2][i][3];
-        count32x32_lp += counts->tx_size[3][i][3];
-
-        count64x64_64x64p += counts->tx_size[3][i][4];
+        depth = tx_size_to_depth(TX_4X4);
+        count4x4 += counts->tx_size[TX_8X8 - TX_SIZE_CTX_MIN][i][depth];
+        count4x4 += counts->tx_size[TX_16X16 - TX_SIZE_CTX_MIN][i][depth];
+        count4x4 += counts->tx_size[TX_32X32 - TX_SIZE_CTX_MIN][i][depth];
+        count4x4 += counts->tx_size[TX_64X64 - TX_SIZE_CTX_MIN][i][depth];
+
+        depth = tx_size_to_depth(TX_8X8);
+        count8x8_8x8p += counts->tx_size[TX_8X8 - TX_SIZE_CTX_MIN][i][depth];
+        count8x8_lp += counts->tx_size[TX_16X16 - TX_SIZE_CTX_MIN][i][depth];
+        count8x8_lp += counts->tx_size[TX_32X32 - TX_SIZE_CTX_MIN][i][depth];
+        count8x8_lp += counts->tx_size[TX_64X64 - TX_SIZE_CTX_MIN][i][depth];
+
+        depth = tx_size_to_depth(TX_16X16);
+        count16x16_16x16p +=
+            counts->tx_size[TX_16X16 - TX_SIZE_CTX_MIN][i][depth];
+        count16x16_lp += counts->tx_size[TX_32X32 - TX_SIZE_CTX_MIN][i][depth];
+        count16x16_lp += counts->tx_size[TX_64X64 - TX_SIZE_CTX_MIN][i][depth];
+
+        depth = tx_size_to_depth(TX_32X32);
+        count32x32_32x32p +=
+            counts->tx_size[TX_32X32 - TX_SIZE_CTX_MIN][i][depth];
+        count32x32_lp += counts->tx_size[TX_64X64 - TX_SIZE_CTX_MIN][i][depth];
+
+        depth = tx_size_to_depth(TX_64X64);
+        count64x64_64x64p +=
+            counts->tx_size[TX_64X64 - TX_SIZE_CTX_MIN][i][depth];
       }
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
-      count4x4 += counts->tx_size_implied[0][TX_4X4];
-      count4x4 += counts->tx_size_implied[1][TX_4X4];
-      count4x4 += counts->tx_size_implied[2][TX_4X4];
-      count4x4 += counts->tx_size_implied[3][TX_4X4];
-      count8x8_8x8p += counts->tx_size_implied[1][TX_8X8];
-      count8x8_lp += counts->tx_size_implied[2][TX_8X8];
-      count8x8_lp += counts->tx_size_implied[3][TX_8X8];
-      count8x8_lp += counts->tx_size_implied[4][TX_8X8];
-      count16x16_16x16p += counts->tx_size_implied[2][TX_16X16];
-      count16x16_lp += counts->tx_size_implied[3][TX_16X16];
-      count16x16_lp += counts->tx_size_implied[4][TX_16X16];
-      count32x32_32x32p += counts->tx_size_implied[3][TX_32X32];
-      count32x32_lp += counts->tx_size_implied[4][TX_32X32];
-      count64x64_64x64p += counts->tx_size_implied[4][TX_64X64];
+      count4x4 += counts->tx_size_implied[TX_4X4][TX_4X4];
+      count4x4 += counts->tx_size_implied[TX_8X8][TX_4X4];
+      count4x4 += counts->tx_size_implied[TX_16X16][TX_4X4];
+      count4x4 += counts->tx_size_implied[TX_32X32][TX_4X4];
+      count8x8_8x8p += counts->tx_size_implied[TX_8X8][TX_8X8];
+      count8x8_lp += counts->tx_size_implied[TX_16X16][TX_8X8];
+      count8x8_lp += counts->tx_size_implied[TX_32X32][TX_8X8];
+      count8x8_lp += counts->tx_size_implied[TX_64X64][TX_8X8];
+      count16x16_16x16p += counts->tx_size_implied[TX_16X16][TX_16X16];
+      count16x16_lp += counts->tx_size_implied[TX_32X32][TX_16X16];
+      count16x16_lp += counts->tx_size_implied[TX_64X64][TX_16X16];
+      count32x32_32x32p += counts->tx_size_implied[TX_32X32][TX_32X32];
+      count32x32_lp += counts->tx_size_implied[TX_64X64][TX_32X32];
+      count64x64_64x64p += counts->tx_size_implied[TX_64X64][TX_64X64];
 #endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
       if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 &&
           count32x32_lp == 0 && count32x32_32x32p == 0 &&
@@ -5652,30 +5156,37 @@ void av1_encode_frame(AV1_COMP *cpi) {
       int count16x16_16x16p = 0, count16x16_lp = 0;
       int count32x32 = 0;
       for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
+        int depth;
         // counts->tx_size[max_depth][context_idx][this_depth_level]
-        count4x4 += counts->tx_size[0][i][0];
-        count4x4 += counts->tx_size[1][i][0];
-        count4x4 += counts->tx_size[2][i][0];
-
-        count8x8_8x8p += counts->tx_size[0][i][1];
-        count8x8_lp += counts->tx_size[1][i][1];
-        count8x8_lp += counts->tx_size[2][i][1];
-
-        count16x16_16x16p += counts->tx_size[1][i][2];
-        count16x16_lp += counts->tx_size[2][i][2];
-        count32x32 += counts->tx_size[2][i][3];
+        depth = tx_size_to_depth(TX_4X4);
+        count4x4 += counts->tx_size[TX_8X8 - TX_SIZE_CTX_MIN][i][depth];
+        count4x4 += counts->tx_size[TX_16X16 - TX_SIZE_CTX_MIN][i][depth];
+        count4x4 += counts->tx_size[TX_32X32 - TX_SIZE_CTX_MIN][i][depth];
+
+        depth = tx_size_to_depth(TX_8X8);
+        count8x8_8x8p += counts->tx_size[TX_8X8 - TX_SIZE_CTX_MIN][i][depth];
+        count8x8_lp += counts->tx_size[TX_16X16 - TX_SIZE_CTX_MIN][i][depth];
+        count8x8_lp += counts->tx_size[TX_32X32 - TX_SIZE_CTX_MIN][i][depth];
+
+        depth = tx_size_to_depth(TX_16X16);
+        count16x16_16x16p +=
+            counts->tx_size[TX_16X16 - TX_SIZE_CTX_MIN][i][depth];
+        count16x16_lp += counts->tx_size[TX_32X32 - TX_SIZE_CTX_MIN][i][depth];
+
+        depth = tx_size_to_depth(TX_32X32);
+        count32x32 += counts->tx_size[TX_32X32 - TX_SIZE_CTX_MIN][i][depth];
       }
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
-      count4x4 += counts->tx_size_implied[0][TX_4X4];
-      count4x4 += counts->tx_size_implied[1][TX_4X4];
-      count4x4 += counts->tx_size_implied[2][TX_4X4];
-      count4x4 += counts->tx_size_implied[3][TX_4X4];
-      count8x8_8x8p += counts->tx_size_implied[1][TX_8X8];
-      count8x8_lp += counts->tx_size_implied[2][TX_8X8];
-      count8x8_lp += counts->tx_size_implied[3][TX_8X8];
-      count16x16_lp += counts->tx_size_implied[3][TX_16X16];
-      count16x16_16x16p += counts->tx_size_implied[2][TX_16X16];
-      count32x32 += counts->tx_size_implied[3][TX_32X32];
+      count4x4 += counts->tx_size_implied[TX_4X4][TX_4X4];
+      count4x4 += counts->tx_size_implied[TX_8X8][TX_4X4];
+      count4x4 += counts->tx_size_implied[TX_16X16][TX_4X4];
+      count4x4 += counts->tx_size_implied[TX_32X32][TX_4X4];
+      count8x8_8x8p += counts->tx_size_implied[TX_8X8][TX_8X8];
+      count8x8_lp += counts->tx_size_implied[TX_16X16][TX_8X8];
+      count8x8_lp += counts->tx_size_implied[TX_32X32][TX_8X8];
+      count16x16_16x16p += counts->tx_size_implied[TX_16X16][TX_16X16];
+      count16x16_lp += counts->tx_size_implied[TX_32X32][TX_16X16];
+      count32x32 += counts->tx_size_implied[TX_32X32][TX_32X32];
 #endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
       if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 &&
 #if CONFIG_SUPERTX
@@ -5709,6 +5220,9 @@ void av1_encode_frame(AV1_COMP *cpi) {
     }
 #endif
   } else {
+#if CONFIG_EXT_INTER
+    make_consistent_compound_tools(cm);
+#endif  // CONFIG_EXT_INTER
     encode_frame_internal(cpi);
   }
 }
@@ -5758,6 +5272,11 @@ static void sum_intra_stats(FRAME_COUNTS *counts, MACROBLOCKD *xd,
       ++counts->filter_intra[0][use_filter_intra_mode];
     }
     if (mbmi->uv_mode == DC_PRED
+#if CONFIG_CB4X4
+        &&
+        is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+                            xd->plane[1].subsampling_y)
+#endif
 #if CONFIG_PALETTE
         && mbmi->palette_mode_info.palette_size[1] == 0
 #endif  // CONFIG_PALETTE
@@ -5799,8 +5318,8 @@ static void update_txfm_count(MACROBLOCK *x, MACROBLOCKD *xd,
   const int tx_col = blk_col >> 1;
   const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0);
   const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0);
-  int ctx = txfm_partition_context(xd->above_txfm_context + tx_col,
-                                   xd->left_txfm_context + tx_row,
+  int ctx = txfm_partition_context(xd->above_txfm_context + blk_col,
+                                   xd->left_txfm_context + blk_row,
                                    mbmi->sb_type, tx_size);
   const TX_SIZE plane_tx_size = mbmi->inter_tx_size[tx_row][tx_col];
 
@@ -5809,8 +5328,8 @@ static void update_txfm_count(MACROBLOCK *x, MACROBLOCKD *xd,
   if (tx_size == plane_tx_size) {
     ++counts->txfm_partition[ctx][0];
     mbmi->tx_size = tx_size;
-    txfm_partition_update(xd->above_txfm_context + tx_col,
-                          xd->left_txfm_context + tx_row, tx_size, tx_size);
+    txfm_partition_update(xd->above_txfm_context + blk_col,
+                          xd->left_txfm_context + blk_row, tx_size, tx_size);
   } else {
     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
     const int bs = tx_size_wide_unit[sub_txs];
@@ -5822,8 +5341,8 @@ static void update_txfm_count(MACROBLOCK *x, MACROBLOCKD *xd,
     if (tx_size == TX_8X8) {
       mbmi->inter_tx_size[tx_row][tx_col] = TX_4X4;
       mbmi->tx_size = TX_4X4;
-      txfm_partition_update(xd->above_txfm_context + tx_col,
-                            xd->left_txfm_context + tx_row, TX_4X4, tx_size);
+      txfm_partition_update(xd->above_txfm_context + blk_col,
+                            xd->left_txfm_context + blk_row, TX_4X4, tx_size);
       return;
     }
 
@@ -5847,9 +5366,10 @@ static void tx_partition_count_update(const AV1_COMMON *const cm, MACROBLOCK *x,
   const int bw = tx_size_wide_unit[max_tx_size];
   int idx, idy;
 
-  xd->above_txfm_context = cm->above_txfm_context + mi_col;
-  xd->left_txfm_context =
-      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  xd->above_txfm_context =
+      cm->above_txfm_context + (mi_col << TX_UNIT_WIDE_LOG2);
+  xd->left_txfm_context = xd->left_txfm_context_buffer +
+                          ((mi_row & MAX_MIB_MASK) << TX_UNIT_HIGH_LOG2);
 
   for (idy = 0; idy < mi_height; idy += bh)
     for (idx = 0; idx < mi_width; idx += bw)
@@ -5870,8 +5390,8 @@ static void set_txfm_context(MACROBLOCKD *xd, TX_SIZE tx_size, int blk_row,
 
   if (tx_size == plane_tx_size) {
     mbmi->tx_size = tx_size;
-    txfm_partition_update(xd->above_txfm_context + tx_col,
-                          xd->left_txfm_context + tx_row, tx_size, tx_size);
+    txfm_partition_update(xd->above_txfm_context + blk_col,
+                          xd->left_txfm_context + blk_row, tx_size, tx_size);
 
   } else {
     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
@@ -5881,8 +5401,8 @@ static void set_txfm_context(MACROBLOCKD *xd, TX_SIZE tx_size, int blk_row,
     if (tx_size == TX_8X8) {
       mbmi->inter_tx_size[tx_row][tx_col] = TX_4X4;
       mbmi->tx_size = TX_4X4;
-      txfm_partition_update(xd->above_txfm_context + tx_col,
-                            xd->left_txfm_context + tx_row, TX_4X4, tx_size);
+      txfm_partition_update(xd->above_txfm_context + blk_col,
+                            xd->left_txfm_context + blk_row, TX_4X4, tx_size);
       return;
     }
 
@@ -5905,9 +5425,10 @@ static void tx_partition_set_contexts(const AV1_COMMON *const cm,
   const int bw = tx_size_wide_unit[max_tx_size];
   int idx, idy;
 
-  xd->above_txfm_context = cm->above_txfm_context + mi_col;
-  xd->left_txfm_context =
-      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  xd->above_txfm_context =
+      cm->above_txfm_context + (mi_col << TX_UNIT_WIDE_LOG2);
+  xd->left_txfm_context = xd->left_txfm_context_buffer +
+                          ((mi_row & MAX_MIB_MASK) << TX_UNIT_HIGH_LOG2);
 
   for (idy = 0; idy < mi_height; idy += bh)
     for (idx = 0; idx < mi_width; idx += bw)
@@ -5964,8 +5485,7 @@ void av1_update_tx_type_count(const AV1_COMMON *cm, MACROBLOCKD *xd,
 
 static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
                               TOKENEXTRA **t, RUN_TYPE dry_run, int mi_row,
-                              int mi_col, BLOCK_SIZE bsize,
-                              PICK_MODE_CONTEXT *ctx, int *rate) {
+                              int mi_col, BLOCK_SIZE bsize, int *rate) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -6039,10 +5559,9 @@ static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
       av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
                            &xd->block_refs[ref]->sf);
     }
-    if (!(cpi->sf.reuse_inter_pred_sby && ctx->pred_pixel_ready) || seg_skip)
-      av1_build_inter_predictors_sby(xd, mi_row, mi_col, NULL, block_size);
+    av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, NULL, block_size);
 
-    av1_build_inter_predictors_sbuv(xd, mi_row, mi_col, NULL, block_size);
+    av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, NULL, block_size);
 #if CONFIG_MOTION_VAR
     if (mbmi->motion_mode == OBMC_CAUSAL) {
 #if CONFIG_NCOBMC
@@ -6068,6 +5587,13 @@ static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
 #endif
   }
 
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+  if (bsize < BLOCK_8X8) {
+    daala_dist_set_sub8x8_dst(x, x->decoded_8x8, bsize, block_size_wide[bsize],
+                              block_size_high[bsize], mi_row, mi_col);
+  }
+#endif
+
   if (!dry_run) {
 #if CONFIG_VAR_TX
     TX_SIZE tx_size =
@@ -6092,7 +5618,7 @@ static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
         const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size];
         const int depth = tx_size_to_depth(coded_tx_size);
         ++td->counts->tx_size[tx_size_cat][tx_size_ctx][depth];
-        if (tx_size != max_txsize_lookup[bsize]) ++x->txb_split_count;
+        if (tx_size != max_txsize_rect_lookup[bsize]) ++x->txb_split_count;
       }
 #else
       const int tx_size_ctx = get_tx_size_context(xd);
@@ -6103,6 +5629,13 @@ static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
 
       ++td->counts->tx_size[tx_size_cat][tx_size_ctx][depth];
 #endif
+#if CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
+      if (is_quarter_tx_allowed(xd, mbmi, is_inter) &&
+          mbmi->tx_size != txsize_sqr_up_map[mbmi->tx_size]) {
+        ++td->counts->quarter_tx_size[mbmi->tx_size ==
+                                      quarter_txsize_lookup[mbmi->sb_type]];
+      }
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
       assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed(xd, mbmi)));
 #endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
@@ -6135,7 +5668,7 @@ static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
 
 #if CONFIG_VAR_TX
       mbmi->min_tx_size = get_min_tx_size(intra_tx_size);
-      if (intra_tx_size != max_txsize_lookup[bsize]) ++x->txb_split_count;
+      if (intra_tx_size != max_txsize_rect_lookup[bsize]) ++x->txb_split_count;
 #endif
     }
 
@@ -6327,13 +5860,13 @@ static void predict_superblock(const AV1_COMP *const cpi, ThreadData *td,
   }
 
   if (!b_sub8x8)
-    av1_build_inter_predictors_sb_extend(xd,
+    av1_build_inter_predictors_sb_extend(cm, xd,
 #if CONFIG_EXT_INTER
                                          mi_row_ori, mi_col_ori,
 #endif  // CONFIG_EXT_INTER
                                          mi_row_pred, mi_col_pred, bsize_pred);
   else
-    av1_build_inter_predictors_sb_sub8x8_extend(xd,
+    av1_build_inter_predictors_sb_sub8x8_extend(cm, xd,
 #if CONFIG_EXT_INTER
                                                 mi_row_ori, mi_col_ori,
 #endif  // CONFIG_EXT_INTER
diff --git a/third_party/aom/av1/encoder/encodeframe.h b/third_party/aom/av1/encoder/encodeframe.h
index 08d6d20de..46a99e1cf 100644
--- a/third_party/aom/av1/encoder/encodeframe.h
+++ b/third_party/aom/av1/encoder/encodeframe.h
@@ -25,13 +25,6 @@ struct yv12_buffer_config;
 struct AV1_COMP;
 struct ThreadData;
 
-// Constants used in SOURCE_VAR_BASED_PARTITION
-#define VAR_HIST_MAX_BG_VAR 1000
-#define VAR_HIST_FACTOR 10
-#define VAR_HIST_BINS (VAR_HIST_MAX_BG_VAR / VAR_HIST_FACTOR + 1)
-#define VAR_HIST_LARGE_CUT_OFF 75
-#define VAR_HIST_SMALL_CUT_OFF 45
-
 void av1_setup_src_planes(struct macroblock *x,
                           const struct yv12_buffer_config *src, int mi_row,
                           int mi_col);
@@ -42,8 +35,6 @@ void av1_init_tile_data(struct AV1_COMP *cpi);
 void av1_encode_tile(struct AV1_COMP *cpi, struct ThreadData *td, int tile_row,
                      int tile_col);
 
-void av1_set_variance_partition_thresholds(struct AV1_COMP *cpi, int q);
-
 void av1_update_tx_type_count(const struct AV1Common *cm, MACROBLOCKD *xd,
 #if CONFIG_TXK_SEL
                               int block, int plane,
diff --git a/third_party/aom/av1/encoder/encodemb.c b/third_party/aom/av1/encoder/encodemb.c
index c450244b1..7c9781533 100644
--- a/third_party/aom/av1/encoder/encodemb.c
+++ b/third_party/aom/av1/encoder/encodemb.c
@@ -115,7 +115,7 @@ static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
 #if CONFIG_EC_ADAPT
   { 10, 7 }, { 8, 5 },
 #else
-  { 10, 6 }, { 8, 5 },
+  { 10, 6 }, { 8, 6 },
 #endif
 };
 
@@ -125,35 +125,31 @@ static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
     rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1); \
   }
 
-static INLINE int64_t
-get_token_bit_costs(unsigned int token_costs[2][COEFF_CONTEXTS][ENTROPY_TOKENS],
-                    int skip_eob, int ctx, int token) {
-#if CONFIG_NEW_TOKENSET
+static INLINE unsigned int get_token_bit_costs(
+    unsigned int token_costs[2][COEFF_CONTEXTS][ENTROPY_TOKENS], int skip_eob,
+    int ctx, int token) {
   (void)skip_eob;
   return token_costs[token == ZERO_TOKEN || token == EOB_TOKEN][ctx][token];
-#else
-  return token_costs[skip_eob][ctx][token];
-#endif
 }
 
+#if !CONFIG_LV_MAP
 #define USE_GREEDY_OPTIMIZE_B 0
 
 #if USE_GREEDY_OPTIMIZE_B
 
-typedef struct av1_token_state {
+typedef struct av1_token_state_greedy {
   int16_t token;
   tran_low_t qc;
   tran_low_t dqc;
-} av1_token_state;
+} av1_token_state_greedy;
 
-int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int block,
-                   TX_SIZE tx_size, int ctx) {
-#if !CONFIG_PVQ
+static int optimize_b_greedy(const AV1_COMMON *cm, MACROBLOCK *mb, int plane,
+                             int block, TX_SIZE tx_size, int ctx) {
   MACROBLOCKD *const xd = &mb->e_mbd;
   struct macroblock_plane *const p = &mb->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int ref = is_inter_block(&xd->mi[0]->mbmi);
-  av1_token_state tokens[MAX_TX_SQUARE + 1][2];
+  av1_token_state_greedy tokens[MAX_TX_SQUARE + 1][2];
   uint8_t token_cache[MAX_TX_SQUARE];
   const tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
   tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
@@ -176,38 +172,23 @@ int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int block,
 #if CONFIG_NEW_QUANT
   int dq = get_dq_profile_from_ctx(mb->qindex, ctx, ref, plane_type);
   const dequant_val_type_nuq *dequant_val = pd->dequant_val_nuq[dq];
-#elif !CONFIG_AOM_QM
-  const int dq_step[2] = { dequant_ptr[0] >> shift, dequant_ptr[1] >> shift };
 #endif  // CONFIG_NEW_QUANT
   int sz = 0;
   const int64_t rddiv = mb->rddiv;
   int64_t rd_cost0, rd_cost1;
   int16_t t0, t1;
   int i, final_eob;
-#if CONFIG_HIGHBITDEPTH
   const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, xd->bd);
-#else
-  const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, 8);
-#endif
   unsigned int(*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
       mb->token_costs[txsize_sqr_map[tx_size]][plane_type][ref];
   const int default_eob = tx_size_2d[tx_size];
 
-  assert((mb->qindex == 0) ^ (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0));
+  assert(mb->qindex > 0);
 
   assert((!plane_type && !plane) || (plane_type && plane));
   assert(eob <= default_eob);
 
   int64_t rdmult = (mb->rdmult * plane_rd_mult[ref][plane_type]) >> 1;
-/* CpuSpeedTest uses "--min-q=0 --max-q=0" and expects 100dB psnr
-* This creates conflict with search for a better EOB position
-* The line below is to make sure EOB search is disabled at this corner case.
-*/
-#if !CONFIG_NEW_QUANT && !CONFIG_AOM_QM
-  if (dq_step[1] <= 4) {
-    rdmult = 1;
-  }
-#endif
 
   int64_t rate0, rate1;
   for (i = 0; i < eob; i++) {
@@ -402,22 +383,10 @@ int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int block,
           dqc_a = shift ? ROUND_POWER_OF_TWO(dqc_a, shift) : dqc_a;
           if (sz) dqc_a = -dqc_a;
 #else
-// The 32x32 transform coefficient uses half quantization step size.
-// Account for the rounding difference in the dequantized coefficeint
-// value when the quantization index is dropped from an even number
-// to an odd number.
-
-#if CONFIG_AOM_QM
-          tran_low_t offset = dqv >> shift;
-#else
-          tran_low_t offset = dq_step[rc != 0];
-#endif
-          if (shift & x_a) offset += (dqv & 0x01);
-
-          if (sz == 0)
-            dqc_a = dqcoeff[rc] - offset;
+          if (x_a < 0)
+            dqc_a = -((-x_a * dqv) >> shift);
           else
-            dqc_a = dqcoeff[rc] + offset;
+            dqc_a = (x_a * dqv) >> shift;
 #endif  // CONFIG_NEW_QUANT
         } else {
           dqc_a = 0;
@@ -483,19 +452,11 @@ int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int block,
 
   mb->plane[plane].eobs[block] = final_eob;
   return final_eob;
-
-#else   // !CONFIG_PVQ
-  (void)cm;
-  (void)tx_size;
-  (void)ctx;
-  struct macroblock_plane *const p = &mb->plane[plane];
-  return p->eobs[block];
-#endif  // !CONFIG_PVQ
 }
 
 #else  // USE_GREEDY_OPTIMIZE_B
 
-typedef struct av1_token_state {
+typedef struct av1_token_state_org {
   int64_t error;
   int rate;
   int16_t next;
@@ -503,16 +464,15 @@ typedef struct av1_token_state {
   tran_low_t qc;
   tran_low_t dqc;
   uint8_t best_index;
-} av1_token_state;
+} av1_token_state_org;
 
-int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int block,
-                   TX_SIZE tx_size, int ctx) {
-#if !CONFIG_PVQ
+static int optimize_b_org(const AV1_COMMON *cm, MACROBLOCK *mb, int plane,
+                          int block, TX_SIZE tx_size, int ctx) {
   MACROBLOCKD *const xd = &mb->e_mbd;
   struct macroblock_plane *const p = &mb->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int ref = is_inter_block(&xd->mi[0]->mbmi);
-  av1_token_state tokens[MAX_TX_SQUARE + 1][2];
+  av1_token_state_org tokens[MAX_TX_SQUARE + 1][2];
   uint8_t token_cache[MAX_TX_SQUARE];
   const tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
   tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
@@ -536,8 +496,6 @@ int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int block,
 #if CONFIG_NEW_QUANT
   int dq = get_dq_profile_from_ctx(mb->qindex, ctx, ref, plane_type);
   const dequant_val_type_nuq *dequant_val = pd->dequant_val_nuq[dq];
-#elif !CONFIG_AOM_QM
-  const int dq_step[2] = { dequant_ptr[0] >> shift, dequant_ptr[1] >> shift };
 #endif  // CONFIG_NEW_QUANT
   int next = eob, sz = 0;
   const int64_t rdmult = (mb->rdmult * plane_rd_mult[ref][plane_type]) >> 1;
@@ -549,11 +507,7 @@ int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int block,
   int best, band = (eob < default_eob) ? band_translate[eob]
                                        : band_translate[eob - 1];
   int pt, i, final_eob;
-#if CONFIG_HIGHBITDEPTH
   const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, xd->bd);
-#else
-  const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, 8);
-#endif
   unsigned int(*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
       mb->token_costs[txsize_sqr_map[tx_size]][plane_type][ref];
   const uint16_t *band_counts = &band_count_table[tx_size][band];
@@ -566,11 +520,10 @@ int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int block,
                          ? av1_get_qindex(&cm->seg, xd->mi[0]->mbmi.segment_id,
                                           cm->base_qindex)
                          : cm->base_qindex;
-  if (qindex == 0) {
-    assert((qindex == 0) ^ (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0));
-  }
+  assert(qindex > 0);
+  (void)qindex;
 #else
-  assert((mb->qindex == 0) ^ (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0));
+  assert(mb->qindex > 0);
 #endif
 
   token_costs += band;
@@ -777,22 +730,10 @@ int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int block,
                                  : tokens[i][1].dqc;
         if (sz) tokens[i][1].dqc = -tokens[i][1].dqc;
 #else
-// The 32x32 transform coefficient uses half quantization step size.
-// Account for the rounding difference in the dequantized coefficeint
-// value when the quantization index is dropped from an even number
-// to an odd number.
-
-#if CONFIG_AOM_QM
-        tran_low_t offset = dqv >> shift;
-#else
-        tran_low_t offset = dq_step[rc != 0];
-#endif
-        if (shift & x) offset += (dqv & 0x01);
-
-        if (sz == 0)
-          tokens[i][1].dqc = dqcoeff[rc] - offset;
+        if (x < 0)
+          tokens[i][1].dqc = -((-x * dqv) >> shift);
         else
-          tokens[i][1].dqc = dqcoeff[rc] + offset;
+          tokens[i][1].dqc = (x * dqv) >> shift;
 #endif  // CONFIG_NEW_QUANT
       } else {
         tokens[i][1].dqc = 0;
@@ -858,16 +799,47 @@ int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int block,
   mb->plane[plane].eobs[block] = final_eob;
   assert(final_eob <= default_eob);
   return final_eob;
-#else   // !CONFIG_PVQ
+}
+
+#endif  // USE_GREEDY_OPTIMIZE_B
+#endif  // !CONFIG_LV_MAP
+
+int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int block,
+                   BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                   const ENTROPY_CONTEXT *a, const ENTROPY_CONTEXT *l) {
+  MACROBLOCKD *const xd = &mb->e_mbd;
+  struct macroblock_plane *const p = &mb->plane[plane];
+  const int eob = p->eobs[block];
+  assert((mb->qindex == 0) ^ (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0));
+  if (eob == 0) return eob;
+  if (xd->lossless[xd->mi[0]->mbmi.segment_id]) return eob;
+#if CONFIG_PVQ
   (void)cm;
   (void)tx_size;
-  (void)ctx;
-  struct macroblock_plane *const p = &mb->plane[plane];
-  return p->eobs[block];
-#endif  // !CONFIG_PVQ
-}
+  (void)a;
+  (void)l;
+  return eob;
+#endif
+
+#if !CONFIG_LV_MAP
+  (void)plane_bsize;
+#if CONFIG_VAR_TX
+  int ctx = get_entropy_context(tx_size, a, l);
+#else
+  int ctx = combine_entropy_contexts(*a, *l);
+#endif
 
+#if USE_GREEDY_OPTIMIZE_B
+  return optimize_b_greedy(cm, mb, plane, block, tx_size, ctx);
+#else   // USE_GREEDY_OPTIMIZE_B
+  return optimize_b_org(cm, mb, plane, block, tx_size, ctx);
 #endif  // USE_GREEDY_OPTIMIZE_B
+#else   // !CONFIG_LV_MAP
+  TXB_CTX txb_ctx;
+  get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
+  return av1_optimize_txb(cm, mb, plane, block, tx_size, &txb_ctx);
+#endif  // !CONFIG_LV_MAP
+}
 
 #if !CONFIG_PVQ
 #if CONFIG_HIGHBITDEPTH
@@ -1158,8 +1130,7 @@ static void encode_block(int plane, int block, int blk_row, int blk_col,
 #endif
 
 #if !CONFIG_PVQ
-  if (p->eobs[block] && !xd->lossless[xd->mi[0]->mbmi.segment_id])
-    av1_optimize_b(cm, x, plane, block, tx_size, ctx);
+  av1_optimize_b(cm, x, plane, block, plane_bsize, tx_size, a, l);
 
   av1_set_txb_context(x, plane, block, tx_size, a, l);
 
@@ -1202,12 +1173,13 @@ static void encode_block_inter(int plane, int block, int blk_row, int blk_col,
   if (tx_size == plane_tx_size) {
     encode_block(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg);
   } else {
+    assert(tx_size < TX_SIZES_ALL);
     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    assert(sub_txs < tx_size);
     // This is the square transform block partition entry point.
     int bsl = tx_size_wide_unit[sub_txs];
     int i;
     assert(bsl > 0);
-    assert(tx_size < TX_SIZES_ALL);
 
     for (i = 0; i < 4; ++i) {
       const int offsetr = blk_row + ((i >> 1) * bsl);
@@ -1301,8 +1273,8 @@ void av1_encode_sby_pass1(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize) {
                                          encode_block_pass1, &args);
 }
 
-void av1_encode_sb(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize,
-                   const int mi_row, const int mi_col) {
+void av1_encode_sb(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
+                   int mi_col) {
   MACROBLOCKD *const xd = &x->e_mbd;
   struct optimize_ctx ctx;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
@@ -1433,6 +1405,301 @@ static void encode_block_intra_and_set_context(int plane, int block,
 #endif
 }
 
+#if CONFIG_DPCM_INTRA
+static int get_eob(const tran_low_t *qcoeff, intptr_t n_coeffs,
+                   const int16_t *scan) {
+  int eob = -1;
+  for (int i = (int)n_coeffs - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    if (qcoeff[rc]) {
+      eob = i;
+      break;
+    }
+  }
+  return eob + 1;
+}
+
+static void quantize_scaler(int coeff, int16_t zbin, int16_t round_value,
+                            int16_t quant, int16_t quant_shift, int16_t dequant,
+                            int log_scale, tran_low_t *const qcoeff,
+                            tran_low_t *const dqcoeff) {
+  zbin = ROUND_POWER_OF_TWO(zbin, log_scale);
+  round_value = ROUND_POWER_OF_TWO(round_value, log_scale);
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  if (abs_coeff >= zbin) {
+    int tmp = clamp(abs_coeff + round_value, INT16_MIN, INT16_MAX);
+    tmp = ((((tmp * quant) >> 16) + tmp) * quant_shift) >> (16 - log_scale);
+    *qcoeff = (tmp ^ coeff_sign) - coeff_sign;
+    *dqcoeff = (*qcoeff * dequant) / (1 << log_scale);
+  }
+}
+
+typedef void (*dpcm_fwd_tx_func)(const int16_t *input, int stride,
+                                 TX_TYPE_1D tx_type, tran_low_t *output);
+
+static dpcm_fwd_tx_func get_dpcm_fwd_tx_func(int tx_length) {
+  switch (tx_length) {
+    case 4: return av1_dpcm_ft4_c;
+    case 8: return av1_dpcm_ft8_c;
+    case 16: return av1_dpcm_ft16_c;
+    case 32:
+      return av1_dpcm_ft32_c;
+    // TODO(huisu): add support for TX_64X64.
+    default: assert(0); return NULL;
+  }
+}
+
+static void process_block_dpcm_vert(TX_SIZE tx_size, TX_TYPE_1D tx_type_1d,
+                                    struct macroblockd_plane *const pd,
+                                    struct macroblock_plane *const p,
+                                    uint8_t *src, int src_stride, uint8_t *dst,
+                                    int dst_stride, int16_t *src_diff,
+                                    int diff_stride, tran_low_t *coeff,
+                                    tran_low_t *qcoeff, tran_low_t *dqcoeff) {
+  const int tx1d_width = tx_size_wide[tx_size];
+  dpcm_fwd_tx_func forward_tx = get_dpcm_fwd_tx_func(tx1d_width);
+  dpcm_inv_txfm_add_func inverse_tx =
+      av1_get_dpcm_inv_txfm_add_func(tx1d_width);
+  const int tx1d_height = tx_size_high[tx_size];
+  const int log_scale = av1_get_tx_scale(tx_size);
+  int q_idx = 0;
+  for (int r = 0; r < tx1d_height; ++r) {
+    // Update prediction.
+    if (r > 0) memcpy(dst, dst - dst_stride, tx1d_width * sizeof(dst[0]));
+    // Subtraction.
+    for (int c = 0; c < tx1d_width; ++c) src_diff[c] = src[c] - dst[c];
+    // Forward transform.
+    forward_tx(src_diff, 1, tx_type_1d, coeff);
+    // Quantization.
+    for (int c = 0; c < tx1d_width; ++c) {
+      quantize_scaler(coeff[c], p->zbin[q_idx], p->round[q_idx],
+                      p->quant[q_idx], p->quant_shift[q_idx],
+                      pd->dequant[q_idx], log_scale, &qcoeff[c], &dqcoeff[c]);
+      q_idx = 1;
+    }
+    // Inverse transform.
+    inverse_tx(dqcoeff, 1, tx_type_1d, dst);
+    // Move to the next row.
+    coeff += tx1d_width;
+    qcoeff += tx1d_width;
+    dqcoeff += tx1d_width;
+    src_diff += diff_stride;
+    dst += dst_stride;
+    src += src_stride;
+  }
+}
+
+static void process_block_dpcm_horz(TX_SIZE tx_size, TX_TYPE_1D tx_type_1d,
+                                    struct macroblockd_plane *const pd,
+                                    struct macroblock_plane *const p,
+                                    uint8_t *src, int src_stride, uint8_t *dst,
+                                    int dst_stride, int16_t *src_diff,
+                                    int diff_stride, tran_low_t *coeff,
+                                    tran_low_t *qcoeff, tran_low_t *dqcoeff) {
+  const int tx1d_height = tx_size_high[tx_size];
+  dpcm_fwd_tx_func forward_tx = get_dpcm_fwd_tx_func(tx1d_height);
+  dpcm_inv_txfm_add_func inverse_tx =
+      av1_get_dpcm_inv_txfm_add_func(tx1d_height);
+  const int tx1d_width = tx_size_wide[tx_size];
+  const int log_scale = av1_get_tx_scale(tx_size);
+  int q_idx = 0;
+  for (int c = 0; c < tx1d_width; ++c) {
+    for (int r = 0; r < tx1d_height; ++r) {
+      // Update prediction.
+      if (c > 0) dst[r * dst_stride] = dst[r * dst_stride - 1];
+      // Subtraction.
+      src_diff[r * diff_stride] = src[r * src_stride] - dst[r * dst_stride];
+    }
+    // Forward transform.
+    tran_low_t tx_buff[64];
+    forward_tx(src_diff, diff_stride, tx_type_1d, tx_buff);
+    for (int r = 0; r < tx1d_height; ++r) coeff[r * tx1d_width] = tx_buff[r];
+    // Quantization.
+    for (int r = 0; r < tx1d_height; ++r) {
+      quantize_scaler(coeff[r * tx1d_width], p->zbin[q_idx], p->round[q_idx],
+                      p->quant[q_idx], p->quant_shift[q_idx],
+                      pd->dequant[q_idx], log_scale, &qcoeff[r * tx1d_width],
+                      &dqcoeff[r * tx1d_width]);
+      q_idx = 1;
+    }
+    // Inverse transform.
+    for (int r = 0; r < tx1d_height; ++r) tx_buff[r] = dqcoeff[r * tx1d_width];
+    inverse_tx(tx_buff, dst_stride, tx_type_1d, dst);
+    // Move to the next column.
+    ++coeff, ++qcoeff, ++dqcoeff, ++src_diff, ++dst, ++src;
+  }
+}
+
+#if CONFIG_HIGHBITDEPTH
+static void hbd_process_block_dpcm_vert(
+    TX_SIZE tx_size, TX_TYPE_1D tx_type_1d, int bd,
+    struct macroblockd_plane *const pd, struct macroblock_plane *const p,
+    uint8_t *src8, int src_stride, uint8_t *dst8, int dst_stride,
+    int16_t *src_diff, int diff_stride, tran_low_t *coeff, tran_low_t *qcoeff,
+    tran_low_t *dqcoeff) {
+  const int tx1d_width = tx_size_wide[tx_size];
+  dpcm_fwd_tx_func forward_tx = get_dpcm_fwd_tx_func(tx1d_width);
+  hbd_dpcm_inv_txfm_add_func inverse_tx =
+      av1_get_hbd_dpcm_inv_txfm_add_func(tx1d_width);
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  const int tx1d_height = tx_size_high[tx_size];
+  const int log_scale = av1_get_tx_scale(tx_size);
+  int q_idx = 0;
+  for (int r = 0; r < tx1d_height; ++r) {
+    // Update prediction.
+    if (r > 0) memcpy(dst, dst - dst_stride, tx1d_width * sizeof(dst[0]));
+    // Subtraction.
+    for (int c = 0; c < tx1d_width; ++c) src_diff[c] = src[c] - dst[c];
+    // Forward transform.
+    forward_tx(src_diff, 1, tx_type_1d, coeff);
+    // Quantization.
+    for (int c = 0; c < tx1d_width; ++c) {
+      quantize_scaler(coeff[c], p->zbin[q_idx], p->round[q_idx],
+                      p->quant[q_idx], p->quant_shift[q_idx],
+                      pd->dequant[q_idx], log_scale, &qcoeff[c], &dqcoeff[c]);
+      q_idx = 1;
+    }
+    // Inverse transform.
+    inverse_tx(dqcoeff, 1, tx_type_1d, bd, dst);
+    // Move to the next row.
+    coeff += tx1d_width;
+    qcoeff += tx1d_width;
+    dqcoeff += tx1d_width;
+    src_diff += diff_stride;
+    dst += dst_stride;
+    src += src_stride;
+  }
+}
+
+static void hbd_process_block_dpcm_horz(
+    TX_SIZE tx_size, TX_TYPE_1D tx_type_1d, int bd,
+    struct macroblockd_plane *const pd, struct macroblock_plane *const p,
+    uint8_t *src8, int src_stride, uint8_t *dst8, int dst_stride,
+    int16_t *src_diff, int diff_stride, tran_low_t *coeff, tran_low_t *qcoeff,
+    tran_low_t *dqcoeff) {
+  const int tx1d_height = tx_size_high[tx_size];
+  dpcm_fwd_tx_func forward_tx = get_dpcm_fwd_tx_func(tx1d_height);
+  hbd_dpcm_inv_txfm_add_func inverse_tx =
+      av1_get_hbd_dpcm_inv_txfm_add_func(tx1d_height);
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  const int tx1d_width = tx_size_wide[tx_size];
+  const int log_scale = av1_get_tx_scale(tx_size);
+  int q_idx = 0;
+  for (int c = 0; c < tx1d_width; ++c) {
+    for (int r = 0; r < tx1d_height; ++r) {
+      // Update prediction.
+      if (c > 0) dst[r * dst_stride] = dst[r * dst_stride - 1];
+      // Subtraction.
+      src_diff[r * diff_stride] = src[r * src_stride] - dst[r * dst_stride];
+    }
+    // Forward transform.
+    tran_low_t tx_buff[64];
+    forward_tx(src_diff, diff_stride, tx_type_1d, tx_buff);
+    for (int r = 0; r < tx1d_height; ++r) coeff[r * tx1d_width] = tx_buff[r];
+    // Quantization.
+    for (int r = 0; r < tx1d_height; ++r) {
+      quantize_scaler(coeff[r * tx1d_width], p->zbin[q_idx], p->round[q_idx],
+                      p->quant[q_idx], p->quant_shift[q_idx],
+                      pd->dequant[q_idx], log_scale, &qcoeff[r * tx1d_width],
+                      &dqcoeff[r * tx1d_width]);
+      q_idx = 1;
+    }
+    // Inverse transform.
+    for (int r = 0; r < tx1d_height; ++r) tx_buff[r] = dqcoeff[r * tx1d_width];
+    inverse_tx(tx_buff, dst_stride, tx_type_1d, bd, dst);
+    // Move to the next column.
+    ++coeff, ++qcoeff, ++dqcoeff, ++src_diff, ++dst, ++src;
+  }
+}
+#endif  // CONFIG_HIGHBITDEPTH
+
+void av1_encode_block_intra_dpcm(const AV1_COMMON *cm, MACROBLOCK *x,
+                                 PREDICTION_MODE mode, int plane, int block,
+                                 int blk_row, int blk_col,
+                                 BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                                 TX_TYPE tx_type, ENTROPY_CONTEXT *ta,
+                                 ENTROPY_CONTEXT *tl, int8_t *skip) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  const int diff_stride = block_size_wide[plane_bsize];
+  const int src_stride = p->src.stride;
+  const int dst_stride = pd->dst.stride;
+  const int tx1d_width = tx_size_wide[tx_size];
+  const int tx1d_height = tx_size_high[tx_size];
+  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, 0);
+  tran_low_t *coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  uint8_t *dst =
+      &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+  uint8_t *src =
+      &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
+  int16_t *src_diff =
+      &p->src_diff[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
+  uint16_t *eob = &p->eobs[block];
+  *eob = 0;
+  memset(qcoeff, 0, tx1d_height * tx1d_width * sizeof(*qcoeff));
+  memset(dqcoeff, 0, tx1d_height * tx1d_width * sizeof(*dqcoeff));
+
+  if (LIKELY(!x->skip_block)) {
+    TX_TYPE_1D tx_type_1d = DCT_1D;
+    switch (tx_type) {
+      case IDTX: tx_type_1d = IDTX_1D; break;
+      case V_DCT:
+        assert(mode == H_PRED);
+        tx_type_1d = DCT_1D;
+        break;
+      case H_DCT:
+        assert(mode == V_PRED);
+        tx_type_1d = DCT_1D;
+        break;
+      default: assert(0);
+    }
+    switch (mode) {
+      case V_PRED:
+#if CONFIG_HIGHBITDEPTH
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+          hbd_process_block_dpcm_vert(tx_size, tx_type_1d, xd->bd, pd, p, src,
+                                      src_stride, dst, dst_stride, src_diff,
+                                      diff_stride, coeff, qcoeff, dqcoeff);
+        } else {
+#endif  // CONFIG_HIGHBITDEPTH
+          process_block_dpcm_vert(tx_size, tx_type_1d, pd, p, src, src_stride,
+                                  dst, dst_stride, src_diff, diff_stride, coeff,
+                                  qcoeff, dqcoeff);
+#if CONFIG_HIGHBITDEPTH
+        }
+#endif  // CONFIG_HIGHBITDEPTH
+        break;
+      case H_PRED:
+#if CONFIG_HIGHBITDEPTH
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+          hbd_process_block_dpcm_horz(tx_size, tx_type_1d, xd->bd, pd, p, src,
+                                      src_stride, dst, dst_stride, src_diff,
+                                      diff_stride, coeff, qcoeff, dqcoeff);
+        } else {
+#endif  // CONFIG_HIGHBITDEPTH
+          process_block_dpcm_horz(tx_size, tx_type_1d, pd, p, src, src_stride,
+                                  dst, dst_stride, src_diff, diff_stride, coeff,
+                                  qcoeff, dqcoeff);
+#if CONFIG_HIGHBITDEPTH
+        }
+#endif  // CONFIG_HIGHBITDEPTH
+        break;
+      default: assert(0);
+    }
+    *eob = get_eob(qcoeff, tx1d_height * tx1d_width, scan_order->scan);
+  }
+
+  ta[blk_col] = tl[blk_row] = *eob > 0;
+  if (*eob) *skip = 0;
+}
+#endif  // CONFIG_DPCM_INTRA
+
 void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
                             BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
                             void *arg) {
@@ -1449,7 +1716,33 @@ void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
   const int dst_stride = pd->dst.stride;
   uint8_t *dst =
       &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+#if CONFIG_CFL
+
+#if CONFIG_EC_ADAPT
+  FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
+#else
+  FRAME_CONTEXT *const ec_ctx = cm->fc;
+#endif  // CONFIG_EC_ADAPT
+
+  av1_predict_intra_block_encoder_facade(x, ec_ctx, plane, block, blk_col,
+                                         blk_row, tx_size, plane_bsize);
+#else
   av1_predict_intra_block_facade(xd, plane, block, blk_col, blk_row, tx_size);
+#endif
+
+#if CONFIG_DPCM_INTRA
+  const int block_raster_idx = av1_block_index_to_raster_order(tx_size, block);
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const PREDICTION_MODE mode =
+      (plane == 0) ? get_y_mode(xd->mi[0], block_raster_idx) : mbmi->uv_mode;
+  if (av1_use_dpcm_intra(plane, mode, tx_type, mbmi)) {
+    av1_encode_block_intra_dpcm(cm, x, mode, plane, block, blk_row, blk_col,
+                                plane_bsize, tx_size, tx_type, args->ta,
+                                args->tl, args->skip);
+    return;
+  }
+#endif  // CONFIG_DPCM_INTRA
+
   av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
 
   const ENTROPY_CONTEXT *a = &args->ta[blk_col];
@@ -1458,9 +1751,7 @@ void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
   if (args->enable_optimize_b) {
     av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
                     ctx, AV1_XFORM_QUANT_FP);
-    if (p->eobs[block]) {
-      av1_optimize_b(cm, x, plane, block, tx_size, ctx);
-    }
+    av1_optimize_b(cm, x, plane, block, plane_bsize, tx_size, a, l);
   } else {
     av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
                     ctx, AV1_XFORM_QUANT_B);
@@ -1480,16 +1771,216 @@ void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
 // Note : *(args->skip) == mbmi->skip
 #endif
 #if CONFIG_CFL
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   if (plane == AOM_PLANE_Y && x->cfl_store_y) {
     cfl_store(xd->cfl, dst, dst_stride, blk_row, blk_col, tx_size);
   }
+
+  if (mbmi->uv_mode == DC_PRED) {
+    // TODO(ltrudeau) find a cleaner way to detect last transform block
+    if (plane == AOM_PLANE_U) {
+      xd->cfl->num_tx_blk[CFL_PRED_U] =
+          (blk_row == 0 && blk_col == 0) ? 1
+                                         : xd->cfl->num_tx_blk[CFL_PRED_U] + 1;
+    }
+
+    if (plane == AOM_PLANE_V) {
+      xd->cfl->num_tx_blk[CFL_PRED_V] =
+          (blk_row == 0 && blk_col == 0) ? 1
+                                         : xd->cfl->num_tx_blk[CFL_PRED_V] + 1;
+
+      if (mbmi->skip &&
+          xd->cfl->num_tx_blk[CFL_PRED_U] == xd->cfl->num_tx_blk[CFL_PRED_V]) {
+        assert(plane_bsize != BLOCK_INVALID);
+        const int block_width = block_size_wide[plane_bsize];
+        const int block_height = block_size_high[plane_bsize];
+
+        // if SKIP is chosen at the block level, and ind != 0, we must change
+        // the prediction
+        if (mbmi->cfl_alpha_idx != 0) {
+          const struct macroblockd_plane *const pd_cb = &xd->plane[AOM_PLANE_U];
+          uint8_t *const dst_cb = pd_cb->dst.buf;
+          const int dst_stride_cb = pd_cb->dst.stride;
+          uint8_t *const dst_cr = pd->dst.buf;
+          const int dst_stride_cr = pd->dst.stride;
+          for (int j = 0; j < block_height; j++) {
+            for (int i = 0; i < block_width; i++) {
+              dst_cb[dst_stride_cb * j + i] =
+                  (uint8_t)(xd->cfl->dc_pred[CFL_PRED_U] + 0.5);
+              dst_cr[dst_stride_cr * j + i] =
+                  (uint8_t)(xd->cfl->dc_pred[CFL_PRED_V] + 0.5);
+            }
+          }
+          mbmi->cfl_alpha_idx = 0;
+          mbmi->cfl_alpha_signs[CFL_PRED_U] = CFL_SIGN_POS;
+          mbmi->cfl_alpha_signs[CFL_PRED_V] = CFL_SIGN_POS;
+        }
+      }
+    }
+  }
 #endif
 }
 
+#if CONFIG_CFL
+static int cfl_alpha_dist(const uint8_t *y_pix, int y_stride, double y_avg,
+                          const uint8_t *src, int src_stride, int blk_width,
+                          int blk_height, double dc_pred, double alpha,
+                          int *dist_neg_out) {
+  const double dc_pred_bias = dc_pred + 0.5;
+  int dist = 0;
+  int diff;
+
+  if (alpha == 0.0) {
+    const int dc_pred_i = (int)dc_pred_bias;
+    for (int j = 0; j < blk_height; j++) {
+      for (int i = 0; i < blk_width; i++) {
+        diff = src[i] - dc_pred_i;
+        dist += diff * diff;
+      }
+      src += src_stride;
+    }
+
+    if (dist_neg_out) *dist_neg_out = dist;
+
+    return dist;
+  }
+
+  int dist_neg = 0;
+  for (int j = 0; j < blk_height; j++) {
+    for (int i = 0; i < blk_width; i++) {
+      const double scaled_luma = alpha * (y_pix[i] - y_avg);
+      const int uv = src[i];
+      diff = uv - (int)(scaled_luma + dc_pred_bias);
+      dist += diff * diff;
+      diff = uv + (int)(scaled_luma - dc_pred_bias);
+      dist_neg += diff * diff;
+    }
+    y_pix += y_stride;
+    src += src_stride;
+  }
+
+  if (dist_neg_out) *dist_neg_out = dist_neg;
+
+  return dist;
+}
+
+static int cfl_compute_alpha_ind(MACROBLOCK *const x, const CFL_CTX *const cfl,
+                                 BLOCK_SIZE bsize,
+                                 CFL_SIGN_TYPE signs_out[CFL_SIGNS]) {
+  const struct macroblock_plane *const p_u = &x->plane[AOM_PLANE_U];
+  const struct macroblock_plane *const p_v = &x->plane[AOM_PLANE_V];
+  const uint8_t *const src_u = p_u->src.buf;
+  const uint8_t *const src_v = p_v->src.buf;
+  const int src_stride_u = p_u->src.stride;
+  const int src_stride_v = p_v->src.stride;
+  const int block_width = block_size_wide[bsize];
+  const int block_height = block_size_high[bsize];
+  const double dc_pred_u = cfl->dc_pred[CFL_PRED_U];
+  const double dc_pred_v = cfl->dc_pred[CFL_PRED_V];
+
+  // Temporary pixel buffer used to store the CfL prediction when we compute the
+  // alpha index.
+  uint8_t tmp_pix[MAX_SB_SQUARE];
+  // Load CfL Prediction over the entire block
+  const double y_avg =
+      cfl_load(cfl, tmp_pix, MAX_SB_SIZE, 0, 0, block_width, block_height);
+
+  int sse[CFL_PRED_PLANES][CFL_MAGS_SIZE];
+  sse[CFL_PRED_U][0] =
+      cfl_alpha_dist(tmp_pix, MAX_SB_SIZE, y_avg, src_u, src_stride_u,
+                     block_width, block_height, dc_pred_u, 0, NULL);
+  sse[CFL_PRED_V][0] =
+      cfl_alpha_dist(tmp_pix, MAX_SB_SIZE, y_avg, src_v, src_stride_v,
+                     block_width, block_height, dc_pred_v, 0, NULL);
+  for (int m = 1; m < CFL_MAGS_SIZE; m += 2) {
+    assert(cfl_alpha_mags[m + 1] == -cfl_alpha_mags[m]);
+    sse[CFL_PRED_U][m] = cfl_alpha_dist(
+        tmp_pix, MAX_SB_SIZE, y_avg, src_u, src_stride_u, block_width,
+        block_height, dc_pred_u, cfl_alpha_mags[m], &sse[CFL_PRED_U][m + 1]);
+    sse[CFL_PRED_V][m] = cfl_alpha_dist(
+        tmp_pix, MAX_SB_SIZE, y_avg, src_v, src_stride_v, block_width,
+        block_height, dc_pred_v, cfl_alpha_mags[m], &sse[CFL_PRED_V][m + 1]);
+  }
+
+  int dist;
+  int64_t cost;
+  int64_t best_cost;
+
+  // Compute least squares parameter of the entire block
+  // IMPORTANT: We assume that the first code is 0,0
+  int ind = 0;
+  signs_out[CFL_PRED_U] = CFL_SIGN_POS;
+  signs_out[CFL_PRED_V] = CFL_SIGN_POS;
+
+  dist = sse[CFL_PRED_U][0] + sse[CFL_PRED_V][0];
+  dist *= 16;
+  best_cost = RDCOST(x->rdmult, x->rddiv, cfl->costs[0], dist);
+
+  for (int c = 1; c < CFL_ALPHABET_SIZE; c++) {
+    const int idx_u = cfl_alpha_codes[c][CFL_PRED_U];
+    const int idx_v = cfl_alpha_codes[c][CFL_PRED_V];
+    for (CFL_SIGN_TYPE sign_u = idx_u == 0; sign_u < CFL_SIGNS; sign_u++) {
+      for (CFL_SIGN_TYPE sign_v = idx_v == 0; sign_v < CFL_SIGNS; sign_v++) {
+        dist = sse[CFL_PRED_U][idx_u + (sign_u == CFL_SIGN_NEG)] +
+               sse[CFL_PRED_V][idx_v + (sign_v == CFL_SIGN_NEG)];
+        dist *= 16;
+        cost = RDCOST(x->rdmult, x->rddiv, cfl->costs[c], dist);
+        if (cost < best_cost) {
+          best_cost = cost;
+          ind = c;
+          signs_out[CFL_PRED_U] = sign_u;
+          signs_out[CFL_PRED_V] = sign_v;
+        }
+      }
+    }
+  }
+
+  return ind;
+}
+
+static inline void cfl_update_costs(CFL_CTX *cfl, FRAME_CONTEXT *ec_ctx) {
+  assert(ec_ctx->cfl_alpha_cdf[CFL_ALPHABET_SIZE - 1] ==
+         AOM_ICDF(CDF_PROB_TOP));
+  const int prob_den = CDF_PROB_TOP;
+
+  int prob_num = AOM_ICDF(ec_ctx->cfl_alpha_cdf[0]);
+  cfl->costs[0] = av1_cost_zero(get_prob(prob_num, prob_den));
+
+  for (int c = 1; c < CFL_ALPHABET_SIZE; c++) {
+    int sign_bit_cost = (cfl_alpha_codes[c][CFL_PRED_U] != 0) +
+                        (cfl_alpha_codes[c][CFL_PRED_V] != 0);
+    prob_num = AOM_ICDF(ec_ctx->cfl_alpha_cdf[c]) -
+               AOM_ICDF(ec_ctx->cfl_alpha_cdf[c - 1]);
+    cfl->costs[c] = av1_cost_zero(get_prob(prob_num, prob_den)) +
+                    av1_cost_literal(sign_bit_cost);
+  }
+}
+
+void av1_predict_intra_block_encoder_facade(MACROBLOCK *x,
+                                            FRAME_CONTEXT *ec_ctx, int plane,
+                                            int block_idx, int blk_col,
+                                            int blk_row, TX_SIZE tx_size,
+                                            BLOCK_SIZE plane_bsize) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  if (plane != AOM_PLANE_Y && mbmi->uv_mode == DC_PRED) {
+    if (blk_col == 0 && blk_row == 0 && plane == AOM_PLANE_U) {
+      CFL_CTX *const cfl = xd->cfl;
+      cfl_update_costs(cfl, ec_ctx);
+      cfl_dc_pred(xd, plane_bsize, tx_size);
+      mbmi->cfl_alpha_idx =
+          cfl_compute_alpha_ind(x, cfl, plane_bsize, mbmi->cfl_alpha_signs);
+    }
+  }
+  av1_predict_intra_block_facade(xd, plane, block_idx, blk_col, blk_row,
+                                 tx_size);
+}
+#endif
+
 void av1_encode_intra_block_plane(AV1_COMMON *cm, MACROBLOCK *x,
                                   BLOCK_SIZE bsize, int plane,
-                                  int enable_optimize_b, const int mi_row,
-                                  const int mi_col) {
+                                  int enable_optimize_b, int mi_row,
+                                  int mi_col) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   ENTROPY_CONTEXT ta[2 * MAX_MIB_SIZE] = { 0 };
   ENTROPY_CONTEXT tl[2 * MAX_MIB_SIZE] = { 0 };
@@ -1545,9 +2036,7 @@ PVQ_SKIP_TYPE av1_pvq_encode_helper(MACROBLOCK *x, tran_low_t *const coeff,
   DECLARE_ALIGNED(16, int32_t, ref_int32[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
   DECLARE_ALIGNED(16, int32_t, out_int32[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
 
-#if CONFIG_HIGHBITDEPTH
   hbd_downshift = x->e_mbd.bd - 8;
-#endif
 
   assert(OD_COEFF_SHIFT >= 4);
   // DC quantizer for PVQ
@@ -1563,10 +2052,10 @@ PVQ_SKIP_TYPE av1_pvq_encode_helper(MACROBLOCK *x, tran_low_t *const coeff,
 
   *eob = 0;
 
-#if CONFIG_DAALA_EC
+#if !CONFIG_ANS
   tell = od_ec_enc_tell_frac(&daala_enc->w.ec);
 #else
-#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#error "CONFIG_PVQ currently requires !CONFIG_ANS."
 #endif
 
   // Change coefficient ordering for pvq encoding.
@@ -1635,11 +2124,11 @@ PVQ_SKIP_TYPE av1_pvq_encode_helper(MACROBLOCK *x, tran_low_t *const coeff,
 
   *eob = tx_blk_size * tx_blk_size;
 
-#if CONFIG_DAALA_EC
+#if !CONFIG_ANS
   *rate = (od_ec_enc_tell_frac(&daala_enc->w.ec) - tell)
           << (AV1_PROB_COST_SHIFT - OD_BITRES);
 #else
-#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#error "CONFIG_PVQ currently requires !CONFIG_ANS."
 #endif
   assert(*rate >= 0);
 
diff --git a/third_party/aom/av1/encoder/encodemb.h b/third_party/aom/av1/encoder/encodemb.h
index 73fde1d88..35a2c1570 100644
--- a/third_party/aom/av1/encoder/encodemb.h
+++ b/third_party/aom/av1/encoder/encodemb.h
@@ -54,7 +54,8 @@ void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
                      TX_SIZE tx_size, int ctx, AV1_XFORM_QUANT xform_quant_idx);
 
 int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int block,
-                   TX_SIZE tx_size, int ctx);
+                   BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                   const ENTROPY_CONTEXT *a, const ENTROPY_CONTEXT *l);
 
 void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
                       int blk_col, int blk_row, TX_SIZE tx_size);
@@ -85,6 +86,23 @@ void av1_store_pvq_enc_info(PVQ_INFO *pvq_info, int *qg, int *theta, int *k,
                             int *size, int skip_rest, int skip_dir, int bs);
 #endif
 
+#if CONFIG_CFL
+void av1_predict_intra_block_encoder_facade(MACROBLOCK *x,
+                                            FRAME_CONTEXT *ec_ctx, int plane,
+                                            int block_idx, int blk_col,
+                                            int blk_row, TX_SIZE tx_size,
+                                            BLOCK_SIZE plane_bsize);
+#endif
+
+#if CONFIG_DPCM_INTRA
+void av1_encode_block_intra_dpcm(const AV1_COMMON *cm, MACROBLOCK *x,
+                                 PREDICTION_MODE mode, int plane, int block,
+                                 int blk_row, int blk_col,
+                                 BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                                 TX_TYPE tx_type, ENTROPY_CONTEXT *ta,
+                                 ENTROPY_CONTEXT *tl, int8_t *skip);
+#endif  // CONFIG_DPCM_INTRA
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/encoder/encodemv.c b/third_party/aom/av1/encoder/encodemv.c
index a2a53f840..eb0ff88c4 100644
--- a/third_party/aom/av1/encoder/encodemv.c
+++ b/third_party/aom/av1/encoder/encodemv.c
@@ -45,13 +45,8 @@ static void encode_mv_component(aom_writer *w, int comp, nmv_component *mvcomp,
   // Sign
   aom_write(w, sign, mvcomp->sign);
 
-// Class
-#if CONFIG_EC_MULTISYMBOL
+  // Class
   aom_write_symbol(w, mv_class, mvcomp->class_cdf, MV_CLASSES);
-#else
-  av1_write_token(w, av1_mv_class_tree, mvcomp->classes,
-                  &mv_class_encodings[mv_class]);
-#endif
 
   // Integer bits
   if (mv_class == MV_CLASS_0) {
@@ -62,16 +57,10 @@ static void encode_mv_component(aom_writer *w, int comp, nmv_component *mvcomp,
     for (i = 0; i < n; ++i) aom_write(w, (d >> i) & 1, mvcomp->bits[i]);
   }
 
-// Fractional bits
-#if CONFIG_EC_MULTISYMBOL
+  // Fractional bits
   aom_write_symbol(
       w, fr, mv_class == MV_CLASS_0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf,
       MV_FP_SIZE);
-#else
-  av1_write_token(w, av1_mv_fp_tree,
-                  mv_class == MV_CLASS_0 ? mvcomp->class0_fp[d] : mvcomp->fp,
-                  &mv_fp_encodings[fr]);
-#endif
 
   // High precision bit
   if (usehp)
@@ -171,7 +160,6 @@ static void write_mv_update(const aom_tree_index *tree,
 void av1_write_nmv_probs(AV1_COMMON *cm, int usehp, aom_writer *w,
                          nmv_context_counts *const nmv_counts) {
   int i;
-#if CONFIG_REF_MV
   int nmv_ctx = 0;
   for (nmv_ctx = 0; nmv_ctx < NMV_CONTEXTS; ++nmv_ctx) {
     nmv_context *const mvc = &cm->fc->nmvc[nmv_ctx];
@@ -213,57 +201,13 @@ void av1_write_nmv_probs(AV1_COMMON *cm, int usehp, aom_writer *w,
       }
     }
   }
-#else
-  nmv_context *const mvc = &cm->fc->nmvc;
-  nmv_context_counts *const counts = nmv_counts;
-
-#if !CONFIG_EC_ADAPT
-  write_mv_update(av1_mv_joint_tree, mvc->joints, counts->joints, MV_JOINTS, w);
-
-  for (i = 0; i < 2; ++i) {
-    int j;
-    nmv_component *comp = &mvc->comps[i];
-    nmv_component_counts *comp_counts = &counts->comps[i];
-
-    update_mv(w, comp_counts->sign, &comp->sign, MV_UPDATE_PROB);
-    write_mv_update(av1_mv_class_tree, comp->classes, comp_counts->classes,
-                    MV_CLASSES, w);
-    write_mv_update(av1_mv_class0_tree, comp->class0, comp_counts->class0,
-                    CLASS0_SIZE, w);
-    for (j = 0; j < MV_OFFSET_BITS; ++j)
-      update_mv(w, comp_counts->bits[j], &comp->bits[j], MV_UPDATE_PROB);
-  }
-
-  for (i = 0; i < 2; ++i) {
-    int j;
-    for (j = 0; j < CLASS0_SIZE; ++j) {
-      write_mv_update(av1_mv_fp_tree, mvc->comps[i].class0_fp[j],
-                      counts->comps[i].class0_fp[j], MV_FP_SIZE, w);
-    }
-    write_mv_update(av1_mv_fp_tree, mvc->comps[i].fp, counts->comps[i].fp,
-                    MV_FP_SIZE, w);
-  }
-#endif  // !CONFIG_EC_ADAPT
-
-  if (usehp) {
-    for (i = 0; i < 2; ++i) {
-      update_mv(w, counts->comps[i].class0_hp, &mvc->comps[i].class0_hp,
-                MV_UPDATE_PROB);
-      update_mv(w, counts->comps[i].hp, &mvc->comps[i].hp, MV_UPDATE_PROB);
-    }
-  }
-#endif
 }
 
 void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref,
                    nmv_context *mvctx, int usehp) {
   const MV diff = { mv->row - ref->row, mv->col - ref->col };
   const MV_JOINT_TYPE j = av1_get_mv_joint(&diff);
-#if CONFIG_EC_MULTISYMBOL
   aom_write_symbol(w, j, mvctx->joint_cdf, MV_JOINTS);
-#else
-  av1_write_token(w, av1_mv_joint_tree, mvctx->joints, &mv_joint_encodings[j]);
-#endif
   if (mv_joint_vertical(j))
     encode_mv_component(w, diff.row, &mvctx->comps[0], usehp);
 
@@ -284,11 +228,7 @@ void av1_encode_dv(aom_writer *w, const MV *mv, const MV *ref,
   const MV diff = { mv->row - ref->row, mv->col - ref->col };
   const MV_JOINT_TYPE j = av1_get_mv_joint(&diff);
 
-#if CONFIG_EC_MULTISYMBOL
   aom_write_symbol(w, j, mvctx->joint_cdf, MV_JOINTS);
-#else
-  av1_write_token(w, av1_mv_joint_tree, mvctx->joints, &mv_joint_encodings[j]);
-#endif
   if (mv_joint_vertical(j))
     encode_mv_component(w, diff.row, &mvctx->comps[0], 0);
 
@@ -306,135 +246,101 @@ void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
 
 #if CONFIG_EXT_INTER
 static void inc_mvs(const MB_MODE_INFO *mbmi, const MB_MODE_INFO_EXT *mbmi_ext,
-                    const int_mv mvs[2],
-#if CONFIG_REF_MV
-                    const int_mv pred_mvs[2],
-#endif
+                    const int_mv mvs[2], const int_mv pred_mvs[2],
                     nmv_context_counts *nmv_counts) {
   int i;
   PREDICTION_MODE mode = mbmi->mode;
-#if !CONFIG_REF_MV
-  nmv_context_counts *counts = nmv_counts;
-#endif
 
   if (mode == NEWMV || mode == NEW_NEWMV) {
     for (i = 0; i < 1 + has_second_ref(mbmi); ++i) {
       const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[i]][0].as_mv;
       const MV diff = { mvs[i].as_mv.row - ref->row,
                         mvs[i].as_mv.col - ref->col };
-#if CONFIG_REF_MV
       int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
       int nmv_ctx =
           av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
                       mbmi_ext->ref_mv_stack[rf_type], i, mbmi->ref_mv_idx);
       nmv_context_counts *counts = &nmv_counts[nmv_ctx];
       (void)pred_mvs;
-#endif
       av1_inc_mv(&diff, counts, 1);
     }
   } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
     const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0].as_mv;
     const MV diff = { mvs[1].as_mv.row - ref->row,
                       mvs[1].as_mv.col - ref->col };
-#if CONFIG_REF_MV
     int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
     int nmv_ctx =
         av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
                     mbmi_ext->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx);
     nmv_context_counts *counts = &nmv_counts[nmv_ctx];
-#endif
     av1_inc_mv(&diff, counts, 1);
   } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
     const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0].as_mv;
     const MV diff = { mvs[0].as_mv.row - ref->row,
                       mvs[0].as_mv.col - ref->col };
-#if CONFIG_REF_MV
     int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
     int nmv_ctx =
         av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
                     mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
     nmv_context_counts *counts = &nmv_counts[nmv_ctx];
-#endif
     av1_inc_mv(&diff, counts, 1);
   }
 }
 
 static void inc_mvs_sub8x8(const MODE_INFO *mi, int block, const int_mv mvs[2],
-#if CONFIG_REF_MV
                            const MB_MODE_INFO_EXT *mbmi_ext,
-#endif
                            nmv_context_counts *nmv_counts) {
   int i;
   PREDICTION_MODE mode = mi->bmi[block].as_mode;
-#if CONFIG_REF_MV
   const MB_MODE_INFO *mbmi = &mi->mbmi;
-#else
-  nmv_context_counts *counts = nmv_counts;
-#endif
 
   if (mode == NEWMV || mode == NEW_NEWMV) {
     for (i = 0; i < 1 + has_second_ref(&mi->mbmi); ++i) {
       const MV *ref = &mi->bmi[block].ref_mv[i].as_mv;
       const MV diff = { mvs[i].as_mv.row - ref->row,
                         mvs[i].as_mv.col - ref->col };
-#if CONFIG_REF_MV
       int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
       int nmv_ctx =
           av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
                       mbmi_ext->ref_mv_stack[rf_type], i, mbmi->ref_mv_idx);
       nmv_context_counts *counts = &nmv_counts[nmv_ctx];
-#endif
       av1_inc_mv(&diff, counts, 1);
     }
   } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
     const MV *ref = &mi->bmi[block].ref_mv[1].as_mv;
     const MV diff = { mvs[1].as_mv.row - ref->row,
                       mvs[1].as_mv.col - ref->col };
-#if CONFIG_REF_MV
     int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
     int nmv_ctx =
         av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
                     mbmi_ext->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx);
     nmv_context_counts *counts = &nmv_counts[nmv_ctx];
-#endif
     av1_inc_mv(&diff, counts, 1);
   } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
     const MV *ref = &mi->bmi[block].ref_mv[0].as_mv;
     const MV diff = { mvs[0].as_mv.row - ref->row,
                       mvs[0].as_mv.col - ref->col };
-#if CONFIG_REF_MV
     int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
     int nmv_ctx =
         av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
                     mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
     nmv_context_counts *counts = &nmv_counts[nmv_ctx];
-#endif
     av1_inc_mv(&diff, counts, 1);
   }
 }
 #else
 static void inc_mvs(const MB_MODE_INFO *mbmi, const MB_MODE_INFO_EXT *mbmi_ext,
-                    const int_mv mvs[2],
-#if CONFIG_REF_MV
-                    const int_mv pred_mvs[2],
-#endif
+                    const int_mv mvs[2], const int_mv pred_mvs[2],
                     nmv_context_counts *nmv_counts) {
   int i;
-#if !CONFIG_REF_MV
-  nmv_context_counts *counts = nmv_counts;
-#endif
 
   for (i = 0; i < 1 + has_second_ref(mbmi); ++i) {
-#if CONFIG_REF_MV
     int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
     int nmv_ctx =
         av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
                     mbmi_ext->ref_mv_stack[rf_type], i, mbmi->ref_mv_idx);
     nmv_context_counts *counts = &nmv_counts[nmv_ctx];
     const MV *ref = &pred_mvs[i].as_mv;
-#else
-    const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[i]][0].as_mv;
-#endif
     const MV diff = { mvs[i].as_mv.row - ref->row,
                       mvs[i].as_mv.col - ref->col };
     av1_inc_mv(&diff, counts, 1);
@@ -464,20 +370,11 @@ void av1_update_mv_count(ThreadData *td) {
 
 #if CONFIG_EXT_INTER
         if (have_newmv_in_inter_mode(mi->bmi[i].as_mode))
-          inc_mvs_sub8x8(mi, i, mi->bmi[i].as_mv,
-#if CONFIG_REF_MV
-                         mbmi_ext, td->counts->mv);
-#else
-                         &td->counts->mv);
-#endif
+          inc_mvs_sub8x8(mi, i, mi->bmi[i].as_mv, mbmi_ext, td->counts->mv);
 #else
         if (mi->bmi[i].as_mode == NEWMV)
-          inc_mvs(mbmi, mbmi_ext, mi->bmi[i].as_mv,
-#if CONFIG_REF_MV
-                  mi->bmi[i].pred_mv, td->counts->mv);
-#else
-                  &td->counts->mv);
-#endif
+          inc_mvs(mbmi, mbmi_ext, mi->bmi[i].as_mv, mi->bmi[i].pred_mv,
+                  td->counts->mv);
 #endif  // CONFIG_EXT_INTER
       }
     }
@@ -487,11 +384,6 @@ void av1_update_mv_count(ThreadData *td) {
 #else
     if (mbmi->mode == NEWMV)
 #endif  // CONFIG_EXT_INTER
-      inc_mvs(mbmi, mbmi_ext, mbmi->mv,
-#if CONFIG_REF_MV
-              mbmi->pred_mv, td->counts->mv);
-#else
-              &td->counts->mv);
-#endif
+      inc_mvs(mbmi, mbmi_ext, mbmi->mv, mbmi->pred_mv, td->counts->mv);
   }
 }
diff --git a/third_party/aom/av1/encoder/encoder.c b/third_party/aom/av1/encoder/encoder.c
index 027109151..4782ce2b7 100644
--- a/third_party/aom/av1/encoder/encoder.c
+++ b/third_party/aom/av1/encoder/encoder.c
@@ -246,29 +246,17 @@ void av1_set_high_precision_mv(AV1_COMP *cpi, int allow_high_precision_mv) {
   MACROBLOCK *const mb = &cpi->td.mb;
   cpi->common.allow_high_precision_mv = allow_high_precision_mv;
 
-#if CONFIG_REF_MV
   if (cpi->common.allow_high_precision_mv) {
     int i;
     for (i = 0; i < NMV_CONTEXTS; ++i) {
       mb->mv_cost_stack[i] = mb->nmvcost_hp[i];
-      mb->mvsadcost = mb->nmvsadcost_hp;
     }
   } else {
     int i;
     for (i = 0; i < NMV_CONTEXTS; ++i) {
       mb->mv_cost_stack[i] = mb->nmvcost[i];
-      mb->mvsadcost = mb->nmvsadcost;
     }
   }
-#else
-  if (cpi->common.allow_high_precision_mv) {
-    mb->mvcost = mb->nmvcost_hp;
-    mb->mvsadcost = mb->nmvcost_hp;
-  } else {
-    mb->mvcost = mb->nmvcost;
-    mb->mvsadcost = mb->nmvcost;
-  }
-#endif
 }
 
 static BLOCK_SIZE select_sb_size(const AV1_COMP *const cpi) {
@@ -334,13 +322,14 @@ static void setup_frame(AV1_COMP *cpi) {
     av1_zero(cpi->interp_filter_selected[0]);
   }
 #if CONFIG_EXT_REFS
-#if CONFIG_LOWDELAY_COMPOUND  // No change to bitstream
+#if CONFIG_ONE_SIDED_COMPOUND  // No change to bitstream
   if (cpi->sf.recode_loop == DISALLOW_RECODE) {
     cpi->refresh_bwd_ref_frame = cpi->refresh_last_frame;
     cpi->rc.is_bipred_frame = 1;
   }
 #endif
 #endif
+  cm->pre_fc = &cm->frame_contexts[cm->frame_context_idx];
 
   cpi->vaq_refresh = 0;
 
@@ -464,6 +453,20 @@ static void dealloc_compressor_data(AV1_COMP *cpi) {
   aom_free(cpi->active_map.map);
   cpi->active_map.map = NULL;
 
+#if CONFIG_MOTION_VAR
+  aom_free(cpi->td.mb.above_pred_buf);
+  cpi->td.mb.above_pred_buf = NULL;
+
+  aom_free(cpi->td.mb.left_pred_buf);
+  cpi->td.mb.left_pred_buf = NULL;
+
+  aom_free(cpi->td.mb.wsrc_buf);
+  cpi->td.mb.wsrc_buf = NULL;
+
+  aom_free(cpi->td.mb.mask_buf);
+  cpi->td.mb.mask_buf = NULL;
+#endif
+
   // Free up-sampled reference buffers.
   for (i = 0; i < (REF_FRAMES + 1); i++)
     aom_free_frame_buffer(&cpi->upsampled_ref_bufs[i].buf);
@@ -492,17 +495,12 @@ static void dealloc_compressor_data(AV1_COMP *cpi) {
   cpi->tile_tok[0][0] = 0;
 
   av1_free_pc_tree(&cpi->td);
-  av1_free_var_tree(&cpi->td);
 
 #if CONFIG_PALETTE
   if (cpi->common.allow_screen_content_tools)
     aom_free(cpi->td.mb.palette_buffer);
 #endif  // CONFIG_PALETTE
 
-  if (cpi->source_diff_var != NULL) {
-    aom_free(cpi->source_diff_var);
-    cpi->source_diff_var = NULL;
-  }
 #if CONFIG_ANS
   aom_buf_ans_free(&cpi->buf_ans);
 #endif  // CONFIG_ANS
@@ -511,26 +509,17 @@ static void dealloc_compressor_data(AV1_COMP *cpi) {
 static void save_coding_context(AV1_COMP *cpi) {
   CODING_CONTEXT *const cc = &cpi->coding_context;
   AV1_COMMON *cm = &cpi->common;
-#if CONFIG_REF_MV
   int i;
-#endif
 
-// Stores a snapshot of key state variables which can subsequently be
-// restored with a call to av1_restore_coding_context. These functions are
-// intended for use in a re-code loop in av1_compress_frame where the
-// quantizer value is adjusted between loop iterations.
-#if CONFIG_REF_MV
+  // Stores a snapshot of key state variables which can subsequently be
+  // restored with a call to av1_restore_coding_context. These functions are
+  // intended for use in a re-code loop in av1_compress_frame where the
+  // quantizer value is adjusted between loop iterations.
   for (i = 0; i < NMV_CONTEXTS; ++i) {
     av1_copy(cc->nmv_vec_cost[i], cpi->td.mb.nmv_vec_cost[i]);
     av1_copy(cc->nmv_costs, cpi->nmv_costs);
     av1_copy(cc->nmv_costs_hp, cpi->nmv_costs_hp);
   }
-#else
-  av1_copy(cc->nmvjointcost, cpi->td.mb.nmvjointcost);
-#endif
-
-  av1_copy(cc->nmvcosts, cpi->nmvcosts);
-  av1_copy(cc->nmvcosts_hp, cpi->nmvcosts_hp);
 
   av1_copy(cc->last_ref_lf_deltas, cm->lf.last_ref_deltas);
   av1_copy(cc->last_mode_lf_deltas, cm->lf.last_mode_deltas);
@@ -541,24 +530,15 @@ static void save_coding_context(AV1_COMP *cpi) {
 static void restore_coding_context(AV1_COMP *cpi) {
   CODING_CONTEXT *const cc = &cpi->coding_context;
   AV1_COMMON *cm = &cpi->common;
-#if CONFIG_REF_MV
   int i;
-#endif
 
-// Restore key state variables to the snapshot state stored in the
-// previous call to av1_save_coding_context.
-#if CONFIG_REF_MV
+  // Restore key state variables to the snapshot state stored in the
+  // previous call to av1_save_coding_context.
   for (i = 0; i < NMV_CONTEXTS; ++i) {
     av1_copy(cpi->td.mb.nmv_vec_cost[i], cc->nmv_vec_cost[i]);
     av1_copy(cpi->nmv_costs, cc->nmv_costs);
     av1_copy(cpi->nmv_costs_hp, cc->nmv_costs_hp);
   }
-#else
-  av1_copy(cpi->td.mb.nmvjointcost, cc->nmvjointcost);
-#endif
-
-  av1_copy(cpi->nmvcosts, cc->nmvcosts);
-  av1_copy(cpi->nmvcosts_hp, cc->nmvcosts_hp);
 
   av1_copy(cm->lf.last_ref_deltas, cc->last_ref_lf_deltas);
   av1_copy(cm->lf.last_mode_deltas, cc->last_mode_lf_deltas);
@@ -795,14 +775,12 @@ static void alloc_util_frame_buffers(AV1_COMP *cpi) {
                        "Failed to allocate scaled last source buffer");
 }
 
-static int alloc_context_buffers_ext(AV1_COMP *cpi) {
+static void alloc_context_buffers_ext(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
   int mi_size = cm->mi_cols * cm->mi_rows;
 
-  cpi->mbmi_ext_base = aom_calloc(mi_size, sizeof(*cpi->mbmi_ext_base));
-  if (!cpi->mbmi_ext_base) return 1;
-
-  return 0;
+  CHECK_MEM_ERROR(cm, cpi->mbmi_ext_base,
+                  aom_calloc(mi_size, sizeof(*cpi->mbmi_ext_base)));
 }
 
 void av1_alloc_compressor_data(AV1_COMP *cpi) {
@@ -902,7 +880,11 @@ static void set_tile_info(AV1_COMP *cpi) {
 
 #if CONFIG_DEPENDENT_HORZTILES
   cm->dependent_horz_tiles = cpi->oxcf.dependent_horz_tiles;
+#if CONFIG_EXT_TILE
+  if (cm->tile_rows <= 1) cm->dependent_horz_tiles = 0;
+#else
   if (cm->log2_tile_rows == 0) cm->dependent_horz_tiles = 0;
+#endif
 #if CONFIG_TILE_GROUPS
   if (cpi->oxcf.mtu == 0) {
     cm->num_tg = cpi->oxcf.num_tile_groups;
@@ -1194,48 +1176,53 @@ MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad4x4x8)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x4x4d)
 
 #if CONFIG_EXT_INTER
-#define HIGHBD_MBFP(BT, MSDF, MVF, MSVF) \
-  cpi->fn_ptr[BT].msdf = MSDF;           \
-  cpi->fn_ptr[BT].mvf = MVF;             \
-  cpi->fn_ptr[BT].msvf = MSVF;
-
-#define MAKE_MBFP_SAD_WRAPPER(fnname)                                          \
-  static unsigned int fnname##_bits8(                                          \
-      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
-      int ref_stride, const uint8_t *m, int m_stride) {                        \
-    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, m, m_stride);   \
-  }                                                                            \
-  static unsigned int fnname##_bits10(                                         \
-      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
-      int ref_stride, const uint8_t *m, int m_stride) {                        \
-    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, m, m_stride) >> \
-           2;                                                                  \
-  }                                                                            \
-  static unsigned int fnname##_bits12(                                         \
-      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
-      int ref_stride, const uint8_t *m, int m_stride) {                        \
-    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, m, m_stride) >> \
-           4;                                                                  \
+#define HIGHBD_MBFP(BT, MCSDF, MCSVF) \
+  cpi->fn_ptr[BT].msdf = MCSDF;       \
+  cpi->fn_ptr[BT].msvf = MCSVF;
+
+#define MAKE_MBFP_COMPOUND_SAD_WRAPPER(fnname)                           \
+  static unsigned int fnname##_bits8(                                    \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+      int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m,  \
+      int m_stride, int invert_mask) {                                   \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride,           \
+                  second_pred_ptr, m, m_stride, invert_mask);            \
+  }                                                                      \
+  static unsigned int fnname##_bits10(                                   \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+      int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m,  \
+      int m_stride, int invert_mask) {                                   \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride,           \
+                  second_pred_ptr, m, m_stride, invert_mask) >>          \
+           2;                                                            \
+  }                                                                      \
+  static unsigned int fnname##_bits12(                                   \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+      int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m,  \
+      int m_stride, int invert_mask) {                                   \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride,           \
+                  second_pred_ptr, m, m_stride, invert_mask) >>          \
+           4;                                                            \
   }
 
 #if CONFIG_EXT_PARTITION
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad128x128)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad128x64)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad64x128)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x128)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x64)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x128)
 #endif  // CONFIG_EXT_PARTITION
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad64x64)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad64x32)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad32x64)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad32x32)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad32x16)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad16x32)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad16x16)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad16x8)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad8x16)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad8x8)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad8x4)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad4x8)
-MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad4x4)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x64)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x32)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x64)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x32)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x16)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x32)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x16)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x8)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x16)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x8)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x4)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x8)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x4)
 #endif  // CONFIG_EXT_INTER
 
 #if CONFIG_MOTION_VAR
@@ -1401,54 +1388,38 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
 #if CONFIG_EXT_INTER
 #if CONFIG_EXT_PARTITION
         HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits8,
-                    aom_highbd_masked_variance128x128,
-                    aom_highbd_masked_sub_pixel_variance128x128)
+                    aom_highbd_8_masked_sub_pixel_variance128x128)
         HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits8,
-                    aom_highbd_masked_variance128x64,
-                    aom_highbd_masked_sub_pixel_variance128x64)
+                    aom_highbd_8_masked_sub_pixel_variance128x64)
         HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits8,
-                    aom_highbd_masked_variance64x128,
-                    aom_highbd_masked_sub_pixel_variance64x128)
+                    aom_highbd_8_masked_sub_pixel_variance64x128)
 #endif  // CONFIG_EXT_PARTITION
         HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits8,
-                    aom_highbd_masked_variance64x64,
-                    aom_highbd_masked_sub_pixel_variance64x64)
+                    aom_highbd_8_masked_sub_pixel_variance64x64)
         HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits8,
-                    aom_highbd_masked_variance64x32,
-                    aom_highbd_masked_sub_pixel_variance64x32)
+                    aom_highbd_8_masked_sub_pixel_variance64x32)
         HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_sad32x64_bits8,
-                    aom_highbd_masked_variance32x64,
-                    aom_highbd_masked_sub_pixel_variance32x64)
+                    aom_highbd_8_masked_sub_pixel_variance32x64)
         HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_sad32x32_bits8,
-                    aom_highbd_masked_variance32x32,
-                    aom_highbd_masked_sub_pixel_variance32x32)
+                    aom_highbd_8_masked_sub_pixel_variance32x32)
         HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_sad32x16_bits8,
-                    aom_highbd_masked_variance32x16,
-                    aom_highbd_masked_sub_pixel_variance32x16)
+                    aom_highbd_8_masked_sub_pixel_variance32x16)
         HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_sad16x32_bits8,
-                    aom_highbd_masked_variance16x32,
-                    aom_highbd_masked_sub_pixel_variance16x32)
+                    aom_highbd_8_masked_sub_pixel_variance16x32)
         HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_sad16x16_bits8,
-                    aom_highbd_masked_variance16x16,
-                    aom_highbd_masked_sub_pixel_variance16x16)
+                    aom_highbd_8_masked_sub_pixel_variance16x16)
         HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_sad8x16_bits8,
-                    aom_highbd_masked_variance8x16,
-                    aom_highbd_masked_sub_pixel_variance8x16)
+                    aom_highbd_8_masked_sub_pixel_variance8x16)
         HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_sad16x8_bits8,
-                    aom_highbd_masked_variance16x8,
-                    aom_highbd_masked_sub_pixel_variance16x8)
+                    aom_highbd_8_masked_sub_pixel_variance16x8)
         HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_sad8x8_bits8,
-                    aom_highbd_masked_variance8x8,
-                    aom_highbd_masked_sub_pixel_variance8x8)
+                    aom_highbd_8_masked_sub_pixel_variance8x8)
         HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_sad4x8_bits8,
-                    aom_highbd_masked_variance4x8,
-                    aom_highbd_masked_sub_pixel_variance4x8)
+                    aom_highbd_8_masked_sub_pixel_variance4x8)
         HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_sad8x4_bits8,
-                    aom_highbd_masked_variance8x4,
-                    aom_highbd_masked_sub_pixel_variance8x4)
+                    aom_highbd_8_masked_sub_pixel_variance8x4)
         HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits8,
-                    aom_highbd_masked_variance4x4,
-                    aom_highbd_masked_sub_pixel_variance4x4)
+                    aom_highbd_8_masked_sub_pixel_variance4x4)
 #endif  // CONFIG_EXT_INTER
 #if CONFIG_MOTION_VAR
 #if CONFIG_EXT_PARTITION
@@ -1624,53 +1595,37 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
 #if CONFIG_EXT_INTER
 #if CONFIG_EXT_PARTITION
         HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits10,
-                    aom_highbd_10_masked_variance128x128,
                     aom_highbd_10_masked_sub_pixel_variance128x128)
         HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits10,
-                    aom_highbd_10_masked_variance128x64,
                     aom_highbd_10_masked_sub_pixel_variance128x64)
         HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits10,
-                    aom_highbd_10_masked_variance64x128,
                     aom_highbd_10_masked_sub_pixel_variance64x128)
 #endif  // CONFIG_EXT_PARTITION
         HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits10,
-                    aom_highbd_10_masked_variance64x64,
                     aom_highbd_10_masked_sub_pixel_variance64x64)
         HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits10,
-                    aom_highbd_10_masked_variance64x32,
                     aom_highbd_10_masked_sub_pixel_variance64x32)
         HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_sad32x64_bits10,
-                    aom_highbd_10_masked_variance32x64,
                     aom_highbd_10_masked_sub_pixel_variance32x64)
         HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_sad32x32_bits10,
-                    aom_highbd_10_masked_variance32x32,
                     aom_highbd_10_masked_sub_pixel_variance32x32)
         HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_sad32x16_bits10,
-                    aom_highbd_10_masked_variance32x16,
                     aom_highbd_10_masked_sub_pixel_variance32x16)
         HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_sad16x32_bits10,
-                    aom_highbd_10_masked_variance16x32,
                     aom_highbd_10_masked_sub_pixel_variance16x32)
         HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_sad16x16_bits10,
-                    aom_highbd_10_masked_variance16x16,
                     aom_highbd_10_masked_sub_pixel_variance16x16)
         HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_sad8x16_bits10,
-                    aom_highbd_10_masked_variance8x16,
                     aom_highbd_10_masked_sub_pixel_variance8x16)
         HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_sad16x8_bits10,
-                    aom_highbd_10_masked_variance16x8,
                     aom_highbd_10_masked_sub_pixel_variance16x8)
         HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_sad8x8_bits10,
-                    aom_highbd_10_masked_variance8x8,
                     aom_highbd_10_masked_sub_pixel_variance8x8)
         HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_sad4x8_bits10,
-                    aom_highbd_10_masked_variance4x8,
                     aom_highbd_10_masked_sub_pixel_variance4x8)
         HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_sad8x4_bits10,
-                    aom_highbd_10_masked_variance8x4,
                     aom_highbd_10_masked_sub_pixel_variance8x4)
         HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits10,
-                    aom_highbd_10_masked_variance4x4,
                     aom_highbd_10_masked_sub_pixel_variance4x4)
 #endif  // CONFIG_EXT_INTER
 #if CONFIG_MOTION_VAR
@@ -1847,53 +1802,37 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
 #if CONFIG_EXT_INTER
 #if CONFIG_EXT_PARTITION
         HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits12,
-                    aom_highbd_12_masked_variance128x128,
                     aom_highbd_12_masked_sub_pixel_variance128x128)
         HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits12,
-                    aom_highbd_12_masked_variance128x64,
                     aom_highbd_12_masked_sub_pixel_variance128x64)
         HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits12,
-                    aom_highbd_12_masked_variance64x128,
                     aom_highbd_12_masked_sub_pixel_variance64x128)
 #endif  // CONFIG_EXT_PARTITION
         HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits12,
-                    aom_highbd_12_masked_variance64x64,
                     aom_highbd_12_masked_sub_pixel_variance64x64)
         HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits12,
-                    aom_highbd_12_masked_variance64x32,
                     aom_highbd_12_masked_sub_pixel_variance64x32)
         HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_sad32x64_bits12,
-                    aom_highbd_12_masked_variance32x64,
                     aom_highbd_12_masked_sub_pixel_variance32x64)
         HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_sad32x32_bits12,
-                    aom_highbd_12_masked_variance32x32,
                     aom_highbd_12_masked_sub_pixel_variance32x32)
         HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_sad32x16_bits12,
-                    aom_highbd_12_masked_variance32x16,
                     aom_highbd_12_masked_sub_pixel_variance32x16)
         HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_sad16x32_bits12,
-                    aom_highbd_12_masked_variance16x32,
                     aom_highbd_12_masked_sub_pixel_variance16x32)
         HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_sad16x16_bits12,
-                    aom_highbd_12_masked_variance16x16,
                     aom_highbd_12_masked_sub_pixel_variance16x16)
         HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_sad8x16_bits12,
-                    aom_highbd_12_masked_variance8x16,
                     aom_highbd_12_masked_sub_pixel_variance8x16)
         HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_sad16x8_bits12,
-                    aom_highbd_12_masked_variance16x8,
                     aom_highbd_12_masked_sub_pixel_variance16x8)
         HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_sad8x8_bits12,
-                    aom_highbd_12_masked_variance8x8,
                     aom_highbd_12_masked_sub_pixel_variance8x8)
         HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_sad4x8_bits12,
-                    aom_highbd_12_masked_variance4x8,
                     aom_highbd_12_masked_sub_pixel_variance4x8)
         HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_sad8x4_bits12,
-                    aom_highbd_12_masked_variance8x4,
                     aom_highbd_12_masked_sub_pixel_variance8x4)
         HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits12,
-                    aom_highbd_12_masked_variance4x4,
                     aom_highbd_12_masked_sub_pixel_variance4x4)
 #endif  // CONFIG_EXT_INTER
 
@@ -1979,6 +1918,18 @@ static void realloc_segmentation_maps(AV1_COMP *cpi) {
                   aom_calloc(cm->mi_rows * cm->mi_cols, 1));
 }
 
+#if CONFIG_EXT_INTER
+void set_compound_tools(AV1_COMMON *cm) {
+  (void)cm;
+#if CONFIG_INTERINTRA
+  cm->allow_interintra_compound = 1;
+#endif  // CONFIG_INTERINTRA
+#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
+  cm->allow_masked_compound = 1;
+#endif  // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
+}
+#endif  // CONFIG_EXT_INTER
+
 void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
   AV1_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
@@ -1994,9 +1945,7 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
     assert(cm->bit_depth > AOM_BITS_8);
 
   cpi->oxcf = *oxcf;
-#if CONFIG_HIGHBITDEPTH
   cpi->td.mb.e_mbd.bd = (int)cm->bit_depth;
-#endif  // CONFIG_HIGHBITDEPTH
 #if CONFIG_GLOBAL_MOTION
   cpi->td.mb.e_mbd.global_motion = cm->global_motion;
 #endif  // CONFIG_GLOBAL_MOTION
@@ -2033,7 +1982,9 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
     av1_setup_pc_tree(&cpi->common, &cpi->td);
   }
 #endif  // CONFIG_PALETTE
-
+#if CONFIG_EXT_INTER
+  set_compound_tools(cm);
+#endif  // CONFIG_EXT_INTER
   av1_reset_segment_features(cm);
   av1_set_high_precision_mv(cpi, 0);
 
@@ -2107,50 +2058,6 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
 #endif  // CONFIG_ANS && ANS_MAX_SYMBOLS
 }
 
-#ifndef M_LOG2_E
-#define M_LOG2_E 0.693147180559945309417
-#endif
-#define log2f(x) (log(x) / (float)M_LOG2_E)
-
-#if !CONFIG_REF_MV
-static void cal_nmvjointsadcost(int *mvjointsadcost) {
-  mvjointsadcost[0] = 600;
-  mvjointsadcost[1] = 300;
-  mvjointsadcost[2] = 300;
-  mvjointsadcost[3] = 300;
-}
-#endif
-
-static void cal_nmvsadcosts(int *mvsadcost[2]) {
-  int i = 1;
-
-  mvsadcost[0][0] = 0;
-  mvsadcost[1][0] = 0;
-
-  do {
-    double z = 256 * (2 * (log2f(8 * i) + .6));
-    mvsadcost[0][i] = (int)z;
-    mvsadcost[1][i] = (int)z;
-    mvsadcost[0][-i] = (int)z;
-    mvsadcost[1][-i] = (int)z;
-  } while (++i <= MV_MAX);
-}
-
-static void cal_nmvsadcosts_hp(int *mvsadcost[2]) {
-  int i = 1;
-
-  mvsadcost[0][0] = 0;
-  mvsadcost[1][0] = 0;
-
-  do {
-    double z = 256 * (2 * (log2f(8 * i) + .6));
-    mvsadcost[0][i] = (int)z;
-    mvsadcost[1][i] = (int)z;
-    mvsadcost[0][-i] = (int)z;
-    mvsadcost[1][-i] = (int)z;
-  } while (++i <= MV_MAX);
-}
-
 static INLINE void init_upsampled_ref_frame_bufs(AV1_COMP *cpi) {
   int i;
 
@@ -2192,6 +2099,11 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
   cpi->resize_state = 0;
   cpi->resize_avg_qp = 0;
   cpi->resize_buffer_underflow = 0;
+  cpi->resize_scale_num = 16;
+  cpi->resize_scale_den = 16;
+  cpi->resize_next_scale_num = 16;
+  cpi->resize_next_scale_den = 16;
+
   cpi->common.buffer_pool = pool;
 
   init_config(cpi, oxcf);
@@ -2223,17 +2135,10 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
 
   realloc_segmentation_maps(cpi);
 
-#if CONFIG_REF_MV
   for (i = 0; i < NMV_CONTEXTS; ++i) {
     memset(cpi->nmv_costs, 0, sizeof(cpi->nmv_costs));
     memset(cpi->nmv_costs_hp, 0, sizeof(cpi->nmv_costs_hp));
   }
-#endif
-
-  memset(cpi->nmvcosts, 0, sizeof(cpi->nmvcosts));
-  memset(cpi->nmvcosts_hp, 0, sizeof(cpi->nmvcosts_hp));
-  memset(cpi->nmvsadcosts, 0, sizeof(cpi->nmvsadcosts));
-  memset(cpi->nmvsadcosts_hp, 0, sizeof(cpi->nmvsadcosts_hp));
 
   for (i = 0; i < (sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0]));
        i++) {
@@ -2296,27 +2201,12 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
 
   cpi->first_time_stamp_ever = INT64_MAX;
 
-#if CONFIG_REF_MV
   for (i = 0; i < NMV_CONTEXTS; ++i) {
     cpi->td.mb.nmvcost[i][0] = &cpi->nmv_costs[i][0][MV_MAX];
     cpi->td.mb.nmvcost[i][1] = &cpi->nmv_costs[i][1][MV_MAX];
     cpi->td.mb.nmvcost_hp[i][0] = &cpi->nmv_costs_hp[i][0][MV_MAX];
     cpi->td.mb.nmvcost_hp[i][1] = &cpi->nmv_costs_hp[i][1][MV_MAX];
   }
-#else
-  cal_nmvjointsadcost(cpi->td.mb.nmvjointsadcost);
-  cpi->td.mb.nmvcost[0] = &cpi->nmvcosts[0][MV_MAX];
-  cpi->td.mb.nmvcost[1] = &cpi->nmvcosts[1][MV_MAX];
-  cpi->td.mb.nmvcost_hp[0] = &cpi->nmvcosts_hp[0][MV_MAX];
-  cpi->td.mb.nmvcost_hp[1] = &cpi->nmvcosts_hp[1][MV_MAX];
-#endif
-  cpi->td.mb.nmvsadcost[0] = &cpi->nmvsadcosts[0][MV_MAX];
-  cpi->td.mb.nmvsadcost[1] = &cpi->nmvsadcosts[1][MV_MAX];
-  cal_nmvsadcosts(cpi->td.mb.nmvsadcost);
-
-  cpi->td.mb.nmvsadcost_hp[0] = &cpi->nmvsadcosts_hp[0][MV_MAX];
-  cpi->td.mb.nmvsadcost_hp[1] = &cpi->nmvsadcosts_hp[1][MV_MAX];
-  cal_nmvsadcosts_hp(cpi->td.mb.nmvsadcost_hp);
 
 #ifdef OUTPUT_YUV_SKINMAP
   yuv_skinmap_file = fopen("skinmap.yuv", "ab");
@@ -2363,17 +2253,36 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
   }
 #endif
 
+#if CONFIG_MOTION_VAR
+#if CONFIG_HIGHBITDEPTH
+  int buf_scaler = 2;
+#else
+  int buf_scaler = 1;
+#endif
+  CHECK_MEM_ERROR(
+      cm, cpi->td.mb.above_pred_buf,
+      (uint8_t *)aom_memalign(16, buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE *
+                                      sizeof(*cpi->td.mb.above_pred_buf)));
+  CHECK_MEM_ERROR(
+      cm, cpi->td.mb.left_pred_buf,
+      (uint8_t *)aom_memalign(16, buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE *
+                                      sizeof(*cpi->td.mb.left_pred_buf)));
+
+  CHECK_MEM_ERROR(cm, cpi->td.mb.wsrc_buf,
+                  (int32_t *)aom_memalign(
+                      16, MAX_SB_SQUARE * sizeof(*cpi->td.mb.wsrc_buf)));
+
+  CHECK_MEM_ERROR(cm, cpi->td.mb.mask_buf,
+                  (int32_t *)aom_memalign(
+                      16, MAX_SB_SQUARE * sizeof(*cpi->td.mb.mask_buf)));
+
+#endif
+
   init_upsampled_ref_frame_bufs(cpi);
 
   av1_set_speed_features_framesize_independent(cpi);
   av1_set_speed_features_framesize_dependent(cpi);
 
-  // Allocate memory to store variances for a frame.
-  CHECK_MEM_ERROR(cm, cpi->source_diff_var,
-                  aom_calloc(cm->MBs, sizeof(*cpi->source_diff_var)));
-  cpi->source_var_thresh = 0;
-  cpi->frames_till_next_var_check = 0;
-
 #define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF) \
   cpi->fn_ptr[BT].sdf = SDF;                                    \
   cpi->fn_ptr[BT].sdaf = SDAF;                                  \
@@ -2499,45 +2408,29 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
 #endif  // CONFIG_MOTION_VAR
 
 #if CONFIG_EXT_INTER
-#define MBFP(BT, MSDF, MVF, MSVF) \
-  cpi->fn_ptr[BT].msdf = MSDF;    \
-  cpi->fn_ptr[BT].mvf = MVF;      \
-  cpi->fn_ptr[BT].msvf = MSVF;
+#define MBFP(BT, MCSDF, MCSVF)  \
+  cpi->fn_ptr[BT].msdf = MCSDF; \
+  cpi->fn_ptr[BT].msvf = MCSVF;
 
 #if CONFIG_EXT_PARTITION
-  MBFP(BLOCK_128X128, aom_masked_sad128x128, aom_masked_variance128x128,
+  MBFP(BLOCK_128X128, aom_masked_sad128x128,
        aom_masked_sub_pixel_variance128x128)
-  MBFP(BLOCK_128X64, aom_masked_sad128x64, aom_masked_variance128x64,
-       aom_masked_sub_pixel_variance128x64)
-  MBFP(BLOCK_64X128, aom_masked_sad64x128, aom_masked_variance64x128,
-       aom_masked_sub_pixel_variance64x128)
+  MBFP(BLOCK_128X64, aom_masked_sad128x64, aom_masked_sub_pixel_variance128x64)
+  MBFP(BLOCK_64X128, aom_masked_sad64x128, aom_masked_sub_pixel_variance64x128)
 #endif  // CONFIG_EXT_PARTITION
-  MBFP(BLOCK_64X64, aom_masked_sad64x64, aom_masked_variance64x64,
-       aom_masked_sub_pixel_variance64x64)
-  MBFP(BLOCK_64X32, aom_masked_sad64x32, aom_masked_variance64x32,
-       aom_masked_sub_pixel_variance64x32)
-  MBFP(BLOCK_32X64, aom_masked_sad32x64, aom_masked_variance32x64,
-       aom_masked_sub_pixel_variance32x64)
-  MBFP(BLOCK_32X32, aom_masked_sad32x32, aom_masked_variance32x32,
-       aom_masked_sub_pixel_variance32x32)
-  MBFP(BLOCK_32X16, aom_masked_sad32x16, aom_masked_variance32x16,
-       aom_masked_sub_pixel_variance32x16)
-  MBFP(BLOCK_16X32, aom_masked_sad16x32, aom_masked_variance16x32,
-       aom_masked_sub_pixel_variance16x32)
-  MBFP(BLOCK_16X16, aom_masked_sad16x16, aom_masked_variance16x16,
-       aom_masked_sub_pixel_variance16x16)
-  MBFP(BLOCK_16X8, aom_masked_sad16x8, aom_masked_variance16x8,
-       aom_masked_sub_pixel_variance16x8)
-  MBFP(BLOCK_8X16, aom_masked_sad8x16, aom_masked_variance8x16,
-       aom_masked_sub_pixel_variance8x16)
-  MBFP(BLOCK_8X8, aom_masked_sad8x8, aom_masked_variance8x8,
-       aom_masked_sub_pixel_variance8x8)
-  MBFP(BLOCK_4X8, aom_masked_sad4x8, aom_masked_variance4x8,
-       aom_masked_sub_pixel_variance4x8)
-  MBFP(BLOCK_8X4, aom_masked_sad8x4, aom_masked_variance8x4,
-       aom_masked_sub_pixel_variance8x4)
-  MBFP(BLOCK_4X4, aom_masked_sad4x4, aom_masked_variance4x4,
-       aom_masked_sub_pixel_variance4x4)
+  MBFP(BLOCK_64X64, aom_masked_sad64x64, aom_masked_sub_pixel_variance64x64)
+  MBFP(BLOCK_64X32, aom_masked_sad64x32, aom_masked_sub_pixel_variance64x32)
+  MBFP(BLOCK_32X64, aom_masked_sad32x64, aom_masked_sub_pixel_variance32x64)
+  MBFP(BLOCK_32X32, aom_masked_sad32x32, aom_masked_sub_pixel_variance32x32)
+  MBFP(BLOCK_32X16, aom_masked_sad32x16, aom_masked_sub_pixel_variance32x16)
+  MBFP(BLOCK_16X32, aom_masked_sad16x32, aom_masked_sub_pixel_variance16x32)
+  MBFP(BLOCK_16X16, aom_masked_sad16x16, aom_masked_sub_pixel_variance16x16)
+  MBFP(BLOCK_16X8, aom_masked_sad16x8, aom_masked_sub_pixel_variance16x8)
+  MBFP(BLOCK_8X16, aom_masked_sad8x16, aom_masked_sub_pixel_variance8x16)
+  MBFP(BLOCK_8X8, aom_masked_sad8x8, aom_masked_sub_pixel_variance8x8)
+  MBFP(BLOCK_4X8, aom_masked_sad4x8, aom_masked_sub_pixel_variance4x8)
+  MBFP(BLOCK_8X4, aom_masked_sad8x4, aom_masked_sub_pixel_variance8x4)
+  MBFP(BLOCK_4X4, aom_masked_sad4x4, aom_masked_sub_pixel_variance4x4)
 #endif  // CONFIG_EXT_INTER
 
 #if CONFIG_HIGHBITDEPTH
@@ -2555,6 +2448,9 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
 #endif
 
   av1_loop_filter_init(cm);
+#if CONFIG_FRAME_SUPERRES
+  cm->superres_scale_numerator = SUPERRES_SCALE_DENOMINATOR;
+#endif  // CONFIG_FRAME_SUPERRES
 #if CONFIG_LOOP_RESTORATION
   av1_loop_restoration_precal();
 #endif  // CONFIG_LOOP_RESTORATION
@@ -2671,11 +2567,16 @@ void av1_remove_compressor(AV1_COMP *cpi) {
     if (t < cpi->num_workers - 1) {
 #if CONFIG_PALETTE
       if (cpi->common.allow_screen_content_tools)
-        aom_free(thread_data->td->mb.palette_buffer);
+        aom_free(thread_data->td->palette_buffer);
 #endif  // CONFIG_PALETTE
+#if CONFIG_MOTION_VAR
+      aom_free(thread_data->td->above_pred_buf);
+      aom_free(thread_data->td->left_pred_buf);
+      aom_free(thread_data->td->wsrc_buf);
+      aom_free(thread_data->td->mask_buf);
+#endif  // CONFIG_MOTION_VAR
       aom_free(thread_data->td->counts);
       av1_free_pc_tree(thread_data->td);
-      av1_free_var_tree(thread_data->td);
       aom_free(thread_data->td);
     }
   }
@@ -2935,48 +2836,6 @@ void aom_write_one_yuv_frame(AV1_COMMON *cm, YV12_BUFFER_CONFIG *s) {
 #endif  // OUTPUT_YUV_REC
 
 #if CONFIG_HIGHBITDEPTH
-static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
-                                                YV12_BUFFER_CONFIG *dst,
-                                                int bd) {
-#else
-static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
-                                                YV12_BUFFER_CONFIG *dst) {
-#endif  // CONFIG_HIGHBITDEPTH
-  // TODO(dkovalev): replace YV12_BUFFER_CONFIG with aom_image_t
-  int i;
-  const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer,
-                                   src->v_buffer };
-  const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride };
-  const int src_widths[3] = { src->y_crop_width, src->uv_crop_width,
-                              src->uv_crop_width };
-  const int src_heights[3] = { src->y_crop_height, src->uv_crop_height,
-                               src->uv_crop_height };
-  uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer };
-  const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride };
-  const int dst_widths[3] = { dst->y_crop_width, dst->uv_crop_width,
-                              dst->uv_crop_width };
-  const int dst_heights[3] = { dst->y_crop_height, dst->uv_crop_height,
-                               dst->uv_crop_height };
-
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-#if CONFIG_HIGHBITDEPTH
-    if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
-      av1_highbd_resize_plane(srcs[i], src_heights[i], src_widths[i],
-                              src_strides[i], dsts[i], dst_heights[i],
-                              dst_widths[i], dst_strides[i], bd);
-    } else {
-      av1_resize_plane(srcs[i], src_heights[i], src_widths[i], src_strides[i],
-                       dsts[i], dst_heights[i], dst_widths[i], dst_strides[i]);
-    }
-#else
-    av1_resize_plane(srcs[i], src_heights[i], src_widths[i], src_strides[i],
-                     dsts[i], dst_heights[i], dst_widths[i], dst_strides[i]);
-#endif  // CONFIG_HIGHBITDEPTH
-  }
-  aom_extend_frame_borders(dst);
-}
-
-#if CONFIG_HIGHBITDEPTH
 static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
                                    YV12_BUFFER_CONFIG *dst, int planes,
                                    int bd) {
@@ -3041,22 +2900,6 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
     aom_extend_frame_borders(dst);
 }
 
-static int scale_down(AV1_COMP *cpi, int q) {
-  RATE_CONTROL *const rc = &cpi->rc;
-  GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-  int scale = 0;
-  assert(frame_is_kf_gf_arf(cpi));
-
-  if (rc->frame_size_selector == UNSCALED &&
-      q >= rc->rf_level_maxq[gf_group->rf_level[gf_group->index]]) {
-    const int max_size_thresh =
-        (int)(rate_thresh_mult[SCALE_STEP1] *
-              AOMMAX(rc->this_frame_target, rc->avg_frame_bandwidth));
-    scale = rc->projected_frame_size > max_size_thresh ? 1 : 0;
-  }
-  return scale;
-}
-
 #if CONFIG_GLOBAL_MOTION
 #define GM_RECODE_LOOP_NUM4X4_FACTOR 192
 static int recode_loop_test_global_motion(AV1_COMP *cpi) {
@@ -3070,11 +2913,8 @@ static int recode_loop_test_global_motion(AV1_COMP *cpi) {
             cpi->gmparams_cost[i]) {
       set_default_warp_params(&cm->global_motion[i]);
       cpi->gmparams_cost[i] = 0;
-#if CONFIG_REF_MV
       recode = 1;
-#else
       recode |= (rdc->global_motion_used[i] > 0);
-#endif
     }
   }
   return recode;
@@ -3093,13 +2933,6 @@ static int recode_loop_test(AV1_COMP *cpi, int high_limit, int low_limit, int q,
   if ((rc->projected_frame_size >= rc->max_frame_bandwidth) ||
       (cpi->sf.recode_loop == ALLOW_RECODE) ||
       (frame_is_kfgfarf && (cpi->sf.recode_loop == ALLOW_RECODE_KFARFGF))) {
-    if (frame_is_kfgfarf && (oxcf->resize_mode == RESIZE_DYNAMIC) &&
-        scale_down(cpi, q)) {
-      // Code this group at a lower resolution.
-      cpi->resize_pending = 1;
-      return 1;
-    }
-
     // TODO(agrange) high_limit could be greater than the scale-down threshold.
     if ((rc->projected_frame_size > high_limit && q < maxq) ||
         (rc->projected_frame_size < low_limit && q > minq)) {
@@ -3863,6 +3696,9 @@ static void set_size_independent_vars(AV1_COMP *cpi) {
   av1_set_rd_speed_thresholds(cpi);
   av1_set_rd_speed_thresholds_sub8x8(cpi);
   cpi->common.interp_filter = cpi->sf.default_interp_filter;
+#if CONFIG_EXT_INTER
+  if (!frame_is_intra_only(&cpi->common)) set_compound_tools(&cpi->common);
+#endif  // CONFIG_EXT_INTER
 }
 
 static void set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index,
@@ -3916,43 +3752,52 @@ static void set_restoration_tilesize(int width, int height,
 }
 #endif  // CONFIG_LOOP_RESTORATION
 
-static void set_frame_size(AV1_COMP *cpi) {
-  int ref_frame;
+static void set_scaled_size(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
 
-  if (oxcf->pass == 2 && oxcf->rc_mode == AOM_VBR &&
-      ((oxcf->resize_mode == RESIZE_FIXED && cm->current_video_frame == 0) ||
-       (oxcf->resize_mode == RESIZE_DYNAMIC && cpi->resize_pending))) {
-    av1_calculate_coded_size(cpi, &oxcf->scaled_frame_width,
-                             &oxcf->scaled_frame_height);
-
-    // There has been a change in frame size.
-    av1_set_size_literal(cpi, oxcf->scaled_frame_width,
-                         oxcf->scaled_frame_height);
+  // TODO(afergs): Replace with call to av1_resize_pending? Could replace
+  //               scaled_size_set as well.
+  // TODO(afergs): Realistically, if resize_pending is true, then the other
+  //               conditions must already be satisfied.
+  //               Try this first:
+  //                 av1_resize_pending &&
+  //                 (DYNAMIC && (1 Pass CBR || 2 Pass VBR)
+  //                  STATIC  && FIRST_FRAME)
+  //               Really, av1_resize_pending should just reflect the above.
+  // TODO(afergs): Allow fixed resizing in AOM_CBR mode?
+  // 2 Pass VBR: Resize if fixed resize and first frame, or dynamic resize and
+  //             a resize is pending.
+  // 1 Pass CBR: Resize if dynamic resize and resize pending.
+  if ((oxcf->pass == 2 && oxcf->rc_mode == AOM_VBR &&
+       ((oxcf->resize_mode == RESIZE_FIXED && cm->current_video_frame == 0) ||
+        (oxcf->resize_mode == RESIZE_DYNAMIC && av1_resize_pending(cpi)))) ||
+      (oxcf->pass == 0 && oxcf->rc_mode == AOM_CBR &&
+       oxcf->resize_mode == RESIZE_DYNAMIC && av1_resize_pending(cpi))) {
+    // TODO(afergs): This feels hacky... Should it just set? Should
+    //               av1_set_next_scaled_size be a library function?
+    av1_calculate_next_scaled_size(cpi, &oxcf->scaled_frame_width,
+                                   &oxcf->scaled_frame_height);
   }
+}
 
-  if (oxcf->pass == 0 && oxcf->rc_mode == AOM_CBR &&
-      oxcf->resize_mode == RESIZE_DYNAMIC) {
-    if (cpi->resize_pending == 1) {
-      oxcf->scaled_frame_width =
-          (cm->width * cpi->resize_scale_num) / cpi->resize_scale_den;
-      oxcf->scaled_frame_height =
-          (cm->height * cpi->resize_scale_num) / cpi->resize_scale_den;
-    } else if (cpi->resize_pending == -1) {
-      // Go back up to original size.
-      oxcf->scaled_frame_width = oxcf->width;
-      oxcf->scaled_frame_height = oxcf->height;
-    }
-    if (cpi->resize_pending != 0) {
-      // There has been a change in frame size.
-      av1_set_size_literal(cpi, oxcf->scaled_frame_width,
-                           oxcf->scaled_frame_height);
+static void set_frame_size(AV1_COMP *cpi, int width, int height) {
+  int ref_frame;
+  AV1_COMMON *const cm = &cpi->common;
+  AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
 
-      // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed.
-      set_mv_search_params(cpi);
-    }
+  if (width != cm->width || height != cm->height) {
+    // There has been a change in the encoded frame size
+    av1_set_size_literal(cpi, width, height);
+
+    // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed.
+    // TODO(afergs): Make condition just (pass == 0) or (rc_mode == CBR) -
+    //               UNLESS CBR starts allowing FIXED resizing. Then the resize
+    //               mode will need to get checked too.
+    if (oxcf->pass == 0 && oxcf->rc_mode == AOM_CBR &&
+        oxcf->resize_mode == RESIZE_DYNAMIC)
+      set_mv_search_params(cpi);  // TODO(afergs): Needed? Caller calls after...
   }
 
 #if !CONFIG_XIPHRC
@@ -4012,10 +3857,33 @@ static void set_frame_size(AV1_COMP *cpi) {
       ref_buf->buf = NULL;
     }
   }
+#if CONFIG_INTRABC
+#if CONFIG_HIGHBITDEPTH
+  av1_setup_scale_factors_for_frame(&xd->sf_identity, cm->width, cm->height,
+                                    cm->width, cm->height,
+                                    cm->use_highbitdepth);
+#else
+  av1_setup_scale_factors_for_frame(&xd->sf_identity, cm->width, cm->height,
+                                    cm->width, cm->height);
+#endif  // CONFIG_HIGHBITDEPTH
+#endif  // CONFIG_INTRABC
 
   set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
 }
 
+static void setup_frame_size(AV1_COMP *cpi) {
+  set_scaled_size(cpi);
+#if CONFIG_FRAME_SUPERRES
+  int encode_width;
+  int encode_height;
+  av1_calculate_superres_size(cpi, &encode_width, &encode_height);
+  set_frame_size(cpi, encode_width, encode_height);
+#else
+  set_frame_size(cpi, cpi->oxcf.scaled_frame_width,
+                 cpi->oxcf.scaled_frame_height);
+#endif  // CONFIG_FRAME_SUPERRES
+}
+
 static void reset_use_upsampled_references(AV1_COMP *cpi) {
   MV_REFERENCE_FRAME ref_frame;
 
@@ -4039,7 +3907,15 @@ static void encode_without_recode_loop(AV1_COMP *cpi) {
 
   aom_clear_system_state();
 
-  set_frame_size(cpi);
+#if CONFIG_FRAME_SUPERRES
+  // TODO(afergs): Figure out when is actually a good time to do superres
+  cm->superres_scale_numerator = SUPERRES_SCALE_DENOMINATOR;
+  // (uint8_t)(rand() % 9 + SUPERRES_SCALE_NUMERATOR_MIN);
+  cpi->superres_pending = cpi->oxcf.superres_enabled && 0;
+#endif  // CONFIG_FRAME_SUPERRES
+
+  setup_frame_size(cpi);
+  av1_resize_step(cpi);
 
   // For 1 pass CBR under dynamic resize mode: use faster scaling for source.
   // Only for 2x2 scaling for now.
@@ -4075,19 +3951,9 @@ static void encode_without_recode_loop(AV1_COMP *cpi) {
     reset_use_upsampled_references(cpi);
 
   av1_set_quantizer(cm, q);
-  av1_set_variance_partition_thresholds(cpi, q);
-
   setup_frame(cpi);
-
-#if CONFIG_SUBFRAME_PROB_UPDATE
-  cm->do_subframe_update = cm->tile_cols == 1 && cm->tile_rows == 1;
-  av1_copy(cm->starting_coef_probs, cm->fc->coef_probs);
-  av1_copy(cpi->subframe_stats.enc_starting_coef_probs, cm->fc->coef_probs);
-  cm->coef_probs_update_idx = 0;
-  av1_copy(cpi->subframe_stats.coef_probs_buf[0], cm->fc->coef_probs);
-#endif  // CONFIG_SUBFRAME_PROB_UPDATE
-
   suppress_active_map(cpi);
+
   // Variance adaptive and in frame q adjustment experiments are mutually
   // exclusive.
   if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
@@ -4102,6 +3968,11 @@ static void encode_without_recode_loop(AV1_COMP *cpi) {
   // transform / motion compensation build reconstruction frame
   av1_encode_frame(cpi);
 
+#if CONFIG_FRAME_SUPERRES
+  // TODO(afergs): Upscale the frame to show
+  cpi->superres_pending = 0;
+#endif  // CONFIG_FRAME_SUPERRES
+
   // Update some stats from cyclic refresh, and check if we should not update
   // golden reference, for 1 pass CBR.
   if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->frame_type != KEY_FRAME &&
@@ -4136,9 +4007,13 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size,
   do {
     aom_clear_system_state();
 
-    set_frame_size(cpi);
+    setup_frame_size(cpi);
 
-    if (loop_count == 0 || cpi->resize_pending != 0) {
+#if CONFIG_FRAME_SUPERRES
+    if (loop_count == 0 || av1_resize_pending(cpi) || cpi->superres_pending) {
+#else
+    if (loop_count == 0 || av1_resize_pending(cpi)) {
+#endif  // CONFIG_FRAME_SUPERRES
       set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
 
       // cpi->sf.use_upsampled_references can be different from frame to frame.
@@ -4159,8 +4034,8 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size,
       undershoot_seen = 0;
 #endif
 
-      // Reconfiguration for change in frame size has concluded.
-      cpi->resize_pending = 0;
+      // Advance resize to next state now that updates are done
+      av1_resize_step(cpi);
 
       q_low = bottom_index;
       q_high = top_index;
@@ -4208,26 +4083,6 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size,
     }
 #endif  // CONFIG_Q_ADAPT_PROBS
 
-#if CONFIG_SUBFRAME_PROB_UPDATE
-    cm->do_subframe_update = cm->tile_cols == 1 && cm->tile_rows == 1;
-    if (loop_count == 0 || frame_is_intra_only(cm) ||
-        cm->error_resilient_mode) {
-      av1_copy(cm->starting_coef_probs, cm->fc->coef_probs);
-      av1_copy(cpi->subframe_stats.enc_starting_coef_probs, cm->fc->coef_probs);
-    } else {
-      if (cm->do_subframe_update) {
-        av1_copy(cm->fc->coef_probs,
-                 cpi->subframe_stats.enc_starting_coef_probs);
-        av1_copy(cm->starting_coef_probs,
-                 cpi->subframe_stats.enc_starting_coef_probs);
-        av1_zero(cpi->subframe_stats.coef_counts_buf);
-        av1_zero(cpi->subframe_stats.eob_counts_buf);
-      }
-    }
-    cm->coef_probs_update_idx = 0;
-    av1_copy(cpi->subframe_stats.coef_probs_buf[0], cm->fc->coef_probs);
-#endif  // CONFIG_SUBFRAME_PROB_UPDATE
-
     // Variance adaptive and in frame q adjustment experiments are mutually
     // exclusive.
     if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
@@ -4318,23 +4173,9 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size,
         int last_q = q;
 #if !CONFIG_XIPHRC
         int retries = 0;
-#endif
 
-        if (cpi->resize_pending == 1) {
-          // Change in frame size so go back around the recode loop.
-          cpi->rc.frame_size_selector =
-              SCALE_STEP1 - cpi->rc.frame_size_selector;
-          cpi->rc.next_frame_size_selector = cpi->rc.frame_size_selector;
+        // TODO(afergs): Replace removed recode when av1_resize_pending is true
 
-#if CONFIG_INTERNAL_STATS
-          ++cpi->tot_recode_hits;
-#endif
-          ++loop_count;
-          loop = 1;
-          continue;
-        }
-
-#if !CONFIG_XIPHRC
         // Frame size out of permitted range:
         // Update correction factor & compute new Q to try...
         // Frame is too large
@@ -4438,7 +4279,7 @@ static int get_ref_frame_flags(const AV1_COMP *cpi) {
   const int last3_is_last =
       map[cpi->lst_fb_idxes[2]] == map[cpi->lst_fb_idxes[0]];
   const int gld_is_last = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[0]];
-#if CONFIG_LOWDELAY_COMPOUND
+#if CONFIG_ONE_SIDED_COMPOUND
   const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idxes[0]];
   const int last3_is_last2 =
       map[cpi->lst_fb_idxes[2]] == map[cpi->lst_fb_idxes[1]];
@@ -4491,7 +4332,7 @@ static int get_ref_frame_flags(const AV1_COMP *cpi) {
 
   if (gld_is_last2 || gld_is_last3) flags &= ~AOM_GOLD_FLAG;
 
-#if CONFIG_LOWDELAY_COMPOUND  // Changes LL & HL bitstream
+#if CONFIG_ONE_SIDED_COMPOUND  // Changes LL & HL bitstream
   /* Allow biprediction between two identical frames (e.g. bwd_is_last = 1) */
   if (bwd_is_alt && (flags & AOM_BWD_FLAG)) flags &= ~AOM_BWD_FLAG;
 #else
@@ -4522,36 +4363,6 @@ static void set_ext_overrides(AV1_COMP *cpi) {
   }
 }
 
-YV12_BUFFER_CONFIG *av1_scale_if_required_fast(AV1_COMMON *cm,
-                                               YV12_BUFFER_CONFIG *unscaled,
-                                               YV12_BUFFER_CONFIG *scaled) {
-  if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
-      cm->mi_rows * MI_SIZE != unscaled->y_height) {
-    // For 2x2 scaling down.
-    aom_scale_frame(unscaled, scaled, unscaled->y_buffer, 9, 2, 1, 2, 1, 0);
-    aom_extend_frame_borders(scaled);
-    return scaled;
-  } else {
-    return unscaled;
-  }
-}
-
-YV12_BUFFER_CONFIG *av1_scale_if_required(AV1_COMMON *cm,
-                                          YV12_BUFFER_CONFIG *unscaled,
-                                          YV12_BUFFER_CONFIG *scaled) {
-  if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
-      cm->mi_rows * MI_SIZE != unscaled->y_height) {
-#if CONFIG_HIGHBITDEPTH
-    scale_and_extend_frame_nonnormative(unscaled, scaled, (int)cm->bit_depth);
-#else
-    scale_and_extend_frame_nonnormative(unscaled, scaled);
-#endif  // CONFIG_HIGHBITDEPTH
-    return scaled;
-  } else {
-    return unscaled;
-  }
-}
-
 static void set_arf_sign_bias(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   int arf_sign_bias;
@@ -5014,9 +4825,6 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
   av1_accumulate_frame_counts(&aggregate_fc, &cm->counts);
 #endif  // CONFIG_ENTROPY_STATS
   if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
-#if CONFIG_SUBFRAME_PROB_UPDATE
-    cm->partial_prob_update = 0;
-#endif  // CONFIG_SUBFRAME_PROB_UPDATE
     av1_adapt_coef_probs(cm);
     av1_adapt_intra_frame_probs(cm);
 #if CONFIG_EC_ADAPT
@@ -5767,7 +5575,8 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
 #else
     av1_rc_get_second_pass_params(cpi);
   } else if (oxcf->pass == 1) {
-    set_frame_size(cpi);
+    setup_frame_size(cpi);
+    av1_resize_step(cpi);
   }
 #endif
 
@@ -5900,8 +5709,7 @@ int av1_set_internal_size(AV1_COMP *cpi, AOM_SCALING horiz_mode,
   return 0;
 }
 
-int av1_set_size_literal(AV1_COMP *cpi, unsigned int width,
-                         unsigned int height) {
+int av1_set_size_literal(AV1_COMP *cpi, int width, int height) {
   AV1_COMMON *cm = &cpi->common;
 #if CONFIG_HIGHBITDEPTH
   check_initial_width(cpi, cm->use_highbitdepth, 1, 1);
@@ -5909,21 +5717,20 @@ int av1_set_size_literal(AV1_COMP *cpi, unsigned int width,
   check_initial_width(cpi, 1, 1);
 #endif  // CONFIG_HIGHBITDEPTH
 
-  if (width) {
-    cm->width = width;
-    if (cm->width > cpi->initial_width) {
-      cm->width = cpi->initial_width;
-      printf("Warning: Desired width too large, changed to %d\n", cm->width);
-    }
+  if (width <= 0 || height <= 0) return 1;
+
+  cm->width = width;
+  if (cm->width > cpi->initial_width) {
+    cm->width = cpi->initial_width;
+    printf("Warning: Desired width too large, changed to %d\n", cm->width);
   }
 
-  if (height) {
-    cm->height = height;
-    if (cm->height > cpi->initial_height) {
-      cm->height = cpi->initial_height;
-      printf("Warning: Desired height too large, changed to %d\n", cm->height);
-    }
+  cm->height = height;
+  if (cm->height > cpi->initial_height) {
+    cm->height = cpi->initial_height;
+    printf("Warning: Desired height too large, changed to %d\n", cm->height);
   }
+
   assert(cm->width <= cpi->initial_width);
   assert(cm->height <= cpi->initial_height);
 
diff --git a/third_party/aom/av1/encoder/encoder.h b/third_party/aom/av1/encoder/encoder.h
index 4e7aef8fc..ee1257c2d 100644
--- a/third_party/aom/av1/encoder/encoder.h
+++ b/third_party/aom/av1/encoder/encoder.h
@@ -37,7 +37,6 @@
 #include "av1/encoder/rd.h"
 #include "av1/encoder/speed_features.h"
 #include "av1/encoder/tokenize.h"
-#include "av1/encoder/variance_tree.h"
 #if CONFIG_XIPHRC
 #include "av1/encoder/ratectrl_xiph.h"
 #endif
@@ -54,15 +53,9 @@ extern "C" {
 #endif
 
 typedef struct {
-  int nmvjointcost[MV_JOINTS];
-  int nmvcosts[2][MV_VALS];
-  int nmvcosts_hp[2][MV_VALS];
-
-#if CONFIG_REF_MV
   int nmv_vec_cost[NMV_CONTEXTS][MV_JOINTS];
   int nmv_costs[NMV_CONTEXTS][2][MV_VALS];
   int nmv_costs_hp[NMV_CONTEXTS][2][MV_VALS];
-#endif
 
   // 0 = Intra, Last, GF, ARF
   signed char last_ref_lf_deltas[TOTAL_REFS_PER_FRAME];
@@ -210,6 +203,11 @@ typedef struct AV1EncoderConfig {
   int scaled_frame_width;
   int scaled_frame_height;
 
+#if CONFIG_FRAME_SUPERRES
+  // Frame Super-Resolution size scaling
+  int superres_enabled;
+#endif  // CONFIG_FRAME_SUPERRES
+
   // Enable feature to reduce the frame quantization every x frames.
   int frame_periodic_boost;
 
@@ -323,9 +321,16 @@ typedef struct ThreadData {
   PICK_MODE_CONTEXT *leaf_tree;
   PC_TREE *pc_tree;
   PC_TREE *pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2 + 1];
+#if CONFIG_MOTION_VAR
+  int32_t *wsrc_buf;
+  int32_t *mask_buf;
+  uint8_t *above_pred_buf;
+  uint8_t *left_pred_buf;
+#endif
 
-  VAR_TREE *var_tree;
-  VAR_TREE *var_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2 + 1];
+#if CONFIG_PALETTE
+  PALETTE_BUFFER *palette_buffer;
+#endif  // CONFIG_PALETTE
 } ThreadData;
 
 struct EncWorkerData;
@@ -350,16 +355,6 @@ typedef struct {
   YV12_BUFFER_CONFIG buf;
 } EncRefCntBuffer;
 
-#if CONFIG_SUBFRAME_PROB_UPDATE
-typedef struct SUBFRAME_STATS {
-  av1_coeff_probs_model coef_probs_buf[COEF_PROBS_BUFS][TX_SIZES][PLANE_TYPES];
-  av1_coeff_count coef_counts_buf[COEF_PROBS_BUFS][TX_SIZES][PLANE_TYPES];
-  unsigned int eob_counts_buf[COEF_PROBS_BUFS][TX_SIZES][PLANE_TYPES][REF_TYPES]
-                             [COEF_BANDS][COEFF_CONTEXTS];
-  av1_coeff_probs_model enc_starting_coef_probs[TX_SIZES][PLANE_TYPES];
-} SUBFRAME_STATS;
-#endif  // CONFIG_SUBFRAME_PROB_UPDATE
-
 typedef struct TileBufferEnc {
   uint8_t *data;
   size_t size;
@@ -369,14 +364,7 @@ typedef struct AV1_COMP {
   QUANTS quants;
   ThreadData td;
   MB_MODE_INFO_EXT *mbmi_ext_base;
-  DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][8]);   // 8: SIMD width
-  DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][8]);  // 8: SIMD width
-#if CONFIG_NEW_QUANT
-  DECLARE_ALIGNED(16, dequant_val_type_nuq,
-                  y_dequant_val_nuq[QUANT_PROFILES][QINDEX_RANGE][COEF_BANDS]);
-  DECLARE_ALIGNED(16, dequant_val_type_nuq,
-                  uv_dequant_val_nuq[QUANT_PROFILES][QINDEX_RANGE][COEF_BANDS]);
-#endif  // CONFIG_NEW_QUANT
+  Dequants dequants;
   AV1_COMMON common;
   AV1EncoderConfig oxcf;
   struct lookahead_ctx *lookahead;
@@ -443,15 +431,8 @@ typedef struct AV1_COMP {
 
   CODING_CONTEXT coding_context;
 
-#if CONFIG_REF_MV
   int nmv_costs[NMV_CONTEXTS][2][MV_VALS];
   int nmv_costs_hp[NMV_CONTEXTS][2][MV_VALS];
-#endif
-
-  int nmvcosts[2][MV_VALS];
-  int nmvcosts_hp[2][MV_VALS];
-  int nmvsadcosts[2][MV_VALS];
-  int nmvsadcosts_hp[2][MV_VALS];
 
   int64_t last_time_stamp_seen;
   int64_t last_end_time_stamp_seen;
@@ -543,29 +524,23 @@ typedef struct AV1_COMP {
                     // number of MBs in the current frame when the frame is
                     // scaled.
 
-  // Store frame variance info in SOURCE_VAR_BASED_PARTITION search type.
-  DIFF *source_diff_var;
-  // The threshold used in SOURCE_VAR_BASED_PARTITION search type.
-  unsigned int source_var_thresh;
-  int frames_till_next_var_check;
-
   int frame_flags;
 
   search_site_config ss_cfg;
 
   int mbmode_cost[BLOCK_SIZE_GROUPS][INTRA_MODES];
-#if CONFIG_REF_MV
   int newmv_mode_cost[NEWMV_MODE_CONTEXTS][2];
   int zeromv_mode_cost[ZEROMV_MODE_CONTEXTS][2];
   int refmv_mode_cost[REFMV_MODE_CONTEXTS][2];
   int drl_mode_cost0[DRL_MODE_CONTEXTS][2];
-#endif
 
   unsigned int inter_mode_cost[INTER_MODE_CONTEXTS][INTER_MODES];
 #if CONFIG_EXT_INTER
   unsigned int inter_compound_mode_cost[INTER_MODE_CONTEXTS]
                                        [INTER_COMPOUND_MODES];
+#if CONFIG_INTERINTRA
   unsigned int interintra_mode_cost[BLOCK_SIZE_GROUPS][INTERINTRA_MODES];
+#endif  // CONFIG_INTERINTRA
 #endif  // CONFIG_EXT_INTER
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
   int motion_mode_cost[BLOCK_SIZES][MOTION_MODES];
@@ -625,24 +600,18 @@ typedef struct AV1_COMP {
 
   TileBufferEnc tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS];
 
-  int resize_pending;
   int resize_state;
   int resize_scale_num;
   int resize_scale_den;
+  int resize_next_scale_num;
+  int resize_next_scale_den;
   int resize_avg_qp;
   int resize_buffer_underflow;
   int resize_count;
 
-  // VAR_BASED_PARTITION thresholds
-  // 0 - threshold_128x128;
-  // 1 - threshold_64x64;
-  // 2 - threshold_32x32;
-  // 3 - threshold_16x16;
-  // 4 - threshold_8x8;
-  int64_t vbp_thresholds[5];
-  int64_t vbp_threshold_minmax;
-  int64_t vbp_threshold_sad;
-  BLOCK_SIZE vbp_bsize_min;
+#if CONFIG_FRAME_SUPERRES
+  int superres_pending;
+#endif  // CONFIG_FRAME_SUPERRES
 
   // VARIANCE_AQ segment map refresh
   int vaq_refresh;
@@ -652,12 +621,6 @@ typedef struct AV1_COMP {
   AVxWorker *workers;
   struct EncWorkerData *tile_thr_data;
   AV1LfSync lf_row_sync;
-#if CONFIG_SUBFRAME_PROB_UPDATE
-  SUBFRAME_STATS subframe_stats;
-  // TODO(yaowu): minimize the size of count buffers
-  SUBFRAME_STATS wholeframe_stats;
-  av1_coeff_stats branch_ct_buf[COEF_PROBS_BUFS][TX_SIZES][PLANE_TYPES];
-#endif  // CONFIG_SUBFRAME_PROB_UPDATE
 #if CONFIG_ANS
   struct BufAnsCoder buf_ans;
 #endif
@@ -720,8 +683,8 @@ int av1_get_active_map(AV1_COMP *cpi, unsigned char *map, int rows, int cols);
 int av1_set_internal_size(AV1_COMP *cpi, AOM_SCALING horiz_mode,
                           AOM_SCALING vert_mode);
 
-int av1_set_size_literal(AV1_COMP *cpi, unsigned int width,
-                         unsigned int height);
+// Returns 1 if the assigned width or height was <= 0.
+int av1_set_size_literal(AV1_COMP *cpi, int width, int height);
 
 int av1_get_quantizer(struct AV1_COMP *cpi);
 
@@ -774,7 +737,7 @@ static INLINE const YV12_BUFFER_CONFIG *get_upsampled_ref(
   return &cpi->upsampled_ref_bufs[buf_idx].buf;
 }
 
-#if CONFIG_EXT_REFS
+#if CONFIG_EXT_REFS || CONFIG_TEMPMV_SIGNALING
 static INLINE int enc_is_ref_frame_buf(AV1_COMP *cpi, RefCntBuffer *frame_buf) {
   MV_REFERENCE_FRAME ref_frame;
   AV1_COMMON *const cm = &cpi->common;
@@ -819,14 +782,6 @@ void av1_set_high_precision_mv(AV1_COMP *cpi, int allow_high_precision_mv);
 void av1_set_temporal_mv_prediction(AV1_COMP *cpi, int allow_tempmv_prediction);
 #endif
 
-YV12_BUFFER_CONFIG *av1_scale_if_required_fast(AV1_COMMON *cm,
-                                               YV12_BUFFER_CONFIG *unscaled,
-                                               YV12_BUFFER_CONFIG *scaled);
-
-YV12_BUFFER_CONFIG *av1_scale_if_required(AV1_COMMON *cm,
-                                          YV12_BUFFER_CONFIG *unscaled,
-                                          YV12_BUFFER_CONFIG *scaled);
-
 void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags);
 
 static INLINE int is_altref_enabled(const AV1_COMP *const cpi) {
@@ -876,6 +831,25 @@ static INLINE void uref_cnt_fb(EncRefCntBuffer *ubufs, int *uidx,
   ubufs[new_uidx].ref_count++;
 }
 
+// Returns 1 if a resize is pending and 0 otherwise.
+static INLINE int av1_resize_pending(const struct AV1_COMP *cpi) {
+  return cpi->resize_scale_num != cpi->resize_next_scale_num ||
+         cpi->resize_scale_den != cpi->resize_next_scale_den;
+}
+
+// Returns 1 if a frame is unscaled and 0 otherwise.
+static INLINE int av1_resize_unscaled(const struct AV1_COMP *cpi) {
+  return cpi->resize_scale_num == cpi->resize_scale_den;
+}
+
+// Moves resizing to the next state. This is just setting the numerator and
+// denominator to the next numerator and denominator, causing
+// av1_resize_pending to subsequently return false.
+static INLINE void av1_resize_step(struct AV1_COMP *cpi) {
+  cpi->resize_scale_num = cpi->resize_next_scale_num;
+  cpi->resize_scale_den = cpi->resize_next_scale_den;
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/encoder/encodetxb.c b/third_party/aom/av1/encoder/encodetxb.c
index 3f71a4472..731642064 100644
--- a/third_party/aom/av1/encoder/encodetxb.c
+++ b/third_party/aom/av1/encoder/encodetxb.c
@@ -21,6 +21,8 @@
 #include "av1/encoder/subexp.h"
 #include "av1/encoder/tokenize.h"
 
+#define TEST_OPTIMIZE_TXB 0
+
 void av1_alloc_txb_buf(AV1_COMP *cpi) {
 #if 0
   AV1_COMMON *cm = &cpi->common;
@@ -159,7 +161,7 @@ void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
     }
 
     // level is above 1.
-    ctx = get_level_ctx(tcoeff, scan[c], bwl);
+    ctx = get_br_ctx(tcoeff, scan[c], bwl);
     for (idx = 0; idx < COEFF_BASE_RANGE; ++idx) {
       if (level == (idx + 1 + NUM_BASE_LEVELS)) {
         aom_write(w, 1, cm->fc->coeff_lps[tx_size][plane_type][ctx]);
@@ -251,6 +253,32 @@ static INLINE void get_base_ctx_set(const tran_low_t *tcoeffs,
   return;
 }
 
+static INLINE int get_br_cost(tran_low_t abs_qc, int ctx,
+                              const aom_prob *coeff_lps) {
+  const tran_low_t min_level = 1 + NUM_BASE_LEVELS;
+  const tran_low_t max_level = 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE;
+  if (abs_qc >= min_level) {
+    const int cost0 = av1_cost_bit(coeff_lps[ctx], 0);
+    const int cost1 = av1_cost_bit(coeff_lps[ctx], 1);
+    if (abs_qc >= max_level)
+      return COEFF_BASE_RANGE * cost0;
+    else
+      return (abs_qc - min_level) * cost0 + cost1;
+  } else {
+    return 0;
+  }
+}
+
+static INLINE int get_base_cost(tran_low_t abs_qc, int ctx,
+                                aom_prob (*coeff_base)[COEFF_BASE_CONTEXTS],
+                                int base_idx) {
+  const int level = base_idx + 1;
+  if (abs_qc < level)
+    return 0;
+  else
+    return av1_cost_bit(coeff_base[base_idx][ctx], abs_qc == level);
+}
+
 int av1_cost_coeffs_txb(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
                         int block, TXB_CTX *txb_ctx) {
   const AV1_COMMON *const cm = &cpi->common;
@@ -331,7 +359,7 @@ int av1_cost_coeffs_txb(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
         int idx;
         int ctx;
 
-        ctx = get_level_ctx(qcoeff, scan[c], bwl);
+        ctx = get_br_ctx(qcoeff, scan[c], bwl);
 
         for (idx = 0; idx < COEFF_BASE_RANGE; ++idx) {
           if (level == (idx + 1 + NUM_BASE_LEVELS)) {
@@ -373,12 +401,1085 @@ int av1_cost_coeffs_txb(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
   return cost;
 }
 
-typedef struct TxbParams {
-  const AV1_COMP *cpi;
-  ThreadData *td;
-  int rate;
-} TxbParams;
+static INLINE int has_base(tran_low_t qc, int base_idx) {
+  const int level = base_idx + 1;
+  return abs(qc) >= level;
+}
+
+static void gen_base_count_mag_arr(int (*base_count_arr)[MAX_TX_SQUARE],
+                                   int (*base_mag_arr)[2],
+                                   const tran_low_t *qcoeff, int stride,
+                                   int eob, const int16_t *scan) {
+  for (int c = 0; c < eob; ++c) {
+    const int coeff_idx = scan[c];  // raster order
+    if (!has_base(qcoeff[coeff_idx], 0)) continue;
+    const int row = coeff_idx / stride;
+    const int col = coeff_idx % stride;
+    int *mag = base_mag_arr[coeff_idx];
+    get_mag(mag, qcoeff, stride, row, col, base_ref_offset,
+            BASE_CONTEXT_POSITION_NUM);
+    for (int i = 0; i < NUM_BASE_LEVELS; ++i) {
+      if (!has_base(qcoeff[coeff_idx], i)) continue;
+      int *count = base_count_arr[i] + coeff_idx;
+      *count = get_level_count(qcoeff, stride, row, col, i, base_ref_offset,
+                               BASE_CONTEXT_POSITION_NUM);
+    }
+  }
+}
+
+static void gen_nz_count_arr(int(*nz_count_arr), const tran_low_t *qcoeff,
+                             int stride, int eob,
+                             const SCAN_ORDER *scan_order) {
+  const int16_t *scan = scan_order->scan;
+  const int16_t *iscan = scan_order->iscan;
+  for (int c = 0; c < eob; ++c) {
+    const int coeff_idx = scan[c];  // raster order
+    const int row = coeff_idx / stride;
+    const int col = coeff_idx % stride;
+    nz_count_arr[coeff_idx] = get_nz_count(qcoeff, stride, row, col, iscan);
+  }
+}
+
+static void gen_nz_ctx_arr(int (*nz_ctx_arr)[2], int(*nz_count_arr),
+                           const tran_low_t *qcoeff, int bwl, int eob,
+                           const SCAN_ORDER *scan_order) {
+  const int16_t *scan = scan_order->scan;
+  const int16_t *iscan = scan_order->iscan;
+  for (int c = 0; c < eob; ++c) {
+    const int coeff_idx = scan[c];  // raster order
+    const int count = nz_count_arr[coeff_idx];
+    nz_ctx_arr[coeff_idx][0] =
+        get_nz_map_ctx_from_count(count, qcoeff, coeff_idx, bwl, iscan);
+  }
+}
+
+static void gen_base_ctx_arr(int (*base_ctx_arr)[MAX_TX_SQUARE][2],
+                             int (*base_count_arr)[MAX_TX_SQUARE],
+                             int (*base_mag_arr)[2], const tran_low_t *qcoeff,
+                             int stride, int eob, const int16_t *scan) {
+  (void)qcoeff;
+  for (int i = 0; i < NUM_BASE_LEVELS; ++i) {
+    for (int c = 0; c < eob; ++c) {
+      const int coeff_idx = scan[c];  // raster order
+      if (!has_base(qcoeff[coeff_idx], i)) continue;
+      const int row = coeff_idx / stride;
+      const int col = coeff_idx % stride;
+      const int count = base_count_arr[i][coeff_idx];
+      const int *mag = base_mag_arr[coeff_idx];
+      const int level = i + 1;
+      base_ctx_arr[i][coeff_idx][0] =
+          get_base_ctx_from_count_mag(row, col, count, mag[0], level);
+    }
+  }
+}
+
+static INLINE int has_br(tran_low_t qc) {
+  return abs(qc) >= 1 + NUM_BASE_LEVELS;
+}
+
+static void gen_br_count_mag_arr(int *br_count_arr, int (*br_mag_arr)[2],
+                                 const tran_low_t *qcoeff, int stride, int eob,
+                                 const int16_t *scan) {
+  for (int c = 0; c < eob; ++c) {
+    const int coeff_idx = scan[c];  // raster order
+    if (!has_br(qcoeff[coeff_idx])) continue;
+    const int row = coeff_idx / stride;
+    const int col = coeff_idx % stride;
+    int *count = br_count_arr + coeff_idx;
+    int *mag = br_mag_arr[coeff_idx];
+    *count = get_level_count(qcoeff, stride, row, col, NUM_BASE_LEVELS,
+                             br_ref_offset, BR_CONTEXT_POSITION_NUM);
+    get_mag(mag, qcoeff, stride, row, col, br_ref_offset,
+            BR_CONTEXT_POSITION_NUM);
+  }
+}
+
+static void gen_br_ctx_arr(int (*br_ctx_arr)[2], const int *br_count_arr,
+                           int (*br_mag_arr)[2], const tran_low_t *qcoeff,
+                           int stride, int eob, const int16_t *scan) {
+  (void)qcoeff;
+  for (int c = 0; c < eob; ++c) {
+    const int coeff_idx = scan[c];  // raster order
+    if (!has_br(qcoeff[coeff_idx])) continue;
+    const int row = coeff_idx / stride;
+    const int col = coeff_idx % stride;
+    const int count = br_count_arr[coeff_idx];
+    const int *mag = br_mag_arr[coeff_idx];
+    br_ctx_arr[coeff_idx][0] =
+        get_br_ctx_from_count_mag(row, col, count, mag[0]);
+  }
+}
+
+static INLINE int get_sign_bit_cost(tran_low_t qc, int coeff_idx,
+                                    const aom_prob *dc_sign_prob,
+                                    int dc_sign_ctx) {
+  const int sign = (qc < 0) ? 1 : 0;
+  // sign bit cost
+  if (coeff_idx == 0) {
+    return av1_cost_bit(dc_sign_prob[dc_sign_ctx], sign);
+  } else {
+    return av1_cost_bit(128, sign);
+  }
+}
+static INLINE int get_golomb_cost(int abs_qc) {
+  if (abs_qc >= 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) {
+    // residual cost
+    int r = abs_qc - COEFF_BASE_RANGE - NUM_BASE_LEVELS;
+    int ri = r;
+    int length = 0;
+
+    while (ri) {
+      ri >>= 1;
+      ++length;
+    }
+
+    return av1_cost_literal(2 * length - 1);
+  } else {
+    return 0;
+  }
+}
+
+// TODO(angiebird): add static once this function is called
+void gen_txb_cache(TxbCache *txb_cache, TxbInfo *txb_info) {
+  const int16_t *scan = txb_info->scan_order->scan;
+  gen_nz_count_arr(txb_cache->nz_count_arr, txb_info->qcoeff, txb_info->stride,
+                   txb_info->eob, txb_info->scan_order);
+  gen_nz_ctx_arr(txb_cache->nz_ctx_arr, txb_cache->nz_count_arr,
+                 txb_info->qcoeff, txb_info->bwl, txb_info->eob,
+                 txb_info->scan_order);
+  gen_base_count_mag_arr(txb_cache->base_count_arr, txb_cache->base_mag_arr,
+                         txb_info->qcoeff, txb_info->stride, txb_info->eob,
+                         scan);
+  gen_base_ctx_arr(txb_cache->base_ctx_arr, txb_cache->base_count_arr,
+                   txb_cache->base_mag_arr, txb_info->qcoeff, txb_info->stride,
+                   txb_info->eob, scan);
+  gen_br_count_mag_arr(txb_cache->br_count_arr, txb_cache->br_mag_arr,
+                       txb_info->qcoeff, txb_info->stride, txb_info->eob, scan);
+  gen_br_ctx_arr(txb_cache->br_ctx_arr, txb_cache->br_count_arr,
+                 txb_cache->br_mag_arr, txb_info->qcoeff, txb_info->stride,
+                 txb_info->eob, scan);
+}
+
+static INLINE aom_prob get_level_prob(int level, int coeff_idx,
+                                      const TxbCache *txb_cache,
+                                      const TxbProbs *txb_probs) {
+  if (level == 0) {
+    const int ctx = txb_cache->nz_ctx_arr[coeff_idx][0];
+    return txb_probs->nz_map[ctx];
+  } else if (level >= 1 && level < 1 + NUM_BASE_LEVELS) {
+    const int idx = level - 1;
+    const int ctx = txb_cache->base_ctx_arr[idx][coeff_idx][0];
+    return txb_probs->coeff_base[idx][ctx];
+  } else if (level >= 1 + NUM_BASE_LEVELS &&
+             level < 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) {
+    const int ctx = txb_cache->br_ctx_arr[coeff_idx][0];
+    return txb_probs->coeff_lps[ctx];
+  } else if (level >= 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) {
+    printf("get_level_prob does not support golomb\n");
+    assert(0);
+    return 0;
+  } else {
+    assert(0);
+    return 0;
+  }
+}
+
+static INLINE tran_low_t get_lower_coeff(tran_low_t qc) {
+  if (qc == 0) {
+    return 0;
+  }
+  return qc > 0 ? qc - 1 : qc + 1;
+}
+
+static INLINE void update_mag_arr(int *mag_arr, int abs_qc) {
+  if (mag_arr[0] == abs_qc) {
+    mag_arr[1] -= 1;
+    assert(mag_arr[1] >= 0);
+  }
+}
+
+static INLINE int get_mag_from_mag_arr(const int *mag_arr) {
+  int mag;
+  if (mag_arr[1] > 0) {
+    mag = mag_arr[0];
+  } else if (mag_arr[0] > 0) {
+    mag = mag_arr[0] - 1;
+  } else {
+    // no neighbor
+    assert(mag_arr[0] == 0 && mag_arr[1] == 0);
+    mag = 0;
+  }
+  return mag;
+}
+
+static int neighbor_level_down_update(int *new_count, int *new_mag, int count,
+                                      const int *mag, int coeff_idx,
+                                      tran_low_t abs_nb_coeff, int nb_coeff_idx,
+                                      int level, const TxbInfo *txb_info) {
+  *new_count = count;
+  *new_mag = get_mag_from_mag_arr(mag);
+
+  int update = 0;
+  // check if br_count changes
+  if (abs_nb_coeff == level) {
+    update = 1;
+    *new_count -= 1;
+    assert(*new_count >= 0);
+  }
+  const int row = coeff_idx >> txb_info->bwl;
+  const int col = coeff_idx - (row << txb_info->bwl);
+  const int nb_row = nb_coeff_idx >> txb_info->bwl;
+  const int nb_col = nb_coeff_idx - (nb_row << txb_info->bwl);
+
+  // check if mag changes
+  if (nb_row >= row && nb_col >= col) {
+    if (abs_nb_coeff == mag[0]) {
+      assert(mag[1] > 0);
+      if (mag[1] == 1) {
+        // the nb is the only qc with max mag
+        *new_mag -= 1;
+        assert(*new_mag >= 0);
+        update = 1;
+      }
+    }
+  }
+  return update;
+}
+
+static int try_neighbor_level_down_br(int coeff_idx, int nb_coeff_idx,
+                                      const TxbCache *txb_cache,
+                                      const TxbProbs *txb_probs,
+                                      const TxbInfo *txb_info) {
+  const tran_low_t qc = txb_info->qcoeff[coeff_idx];
+  const tran_low_t abs_qc = abs(qc);
+  const int level = NUM_BASE_LEVELS + 1;
+  if (abs_qc < level) return 0;
+
+  const tran_low_t nb_coeff = txb_info->qcoeff[nb_coeff_idx];
+  const tran_low_t abs_nb_coeff = abs(nb_coeff);
+  const int count = txb_cache->br_count_arr[coeff_idx];
+  const int *mag = txb_cache->br_mag_arr[coeff_idx];
+  int new_count;
+  int new_mag;
+  const int update =
+      neighbor_level_down_update(&new_count, &new_mag, count, mag, coeff_idx,
+                                 abs_nb_coeff, nb_coeff_idx, level, txb_info);
+  if (update) {
+    const int row = coeff_idx >> txb_info->bwl;
+    const int col = coeff_idx - (row << txb_info->bwl);
+    const int ctx = txb_cache->br_ctx_arr[coeff_idx][0];
+    const int org_cost = get_br_cost(abs_qc, ctx, txb_probs->coeff_lps);
+
+    const int new_ctx = get_br_ctx_from_count_mag(row, col, new_count, new_mag);
+    const int new_cost = get_br_cost(abs_qc, new_ctx, txb_probs->coeff_lps);
+    const int cost_diff = -org_cost + new_cost;
+    return cost_diff;
+  } else {
+    return 0;
+  }
+}
+
+static int try_neighbor_level_down_base(int coeff_idx, int nb_coeff_idx,
+                                        const TxbCache *txb_cache,
+                                        const TxbProbs *txb_probs,
+                                        const TxbInfo *txb_info) {
+  const tran_low_t qc = txb_info->qcoeff[coeff_idx];
+  const tran_low_t abs_qc = abs(qc);
+
+  int cost_diff = 0;
+  for (int base_idx = 0; base_idx < NUM_BASE_LEVELS; ++base_idx) {
+    const int level = base_idx + 1;
+    if (abs_qc < level) continue;
+
+    const tran_low_t nb_coeff = txb_info->qcoeff[nb_coeff_idx];
+    const tran_low_t abs_nb_coeff = abs(nb_coeff);
+
+    const int count = txb_cache->base_count_arr[base_idx][coeff_idx];
+    const int *mag = txb_cache->base_mag_arr[coeff_idx];
+    int new_count;
+    int new_mag;
+    const int update =
+        neighbor_level_down_update(&new_count, &new_mag, count, mag, coeff_idx,
+                                   abs_nb_coeff, nb_coeff_idx, level, txb_info);
+    if (update) {
+      const int row = coeff_idx >> txb_info->bwl;
+      const int col = coeff_idx - (row << txb_info->bwl);
+      const int ctx = txb_cache->base_ctx_arr[base_idx][coeff_idx][0];
+      const int org_cost =
+          get_base_cost(abs_qc, ctx, txb_probs->coeff_base, base_idx);
+
+      const int new_ctx =
+          get_base_ctx_from_count_mag(row, col, new_count, new_mag, level);
+      const int new_cost =
+          get_base_cost(abs_qc, new_ctx, txb_probs->coeff_base, base_idx);
+      cost_diff += -org_cost + new_cost;
+    }
+  }
+  return cost_diff;
+}
+
+static int try_neighbor_level_down_nz(int coeff_idx, int nb_coeff_idx,
+                                      const TxbCache *txb_cache,
+                                      const TxbProbs *txb_probs,
+                                      TxbInfo *txb_info) {
+  // assume eob doesn't change
+  const tran_low_t qc = txb_info->qcoeff[coeff_idx];
+  const tran_low_t abs_qc = abs(qc);
+  const tran_low_t nb_coeff = txb_info->qcoeff[nb_coeff_idx];
+  const tran_low_t abs_nb_coeff = abs(nb_coeff);
+  if (abs_nb_coeff != 1) return 0;
+  const int16_t *iscan = txb_info->scan_order->iscan;
+  const int scan_idx = iscan[coeff_idx];
+  if (scan_idx == txb_info->seg_eob) return 0;
+  const int nb_scan_idx = iscan[nb_coeff_idx];
+  if (nb_scan_idx < scan_idx) {
+    const int count = txb_cache->nz_count_arr[coeff_idx];
+    assert(count > 0);
+    txb_info->qcoeff[nb_coeff_idx] = get_lower_coeff(nb_coeff);
+    const int new_ctx = get_nz_map_ctx_from_count(
+        count - 1, txb_info->qcoeff, coeff_idx, txb_info->bwl, iscan);
+    txb_info->qcoeff[nb_coeff_idx] = nb_coeff;
+    const int ctx = txb_cache->nz_ctx_arr[coeff_idx][0];
+    const int is_nz = abs_qc > 0;
+    const int org_cost = av1_cost_bit(txb_probs->nz_map[ctx], is_nz);
+    const int new_cost = av1_cost_bit(txb_probs->nz_map[new_ctx], is_nz);
+    const int cost_diff = new_cost - org_cost;
+    return cost_diff;
+  } else {
+    return 0;
+  }
+}
+
+static int try_self_level_down(tran_low_t *low_coeff, int coeff_idx,
+                               const TxbCache *txb_cache,
+                               const TxbProbs *txb_probs, TxbInfo *txb_info) {
+  const tran_low_t qc = txb_info->qcoeff[coeff_idx];
+  if (qc == 0) {
+    *low_coeff = 0;
+    return 0;
+  }
+  const tran_low_t abs_qc = abs(qc);
+  *low_coeff = get_lower_coeff(qc);
+  int cost_diff;
+  if (*low_coeff == 0) {
+    const int scan_idx = txb_info->scan_order->iscan[coeff_idx];
+    const aom_prob level_prob =
+        get_level_prob(abs_qc, coeff_idx, txb_cache, txb_probs);
+    const aom_prob low_level_prob =
+        get_level_prob(abs(*low_coeff), coeff_idx, txb_cache, txb_probs);
+    if (scan_idx < txb_info->seg_eob) {
+      // When level-0, we code the binary of abs_qc > level
+      // but when level-k k > 0 we code the binary of abs_qc == level
+      // That's why wee need this special treatment for level-0 map
+      // TODO(angiebird): make leve-0 consistent to other levels
+      cost_diff = -av1_cost_bit(level_prob, 1) +
+                  av1_cost_bit(low_level_prob, 0) -
+                  av1_cost_bit(low_level_prob, 1);
+    } else {
+      cost_diff = -av1_cost_bit(level_prob, 1);
+    }
+
+    if (scan_idx < txb_info->seg_eob) {
+      const int eob_ctx =
+          get_eob_ctx(txb_info->qcoeff, coeff_idx, txb_info->bwl);
+      cost_diff -= av1_cost_bit(txb_probs->eob_flag[eob_ctx],
+                                scan_idx == (txb_info->eob - 1));
+    }
+
+    const int sign_cost = get_sign_bit_cost(
+        qc, coeff_idx, txb_probs->dc_sign_prob, txb_info->txb_ctx->dc_sign_ctx);
+    cost_diff -= sign_cost;
+  } else if (abs_qc < 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) {
+    const aom_prob level_prob =
+        get_level_prob(abs_qc, coeff_idx, txb_cache, txb_probs);
+    const aom_prob low_level_prob =
+        get_level_prob(abs(*low_coeff), coeff_idx, txb_cache, txb_probs);
+    cost_diff = -av1_cost_bit(level_prob, 1) + av1_cost_bit(low_level_prob, 1) -
+                av1_cost_bit(low_level_prob, 0);
+  } else if (abs_qc == 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) {
+    const aom_prob low_level_prob =
+        get_level_prob(abs(*low_coeff), coeff_idx, txb_cache, txb_probs);
+    cost_diff = -get_golomb_cost(abs_qc) + av1_cost_bit(low_level_prob, 1) -
+                av1_cost_bit(low_level_prob, 0);
+  } else {
+    assert(abs_qc > 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE);
+    const tran_low_t abs_low_coeff = abs(*low_coeff);
+    cost_diff = -get_golomb_cost(abs_qc) + get_golomb_cost(abs_low_coeff);
+  }
+  return cost_diff;
+}
+
+#define COST_MAP_SIZE 5
+#define COST_MAP_OFFSET 2
+
+static INLINE int check_nz_neighbor(tran_low_t qc) { return abs(qc) == 1; }
+
+static INLINE int check_base_neighbor(tran_low_t qc) {
+  return abs(qc) <= 1 + NUM_BASE_LEVELS;
+}
+
+static INLINE int check_br_neighbor(tran_low_t qc) {
+  return abs(qc) > BR_MAG_OFFSET;
+}
+
+// TODO(angiebird): add static to this function once it's called
+int try_level_down(int coeff_idx, const TxbCache *txb_cache,
+                   const TxbProbs *txb_probs, TxbInfo *txb_info,
+                   int (*cost_map)[COST_MAP_SIZE]) {
+  if (cost_map) {
+    for (int i = 0; i < COST_MAP_SIZE; ++i) av1_zero(cost_map[i]);
+  }
+
+  tran_low_t qc = txb_info->qcoeff[coeff_idx];
+  tran_low_t low_coeff;
+  if (qc == 0) return 0;
+  int accu_cost_diff = 0;
+
+  const int16_t *iscan = txb_info->scan_order->iscan;
+  const int eob = txb_info->eob;
+  const int scan_idx = iscan[coeff_idx];
+  if (scan_idx < eob) {
+    const int cost_diff = try_self_level_down(&low_coeff, coeff_idx, txb_cache,
+                                              txb_probs, txb_info);
+    if (cost_map)
+      cost_map[0 + COST_MAP_OFFSET][0 + COST_MAP_OFFSET] = cost_diff;
+    accu_cost_diff += cost_diff;
+  }
+
+  const int row = coeff_idx >> txb_info->bwl;
+  const int col = coeff_idx - (row << txb_info->bwl);
+  if (check_nz_neighbor(qc)) {
+    for (int i = 0; i < SIG_REF_OFFSET_NUM; ++i) {
+      const int nb_row = row - sig_ref_offset[i][0];
+      const int nb_col = col - sig_ref_offset[i][1];
+      const int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
+      const int nb_scan_idx = iscan[nb_coeff_idx];
+      if (nb_scan_idx < eob && nb_row >= 0 && nb_col >= 0 &&
+          nb_row < txb_info->stride && nb_col < txb_info->stride) {
+        const int cost_diff = try_neighbor_level_down_nz(
+            nb_coeff_idx, coeff_idx, txb_cache, txb_probs, txb_info);
+        if (cost_map)
+          cost_map[nb_row - row + COST_MAP_OFFSET]
+                  [nb_col - col + COST_MAP_OFFSET] += cost_diff;
+        accu_cost_diff += cost_diff;
+      }
+    }
+  }
+
+  if (check_base_neighbor(qc)) {
+    for (int i = 0; i < BASE_CONTEXT_POSITION_NUM; ++i) {
+      const int nb_row = row - base_ref_offset[i][0];
+      const int nb_col = col - base_ref_offset[i][1];
+      const int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
+      const int nb_scan_idx = iscan[nb_coeff_idx];
+      if (nb_scan_idx < eob && nb_row >= 0 && nb_col >= 0 &&
+          nb_row < txb_info->stride && nb_col < txb_info->stride) {
+        const int cost_diff = try_neighbor_level_down_base(
+            nb_coeff_idx, coeff_idx, txb_cache, txb_probs, txb_info);
+        if (cost_map)
+          cost_map[nb_row - row + COST_MAP_OFFSET]
+                  [nb_col - col + COST_MAP_OFFSET] += cost_diff;
+        accu_cost_diff += cost_diff;
+      }
+    }
+  }
+
+  if (check_br_neighbor(qc)) {
+    for (int i = 0; i < BR_CONTEXT_POSITION_NUM; ++i) {
+      const int nb_row = row - br_ref_offset[i][0];
+      const int nb_col = col - br_ref_offset[i][1];
+      const int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
+      const int nb_scan_idx = iscan[nb_coeff_idx];
+      if (nb_scan_idx < eob && nb_row >= 0 && nb_col >= 0 &&
+          nb_row < txb_info->stride && nb_col < txb_info->stride) {
+        const int cost_diff = try_neighbor_level_down_br(
+            nb_coeff_idx, coeff_idx, txb_cache, txb_probs, txb_info);
+        if (cost_map)
+          cost_map[nb_row - row + COST_MAP_OFFSET]
+                  [nb_col - col + COST_MAP_OFFSET] += cost_diff;
+        accu_cost_diff += cost_diff;
+      }
+    }
+  }
+
+  return accu_cost_diff;
+}
+
+static int get_low_coeff_cost(int coeff_idx, const TxbCache *txb_cache,
+                              const TxbProbs *txb_probs,
+                              const TxbInfo *txb_info) {
+  const tran_low_t qc = txb_info->qcoeff[coeff_idx];
+  const int abs_qc = abs(qc);
+  assert(abs_qc <= 1);
+  int cost = 0;
+  const int scan_idx = txb_info->scan_order->iscan[coeff_idx];
+  if (scan_idx < txb_info->seg_eob) {
+    const aom_prob level_prob =
+        get_level_prob(0, coeff_idx, txb_cache, txb_probs);
+    cost += av1_cost_bit(level_prob, qc != 0);
+  }
+
+  if (qc != 0) {
+    const int base_idx = 0;
+    const int ctx = txb_cache->base_ctx_arr[base_idx][coeff_idx][0];
+    cost += get_base_cost(abs_qc, ctx, txb_probs->coeff_base, base_idx);
+    if (scan_idx < txb_info->seg_eob) {
+      const int eob_ctx =
+          get_eob_ctx(txb_info->qcoeff, coeff_idx, txb_info->bwl);
+      cost += av1_cost_bit(txb_probs->eob_flag[eob_ctx],
+                           scan_idx == (txb_info->eob - 1));
+    }
+    cost += get_sign_bit_cost(qc, coeff_idx, txb_probs->dc_sign_prob,
+                              txb_info->txb_ctx->dc_sign_ctx);
+  }
+  return cost;
+}
+
+static INLINE void set_eob(TxbInfo *txb_info, int eob) {
+  txb_info->eob = eob;
+  txb_info->seg_eob = AOMMIN(eob, tx_size_2d[txb_info->tx_size] - 1);
+}
+
+// TODO(angiebird): add static to this function once it's called
+int try_change_eob(int *new_eob, int coeff_idx, const TxbCache *txb_cache,
+                   const TxbProbs *txb_probs, TxbInfo *txb_info) {
+  assert(txb_info->eob > 0);
+  const tran_low_t qc = txb_info->qcoeff[coeff_idx];
+  const int abs_qc = abs(qc);
+  if (abs_qc != 1) {
+    *new_eob = -1;
+    return 0;
+  }
+  const int16_t *iscan = txb_info->scan_order->iscan;
+  const int16_t *scan = txb_info->scan_order->scan;
+  const int scan_idx = iscan[coeff_idx];
+  *new_eob = 0;
+  int cost_diff = 0;
+  cost_diff -= get_low_coeff_cost(coeff_idx, txb_cache, txb_probs, txb_info);
+  // int coeff_cost =
+  //     get_coeff_cost(qc, scan_idx, txb_info, txb_probs);
+  // if (-cost_diff != coeff_cost) {
+  //   printf("-cost_diff %d coeff_cost %d\n", -cost_diff, coeff_cost);
+  //   get_low_coeff_cost(coeff_idx, txb_cache, txb_probs, txb_info);
+  //   get_coeff_cost(qc, scan_idx, txb_info, txb_probs);
+  // }
+  for (int si = scan_idx - 1; si >= 0; --si) {
+    const int ci = scan[si];
+    if (txb_info->qcoeff[ci] != 0) {
+      *new_eob = si + 1;
+      break;
+    } else {
+      cost_diff -= get_low_coeff_cost(ci, txb_cache, txb_probs, txb_info);
+    }
+  }
+
+  const int org_eob = txb_info->eob;
+  set_eob(txb_info, *new_eob);
+  cost_diff += try_level_down(coeff_idx, txb_cache, txb_probs, txb_info, NULL);
+  set_eob(txb_info, org_eob);
+
+  if (*new_eob > 0) {
+    // Note that get_eob_ctx does NOT actually account for qcoeff, so we don't
+    // need to lower down the qcoeff here
+    const int eob_ctx =
+        get_eob_ctx(txb_info->qcoeff, scan[*new_eob - 1], txb_info->bwl);
+    cost_diff -= av1_cost_bit(txb_probs->eob_flag[eob_ctx], 0);
+    cost_diff += av1_cost_bit(txb_probs->eob_flag[eob_ctx], 1);
+  } else {
+    const int txb_skip_ctx = txb_info->txb_ctx->txb_skip_ctx;
+    cost_diff -= av1_cost_bit(txb_probs->txb_skip[txb_skip_ctx], 0);
+    cost_diff += av1_cost_bit(txb_probs->txb_skip[txb_skip_ctx], 1);
+  }
+  return cost_diff;
+}
+
+static INLINE tran_low_t qcoeff_to_dqcoeff(tran_low_t qc, int dqv, int shift) {
+  int sgn = qc < 0 ? -1 : 1;
+  return sgn * ((abs(qc) * dqv) >> shift);
+}
+
+// TODO(angiebird): add static to this function it's called
+void update_level_down(int coeff_idx, TxbCache *txb_cache, TxbInfo *txb_info) {
+  const tran_low_t qc = txb_info->qcoeff[coeff_idx];
+  const int abs_qc = abs(qc);
+  if (qc == 0) return;
+  const tran_low_t low_coeff = get_lower_coeff(qc);
+  txb_info->qcoeff[coeff_idx] = low_coeff;
+  const int dqv = txb_info->dequant[coeff_idx != 0];
+  txb_info->dqcoeff[coeff_idx] =
+      qcoeff_to_dqcoeff(low_coeff, dqv, txb_info->shift);
+
+  const int row = coeff_idx >> txb_info->bwl;
+  const int col = coeff_idx - (row << txb_info->bwl);
+  const int eob = txb_info->eob;
+  const int16_t *iscan = txb_info->scan_order->iscan;
+  for (int i = 0; i < SIG_REF_OFFSET_NUM; ++i) {
+    const int nb_row = row - sig_ref_offset[i][0];
+    const int nb_col = col - sig_ref_offset[i][1];
+    const int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
+    const int nb_scan_idx = iscan[nb_coeff_idx];
+    if (nb_scan_idx < eob && nb_row >= 0 && nb_col >= 0 &&
+        nb_row < txb_info->stride && nb_col < txb_info->stride) {
+      const int scan_idx = iscan[coeff_idx];
+      if (scan_idx < nb_scan_idx) {
+        const int level = 1;
+        if (abs_qc == level) {
+          txb_cache->nz_count_arr[nb_coeff_idx] -= 1;
+          assert(txb_cache->nz_count_arr[nb_coeff_idx] >= 0);
+        }
+        const int count = txb_cache->nz_count_arr[nb_coeff_idx];
+        txb_cache->nz_ctx_arr[nb_coeff_idx][0] = get_nz_map_ctx_from_count(
+            count, txb_info->qcoeff, nb_coeff_idx, txb_info->bwl, iscan);
+        // int ref_ctx = get_nz_map_ctx2(txb_info->qcoeff, nb_coeff_idx,
+        // txb_info->bwl, iscan);
+        // if (ref_ctx != txb_cache->nz_ctx_arr[nb_coeff_idx][0])
+        //   printf("nz ctx %d ref_ctx %d\n",
+        //   txb_cache->nz_ctx_arr[nb_coeff_idx][0], ref_ctx);
+      }
+    }
+  }
+
+  for (int i = 0; i < BASE_CONTEXT_POSITION_NUM; ++i) {
+    const int nb_row = row - base_ref_offset[i][0];
+    const int nb_col = col - base_ref_offset[i][1];
+    const int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
+    const tran_low_t nb_coeff = txb_info->qcoeff[nb_coeff_idx];
+    if (!has_base(nb_coeff, 0)) continue;
+    const int nb_scan_idx = iscan[nb_coeff_idx];
+    if (nb_scan_idx < eob && nb_row >= 0 && nb_col >= 0 &&
+        nb_row < txb_info->stride && nb_col < txb_info->stride) {
+      if (row >= nb_row && col >= nb_col)
+        update_mag_arr(txb_cache->base_mag_arr[nb_coeff_idx], abs_qc);
+      const int mag =
+          get_mag_from_mag_arr(txb_cache->base_mag_arr[nb_coeff_idx]);
+      for (int base_idx = 0; base_idx < NUM_BASE_LEVELS; ++base_idx) {
+        if (!has_base(nb_coeff, base_idx)) continue;
+        const int level = base_idx + 1;
+        if (abs_qc == level) {
+          txb_cache->base_count_arr[base_idx][nb_coeff_idx] -= 1;
+          assert(txb_cache->base_count_arr[base_idx][nb_coeff_idx] >= 0);
+        }
+        const int count = txb_cache->base_count_arr[base_idx][nb_coeff_idx];
+        txb_cache->base_ctx_arr[base_idx][nb_coeff_idx][0] =
+            get_base_ctx_from_count_mag(nb_row, nb_col, count, mag, level);
+        // int ref_ctx = get_base_ctx(txb_info->qcoeff, nb_coeff_idx,
+        // txb_info->bwl, level);
+        // if (ref_ctx != txb_cache->base_ctx_arr[base_idx][nb_coeff_idx][0]) {
+        //   printf("base ctx %d ref_ctx %d\n",
+        //   txb_cache->base_ctx_arr[base_idx][nb_coeff_idx][0], ref_ctx);
+        // }
+      }
+    }
+  }
+
+  for (int i = 0; i < BR_CONTEXT_POSITION_NUM; ++i) {
+    const int nb_row = row - br_ref_offset[i][0];
+    const int nb_col = col - br_ref_offset[i][1];
+    const int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
+    const int nb_scan_idx = iscan[nb_coeff_idx];
+    const tran_low_t nb_coeff = txb_info->qcoeff[nb_coeff_idx];
+    if (!has_br(nb_coeff)) continue;
+    if (nb_scan_idx < eob && nb_row >= 0 && nb_col >= 0 &&
+        nb_row < txb_info->stride && nb_col < txb_info->stride) {
+      const int level = 1 + NUM_BASE_LEVELS;
+      if (abs_qc == level) {
+        txb_cache->br_count_arr[nb_coeff_idx] -= 1;
+        assert(txb_cache->br_count_arr[nb_coeff_idx] >= 0);
+      }
+      if (row >= nb_row && col >= nb_col)
+        update_mag_arr(txb_cache->br_mag_arr[nb_coeff_idx], abs_qc);
+      const int count = txb_cache->br_count_arr[nb_coeff_idx];
+      const int mag = get_mag_from_mag_arr(txb_cache->br_mag_arr[nb_coeff_idx]);
+      txb_cache->br_ctx_arr[nb_coeff_idx][0] =
+          get_br_ctx_from_count_mag(nb_row, nb_col, count, mag);
+      // int ref_ctx = get_level_ctx(txb_info->qcoeff, nb_coeff_idx,
+      // txb_info->bwl);
+      // if (ref_ctx != txb_cache->br_ctx_arr[nb_coeff_idx][0]) {
+      //   printf("base ctx %d ref_ctx %d\n",
+      //   txb_cache->br_ctx_arr[nb_coeff_idx][0], ref_ctx);
+      // }
+    }
+  }
+}
+
+static int get_coeff_cost(tran_low_t qc, int scan_idx, TxbInfo *txb_info,
+                          const TxbProbs *txb_probs) {
+  const TXB_CTX *txb_ctx = txb_info->txb_ctx;
+  const int is_nz = (qc != 0);
+  const tran_low_t abs_qc = abs(qc);
+  int cost = 0;
+  const int16_t *scan = txb_info->scan_order->scan;
+  const int16_t *iscan = txb_info->scan_order->iscan;
+
+  if (scan_idx < txb_info->seg_eob) {
+    int coeff_ctx =
+        get_nz_map_ctx2(txb_info->qcoeff, scan[scan_idx], txb_info->bwl, iscan);
+    cost += av1_cost_bit(txb_probs->nz_map[coeff_ctx], is_nz);
+  }
+
+  if (is_nz) {
+    cost += get_sign_bit_cost(qc, scan_idx, txb_probs->dc_sign_prob,
+                              txb_ctx->dc_sign_ctx);
+
+    int ctx_ls[NUM_BASE_LEVELS] = { 0 };
+    get_base_ctx_set(txb_info->qcoeff, scan[scan_idx], txb_info->bwl, ctx_ls);
+
+    int i;
+    for (i = 0; i < NUM_BASE_LEVELS; ++i) {
+      cost += get_base_cost(abs_qc, ctx_ls[i], txb_probs->coeff_base, i);
+    }
+
+    if (abs_qc > NUM_BASE_LEVELS) {
+      int ctx = get_br_ctx(txb_info->qcoeff, scan[scan_idx], txb_info->bwl);
+      cost += get_br_cost(abs_qc, ctx, txb_probs->coeff_lps);
+      cost += get_golomb_cost(abs_qc);
+    }
+
+    if (scan_idx < txb_info->seg_eob) {
+      int eob_ctx =
+          get_eob_ctx(txb_info->qcoeff, scan[scan_idx], txb_info->bwl);
+      cost += av1_cost_bit(txb_probs->eob_flag[eob_ctx],
+                           scan_idx == (txb_info->eob - 1));
+    }
+  }
+  return cost;
+}
+
+#if TEST_OPTIMIZE_TXB
+#define ALL_REF_OFFSET_NUM 17
+static int all_ref_offset[ALL_REF_OFFSET_NUM][2] = {
+  { 0, 0 },  { -2, -1 }, { -2, 0 }, { -2, 1 }, { -1, -2 }, { -1, -1 },
+  { -1, 0 }, { -1, 1 },  { 0, -2 }, { 0, -1 }, { 1, -2 },  { 1, -1 },
+  { 1, 0 },  { 2, 0 },   { 0, 1 },  { 0, 2 },  { 1, 1 },
+};
+
+static int try_level_down_ref(int coeff_idx, const TxbProbs *txb_probs,
+                              TxbInfo *txb_info,
+                              int (*cost_map)[COST_MAP_SIZE]) {
+  if (cost_map) {
+    for (int i = 0; i < COST_MAP_SIZE; ++i) av1_zero(cost_map[i]);
+  }
+  tran_low_t qc = txb_info->qcoeff[coeff_idx];
+  if (qc == 0) return 0;
+  int row = coeff_idx >> txb_info->bwl;
+  int col = coeff_idx - (row << txb_info->bwl);
+  int org_cost = 0;
+  for (int i = 0; i < ALL_REF_OFFSET_NUM; ++i) {
+    int nb_row = row - all_ref_offset[i][0];
+    int nb_col = col - all_ref_offset[i][1];
+    int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
+    int nb_scan_idx = txb_info->scan_order->iscan[nb_coeff_idx];
+    if (nb_scan_idx < txb_info->eob && nb_row >= 0 && nb_col >= 0 &&
+        nb_row < txb_info->stride && nb_col < txb_info->stride) {
+      tran_low_t nb_coeff = txb_info->qcoeff[nb_coeff_idx];
+      int cost = get_coeff_cost(nb_coeff, nb_scan_idx, txb_info, txb_probs);
+      if (cost_map)
+        cost_map[nb_row - row + COST_MAP_OFFSET]
+                [nb_col - col + COST_MAP_OFFSET] -= cost;
+      org_cost += cost;
+    }
+  }
+  txb_info->qcoeff[coeff_idx] = get_lower_coeff(qc);
+  int new_cost = 0;
+  for (int i = 0; i < ALL_REF_OFFSET_NUM; ++i) {
+    int nb_row = row - all_ref_offset[i][0];
+    int nb_col = col - all_ref_offset[i][1];
+    int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
+    int nb_scan_idx = txb_info->scan_order->iscan[nb_coeff_idx];
+    if (nb_scan_idx < txb_info->eob && nb_row >= 0 && nb_col >= 0 &&
+        nb_row < txb_info->stride && nb_col < txb_info->stride) {
+      tran_low_t nb_coeff = txb_info->qcoeff[nb_coeff_idx];
+      int cost = get_coeff_cost(nb_coeff, nb_scan_idx, txb_info, txb_probs);
+      if (cost_map)
+        cost_map[nb_row - row + COST_MAP_OFFSET]
+                [nb_col - col + COST_MAP_OFFSET] += cost;
+      new_cost += cost;
+    }
+  }
+  txb_info->qcoeff[coeff_idx] = qc;
+  return new_cost - org_cost;
+}
 
+static void test_level_down(int coeff_idx, const TxbCache *txb_cache,
+                            const TxbProbs *txb_probs, TxbInfo *txb_info) {
+  int cost_map[COST_MAP_SIZE][COST_MAP_SIZE];
+  int ref_cost_map[COST_MAP_SIZE][COST_MAP_SIZE];
+  const int cost_diff =
+      try_level_down(coeff_idx, txb_cache, txb_probs, txb_info, cost_map);
+  const int cost_diff_ref =
+      try_level_down_ref(coeff_idx, txb_probs, txb_info, ref_cost_map);
+  if (cost_diff != cost_diff_ref) {
+    printf("qc %d cost_diff %d cost_diff_ref %d\n", txb_info->qcoeff[coeff_idx],
+           cost_diff, cost_diff_ref);
+    for (int r = 0; r < COST_MAP_SIZE; ++r) {
+      for (int c = 0; c < COST_MAP_SIZE; ++c) {
+        printf("%d:%d ", cost_map[r][c], ref_cost_map[r][c]);
+      }
+      printf("\n");
+    }
+  }
+}
+#endif
+
+// TODO(angiebird): make this static once it's called
+int get_txb_cost(TxbInfo *txb_info, const TxbProbs *txb_probs) {
+  int cost = 0;
+  int txb_skip_ctx = txb_info->txb_ctx->txb_skip_ctx;
+  const int16_t *scan = txb_info->scan_order->scan;
+  if (txb_info->eob == 0) {
+    cost = av1_cost_bit(txb_probs->txb_skip[txb_skip_ctx], 1);
+    return cost;
+  }
+  cost = av1_cost_bit(txb_probs->txb_skip[txb_skip_ctx], 0);
+  for (int c = 0; c < txb_info->eob; ++c) {
+    tran_low_t qc = txb_info->qcoeff[scan[c]];
+    int coeff_cost = get_coeff_cost(qc, c, txb_info, txb_probs);
+    cost += coeff_cost;
+  }
+  return cost;
+}
+
+#if TEST_OPTIMIZE_TXB
+void test_try_change_eob(TxbInfo *txb_info, TxbProbs *txb_probs,
+                         TxbCache *txb_cache) {
+  int eob = txb_info->eob;
+  const int16_t *scan = txb_info->scan_order->scan;
+  if (eob > 0) {
+    int last_si = eob - 1;
+    int last_ci = scan[last_si];
+    int last_coeff = txb_info->qcoeff[last_ci];
+    if (abs(last_coeff) == 1) {
+      int new_eob;
+      int cost_diff =
+          try_change_eob(&new_eob, last_ci, txb_cache, txb_probs, txb_info);
+      int org_eob = txb_info->eob;
+      int cost = get_txb_cost(txb_info, txb_probs);
+
+      txb_info->qcoeff[last_ci] = get_lower_coeff(last_coeff);
+      set_eob(txb_info, new_eob);
+      int new_cost = get_txb_cost(txb_info, txb_probs);
+      set_eob(txb_info, org_eob);
+      txb_info->qcoeff[last_ci] = last_coeff;
+
+      int ref_cost_diff = -cost + new_cost;
+      if (cost_diff != ref_cost_diff)
+        printf("org_eob %d new_eob %d cost_diff %d ref_cost_diff %d\n", org_eob,
+               new_eob, cost_diff, ref_cost_diff);
+    }
+  }
+}
+#endif
+
+static INLINE int64_t get_coeff_dist(tran_low_t tcoeff, tran_low_t dqcoeff,
+                                     int shift) {
+  const int64_t diff = (tcoeff - dqcoeff) * (1 << shift);
+  const int64_t error = diff * diff;
+  return error;
+}
+
+typedef struct LevelDownStats {
+  int update;
+  tran_low_t low_qc;
+  tran_low_t low_dqc;
+  int64_t rd_diff;
+  int cost_diff;
+  int64_t dist_diff;
+  int new_eob;
+} LevelDownStats;
+
+void try_level_down_facade(LevelDownStats *stats, int scan_idx,
+                           const TxbCache *txb_cache, const TxbProbs *txb_probs,
+                           TxbInfo *txb_info) {
+  const int16_t *scan = txb_info->scan_order->scan;
+  const int coeff_idx = scan[scan_idx];
+  const tran_low_t qc = txb_info->qcoeff[coeff_idx];
+  stats->new_eob = -1;
+  stats->update = 0;
+  if (qc == 0) {
+    return;
+  }
+
+  const tran_low_t tqc = txb_info->tcoeff[coeff_idx];
+  const int dqv = txb_info->dequant[coeff_idx != 0];
+
+  const tran_low_t dqc = qcoeff_to_dqcoeff(qc, dqv, txb_info->shift);
+  const int64_t dqc_dist = get_coeff_dist(tqc, dqc, txb_info->shift);
+
+  stats->low_qc = get_lower_coeff(qc);
+  stats->low_dqc = qcoeff_to_dqcoeff(stats->low_qc, dqv, txb_info->shift);
+  const int64_t low_dqc_dist =
+      get_coeff_dist(tqc, stats->low_dqc, txb_info->shift);
+
+  stats->dist_diff = -dqc_dist + low_dqc_dist;
+  stats->cost_diff = 0;
+  stats->new_eob = txb_info->eob;
+  if (scan_idx == txb_info->eob - 1 && abs(qc) == 1) {
+    stats->cost_diff = try_change_eob(&stats->new_eob, coeff_idx, txb_cache,
+                                      txb_probs, txb_info);
+  } else {
+    stats->cost_diff =
+        try_level_down(coeff_idx, txb_cache, txb_probs, txb_info, NULL);
+#if TEST_OPTIMIZE_TXB
+    test_level_down(coeff_idx, txb_cache, txb_probs, txb_info);
+#endif
+  }
+  stats->rd_diff = RDCOST(txb_info->rdmult, txb_info->rddiv, stats->cost_diff,
+                          stats->dist_diff);
+  if (stats->rd_diff < 0) stats->update = 1;
+  return;
+}
+
+static int optimize_txb(TxbInfo *txb_info, const TxbProbs *txb_probs,
+                        TxbCache *txb_cache, int dry_run) {
+  int update = 0;
+  if (txb_info->eob == 0) return update;
+  int cost_diff = 0;
+  int64_t dist_diff = 0;
+  int64_t rd_diff = 0;
+  const int max_eob = tx_size_2d[txb_info->tx_size];
+
+#if TEST_OPTIMIZE_TXB
+  int64_t sse;
+  int64_t org_dist =
+      av1_block_error_c(txb_info->tcoeff, txb_info->dqcoeff, max_eob, &sse) *
+      (1 << (2 * txb_info->shift));
+  int org_cost = get_txb_cost(txb_info, txb_probs);
+#endif
+
+  tran_low_t *org_qcoeff = txb_info->qcoeff;
+  tran_low_t *org_dqcoeff = txb_info->dqcoeff;
+
+  tran_low_t tmp_qcoeff[MAX_TX_SQUARE];
+  tran_low_t tmp_dqcoeff[MAX_TX_SQUARE];
+  const int org_eob = txb_info->eob;
+  if (dry_run) {
+    memcpy(tmp_qcoeff, org_qcoeff, sizeof(org_qcoeff[0]) * max_eob);
+    memcpy(tmp_dqcoeff, org_dqcoeff, sizeof(org_dqcoeff[0]) * max_eob);
+    txb_info->qcoeff = tmp_qcoeff;
+    txb_info->dqcoeff = tmp_dqcoeff;
+  }
+
+  const int16_t *scan = txb_info->scan_order->scan;
+
+  // forward optimize the nz_map
+  const int cur_eob = txb_info->eob;
+  for (int si = 0; si < cur_eob; ++si) {
+    const int coeff_idx = scan[si];
+    tran_low_t qc = txb_info->qcoeff[coeff_idx];
+    if (abs(qc) == 1) {
+      LevelDownStats stats;
+      try_level_down_facade(&stats, si, txb_cache, txb_probs, txb_info);
+      if (stats.update) {
+        update = 1;
+        cost_diff += stats.cost_diff;
+        dist_diff += stats.dist_diff;
+        rd_diff += stats.rd_diff;
+        update_level_down(coeff_idx, txb_cache, txb_info);
+        set_eob(txb_info, stats.new_eob);
+      }
+    }
+  }
+
+  // backward optimize the level-k map
+  for (int si = txb_info->eob - 1; si >= 0; --si) {
+    LevelDownStats stats;
+    try_level_down_facade(&stats, si, txb_cache, txb_probs, txb_info);
+    const int coeff_idx = scan[si];
+    if (stats.update) {
+#if TEST_OPTIMIZE_TXB
+// printf("si %d low_qc %d cost_diff %d dist_diff %ld rd_diff %ld eob %d new_eob
+// %d\n", si, stats.low_qc, stats.cost_diff, stats.dist_diff, stats.rd_diff,
+// txb_info->eob, stats.new_eob);
+#endif
+      update = 1;
+      cost_diff += stats.cost_diff;
+      dist_diff += stats.dist_diff;
+      rd_diff += stats.rd_diff;
+      update_level_down(coeff_idx, txb_cache, txb_info);
+      set_eob(txb_info, stats.new_eob);
+    }
+    if (si > txb_info->eob) si = txb_info->eob;
+  }
+#if TEST_OPTIMIZE_TXB
+  int64_t new_dist =
+      av1_block_error_c(txb_info->tcoeff, txb_info->dqcoeff, max_eob, &sse) *
+      (1 << (2 * txb_info->shift));
+  int new_cost = get_txb_cost(txb_info, txb_probs);
+  int64_t ref_dist_diff = new_dist - org_dist;
+  int ref_cost_diff = new_cost - org_cost;
+  if (cost_diff != ref_cost_diff || dist_diff != ref_dist_diff)
+    printf(
+        "overall rd_diff %ld\ncost_diff %d ref_cost_diff%d\ndist_diff %ld "
+        "ref_dist_diff %ld\neob %d new_eob %d\n\n",
+        rd_diff, cost_diff, ref_cost_diff, dist_diff, ref_dist_diff, org_eob,
+        txb_info->eob);
+#endif
+  if (dry_run) {
+    txb_info->qcoeff = org_qcoeff;
+    txb_info->dqcoeff = org_dqcoeff;
+    set_eob(txb_info, org_eob);
+  }
+  return update;
+}
+
+// These numbers are empirically obtained.
+static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
+#if CONFIG_EC_ADAPT
+  { 17, 13 }, { 16, 10 },
+#else
+  { 20, 12 }, { 16, 12 },
+#endif
+};
+
+int av1_optimize_txb(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
+                     TX_SIZE tx_size, TXB_CTX *txb_ctx) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+  const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  const struct macroblock_plane *p = &x->plane[plane];
+  struct macroblockd_plane *pd = &xd->plane[plane];
+  const int eob = p->eobs[block];
+  tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  const tran_low_t *tcoeff = BLOCK_OFFSET(p->coeff, block);
+  const int16_t *dequant = pd->dequant;
+  const int seg_eob = AOMMIN(eob, tx_size_2d[tx_size] - 1);
+  const aom_prob *nz_map = xd->fc->nz_map[tx_size][plane_type];
+
+  const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
+  const int stride = 1 << bwl;
+  aom_prob(*coeff_base)[COEFF_BASE_CONTEXTS] =
+      xd->fc->coeff_base[tx_size][plane_type];
+
+  const aom_prob *coeff_lps = xd->fc->coeff_lps[tx_size][plane_type];
+
+  const int is_inter = is_inter_block(mbmi);
+  const SCAN_ORDER *const scan_order =
+      get_scan(cm, tx_size, tx_type, is_inter_block(mbmi));
+
+  const TxbProbs txb_probs = { xd->fc->dc_sign[plane_type],
+                               nz_map,
+                               coeff_base,
+                               coeff_lps,
+                               xd->fc->eob_flag[tx_size][plane_type],
+                               xd->fc->txb_skip[tx_size] };
+
+  const int shift = av1_get_tx_scale(tx_size);
+  const int64_t rdmult =
+      (x->rdmult * plane_rd_mult[is_inter][plane_type] + 2) >> 2;
+  const int64_t rddiv = x->rddiv;
+
+  TxbInfo txb_info = { qcoeff,     dqcoeff, tcoeff, dequant, shift,
+                       tx_size,    bwl,     stride, eob,     seg_eob,
+                       scan_order, txb_ctx, rdmult, rddiv };
+  TxbCache txb_cache;
+  gen_txb_cache(&txb_cache, &txb_info);
+
+  const int update = optimize_txb(&txb_info, &txb_probs, &txb_cache, 0);
+  if (update) p->eobs[block] = txb_info.eob;
+  return txb_info.eob;
+}
 int av1_get_txb_entropy_context(const tran_low_t *qcoeff,
                                 const SCAN_ORDER *scan_order, int eob) {
   const int16_t *scan = scan_order->scan;
@@ -394,10 +1495,10 @@ int av1_get_txb_entropy_context(const tran_low_t *qcoeff,
   return cul_level;
 }
 
-static void update_txb_context(int plane, int block, int blk_row, int blk_col,
-                               BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                               void *arg) {
-  TxbParams *const args = arg;
+void av1_update_txb_context_b(int plane, int block, int blk_row, int blk_col,
+                              BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                              void *arg) {
+  struct tokenize_b_args *const args = arg;
   const AV1_COMP *cpi = args->cpi;
   const AV1_COMMON *cm = &cpi->common;
   ThreadData *const td = args->td;
@@ -418,10 +1519,10 @@ static void update_txb_context(int plane, int block, int blk_row, int blk_col,
   av1_set_contexts(xd, pd, plane, tx_size, cul_level, blk_col, blk_row);
 }
 
-static void update_and_record_txb_context(int plane, int block, int blk_row,
-                                          int blk_col, BLOCK_SIZE plane_bsize,
-                                          TX_SIZE tx_size, void *arg) {
-  TxbParams *const args = arg;
+void av1_update_and_record_txb_context(int plane, int block, int blk_row,
+                                       int blk_col, BLOCK_SIZE plane_bsize,
+                                       TX_SIZE tx_size, void *arg) {
+  struct tokenize_b_args *const args = arg;
   const AV1_COMP *cpi = args->cpi;
   const AV1_COMMON *cm = &cpi->common;
   ThreadData *const td = args->td;
@@ -529,7 +1630,7 @@ static void update_and_record_txb_context(int plane, int block, int blk_row,
     }
 
     // level is above 1.
-    ctx = get_level_ctx(tcoeff, scan[c], bwl);
+    ctx = get_br_ctx(tcoeff, scan[c], bwl);
     for (idx = 0; idx < COEFF_BASE_RANGE; ++idx) {
       if (level == (idx + 1 + NUM_BASE_LEVELS)) {
         ++td->counts->coeff_lps[tx_size][plane_type][ctx][1];
@@ -568,23 +1669,23 @@ void av1_update_txb_context(const AV1_COMP *cpi, ThreadData *td,
   const int ctx = av1_get_skip_context(xd);
   const int skip_inc =
       !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
-  struct TxbParams arg = { cpi, td, 0 };
+  struct tokenize_b_args arg = { cpi, td, NULL, 0 };
   (void)rate;
   (void)mi_row;
   (void)mi_col;
   if (mbmi->skip) {
     if (!dry_run) td->counts->skip[ctx][1] += skip_inc;
-    reset_skip_context(xd, bsize);
+    av1_reset_skip_context(xd, mi_row, mi_col, bsize);
     return;
   }
 
   if (!dry_run) {
     td->counts->skip[ctx][0] += skip_inc;
     av1_foreach_transformed_block(xd, bsize, mi_row, mi_col,
-                                  update_and_record_txb_context, &arg);
+                                  av1_update_and_record_txb_context, &arg);
   } else if (dry_run == DRY_RUN_NORMAL) {
-    av1_foreach_transformed_block(xd, bsize, mi_row, mi_col, update_txb_context,
-                                  &arg);
+    av1_foreach_transformed_block(xd, bsize, mi_row, mi_col,
+                                  av1_update_txb_context_b, &arg);
   } else {
     printf("DRY_RUN_COSTCOEFFS is not supported yet\n");
     assert(0);
@@ -749,8 +1850,7 @@ int64_t av1_search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
     av1_invalid_rd_stats(&this_rd_stats);
     av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
                     coeff_ctx, AV1_XFORM_QUANT_FP);
-    if (x->plane[plane].eobs[block] && !xd->lossless[mbmi->segment_id])
-      av1_optimize_b(cm, x, plane, block, tx_size, coeff_ctx);
+    av1_optimize_b(cm, x, plane, block, plane_bsize, tx_size, a, l);
     av1_dist_block(cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size,
                    &this_rd_stats.dist, &this_rd_stats.sse,
                    OUTPUT_HAS_PREDICTED_PIXELS);
@@ -771,8 +1871,7 @@ int64_t av1_search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
   // copy the best result in the above tx_type search for loop
   av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
                   coeff_ctx, AV1_XFORM_QUANT_FP);
-  if (x->plane[plane].eobs[block] && !xd->lossless[mbmi->segment_id])
-    av1_optimize_b(cm, x, plane, block, tx_size, coeff_ctx);
+  av1_optimize_b(cm, x, plane, block, plane_bsize, tx_size, a, l);
   if (!is_inter_block(mbmi)) {
     // intra mode needs decoded result such that the next transform block
     // can use it for prediction.
diff --git a/third_party/aom/av1/encoder/encodetxb.h b/third_party/aom/av1/encoder/encodetxb.h
index 552d47b54..836033a54 100644
--- a/third_party/aom/av1/encoder/encodetxb.h
+++ b/third_party/aom/av1/encoder/encodetxb.h
@@ -22,6 +22,47 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
+
+typedef struct TxbInfo {
+  tran_low_t *qcoeff;
+  tran_low_t *dqcoeff;
+  const tran_low_t *tcoeff;
+  const int16_t *dequant;
+  int shift;
+  TX_SIZE tx_size;
+  int bwl;
+  int stride;
+  int eob;
+  int seg_eob;
+  const SCAN_ORDER *scan_order;
+  TXB_CTX *txb_ctx;
+  int64_t rdmult;
+  int64_t rddiv;
+} TxbInfo;
+
+typedef struct TxbCache {
+  int nz_count_arr[MAX_TX_SQUARE];
+  int nz_ctx_arr[MAX_TX_SQUARE][2];
+  int base_count_arr[NUM_BASE_LEVELS][MAX_TX_SQUARE];
+  int base_mag_arr[MAX_TX_SQUARE]
+                  [2];  // [0]: max magnitude [1]: num of max magnitude
+  int base_ctx_arr[NUM_BASE_LEVELS][MAX_TX_SQUARE][2];  // [1]: not used
+
+  int br_count_arr[MAX_TX_SQUARE];
+  int br_mag_arr[MAX_TX_SQUARE]
+                [2];  // [0]: max magnitude [1]: num of max magnitude
+  int br_ctx_arr[MAX_TX_SQUARE][2];  // [1]: not used
+} TxbCache;
+
+typedef struct TxbProbs {
+  const aom_prob *dc_sign_prob;
+  const aom_prob *nz_map;
+  aom_prob (*coeff_base)[COEFF_BASE_CONTEXTS];
+  const aom_prob *coeff_lps;
+  const aom_prob *eob_flag;
+  const aom_prob *txb_skip;
+} TxbProbs;
+
 void av1_alloc_txb_buf(AV1_COMP *cpi);
 void av1_free_txb_buf(AV1_COMP *cpi);
 int av1_cost_coeffs_txb(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
@@ -39,6 +80,14 @@ void av1_update_txb_context(const AV1_COMP *cpi, ThreadData *td,
                             const int mi_row, const int mi_col);
 void av1_write_txb_probs(AV1_COMP *cpi, aom_writer *w);
 
+void av1_update_txb_context_b(int plane, int block, int blk_row, int blk_col,
+                              BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                              void *arg);
+
+void av1_update_and_record_txb_context(int plane, int block, int blk_row,
+                                       int blk_col, BLOCK_SIZE plane_bsize,
+                                       TX_SIZE tx_size, void *arg);
+
 #if CONFIG_TXK_SEL
 int64_t av1_search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
                             int block, int blk_row, int blk_col,
@@ -46,6 +95,8 @@ int64_t av1_search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
                             const ENTROPY_CONTEXT *a, const ENTROPY_CONTEXT *l,
                             int use_fast_coef_costing, RD_STATS *rd_stats);
 #endif
+int av1_optimize_txb(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
+                     TX_SIZE tx_size, TXB_CTX *txb_ctx);
 #ifdef __cplusplus
 }
 #endif
diff --git a/third_party/aom/av1/encoder/ethread.c b/third_party/aom/av1/encoder/ethread.c
index 34f0b9566..7af5f78b6 100644
--- a/third_party/aom/av1/encoder/ethread.c
+++ b/third_party/aom/av1/encoder/ethread.c
@@ -93,14 +93,42 @@ void av1_encode_tiles_mt(AV1_COMP *cpi) {
         thread_data->td->pc_tree = NULL;
         av1_setup_pc_tree(cm, thread_data->td);
 
-        // Set up variance tree if needed.
-        if (cpi->sf.partition_search_type == VAR_BASED_PARTITION)
-          av1_setup_var_tree(cm, thread_data->td);
-
+#if CONFIG_MOTION_VAR
+#if CONFIG_HIGHBITDEPTH
+        int buf_scaler = 2;
+#else
+        int buf_scaler = 1;
+#endif
+        CHECK_MEM_ERROR(cm, thread_data->td->above_pred_buf,
+                        (uint8_t *)aom_memalign(
+                            16, buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE *
+                                    sizeof(*thread_data->td->above_pred_buf)));
+        CHECK_MEM_ERROR(cm, thread_data->td->left_pred_buf,
+                        (uint8_t *)aom_memalign(
+                            16, buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE *
+                                    sizeof(*thread_data->td->left_pred_buf)));
+        CHECK_MEM_ERROR(
+            cm, thread_data->td->wsrc_buf,
+            (int32_t *)aom_memalign(
+                16, MAX_SB_SQUARE * sizeof(*thread_data->td->wsrc_buf)));
+        CHECK_MEM_ERROR(
+            cm, thread_data->td->mask_buf,
+            (int32_t *)aom_memalign(
+                16, MAX_SB_SQUARE * sizeof(*thread_data->td->mask_buf)));
+#endif
         // Allocate frame counters in thread data.
         CHECK_MEM_ERROR(cm, thread_data->td->counts,
                         aom_calloc(1, sizeof(*thread_data->td->counts)));
 
+#if CONFIG_PALETTE
+        // Allocate buffers used by palette coding mode.
+        if (cpi->common.allow_screen_content_tools) {
+          CHECK_MEM_ERROR(
+              cm, thread_data->td->palette_buffer,
+              aom_memalign(16, sizeof(*thread_data->td->palette_buffer)));
+        }
+#endif  // CONFIG_PALETTE
+
         // Create threads
         if (!winterface->reset(worker))
           aom_internal_error(&cm->error, AOM_CODEC_ERROR,
@@ -127,6 +155,12 @@ void av1_encode_tiles_mt(AV1_COMP *cpi) {
     if (thread_data->td != &cpi->td) {
       thread_data->td->mb = cpi->td.mb;
       thread_data->td->rd_counts = cpi->td.rd_counts;
+#if CONFIG_MOTION_VAR
+      thread_data->td->mb.above_pred_buf = thread_data->td->above_pred_buf;
+      thread_data->td->mb.left_pred_buf = thread_data->td->left_pred_buf;
+      thread_data->td->mb.wsrc_buf = thread_data->td->wsrc_buf;
+      thread_data->td->mb.mask_buf = thread_data->td->mask_buf;
+#endif
     }
     if (thread_data->td->counts != &cpi->common.counts) {
       memcpy(thread_data->td->counts, &cpi->common.counts,
@@ -134,12 +168,8 @@ void av1_encode_tiles_mt(AV1_COMP *cpi) {
     }
 
 #if CONFIG_PALETTE
-    // Allocate buffers used by palette coding mode.
-    if (cpi->common.allow_screen_content_tools && i < num_workers - 1) {
-      MACROBLOCK *x = &thread_data->td->mb;
-      CHECK_MEM_ERROR(cm, x->palette_buffer,
-                      aom_memalign(16, sizeof(*x->palette_buffer)));
-    }
+    if (cpi->common.allow_screen_content_tools && i < num_workers - 1)
+      thread_data->td->mb.palette_buffer = thread_data->td->palette_buffer;
 #endif  // CONFIG_PALETTE
   }
 
@@ -171,6 +201,9 @@ void av1_encode_tiles_mt(AV1_COMP *cpi) {
     if (i < cpi->num_workers - 1) {
       av1_accumulate_frame_counts(&cm->counts, thread_data->td->counts);
       accumulate_rd_opt(&cpi->td, thread_data->td);
+#if CONFIG_VAR_TX
+      cpi->td.mb.txb_split_count += thread_data->td->mb.txb_split_count;
+#endif
     }
   }
 }
diff --git a/third_party/aom/av1/encoder/firstpass.c b/third_party/aom/av1/encoder/firstpass.c
index e35a54ef2..7a0abba2d 100644
--- a/third_party/aom/av1/encoder/firstpass.c
+++ b/third_party/aom/av1/encoder/firstpass.c
@@ -568,16 +568,11 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
 
     od_init_qm(x->daala_enc.state.qm, x->daala_enc.state.qm_inv,
                x->daala_enc.qm == OD_HVS_QM ? OD_QM8_Q4_HVS : OD_QM8_Q4_FLAT);
-#if CONFIG_DAALA_EC
+#if !CONFIG_ANS
     od_ec_enc_init(&x->daala_enc.w.ec, 65025);
-#else
-#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
-#endif
-
-#if CONFIG_DAALA_EC
     od_ec_enc_reset(&x->daala_enc.w.ec);
 #else
-#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#error "CONFIG_PVQ currently requires !CONFIG_ANS."
 #endif
   }
 #endif
@@ -598,6 +593,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
   av1_init_mv_probs(cm);
 #if CONFIG_ADAPT_SCAN
   av1_init_scan_order(cm);
+  av1_deliver_eob_threshold(cm, xd);
 #endif
   av1_convolve_init(cm);
 #if CONFIG_PVQ
@@ -884,7 +880,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
           xd->mi[0]->mbmi.tx_size = TX_4X4;
           xd->mi[0]->mbmi.ref_frame[0] = LAST_FRAME;
           xd->mi[0]->mbmi.ref_frame[1] = NONE_FRAME;
-          av1_build_inter_predictors_sby(xd, mb_row * mb_scale,
+          av1_build_inter_predictors_sby(cm, xd, mb_row * mb_scale,
                                          mb_col * mb_scale, NULL, bsize);
           av1_encode_sby_pass1(cm, x, bsize);
           sum_mvr += mv.row;
@@ -997,10 +993,10 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
   }
 
 #if CONFIG_PVQ
-#if CONFIG_DAALA_EC
+#if !CONFIG_ANS
   od_ec_enc_clear(&x->daala_enc.w.ec);
 #else
-#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#error "CONFIG_PVQ currently requires !CONFIG_ANS."
 #endif
 
   x->pvq_q->last_pos = x->pvq_q->curr_pos;
@@ -1235,28 +1231,26 @@ static void setup_rf_level_maxq(AV1_COMP *cpi) {
   }
 }
 
-void av1_init_subsampling(AV1_COMP *cpi) {
-  const AV1_COMMON *const cm = &cpi->common;
-  RATE_CONTROL *const rc = &cpi->rc;
-  const int w = cm->width;
-  const int h = cm->height;
-  int i;
-
-  for (i = 0; i < FRAME_SCALE_STEPS; ++i) {
-    // Note: Frames with odd-sized dimensions may result from this scaling.
-    rc->frame_width[i] = (w * 16) / frame_scale_factor[i];
-    rc->frame_height[i] = (h * 16) / frame_scale_factor[i];
-  }
-
-  setup_rf_level_maxq(cpi);
+void av1_calculate_next_scaled_size(const AV1_COMP *cpi,
+                                    int *scaled_frame_width,
+                                    int *scaled_frame_height) {
+  *scaled_frame_width =
+      cpi->oxcf.width * cpi->resize_next_scale_num / cpi->resize_next_scale_den;
+  *scaled_frame_height = cpi->oxcf.height * cpi->resize_next_scale_num /
+                         cpi->resize_next_scale_den;
 }
 
-void av1_calculate_coded_size(AV1_COMP *cpi, int *scaled_frame_width,
-                              int *scaled_frame_height) {
-  RATE_CONTROL *const rc = &cpi->rc;
-  *scaled_frame_width = rc->frame_width[rc->frame_size_selector];
-  *scaled_frame_height = rc->frame_height[rc->frame_size_selector];
+#if CONFIG_FRAME_SUPERRES
+void av1_calculate_superres_size(const AV1_COMP *cpi, int *encoded_width,
+                                 int *encoded_height) {
+  *encoded_width = cpi->oxcf.scaled_frame_width *
+                   cpi->common.superres_scale_numerator /
+                   SUPERRES_SCALE_DENOMINATOR;
+  *encoded_height = cpi->oxcf.scaled_frame_height *
+                    cpi->common.superres_scale_numerator /
+                    SUPERRES_SCALE_DENOMINATOR;
 }
+#endif  // CONFIG_FRAME_SUPERRES
 
 void av1_init_second_pass(AV1_COMP *cpi) {
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
@@ -1316,7 +1310,7 @@ void av1_init_second_pass(AV1_COMP *cpi) {
   twopass->last_kfgroup_zeromotion_pct = 100;
 
   if (oxcf->resize_mode != RESIZE_NONE) {
-    av1_init_subsampling(cpi);
+    setup_rf_level_maxq(cpi);
   }
 }
 
@@ -2300,7 +2294,8 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
   if (oxcf->resize_mode == RESIZE_DYNAMIC) {
     // Default to starting GF groups at normal frame size.
-    cpi->rc.next_frame_size_selector = UNSCALED;
+    // TODO(afergs): Make a function for this
+    cpi->resize_next_scale_num = cpi->resize_next_scale_den;
   }
 }
 
@@ -2646,7 +2641,8 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
   if (oxcf->resize_mode == RESIZE_DYNAMIC) {
     // Default to normal-sized frame on keyframes.
-    cpi->rc.next_frame_size_selector = UNSCALED;
+    // TODO(afergs): Make a function for this
+    cpi->resize_next_scale_num = cpi->resize_next_scale_den;
   }
 }
 
diff --git a/third_party/aom/av1/encoder/firstpass.h b/third_party/aom/av1/encoder/firstpass.h
index db459cc22..43104454c 100644
--- a/third_party/aom/av1/encoder/firstpass.h
+++ b/third_party/aom/av1/encoder/firstpass.h
@@ -177,10 +177,17 @@ void av1_twopass_postencode_update(struct AV1_COMP *cpi);
 // Post encode update of the rate control parameters for 2-pass
 void av1_twopass_postencode_update(struct AV1_COMP *cpi);
 
-void av1_init_subsampling(struct AV1_COMP *cpi);
-
-void av1_calculate_coded_size(struct AV1_COMP *cpi, int *scaled_frame_width,
-                              int *scaled_frame_height);
+void av1_calculate_next_scaled_size(const struct AV1_COMP *cpi,
+                                    int *scaled_frame_width,
+                                    int *scaled_frame_height);
+
+#if CONFIG_FRAME_SUPERRES
+// This is the size after superress scaling, which could be 1:1.
+// Superres scaling happens after regular downscaling.
+// TODO(afergs): Limit overall reduction to 1/2 of the original size
+void av1_calculate_superres_size(const struct AV1_COMP *cpi, int *encoded_width,
+                                 int *encoded_height);
+#endif  // CONFIG_FRAME_SUPERRES
 
 #if CONFIG_EXT_REFS
 static INLINE int get_number_of_extra_arfs(int interval, int arf_pending) {
diff --git a/third_party/aom/av1/encoder/global_motion.c b/third_party/aom/av1/encoder/global_motion.c
index 2a6204939..74cbc8ae7 100644
--- a/third_party/aom/av1/encoder/global_motion.c
+++ b/third_party/aom/av1/encoder/global_motion.c
@@ -124,14 +124,15 @@ static void force_wmtype(WarpedMotionParams *wm, TransformationType wmtype) {
   wm->wmtype = wmtype;
 }
 
-double refine_integerized_param(WarpedMotionParams *wm,
-                                TransformationType wmtype,
+int64_t refine_integerized_param(WarpedMotionParams *wm,
+                                 TransformationType wmtype,
 #if CONFIG_HIGHBITDEPTH
-                                int use_hbd, int bd,
+                                 int use_hbd, int bd,
 #endif  // CONFIG_HIGHBITDEPTH
-                                uint8_t *ref, int r_width, int r_height,
-                                int r_stride, uint8_t *dst, int d_width,
-                                int d_height, int d_stride, int n_refinements) {
+                                 uint8_t *ref, int r_width, int r_height,
+                                 int r_stride, uint8_t *dst, int d_width,
+                                 int d_height, int d_stride,
+                                 int n_refinements) {
   static const int max_trans_model_params[TRANS_TYPES] = {
     0, 2, 4, 6, 8, 8, 8
   };
@@ -139,22 +140,21 @@ double refine_integerized_param(WarpedMotionParams *wm,
   int i = 0, p;
   int n_params = max_trans_model_params[wmtype];
   int32_t *param_mat = wm->wmmat;
-  double step_error;
+  int64_t step_error, best_error;
   int32_t step;
   int32_t *param;
   int32_t curr_param;
   int32_t best_param;
-  double best_error;
 
   force_wmtype(wm, wmtype);
-  best_error = av1_warp_erroradv(wm,
+  best_error = av1_warp_error(wm,
 #if CONFIG_HIGHBITDEPTH
-                                 use_hbd, bd,
+                              use_hbd, bd,
 #endif  // CONFIG_HIGHBITDEPTH
-                                 ref, r_width, r_height, r_stride,
-                                 dst + border * d_stride + border, border,
-                                 border, d_width - 2 * border,
-                                 d_height - 2 * border, d_stride, 0, 0, 16, 16);
+                              ref, r_width, r_height, r_stride,
+                              dst + border * d_stride + border, border, border,
+                              d_width - 2 * border, d_height - 2 * border,
+                              d_stride, 0, 0, 16, 16);
   step = 1 << (n_refinements + 1);
   for (i = 0; i < n_refinements; i++, step >>= 1) {
     for (p = 0; p < n_params; ++p) {
@@ -167,7 +167,7 @@ double refine_integerized_param(WarpedMotionParams *wm,
       best_param = curr_param;
       // look to the left
       *param = add_param_offset(p, curr_param, -step);
-      step_error = av1_warp_erroradv(
+      step_error = av1_warp_error(
           wm,
 #if CONFIG_HIGHBITDEPTH
           use_hbd, bd,
@@ -183,7 +183,7 @@ double refine_integerized_param(WarpedMotionParams *wm,
 
       // look to the right
       *param = add_param_offset(p, curr_param, step);
-      step_error = av1_warp_erroradv(
+      step_error = av1_warp_error(
           wm,
 #if CONFIG_HIGHBITDEPTH
           use_hbd, bd,
@@ -202,7 +202,7 @@ double refine_integerized_param(WarpedMotionParams *wm,
       // for the biggest step size
       while (step_dir) {
         *param = add_param_offset(p, best_param, step * step_dir);
-        step_error = av1_warp_erroradv(
+        step_error = av1_warp_error(
             wm,
 #if CONFIG_HIGHBITDEPTH
             use_hbd, bd,
diff --git a/third_party/aom/av1/encoder/global_motion.h b/third_party/aom/av1/encoder/global_motion.h
index 8fc757f38..38509df6a 100644
--- a/third_party/aom/av1/encoder/global_motion.h
+++ b/third_party/aom/av1/encoder/global_motion.h
@@ -26,14 +26,17 @@ void convert_model_to_params(const double *params, WarpedMotionParams *model);
 
 int is_enough_erroradvantage(double erroradv, int params_cost);
 
-double refine_integerized_param(WarpedMotionParams *wm,
-                                TransformationType wmtype,
+// Returns the av1_warp_error between "dst" and the result of applying the
+// motion params that result from fine-tuning "wm" to "ref". Note that "wm" is
+// modified in place.
+int64_t refine_integerized_param(WarpedMotionParams *wm,
+                                 TransformationType wmtype,
 #if CONFIG_HIGHBITDEPTH
-                                int use_hbd, int bd,
+                                 int use_hbd, int bd,
 #endif  // CONFIG_HIGHBITDEPTH
-                                uint8_t *ref, int r_width, int r_height,
-                                int r_stride, uint8_t *dst, int d_width,
-                                int d_height, int d_stride, int n_refinements);
+                                 uint8_t *ref, int r_width, int r_height,
+                                 int r_stride, uint8_t *dst, int d_width,
+                                 int d_height, int d_stride, int n_refinements);
 
 /*
   Computes "num_motions" candidate global motion parameters between two frames.
diff --git a/third_party/aom/av1/encoder/hybrid_fwd_txfm.c b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c
index 4fd563163..c57deed84 100644
--- a/third_party/aom/av1/encoder/hybrid_fwd_txfm.c
+++ b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c
@@ -16,7 +16,7 @@
 #include "av1/common/idct.h"
 #include "av1/encoder/hybrid_fwd_txfm.h"
 
-#if CONFIG_CB4X4
+#if CONFIG_CHROMA_2X2
 static void fwd_txfm_2x2(const int16_t *src_diff, tran_low_t *coeff,
                          int diff_stride, TX_TYPE tx_type, int lossless) {
   tran_high_t a1 = src_diff[0];
@@ -132,8 +132,38 @@ static void fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff,
 }
 #endif  // CONFIG_TX64X64
 
+#if CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
+static void fwd_txfm_16x4(const int16_t *src_diff, tran_low_t *coeff,
+                          int diff_stride, TX_TYPE tx_type,
+                          FWD_TXFM_OPT fwd_txfm_opt) {
+  (void)fwd_txfm_opt;
+  av1_fht16x4(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void fwd_txfm_4x16(const int16_t *src_diff, tran_low_t *coeff,
+                          int diff_stride, TX_TYPE tx_type,
+                          FWD_TXFM_OPT fwd_txfm_opt) {
+  (void)fwd_txfm_opt;
+  av1_fht4x16(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void fwd_txfm_32x8(const int16_t *src_diff, tran_low_t *coeff,
+                          int diff_stride, TX_TYPE tx_type,
+                          FWD_TXFM_OPT fwd_txfm_opt) {
+  (void)fwd_txfm_opt;
+  av1_fht32x8(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void fwd_txfm_8x32(const int16_t *src_diff, tran_low_t *coeff,
+                          int diff_stride, TX_TYPE tx_type,
+                          FWD_TXFM_OPT fwd_txfm_opt) {
+  (void)fwd_txfm_opt;
+  av1_fht8x32(src_diff, coeff, diff_stride, tx_type);
+}
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
+
 #if CONFIG_HIGHBITDEPTH
-#if CONFIG_CB4X4
+#if CONFIG_CHROMA_2X2
 static void highbd_fwd_txfm_2x2(const int16_t *src_diff, tran_low_t *coeff,
                                 int diff_stride, TX_TYPE tx_type, int lossless,
                                 const int bd) {
@@ -425,11 +455,25 @@ void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
     case TX_4X4:
       fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type, lossless);
       break;
-#if CONFIG_CB4X4
+#if CONFIG_CHROMA_2X2
     case TX_2X2:
       fwd_txfm_2x2(src_diff, coeff, diff_stride, tx_type, lossless);
       break;
 #endif
+#if CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
+    case TX_4X16:
+      fwd_txfm_4x16(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+      break;
+    case TX_16X4:
+      fwd_txfm_16x4(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+      break;
+    case TX_8X32:
+      fwd_txfm_8x32(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+      break;
+    case TX_32X8:
+      fwd_txfm_32x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+      break;
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
     default: assert(0); break;
   }
 }
@@ -488,7 +532,7 @@ void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
     case TX_4X4:
       highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type, lossless, bd);
       break;
-#if CONFIG_CB4X4
+#if CONFIG_CHROMA_2X2
     case TX_2X2:
       highbd_fwd_txfm_2x2(src_diff, coeff, diff_stride, tx_type, lossless, bd);
       break;
diff --git a/third_party/aom/av1/encoder/mathutils.h b/third_party/aom/av1/encoder/mathutils.h
new file mode 100644
index 000000000..23243dd9e
--- /dev/null
+++ b/third_party/aom/av1/encoder/mathutils.h
@@ -0,0 +1,354 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <memory.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+static const double TINY_NEAR_ZERO = 1.0E-16;
+
+// Solves Ax = b, where x and b are column vectors of size nx1 and A is nxn
+static INLINE int linsolve(int n, double *A, int stride, double *b, double *x) {
+  int i, j, k;
+  double c;
+  // Forward elimination
+  for (k = 0; k < n - 1; k++) {
+    // Bring the largest magitude to the diagonal position
+    for (i = n - 1; i > k; i--) {
+      if (fabs(A[(i - 1) * stride + k]) < fabs(A[i * stride + k])) {
+        for (j = 0; j < n; j++) {
+          c = A[i * stride + j];
+          A[i * stride + j] = A[(i - 1) * stride + j];
+          A[(i - 1) * stride + j] = c;
+        }
+        c = b[i];
+        b[i] = b[i - 1];
+        b[i - 1] = c;
+      }
+    }
+    for (i = k; i < n - 1; i++) {
+      if (fabs(A[k * stride + k]) < TINY_NEAR_ZERO) return 0;
+      c = A[(i + 1) * stride + k] / A[k * stride + k];
+      for (j = 0; j < n; j++) A[(i + 1) * stride + j] -= c * A[k * stride + j];
+      b[i + 1] -= c * b[k];
+    }
+  }
+  // Backward substitution
+  for (i = n - 1; i >= 0; i--) {
+    if (fabs(A[i * stride + i]) < TINY_NEAR_ZERO) return 0;
+    c = 0;
+    for (j = i + 1; j <= n - 1; j++) c += A[i * stride + j] * x[j];
+    x[i] = (b[i] - c) / A[i * stride + i];
+  }
+
+  return 1;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Least-squares
+// Solves for n-dim x in a least squares sense to minimize |Ax - b|^2
+// The solution is simply x = (A'A)^-1 A'b or simply the solution for
+// the system: A'A x = A'b
+static INLINE int least_squares(int n, double *A, int rows, int stride,
+                                double *b, double *scratch, double *x) {
+  int i, j, k;
+  double *scratch_ = NULL;
+  double *AtA, *Atb;
+  if (!scratch) {
+    scratch_ = (double *)aom_malloc(sizeof(*scratch) * n * (n + 1));
+    scratch = scratch_;
+  }
+  AtA = scratch;
+  Atb = scratch + n * n;
+
+  for (i = 0; i < n; ++i) {
+    for (j = i; j < n; ++j) {
+      AtA[i * n + j] = 0.0;
+      for (k = 0; k < rows; ++k)
+        AtA[i * n + j] += A[k * stride + i] * A[k * stride + j];
+      AtA[j * n + i] = AtA[i * n + j];
+    }
+    Atb[i] = 0;
+    for (k = 0; k < rows; ++k) Atb[i] += A[k * stride + i] * b[k];
+  }
+  int ret = linsolve(n, AtA, n, Atb, x);
+  if (scratch_) aom_free(scratch_);
+  return ret;
+}
+
+// Matrix multiply
+static INLINE void multiply_mat(const double *m1, const double *m2, double *res,
+                                const int m1_rows, const int inner_dim,
+                                const int m2_cols) {
+  double sum;
+
+  int row, col, inner;
+  for (row = 0; row < m1_rows; ++row) {
+    for (col = 0; col < m2_cols; ++col) {
+      sum = 0;
+      for (inner = 0; inner < inner_dim; ++inner)
+        sum += m1[row * inner_dim + inner] * m2[inner * m2_cols + col];
+      *(res++) = sum;
+    }
+  }
+}
+
+//
+// The functions below are needed only for homography computation
+// Remove if the homography models are not used.
+//
+///////////////////////////////////////////////////////////////////////////////
+// svdcmp
+// Adopted from Numerical Recipes in C
+
+static INLINE double sign(double a, double b) {
+  return ((b) >= 0 ? fabs(a) : -fabs(a));
+}
+
+static INLINE double pythag(double a, double b) {
+  double ct;
+  const double absa = fabs(a);
+  const double absb = fabs(b);
+
+  if (absa > absb) {
+    ct = absb / absa;
+    return absa * sqrt(1.0 + ct * ct);
+  } else {
+    ct = absa / absb;
+    return (absb == 0) ? 0 : absb * sqrt(1.0 + ct * ct);
+  }
+}
+
+static INLINE int svdcmp(double **u, int m, int n, double w[], double **v) {
+  const int max_its = 30;
+  int flag, i, its, j, jj, k, l, nm;
+  double anorm, c, f, g, h, s, scale, x, y, z;
+  double *rv1 = (double *)aom_malloc(sizeof(*rv1) * (n + 1));
+  g = scale = anorm = 0.0;
+  for (i = 0; i < n; i++) {
+    l = i + 1;
+    rv1[i] = scale * g;
+    g = s = scale = 0.0;
+    if (i < m) {
+      for (k = i; k < m; k++) scale += fabs(u[k][i]);
+      if (scale != 0.) {
+        for (k = i; k < m; k++) {
+          u[k][i] /= scale;
+          s += u[k][i] * u[k][i];
+        }
+        f = u[i][i];
+        g = -sign(sqrt(s), f);
+        h = f * g - s;
+        u[i][i] = f - g;
+        for (j = l; j < n; j++) {
+          for (s = 0.0, k = i; k < m; k++) s += u[k][i] * u[k][j];
+          f = s / h;
+          for (k = i; k < m; k++) u[k][j] += f * u[k][i];
+        }
+        for (k = i; k < m; k++) u[k][i] *= scale;
+      }
+    }
+    w[i] = scale * g;
+    g = s = scale = 0.0;
+    if (i < m && i != n - 1) {
+      for (k = l; k < n; k++) scale += fabs(u[i][k]);
+      if (scale != 0.) {
+        for (k = l; k < n; k++) {
+          u[i][k] /= scale;
+          s += u[i][k] * u[i][k];
+        }
+        f = u[i][l];
+        g = -sign(sqrt(s), f);
+        h = f * g - s;
+        u[i][l] = f - g;
+        for (k = l; k < n; k++) rv1[k] = u[i][k] / h;
+        for (j = l; j < m; j++) {
+          for (s = 0.0, k = l; k < n; k++) s += u[j][k] * u[i][k];
+          for (k = l; k < n; k++) u[j][k] += s * rv1[k];
+        }
+        for (k = l; k < n; k++) u[i][k] *= scale;
+      }
+    }
+    anorm = fmax(anorm, (fabs(w[i]) + fabs(rv1[i])));
+  }
+
+  for (i = n - 1; i >= 0; i--) {
+    if (i < n - 1) {
+      if (g != 0.) {
+        for (j = l; j < n; j++) v[j][i] = (u[i][j] / u[i][l]) / g;
+        for (j = l; j < n; j++) {
+          for (s = 0.0, k = l; k < n; k++) s += u[i][k] * v[k][j];
+          for (k = l; k < n; k++) v[k][j] += s * v[k][i];
+        }
+      }
+      for (j = l; j < n; j++) v[i][j] = v[j][i] = 0.0;
+    }
+    v[i][i] = 1.0;
+    g = rv1[i];
+    l = i;
+  }
+  for (i = AOMMIN(m, n) - 1; i >= 0; i--) {
+    l = i + 1;
+    g = w[i];
+    for (j = l; j < n; j++) u[i][j] = 0.0;
+    if (g != 0.) {
+      g = 1.0 / g;
+      for (j = l; j < n; j++) {
+        for (s = 0.0, k = l; k < m; k++) s += u[k][i] * u[k][j];
+        f = (s / u[i][i]) * g;
+        for (k = i; k < m; k++) u[k][j] += f * u[k][i];
+      }
+      for (j = i; j < m; j++) u[j][i] *= g;
+    } else {
+      for (j = i; j < m; j++) u[j][i] = 0.0;
+    }
+    ++u[i][i];
+  }
+  for (k = n - 1; k >= 0; k--) {
+    for (its = 0; its < max_its; its++) {
+      flag = 1;
+      for (l = k; l >= 0; l--) {
+        nm = l - 1;
+        if ((double)(fabs(rv1[l]) + anorm) == anorm || nm < 0) {
+          flag = 0;
+          break;
+        }
+        if ((double)(fabs(w[nm]) + anorm) == anorm) break;
+      }
+      if (flag) {
+        c = 0.0;
+        s = 1.0;
+        for (i = l; i <= k; i++) {
+          f = s * rv1[i];
+          rv1[i] = c * rv1[i];
+          if ((double)(fabs(f) + anorm) == anorm) break;
+          g = w[i];
+          h = pythag(f, g);
+          w[i] = h;
+          h = 1.0 / h;
+          c = g * h;
+          s = -f * h;
+          for (j = 0; j < m; j++) {
+            y = u[j][nm];
+            z = u[j][i];
+            u[j][nm] = y * c + z * s;
+            u[j][i] = z * c - y * s;
+          }
+        }
+      }
+      z = w[k];
+      if (l == k) {
+        if (z < 0.0) {
+          w[k] = -z;
+          for (j = 0; j < n; j++) v[j][k] = -v[j][k];
+        }
+        break;
+      }
+      if (its == max_its - 1) {
+        aom_free(rv1);
+        return 1;
+      }
+      assert(k > 0);
+      x = w[l];
+      nm = k - 1;
+      y = w[nm];
+      g = rv1[nm];
+      h = rv1[k];
+      f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
+      g = pythag(f, 1.0);
+      f = ((x - z) * (x + z) + h * ((y / (f + sign(g, f))) - h)) / x;
+      c = s = 1.0;
+      for (j = l; j <= nm; j++) {
+        i = j + 1;
+        g = rv1[i];
+        y = w[i];
+        h = s * g;
+        g = c * g;
+        z = pythag(f, h);
+        rv1[j] = z;
+        c = f / z;
+        s = h / z;
+        f = x * c + g * s;
+        g = g * c - x * s;
+        h = y * s;
+        y *= c;
+        for (jj = 0; jj < n; jj++) {
+          x = v[jj][j];
+          z = v[jj][i];
+          v[jj][j] = x * c + z * s;
+          v[jj][i] = z * c - x * s;
+        }
+        z = pythag(f, h);
+        w[j] = z;
+        if (z != 0.) {
+          z = 1.0 / z;
+          c = f * z;
+          s = h * z;
+        }
+        f = c * g + s * y;
+        x = c * y - s * g;
+        for (jj = 0; jj < m; jj++) {
+          y = u[jj][j];
+          z = u[jj][i];
+          u[jj][j] = y * c + z * s;
+          u[jj][i] = z * c - y * s;
+        }
+      }
+      rv1[l] = 0.0;
+      rv1[k] = f;
+      w[k] = x;
+    }
+  }
+  aom_free(rv1);
+  return 0;
+}
+
+static INLINE int SVD(double *U, double *W, double *V, double *matx, int M,
+                      int N) {
+  // Assumes allocation for U is MxN
+  double **nrU = (double **)aom_malloc((M) * sizeof(*nrU));
+  double **nrV = (double **)aom_malloc((N) * sizeof(*nrV));
+  int problem, i;
+
+  problem = !(nrU && nrV);
+  if (!problem) {
+    for (i = 0; i < M; i++) {
+      nrU[i] = &U[i * N];
+    }
+    for (i = 0; i < N; i++) {
+      nrV[i] = &V[i * N];
+    }
+  } else {
+    if (nrU) aom_free(nrU);
+    if (nrV) aom_free(nrV);
+    return 1;
+  }
+
+  /* copy from given matx into nrU */
+  for (i = 0; i < M; i++) {
+    memcpy(&(nrU[i][0]), matx + N * i, N * sizeof(*matx));
+  }
+
+  /* HERE IT IS: do SVD */
+  if (svdcmp(nrU, M, N, W, nrV)) {
+    aom_free(nrU);
+    aom_free(nrV);
+    return 1;
+  }
+
+  /* aom_free Numerical Recipes arrays */
+  aom_free(nrU);
+  aom_free(nrV);
+
+  return 0;
+}
diff --git a/third_party/aom/av1/encoder/mbgraph.c b/third_party/aom/av1/encoder/mbgraph.c
index 1296027dc..3f5daebcc 100644
--- a/third_party/aom/av1/encoder/mbgraph.c
+++ b/third_party/aom/av1/encoder/mbgraph.c
@@ -52,11 +52,14 @@ static unsigned int do_16x16_motion_iteration(AV1_COMP *cpi, const MV *ref_mv,
   {
     int distortion;
     unsigned int sse;
-    cpi->find_fractional_mv_step(x, ref_mv, cpi->common.allow_high_precision_mv,
-                                 x->errorperbit, &v_fn_ptr, 0,
-                                 mv_sf->subpel_iters_per_step,
-                                 cond_cost_list(cpi, cost_list), NULL, NULL,
-                                 &distortion, &sse, NULL, 0, 0, 0);
+    cpi->find_fractional_mv_step(
+        x, ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
+        &v_fn_ptr, 0, mv_sf->subpel_iters_per_step,
+        cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL,
+#if CONFIG_EXT_INTER
+        NULL, 0, 0,
+#endif
+        0, 0, 0);
   }
 
 #if CONFIG_EXT_INTER
@@ -71,7 +74,8 @@ static unsigned int do_16x16_motion_iteration(AV1_COMP *cpi, const MV *ref_mv,
   xd->mi[0]->mbmi.ref_frame[1] = NONE_FRAME;
 #endif  // CONFIG_EXT_INTER
 
-  av1_build_inter_predictors_sby(xd, mb_row, mb_col, NULL, BLOCK_16X16);
+  av1_build_inter_predictors_sby(&cpi->common, xd, mb_row, mb_col, NULL,
+                                 BLOCK_16X16);
 
   /* restore UMV window */
   x->mv_limits = tmp_mv_limits;
diff --git a/third_party/aom/av1/encoder/mcomp.c b/third_party/aom/av1/encoder/mcomp.c
index d069eefb0..52080ca0d 100644
--- a/third_party/aom/av1/encoder/mcomp.c
+++ b/third_party/aom/av1/encoder/mcomp.c
@@ -110,7 +110,7 @@ static int mvsad_err_cost(const MACROBLOCK *x, const MV *mv, const MV *ref,
                           int sad_per_bit) {
   const MV diff = { (mv->row - ref->row) * 8, (mv->col - ref->col) * 8 };
   return ROUND_POWER_OF_TWO(
-      (unsigned)mv_cost(&diff, x->nmvjointsadcost, x->mvsadcost) * sad_per_bit,
+      (unsigned)mv_cost(&diff, x->nmvjointcost, x->mvcost) * sad_per_bit,
       AV1_PROB_COST_SHIFT);
 }
 
@@ -176,6 +176,7 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
 }
 
 /* checks if (r, c) has better score than previous best */
+#if CONFIG_EXT_INTER
 #define CHECK_BETTER(v, r, c)                                             \
   if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                 \
     MV this_mv = { r, c };                                                \
@@ -183,6 +184,10 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
     if (second_pred == NULL)                                              \
       thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r),  \
                          src_address, src_stride, &sse);                  \
+    else if (mask)                                                        \
+      thismse = vfp->msvf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \
+                          src_address, src_stride, second_pred, mask,     \
+                          mask_stride, invert_mask, &sse);                \
     else                                                                  \
       thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \
                           src_address, src_stride, &sse, second_pred);    \
@@ -197,6 +202,29 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
   } else {                                                                \
     v = INT_MAX;                                                          \
   }
+#else
+#define CHECK_BETTER(v, r, c)                                             \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                 \
+    MV this_mv = { r, c };                                                \
+    v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);    \
+    if (second_pred == NULL)                                              \
+      thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r),  \
+                         src_address, src_stride, &sse);                  \
+    else                                                                  \
+      thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \
+                          src_address, src_stride, &sse, second_pred);    \
+    v += thismse;                                                         \
+    if (v < besterr) {                                                    \
+      besterr = v;                                                        \
+      br = r;                                                             \
+      bc = c;                                                             \
+      *distortion = thismse;                                              \
+      *sse1 = sse;                                                        \
+    }                                                                     \
+  } else {                                                                \
+    v = INT_MAX;                                                          \
+  }
+#endif  // CONFIG_EXT_INTER
 
 #define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c)
 
@@ -206,6 +234,26 @@ static INLINE const uint8_t *upre(const uint8_t *buf, int stride, int r,
 }
 
 /* checks if (r, c) has better score than previous best */
+#if CONFIG_EXT_INTER
+#define CHECK_BETTER1(v, r, c)                                               \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                    \
+    MV this_mv = { r, c };                                                   \
+    thismse = upsampled_pref_error(                                          \
+        xd, vfp, src_address, src_stride, upre(y, y_stride, r, c), y_stride, \
+        second_pred, mask, mask_stride, invert_mask, w, h, &sse);            \
+    v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);       \
+    v += thismse;                                                            \
+    if (v < besterr) {                                                       \
+      besterr = v;                                                           \
+      br = r;                                                                \
+      bc = c;                                                                \
+      *distortion = thismse;                                                 \
+      *sse1 = sse;                                                           \
+    }                                                                        \
+  } else {                                                                   \
+    v = INT_MAX;                                                             \
+  }
+#else
 #define CHECK_BETTER1(v, r, c)                                         \
   if (c >= minc && c <= maxc && r >= minr && r <= maxr) {              \
     MV this_mv = { r, c };                                             \
@@ -224,6 +272,7 @@ static INLINE const uint8_t *upre(const uint8_t *buf, int stride, int r,
   } else {                                                             \
     v = INT_MAX;                                                       \
   }
+#endif  // CONFIG_EXT_INTER
 
 #define FIRST_LEVEL_CHECKS                                       \
   {                                                              \
@@ -327,20 +376,36 @@ static unsigned int setup_center_error(
     const MACROBLOCKD *xd, const MV *bestmv, const MV *ref_mv,
     int error_per_bit, const aom_variance_fn_ptr_t *vfp,
     const uint8_t *const src, const int src_stride, const uint8_t *const y,
-    int y_stride, const uint8_t *second_pred, int w, int h, int offset,
-    int *mvjcost, int *mvcost[2], unsigned int *sse1, int *distortion) {
+    int y_stride, const uint8_t *second_pred,
+#if CONFIG_EXT_INTER
+    const uint8_t *mask, int mask_stride, int invert_mask,
+#endif
+    int w, int h, int offset, int *mvjcost, int *mvcost[2], unsigned int *sse1,
+    int *distortion) {
   unsigned int besterr;
 #if CONFIG_HIGHBITDEPTH
   if (second_pred != NULL) {
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]);
-      aom_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
-                               y_stride);
+#if CONFIG_EXT_INTER
+      if (mask)
+        aom_highbd_comp_mask_pred(comp_pred16, second_pred, w, h, y + offset,
+                                  y_stride, mask, mask_stride, invert_mask);
+      else
+#endif
+        aom_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
+                                 y_stride);
       besterr =
           vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride, sse1);
     } else {
       DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]);
-      aom_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
+#if CONFIG_EXT_INTER
+      if (mask)
+        aom_comp_mask_pred(comp_pred, second_pred, w, h, y + offset, y_stride,
+                           mask, mask_stride, invert_mask);
+      else
+#endif
+        aom_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
       besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
     }
   } else {
@@ -352,7 +417,13 @@ static unsigned int setup_center_error(
   (void)xd;
   if (second_pred != NULL) {
     DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]);
-    aom_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
+#if CONFIG_EXT_INTER
+    if (mask)
+      aom_comp_mask_pred(comp_pred, second_pred, w, h, y + offset, y_stride,
+                         mask, mask_stride, invert_mask);
+    else
+#endif
+      aom_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
     besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
   } else {
     besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1);
@@ -391,12 +462,19 @@ int av1_find_best_sub_pixel_tree_pruned_evenmore(
     MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit,
     const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
     int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
-    unsigned int *sse1, const uint8_t *second_pred, int w, int h,
-    int use_upsampled_ref) {
+    unsigned int *sse1, const uint8_t *second_pred,
+#if CONFIG_EXT_INTER
+    const uint8_t *mask, int mask_stride, int invert_mask,
+#endif
+    int w, int h, int use_upsampled_ref) {
   SETUP_SUBPEL_SEARCH;
-  besterr = setup_center_error(
-      xd, bestmv, ref_mv, error_per_bit, vfp, src_address, src_stride, y,
-      y_stride, second_pred, w, h, offset, mvjcost, mvcost, sse1, distortion);
+  besterr =
+      setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, src_address,
+                         src_stride, y, y_stride, second_pred,
+#if CONFIG_EXT_INTER
+                         mask, mask_stride, invert_mask,
+#endif
+                         w, h, offset, mvjcost, mvcost, sse1, distortion);
   (void)halfiters;
   (void)quarteriters;
   (void)eighthiters;
@@ -457,14 +535,21 @@ int av1_find_best_sub_pixel_tree_pruned_more(
     MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit,
     const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
     int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
-    unsigned int *sse1, const uint8_t *second_pred, int w, int h,
-    int use_upsampled_ref) {
+    unsigned int *sse1, const uint8_t *second_pred,
+#if CONFIG_EXT_INTER
+    const uint8_t *mask, int mask_stride, int invert_mask,
+#endif
+    int w, int h, int use_upsampled_ref) {
   SETUP_SUBPEL_SEARCH;
   (void)use_upsampled_ref;
 
-  besterr = setup_center_error(
-      xd, bestmv, ref_mv, error_per_bit, vfp, src_address, src_stride, y,
-      y_stride, second_pred, w, h, offset, mvjcost, mvcost, sse1, distortion);
+  besterr =
+      setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, src_address,
+                         src_stride, y, y_stride, second_pred,
+#if CONFIG_EXT_INTER
+                         mask, mask_stride, invert_mask,
+#endif
+                         w, h, offset, mvjcost, mvcost, sse1, distortion);
   if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
       cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
       cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) {
@@ -519,14 +604,21 @@ int av1_find_best_sub_pixel_tree_pruned(
     MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit,
     const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
     int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
-    unsigned int *sse1, const uint8_t *second_pred, int w, int h,
-    int use_upsampled_ref) {
+    unsigned int *sse1, const uint8_t *second_pred,
+#if CONFIG_EXT_INTER
+    const uint8_t *mask, int mask_stride, int invert_mask,
+#endif
+    int w, int h, int use_upsampled_ref) {
   SETUP_SUBPEL_SEARCH;
   (void)use_upsampled_ref;
 
-  besterr = setup_center_error(
-      xd, bestmv, ref_mv, error_per_bit, vfp, src_address, src_stride, y,
-      y_stride, second_pred, w, h, offset, mvjcost, mvcost, sse1, distortion);
+  besterr =
+      setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, src_address,
+                         src_stride, y, y_stride, second_pred,
+#if CONFIG_EXT_INTER
+                         mask, mask_stride, invert_mask,
+#endif
+                         w, h, offset, mvjcost, mvcost, sse1, distortion);
   if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
       cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
       cost_list[4] != INT_MAX) {
@@ -612,17 +704,29 @@ static int upsampled_pref_error(const MACROBLOCKD *xd,
                                 const aom_variance_fn_ptr_t *vfp,
                                 const uint8_t *const src, const int src_stride,
                                 const uint8_t *const y, int y_stride,
-                                const uint8_t *second_pred, int w, int h,
-                                unsigned int *sse) {
+                                const uint8_t *second_pred,
+#if CONFIG_EXT_INTER
+                                const uint8_t *mask, int mask_stride,
+                                int invert_mask,
+#endif
+                                int w, int h, unsigned int *sse) {
   unsigned int besterr;
 #if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
-    if (second_pred != NULL)
-      aom_highbd_comp_avg_upsampled_pred(pred16, second_pred, w, h, y,
-                                         y_stride);
-    else
+    if (second_pred != NULL) {
+#if CONFIG_EXT_INTER
+      if (mask)
+        aom_highbd_comp_mask_upsampled_pred(pred16, second_pred, w, h, y,
+                                            y_stride, mask, mask_stride,
+                                            invert_mask);
+      else
+#endif
+        aom_highbd_comp_avg_upsampled_pred(pred16, second_pred, w, h, y,
+                                           y_stride);
+    } else {
       aom_highbd_upsampled_pred(pred16, w, h, y, y_stride);
+    }
 
     besterr = vfp->vf(CONVERT_TO_BYTEPTR(pred16), w, src, src_stride, sse);
   } else {
@@ -631,10 +735,17 @@ static int upsampled_pref_error(const MACROBLOCKD *xd,
   DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
   (void)xd;
 #endif  // CONFIG_HIGHBITDEPTH
-    if (second_pred != NULL)
-      aom_comp_avg_upsampled_pred(pred, second_pred, w, h, y, y_stride);
-    else
+    if (second_pred != NULL) {
+#if CONFIG_EXT_INTER
+      if (mask)
+        aom_comp_mask_upsampled_pred(pred, second_pred, w, h, y, y_stride, mask,
+                                     mask_stride, invert_mask);
+      else
+#endif
+        aom_comp_avg_upsampled_pred(pred, second_pred, w, h, y, y_stride);
+    } else {
       aom_upsampled_pred(pred, w, h, y, y_stride);
+    }
 
     besterr = vfp->vf(pred, w, src, src_stride, sse);
 #if CONFIG_HIGHBITDEPTH
@@ -647,23 +758,32 @@ static unsigned int upsampled_setup_center_error(
     const MACROBLOCKD *xd, const MV *bestmv, const MV *ref_mv,
     int error_per_bit, const aom_variance_fn_ptr_t *vfp,
     const uint8_t *const src, const int src_stride, const uint8_t *const y,
-    int y_stride, const uint8_t *second_pred, int w, int h, int offset,
-    int *mvjcost, int *mvcost[2], unsigned int *sse1, int *distortion) {
-  unsigned int besterr = upsampled_pref_error(
-      xd, vfp, src, src_stride, y + offset, y_stride, second_pred, w, h, sse1);
+    int y_stride, const uint8_t *second_pred,
+#if CONFIG_EXT_INTER
+    const uint8_t *mask, int mask_stride, int invert_mask,
+#endif
+    int w, int h, int offset, int *mvjcost, int *mvcost[2], unsigned int *sse1,
+    int *distortion) {
+  unsigned int besterr = upsampled_pref_error(xd, vfp, src, src_stride,
+                                              y + offset, y_stride, second_pred,
+#if CONFIG_EXT_INTER
+                                              mask, mask_stride, invert_mask,
+#endif
+                                              w, h, sse1);
   *distortion = besterr;
   besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
   return besterr;
 }
 
-int av1_find_best_sub_pixel_tree(MACROBLOCK *x, const MV *ref_mv, int allow_hp,
-                                 int error_per_bit,
-                                 const aom_variance_fn_ptr_t *vfp,
-                                 int forced_stop, int iters_per_step,
-                                 int *cost_list, int *mvjcost, int *mvcost[2],
-                                 int *distortion, unsigned int *sse1,
-                                 const uint8_t *second_pred, int w, int h,
-                                 int use_upsampled_ref) {
+int av1_find_best_sub_pixel_tree(
+    MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
+    int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
+    unsigned int *sse1, const uint8_t *second_pred,
+#if CONFIG_EXT_INTER
+    const uint8_t *mask, int mask_stride, int invert_mask,
+#endif
+    int w, int h, int use_upsampled_ref) {
   const uint8_t *const src_address = x->plane[0].src.buf;
   const int src_stride = x->plane[0].src.stride;
   const MACROBLOCKD *xd = &x->e_mbd;
@@ -700,12 +820,19 @@ int av1_find_best_sub_pixel_tree(MACROBLOCK *x, const MV *ref_mv, int allow_hp,
   if (use_upsampled_ref)
     besterr = upsampled_setup_center_error(
         xd, bestmv, ref_mv, error_per_bit, vfp, src_address, src_stride, y,
-        y_stride, second_pred, w, h, (offset * 8), mvjcost, mvcost, sse1,
-        distortion);
+        y_stride, second_pred,
+#if CONFIG_EXT_INTER
+        mask, mask_stride, invert_mask,
+#endif
+        w, h, (offset * 8), mvjcost, mvcost, sse1, distortion);
   else
-    besterr = setup_center_error(
-        xd, bestmv, ref_mv, error_per_bit, vfp, src_address, src_stride, y,
-        y_stride, second_pred, w, h, offset, mvjcost, mvcost, sse1, distortion);
+    besterr =
+        setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, src_address,
+                           src_stride, y, y_stride, second_pred,
+#if CONFIG_EXT_INTER
+                           mask, mask_stride, invert_mask,
+#endif
+                           w, h, offset, mvjcost, mvcost, sse1, distortion);
 
   (void)cost_list;  // to silence compiler warning
 
@@ -721,14 +848,23 @@ int av1_find_best_sub_pixel_tree(MACROBLOCK *x, const MV *ref_mv, int allow_hp,
           const uint8_t *const pre_address = y + tr * y_stride + tc;
 
           thismse = upsampled_pref_error(xd, vfp, src_address, src_stride,
-                                         pre_address, y_stride, second_pred, w,
-                                         h, &sse);
+                                         pre_address, y_stride, second_pred,
+#if CONFIG_EXT_INTER
+                                         mask, mask_stride, invert_mask,
+#endif
+                                         w, h, &sse);
         } else {
           const uint8_t *const pre_address =
               y + (tr >> 3) * y_stride + (tc >> 3);
           if (second_pred == NULL)
             thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr),
                                src_address, src_stride, &sse);
+#if CONFIG_EXT_INTER
+          else if (mask)
+            thismse = vfp->msvf(pre_address, y_stride, sp(tc), sp(tr),
+                                src_address, src_stride, second_pred, mask,
+                                mask_stride, invert_mask, &sse);
+#endif
           else
             thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
                                 src_address, src_stride, &sse, second_pred);
@@ -760,15 +896,24 @@ int av1_find_best_sub_pixel_tree(MACROBLOCK *x, const MV *ref_mv, int allow_hp,
       if (use_upsampled_ref) {
         const uint8_t *const pre_address = y + tr * y_stride + tc;
 
-        thismse =
-            upsampled_pref_error(xd, vfp, src_address, src_stride, pre_address,
-                                 y_stride, second_pred, w, h, &sse);
+        thismse = upsampled_pref_error(xd, vfp, src_address, src_stride,
+                                       pre_address, y_stride, second_pred,
+#if CONFIG_EXT_INTER
+                                       mask, mask_stride, invert_mask,
+#endif
+                                       w, h, &sse);
       } else {
         const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
 
         if (second_pred == NULL)
           thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), src_address,
                              src_stride, &sse);
+#if CONFIG_EXT_INTER
+        else if (mask)
+          thismse = vfp->msvf(pre_address, y_stride, sp(tc), sp(tr),
+                              src_address, src_stride, second_pred, mask,
+                              mask_stride, invert_mask, &sse);
+#endif
         else
           thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
                               src_address, src_stride, &sse, second_pred);
@@ -822,6 +967,102 @@ int av1_find_best_sub_pixel_tree(MACROBLOCK *x, const MV *ref_mv, int allow_hp,
 #undef PRE
 #undef CHECK_BETTER
 
+#if CONFIG_WARPED_MOTION
+unsigned int av1_compute_motion_cost(const AV1_COMP *cpi, MACROBLOCK *const x,
+                                     BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                     const MV *this_mv) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MODE_INFO *mi = xd->mi[0];
+  MB_MODE_INFO *mbmi = &mi->mbmi;
+  const uint8_t *const src = x->plane[0].src.buf;
+  const int src_stride = x->plane[0].src.stride;
+  uint8_t *const dst = xd->plane[0].dst.buf;
+  const int dst_stride = xd->plane[0].dst.stride;
+  const aom_variance_fn_ptr_t *vfp = &cpi->fn_ptr[bsize];
+  const MV ref_mv = x->mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0].as_mv;
+  unsigned int mse;
+  unsigned int sse;
+
+  av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, NULL, bsize);
+  mse = vfp->vf(dst, dst_stride, src, src_stride, &sse);
+  mse +=
+      mv_err_cost(this_mv, &ref_mv, x->nmvjointcost, x->mvcost, x->errorperbit);
+  return mse;
+}
+
+// Refine MV in a small range
+unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x,
+                                  BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                  int *pts, int *pts_inref) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MODE_INFO *mi = xd->mi[0];
+  MB_MODE_INFO *mbmi = &mi->mbmi;
+  const MV neighbors[8] = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 },
+                            { 0, -2 }, { 2, 0 }, { 0, 2 }, { -2, 0 } };
+  const MV ref_mv = x->mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0].as_mv;
+  int16_t br = mbmi->mv[0].as_mv.row;
+  int16_t bc = mbmi->mv[0].as_mv.col;
+  int16_t *tr = &mbmi->mv[0].as_mv.row;
+  int16_t *tc = &mbmi->mv[0].as_mv.col;
+  WarpedMotionParams best_wm_params = mbmi->wm_params[0];
+  unsigned int bestmse;
+  int minc, maxc, minr, maxr;
+  const int start = cm->allow_high_precision_mv ? 0 : 4;
+  int ite;
+
+  av1_set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr,
+                                 &ref_mv);
+
+  // Calculate the center position's error
+  assert(bc >= minc && bc <= maxc && br >= minr && br <= maxr);
+  bestmse = av1_compute_motion_cost(cpi, x, bsize, mi_row, mi_col,
+                                    &mbmi->mv[0].as_mv);
+
+  // MV search
+  for (ite = 0; ite < 2; ++ite) {
+    int best_idx = -1;
+    int idx;
+
+    for (idx = start; idx < start + 4; ++idx) {
+      unsigned int thismse;
+
+      *tr = br + neighbors[idx].row;
+      *tc = bc + neighbors[idx].col;
+
+      if (*tc >= minc && *tc <= maxc && *tr >= minr && *tr <= maxr) {
+        MV this_mv = { *tr, *tc };
+        if (!find_projection(mbmi->num_proj_ref[0], pts, pts_inref, bsize, *tr,
+                             *tc, &mbmi->wm_params[0], mi_row, mi_col)) {
+          thismse =
+              av1_compute_motion_cost(cpi, x, bsize, mi_row, mi_col, &this_mv);
+
+          if (thismse < bestmse) {
+            best_idx = idx;
+            best_wm_params = mbmi->wm_params[0];
+            bestmse = thismse;
+          }
+        }
+      }
+    }
+
+    if (best_idx == -1) break;
+
+    if (best_idx >= 0) {
+      br += neighbors[best_idx].row;
+      bc += neighbors[best_idx].col;
+    }
+  }
+
+  *tr = br;
+  *tc = bc;
+  mbmi->wm_params[0] = best_wm_params;
+
+  return bestmse;
+}
+#endif  // CONFIG_WARPED_MOTION
+
 static INLINE int check_bounds(const MvLimits *mv_limits, int row, int col,
                                int range) {
   return ((row - range) >= mv_limits->row_min) &
@@ -1232,6 +1473,27 @@ int av1_get_mvpred_av_var(const MACROBLOCK *x, const MV *best_mv,
                      : 0);
 }
 
+#if CONFIG_EXT_INTER
+int av1_get_mvpred_mask_var(const MACROBLOCK *x, const MV *best_mv,
+                            const MV *center_mv, const uint8_t *second_pred,
+                            const uint8_t *mask, int mask_stride,
+                            int invert_mask, const aom_variance_fn_ptr_t *vfp,
+                            int use_mvcost) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const MV mv = { best_mv->row * 8, best_mv->col * 8 };
+  unsigned int unused;
+
+  return vfp->msvf(what->buf, what->stride, 0, 0,
+                   get_buf_from_mv(in_what, best_mv), in_what->stride,
+                   second_pred, mask, mask_stride, invert_mask, &unused) +
+         (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost,
+                                   x->errorperbit)
+                     : 0);
+}
+#endif
+
 int av1_hex_search(MACROBLOCK *x, MV *start_mv, int search_param,
                    int sad_per_bit, int do_init_search, int *cost_list,
                    const aom_variance_fn_ptr_t *vfp, int use_mvcost,
@@ -1685,17 +1947,12 @@ unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x,
   DECLARE_ALIGNED(16, int16_t, src_hbuf[MAX_SB_SQUARE]);
   DECLARE_ALIGNED(16, int16_t, src_vbuf[MAX_SB_SQUARE]);
   int idx;
-  const int bw = 4 << b_width_log2_lookup[bsize];
-  const int bh = 4 << b_height_log2_lookup[bsize];
-  const int search_width = bw << 1;
-  const int search_height = bh << 1;
   const int src_stride = x->plane[0].src.stride;
   const int ref_stride = xd->plane[0].pre[0].stride;
   uint8_t const *ref_buf, *src_buf;
   MV *tmp_mv = &xd->mi[0]->mbmi.mv[0].as_mv;
   unsigned int best_sad, tmp_sad, sad_arr[4];
   MV this_mv;
-  const int norm_factor = 3 + (bw >> 5);
   const YV12_BUFFER_CONFIG *scaled_ref_frame =
       av1_get_scaled_ref_frame(cpi, mbmi->ref_frame[0]);
 
@@ -1724,6 +1981,12 @@ unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x,
   }
 #endif
 
+  const int bw = 4 << b_width_log2_lookup[bsize];
+  const int bh = 4 << b_height_log2_lookup[bsize];
+  const int search_width = bw << 1;
+  const int search_height = bh << 1;
+  const int norm_factor = 3 + (bw >> 5);
+
   // Set up prediction 1-D reference set
   ref_buf = xd->plane[0].pre[0].buf - (bw >> 1);
   for (idx = 0; idx < search_width; idx += 16) {
@@ -2195,9 +2458,13 @@ int av1_refining_search_sad(MACROBLOCK *x, MV *ref_mv, int error_per_bit,
 }
 
 // This function is called when we do joint motion search in comp_inter_inter
-// mode.
+// mode, or when searching for one component of an ext-inter compound mode.
 int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
                              const aom_variance_fn_ptr_t *fn_ptr,
+#if CONFIG_EXT_INTER
+                             const uint8_t *mask, int mask_stride,
+                             int invert_mask,
+#endif
                              const MV *center_mv, const uint8_t *second_pred) {
   const MV neighbors[8] = { { -1, 0 },  { 0, -1 }, { 0, 1 },  { 1, 0 },
                             { -1, -1 }, { 1, -1 }, { -1, 1 }, { 1, 1 } };
@@ -2211,10 +2478,18 @@ int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
 
   clamp_mv(best_mv, x->mv_limits.col_min, x->mv_limits.col_max,
            x->mv_limits.row_min, x->mv_limits.row_max);
-  best_sad =
-      fn_ptr->sdaf(what->buf, what->stride, get_buf_from_mv(in_what, best_mv),
-                   in_what->stride, second_pred) +
-      mvsad_err_cost(x, best_mv, &fcenter_mv, error_per_bit);
+#if CONFIG_EXT_INTER
+  if (mask)
+    best_sad = fn_ptr->msdf(what->buf, what->stride,
+                            get_buf_from_mv(in_what, best_mv), in_what->stride,
+                            second_pred, mask, mask_stride, invert_mask) +
+               mvsad_err_cost(x, best_mv, &fcenter_mv, error_per_bit);
+  else
+#endif
+    best_sad =
+        fn_ptr->sdaf(what->buf, what->stride, get_buf_from_mv(in_what, best_mv),
+                     in_what->stride, second_pred) +
+        mvsad_err_cost(x, best_mv, &fcenter_mv, error_per_bit);
 
   for (i = 0; i < search_range; ++i) {
     int best_site = -1;
@@ -2224,9 +2499,17 @@ int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
                       best_mv->col + neighbors[j].col };
 
       if (is_mv_in(&x->mv_limits, &mv)) {
-        unsigned int sad =
-            fn_ptr->sdaf(what->buf, what->stride, get_buf_from_mv(in_what, &mv),
-                         in_what->stride, second_pred);
+        unsigned int sad;
+#if CONFIG_EXT_INTER
+        if (mask)
+          sad = fn_ptr->msdf(what->buf, what->stride,
+                             get_buf_from_mv(in_what, &mv), in_what->stride,
+                             second_pred, mask, mask_stride, invert_mask);
+        else
+#endif
+          sad = fn_ptr->sdaf(what->buf, what->stride,
+                             get_buf_from_mv(in_what, &mv), in_what->stride,
+                             second_pred);
         if (sad < best_sad) {
           sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
           if (sad < best_sad) {
@@ -2337,612 +2620,20 @@ int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
   return var;
 }
 
-#if CONFIG_EXT_INTER
-/* returns subpixel variance error function */
-#define DIST(r, c)                                                         \
-  vfp->msvf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, src_stride, \
-            mask, mask_stride, &sse)
-
-/* checks if (r, c) has better score than previous best */
-
-#define MVC(r, c)                                                         \
-  (mvcost                                                                 \
-       ? ((mvjcost[((r) != rr) * 2 + ((c) != rc)] + mvcost[0][((r)-rr)] + \
-           mvcost[1][((c)-rc)]) *                                         \
-              error_per_bit +                                             \
-          4096) >>                                                        \
-             13                                                           \
-       : 0)
-
-#define CHECK_BETTER(v, r, c)                             \
-  if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \
-    thismse = (DIST(r, c));                               \
-    if ((v = MVC(r, c) + thismse) < besterr) {            \
-      besterr = v;                                        \
-      br = r;                                             \
-      bc = c;                                             \
-      *distortion = thismse;                              \
-      *sse1 = sse;                                        \
-    }                                                     \
-  } else {                                                \
-    v = INT_MAX;                                          \
-  }
-
-#undef CHECK_BETTER0
-#define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c)
-
-#undef CHECK_BETTER1
-#define CHECK_BETTER1(v, r, c)                                                 \
-  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                      \
-    thismse = upsampled_masked_pref_error(xd, mask, mask_stride, vfp, z,       \
-                                          src_stride, upre(y, y_stride, r, c), \
-                                          y_stride, w, h, &sse);               \
-    if ((v = MVC(r, c) + thismse) < besterr) {                                 \
-      besterr = v;                                                             \
-      br = r;                                                                  \
-      bc = c;                                                                  \
-      *distortion = thismse;                                                   \
-      *sse1 = sse;                                                             \
-    }                                                                          \
-  } else {                                                                     \
-    v = INT_MAX;                                                               \
-  }
-
-int av1_find_best_masked_sub_pixel_tree(
-    const MACROBLOCK *x, const uint8_t *mask, int mask_stride, MV *bestmv,
-    const MV *ref_mv, int allow_hp, int error_per_bit,
-    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
-    int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1,
-    int is_second) {
-  const uint8_t *const z = x->plane[0].src.buf;
-  const int src_stride = x->plane[0].src.stride;
-  const MACROBLOCKD *xd = &x->e_mbd;
-  unsigned int besterr = INT_MAX;
-  unsigned int sse;
-  int thismse;
-  unsigned int whichdir;
-  unsigned int halfiters = iters_per_step;
-  unsigned int quarteriters = iters_per_step;
-  unsigned int eighthiters = iters_per_step;
-
-  const int y_stride = xd->plane[0].pre[is_second].stride;
-  const int offset = bestmv->row * y_stride + bestmv->col;
-  const uint8_t *const y = xd->plane[0].pre[is_second].buf;
-
-  int rr = ref_mv->row;
-  int rc = ref_mv->col;
-  int br = bestmv->row * 8;
-  int bc = bestmv->col * 8;
-  int hstep = 4;
-  int tr = br;
-  int tc = bc;
-  int minc, maxc, minr, maxr;
-
-  av1_set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr,
-                                 ref_mv);
-
-  // central mv
-  bestmv->row *= 8;
-  bestmv->col *= 8;
-
-  // calculate central point error
-  besterr =
-      vfp->mvf(y + offset, y_stride, z, src_stride, mask, mask_stride, sse1);
-  *distortion = besterr;
-  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
-
-  // 1/2 pel
-  FIRST_LEVEL_CHECKS;
-  if (halfiters > 1) {
-    SECOND_LEVEL_CHECKS;
-  }
-  tr = br;
-  tc = bc;
-
-  // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
-  if (forced_stop != 2) {
-    hstep >>= 1;
-    FIRST_LEVEL_CHECKS;
-    if (quarteriters > 1) {
-      SECOND_LEVEL_CHECKS;
-    }
-    tr = br;
-    tc = bc;
-  }
-
-  if (allow_hp && forced_stop == 0) {
-    hstep >>= 1;
-    FIRST_LEVEL_CHECKS;
-    if (eighthiters > 1) {
-      SECOND_LEVEL_CHECKS;
-    }
-    tr = br;
-    tc = bc;
-  }
-  // These lines insure static analysis doesn't warn that
-  // tr and tc aren't used after the above point.
-  (void)tr;
-  (void)tc;
-
-  bestmv->row = br;
-  bestmv->col = bc;
-
-  return besterr;
-}
-
-static unsigned int setup_masked_center_error(
-    const uint8_t *mask, int mask_stride, const MV *bestmv, const MV *ref_mv,
-    int error_per_bit, const aom_variance_fn_ptr_t *vfp,
-    const uint8_t *const src, const int src_stride, const uint8_t *const y,
-    int y_stride, int offset, int *mvjcost, int *mvcost[2], unsigned int *sse1,
-    int *distortion) {
-  unsigned int besterr;
-  besterr =
-      vfp->mvf(y + offset, y_stride, src, src_stride, mask, mask_stride, sse1);
-  *distortion = besterr;
-  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
-  return besterr;
-}
-
-static int upsampled_masked_pref_error(const MACROBLOCKD *xd,
-                                       const uint8_t *mask, int mask_stride,
-                                       const aom_variance_fn_ptr_t *vfp,
-                                       const uint8_t *const src,
-                                       const int src_stride,
-                                       const uint8_t *const y, int y_stride,
-                                       int w, int h, unsigned int *sse) {
-  unsigned int besterr;
-#if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
-    aom_highbd_upsampled_pred(pred16, w, h, y, y_stride);
-
-    besterr = vfp->mvf(CONVERT_TO_BYTEPTR(pred16), w, src, src_stride, mask,
-                       mask_stride, sse);
-  } else {
-    DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
-#else
-  DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
-  (void)xd;
-#endif  // CONFIG_HIGHBITDEPTH
-    aom_upsampled_pred(pred, w, h, y, y_stride);
-
-    besterr = vfp->mvf(pred, w, src, src_stride, mask, mask_stride, sse);
-#if CONFIG_HIGHBITDEPTH
-  }
-#endif
-  return besterr;
-}
-
-static unsigned int upsampled_setup_masked_center_error(
-    const MACROBLOCKD *xd, const uint8_t *mask, int mask_stride,
-    const MV *bestmv, const MV *ref_mv, int error_per_bit,
-    const aom_variance_fn_ptr_t *vfp, const uint8_t *const src,
-    const int src_stride, const uint8_t *const y, int y_stride, int w, int h,
-    int offset, int *mvjcost, int *mvcost[2], unsigned int *sse1,
-    int *distortion) {
-  unsigned int besterr =
-      upsampled_masked_pref_error(xd, mask, mask_stride, vfp, src, src_stride,
-                                  y + offset, y_stride, w, h, sse1);
-  *distortion = besterr;
-  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
-  return besterr;
-}
-
-int av1_find_best_masked_sub_pixel_tree_up(
-    const AV1_COMP *cpi, MACROBLOCK *x, const uint8_t *mask, int mask_stride,
-    int mi_row, int mi_col, MV *bestmv, const MV *ref_mv, int allow_hp,
-    int error_per_bit, const aom_variance_fn_ptr_t *vfp, int forced_stop,
-    int iters_per_step, int *mvjcost, int *mvcost[2], int *distortion,
-    unsigned int *sse1, int is_second, int use_upsampled_ref) {
-  const uint8_t *const z = x->plane[0].src.buf;
-  const uint8_t *const src_address = z;
-  const int src_stride = x->plane[0].src.stride;
-  MACROBLOCKD *xd = &x->e_mbd;
-  struct macroblockd_plane *const pd = &xd->plane[0];
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  unsigned int besterr = INT_MAX;
-  unsigned int sse;
-  unsigned int thismse;
-
-  int rr = ref_mv->row;
-  int rc = ref_mv->col;
-  int br = bestmv->row * 8;
-  int bc = bestmv->col * 8;
-  int hstep = 4;
-  int iter;
-  int round = 3 - forced_stop;
-  int tr = br;
-  int tc = bc;
-  const MV *search_step = search_step_table;
-  int idx, best_idx = -1;
-  unsigned int cost_array[5];
-  int kr, kc;
-  const int w = block_size_wide[mbmi->sb_type];
-  const int h = block_size_high[mbmi->sb_type];
-  int offset;
-  int y_stride;
-  const uint8_t *y;
-
-  const struct buf_2d backup_pred = pd->pre[is_second];
-  int minc, maxc, minr, maxr;
-
-  av1_set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr,
-                                 ref_mv);
-
-  if (use_upsampled_ref) {
-    int ref = xd->mi[0]->mbmi.ref_frame[is_second];
-    const YV12_BUFFER_CONFIG *upsampled_ref = get_upsampled_ref(cpi, ref);
-    setup_pred_plane(&pd->pre[is_second], mbmi->sb_type,
-                     upsampled_ref->y_buffer, upsampled_ref->y_crop_width,
-                     upsampled_ref->y_crop_height, upsampled_ref->y_stride,
-                     (mi_row << 3), (mi_col << 3), NULL, pd->subsampling_x,
-                     pd->subsampling_y);
-  }
-  y = pd->pre[is_second].buf;
-  y_stride = pd->pre[is_second].stride;
-  offset = bestmv->row * y_stride + bestmv->col;
-
-  if (!allow_hp)
-    if (round == 3) round = 2;
-
-  bestmv->row *= 8;
-  bestmv->col *= 8;
-
-  // use_upsampled_ref can be 0 or 1
-  if (use_upsampled_ref)
-    besterr = upsampled_setup_masked_center_error(
-        xd, mask, mask_stride, bestmv, ref_mv, error_per_bit, vfp, z,
-        src_stride, y, y_stride, w, h, (offset * 8), mvjcost, mvcost, sse1,
-        distortion);
-  else
-    besterr = setup_masked_center_error(
-        mask, mask_stride, bestmv, ref_mv, error_per_bit, vfp, z, src_stride, y,
-        y_stride, offset, mvjcost, mvcost, sse1, distortion);
-
-  for (iter = 0; iter < round; ++iter) {
-    // Check vertical and horizontal sub-pixel positions.
-    for (idx = 0; idx < 4; ++idx) {
-      tr = br + search_step[idx].row;
-      tc = bc + search_step[idx].col;
-      if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
-        MV this_mv = { tr, tc };
-
-        if (use_upsampled_ref) {
-          const uint8_t *const pre_address = y + tr * y_stride + tc;
-
-          thismse = upsampled_masked_pref_error(
-              xd, mask, mask_stride, vfp, src_address, src_stride, pre_address,
-              y_stride, w, h, &sse);
-        } else {
-          const uint8_t *const pre_address =
-              y + (tr >> 3) * y_stride + (tc >> 3);
-          thismse = vfp->msvf(pre_address, y_stride, sp(tc), sp(tr),
-                              src_address, src_stride, mask, mask_stride, &sse);
-        }
-
-        cost_array[idx] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost,
-                                                mvcost, error_per_bit);
-
-        if (cost_array[idx] < besterr) {
-          best_idx = idx;
-          besterr = cost_array[idx];
-          *distortion = thismse;
-          *sse1 = sse;
-        }
-      } else {
-        cost_array[idx] = INT_MAX;
-      }
-    }
-
-    // Check diagonal sub-pixel position
-    kc = (cost_array[0] <= cost_array[1] ? -hstep : hstep);
-    kr = (cost_array[2] <= cost_array[3] ? -hstep : hstep);
-
-    tc = bc + kc;
-    tr = br + kr;
-    if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
-      MV this_mv = { tr, tc };
-
-      if (use_upsampled_ref) {
-        const uint8_t *const pre_address = y + tr * y_stride + tc;
-
-        thismse = upsampled_masked_pref_error(
-            xd, mask, mask_stride, vfp, src_address, src_stride, pre_address,
-            y_stride, w, h, &sse);
-      } else {
-        const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
-
-        thismse = vfp->msvf(pre_address, y_stride, sp(tc), sp(tr), src_address,
-                            src_stride, mask, mask_stride, &sse);
-      }
-
-      cost_array[4] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
-                                            error_per_bit);
-
-      if (cost_array[4] < besterr) {
-        best_idx = 4;
-        besterr = cost_array[4];
-        *distortion = thismse;
-        *sse1 = sse;
-      }
-    } else {
-      cost_array[idx] = INT_MAX;
-    }
-
-    if (best_idx < 4 && best_idx >= 0) {
-      br += search_step[best_idx].row;
-      bc += search_step[best_idx].col;
-    } else if (best_idx == 4) {
-      br = tr;
-      bc = tc;
-    }
-
-    if (iters_per_step > 1 && best_idx != -1) {
-      if (use_upsampled_ref) {
-        SECOND_LEVEL_CHECKS_BEST(1);
-      } else {
-        SECOND_LEVEL_CHECKS_BEST(0);
-      }
-    }
-
-    tr = br;
-    tc = bc;
-
-    search_step += 4;
-    hstep >>= 1;
-    best_idx = -1;
-  }
-
-  // These lines insure static analysis doesn't warn that
-  // tr and tc aren't used after the above point.
-  (void)tr;
-  (void)tc;
-
-  bestmv->row = br;
-  bestmv->col = bc;
-
-  if (use_upsampled_ref) {
-    pd->pre[is_second] = backup_pred;
-  }
-
-  return besterr;
-}
-
-#undef DIST
-#undef MVC
-#undef CHECK_BETTER
-
-static int get_masked_mvpred_var(const MACROBLOCK *x, const uint8_t *mask,
-                                 int mask_stride, const MV *best_mv,
-                                 const MV *center_mv,
-                                 const aom_variance_fn_ptr_t *vfp,
-                                 int use_mvcost, int is_second) {
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct buf_2d *const what = &x->plane[0].src;
-  const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
-  const MV mv = { best_mv->row * 8, best_mv->col * 8 };
-  unsigned int unused;
-
-  return vfp->mvf(what->buf, what->stride, get_buf_from_mv(in_what, best_mv),
-                  in_what->stride, mask, mask_stride, &unused) +
-         (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost,
-                                   x->errorperbit)
-                     : 0);
-}
-
-int masked_refining_search_sad(const MACROBLOCK *x, const uint8_t *mask,
-                               int mask_stride, MV *ref_mv, int error_per_bit,
-                               int search_range,
-                               const aom_variance_fn_ptr_t *fn_ptr,
-                               const MV *center_mv, int is_second) {
-  const MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } };
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct buf_2d *const what = &x->plane[0].src;
-  const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
-  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
-  unsigned int best_sad =
-      fn_ptr->msdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv),
-                   in_what->stride, mask, mask_stride) +
-      mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
-  int i, j;
-
-  for (i = 0; i < search_range; i++) {
-    int best_site = -1;
-
-    for (j = 0; j < 4; j++) {
-      const MV mv = { ref_mv->row + neighbors[j].row,
-                      ref_mv->col + neighbors[j].col };
-      if (is_mv_in(&x->mv_limits, &mv)) {
-        unsigned int sad =
-            fn_ptr->msdf(what->buf, what->stride, get_buf_from_mv(in_what, &mv),
-                         in_what->stride, mask, mask_stride);
-        if (sad < best_sad) {
-          sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
-          if (sad < best_sad) {
-            best_sad = sad;
-            best_site = j;
-          }
-        }
-      }
-    }
-
-    if (best_site == -1) {
-      break;
-    } else {
-      ref_mv->row += neighbors[best_site].row;
-      ref_mv->col += neighbors[best_site].col;
-    }
-  }
-  return best_sad;
-}
-
-int masked_diamond_search_sad(const MACROBLOCK *x,
-                              const search_site_config *cfg,
-                              const uint8_t *mask, int mask_stride, MV *ref_mv,
-                              MV *best_mv, int search_param, int sad_per_bit,
-                              int *num00, const aom_variance_fn_ptr_t *fn_ptr,
-                              const MV *center_mv, int is_second) {
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct buf_2d *const what = &x->plane[0].src;
-  const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
-  // search_param determines the length of the initial step and hence the number
-  // of iterations
-  // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 =
-  // (MAX_FIRST_STEP/4) pel... etc.
-  const search_site *const ss = &cfg->ss[search_param * cfg->searches_per_step];
-  const int tot_steps = (cfg->ss_count / cfg->searches_per_step) - search_param;
-  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
-  const uint8_t *best_address, *in_what_ref;
-  int best_sad = INT_MAX;
-  int best_site = 0;
-  int last_site = 0;
-  int i, j, step;
-
-  clamp_mv(ref_mv, x->mv_limits.col_min, x->mv_limits.col_max,
-           x->mv_limits.row_min, x->mv_limits.row_max);
-  in_what_ref = get_buf_from_mv(in_what, ref_mv);
-  best_address = in_what_ref;
-  *num00 = 0;
-  *best_mv = *ref_mv;
-
-  // Check the starting position
-  best_sad = fn_ptr->msdf(what->buf, what->stride, best_address,
-                          in_what->stride, mask, mask_stride) +
-             mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
-
-  i = 1;
-
-  for (step = 0; step < tot_steps; step++) {
-    for (j = 0; j < cfg->searches_per_step; j++) {
-      const MV mv = { best_mv->row + ss[i].mv.row,
-                      best_mv->col + ss[i].mv.col };
-      if (is_mv_in(&x->mv_limits, &mv)) {
-        int sad =
-            fn_ptr->msdf(what->buf, what->stride, best_address + ss[i].offset,
-                         in_what->stride, mask, mask_stride);
-        if (sad < best_sad) {
-          sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
-          if (sad < best_sad) {
-            best_sad = sad;
-            best_site = i;
-          }
-        }
-      }
-
-      i++;
-    }
-
-    if (best_site != last_site) {
-      best_mv->row += ss[best_site].mv.row;
-      best_mv->col += ss[best_site].mv.col;
-      best_address += ss[best_site].offset;
-      last_site = best_site;
-#if defined(NEW_DIAMOND_SEARCH)
-      while (1) {
-        const MV this_mv = { best_mv->row + ss[best_site].mv.row,
-                             best_mv->col + ss[best_site].mv.col };
-        if (is_mv_in(&x->mv_limits, &this_mv)) {
-          int sad = fn_ptr->msdf(what->buf, what->stride,
-                                 best_address + ss[best_site].offset,
-                                 in_what->stride, mask, mask_stride);
-          if (sad < best_sad) {
-            sad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
-            if (sad < best_sad) {
-              best_sad = sad;
-              best_mv->row += ss[best_site].mv.row;
-              best_mv->col += ss[best_site].mv.col;
-              best_address += ss[best_site].offset;
-              continue;
-            }
-          }
-        }
-        break;
-      }
-#endif
-    } else if (best_address == in_what_ref) {
-      (*num00)++;
-    }
-  }
-  return best_sad;
-}
-
-int av1_masked_full_pixel_diamond(const AV1_COMP *cpi, MACROBLOCK *x,
-                                  const uint8_t *mask, int mask_stride,
-                                  MV *mvp_full, int step_param, int sadpb,
-                                  int further_steps, int do_refine,
-                                  const aom_variance_fn_ptr_t *fn_ptr,
-                                  const MV *ref_mv, MV *dst_mv, int is_second) {
-  MV temp_mv;
-  int thissme, n, num00 = 0;
-  int bestsme = masked_diamond_search_sad(x, &cpi->ss_cfg, mask, mask_stride,
-                                          mvp_full, &temp_mv, step_param, sadpb,
-                                          &n, fn_ptr, ref_mv, is_second);
-  if (bestsme < INT_MAX)
-    bestsme = get_masked_mvpred_var(x, mask, mask_stride, &temp_mv, ref_mv,
-                                    fn_ptr, 1, is_second);
-  *dst_mv = temp_mv;
-
-  // If there won't be more n-step search, check to see if refining search is
-  // needed.
-  if (n > further_steps) do_refine = 0;
-
-  while (n < further_steps) {
-    ++n;
-
-    if (num00) {
-      num00--;
-    } else {
-      thissme = masked_diamond_search_sad(
-          x, &cpi->ss_cfg, mask, mask_stride, mvp_full, &temp_mv,
-          step_param + n, sadpb, &num00, fn_ptr, ref_mv, is_second);
-      if (thissme < INT_MAX)
-        thissme = get_masked_mvpred_var(x, mask, mask_stride, &temp_mv, ref_mv,
-                                        fn_ptr, 1, is_second);
-
-      // check to see if refining search is needed.
-      if (num00 > further_steps - n) do_refine = 0;
-
-      if (thissme < bestsme) {
-        bestsme = thissme;
-        *dst_mv = temp_mv;
-      }
-    }
-  }
-
-  // final 1-away diamond refining search
-  if (do_refine) {
-    const int search_range = 8;
-    MV best_mv = *dst_mv;
-    thissme =
-        masked_refining_search_sad(x, mask, mask_stride, &best_mv, sadpb,
-                                   search_range, fn_ptr, ref_mv, is_second);
-    if (thissme < INT_MAX)
-      thissme = get_masked_mvpred_var(x, mask, mask_stride, &best_mv, ref_mv,
-                                      fn_ptr, 1, is_second);
-    if (thissme < bestsme) {
-      bestsme = thissme;
-      *dst_mv = best_mv;
-    }
-  }
-  return bestsme;
-}
-#endif  // CONFIG_EXT_INTER
-
 #if CONFIG_MOTION_VAR
 /* returns subpixel variance error function */
 #define DIST(r, c) \
   vfp->osvf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, mask, &sse)
 
 /* checks if (r, c) has better score than previous best */
-#define MVC(r, c)                                                         \
-  (mvcost                                                                 \
-       ? ((mvjcost[((r) != rr) * 2 + ((c) != rc)] + mvcost[0][((r)-rr)] + \
-           mvcost[1][((c)-rc)]) *                                         \
-              error_per_bit +                                             \
-          4096) >>                                                        \
-             13                                                           \
-       : 0)
+#define MVC(r, c)                                                              \
+  (unsigned int)(mvcost                                                        \
+                     ? ((mvjcost[((r) != rr) * 2 + ((c) != rc)] +              \
+                         mvcost[0][((r)-rr)] + (int64_t)mvcost[1][((c)-rc)]) * \
+                            error_per_bit +                                    \
+                        4096) >>                                               \
+                           13                                                  \
+                     : 0)
 
 #define CHECK_BETTER(v, r, c)                             \
   if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \
@@ -3452,15 +3143,21 @@ int av1_obmc_full_pixel_diamond(const AV1_COMP *cpi, MACROBLOCK *x,
   (void)thismse;           \
   (void)cost_list;
 // Return the maximum MV.
-int av1_return_max_sub_pixel_mv(MACROBLOCK *x, const MV *ref_mv, int allow_hp,
-                                int error_per_bit,
-                                const aom_variance_fn_ptr_t *vfp,
-                                int forced_stop, int iters_per_step,
-                                int *cost_list, int *mvjcost, int *mvcost[2],
-                                int *distortion, unsigned int *sse1,
-                                const uint8_t *second_pred, int w, int h,
-                                int use_upsampled_ref) {
+int av1_return_max_sub_pixel_mv(
+    MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
+    int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
+    unsigned int *sse1, const uint8_t *second_pred,
+#if CONFIG_EXT_INTER
+    const uint8_t *mask, int mask_stride, int invert_mask,
+#endif
+    int w, int h, int use_upsampled_ref) {
   COMMON_MV_TEST;
+#if CONFIG_EXT_INTER
+  (void)mask;
+  (void)mask_stride;
+  (void)invert_mask;
+#endif
   (void)minr;
   (void)minc;
   bestmv->row = maxr;
@@ -3472,17 +3169,23 @@ int av1_return_max_sub_pixel_mv(MACROBLOCK *x, const MV *ref_mv, int allow_hp,
   return besterr;
 }
 // Return the minimum MV.
-int av1_return_min_sub_pixel_mv(MACROBLOCK *x, const MV *ref_mv, int allow_hp,
-                                int error_per_bit,
-                                const aom_variance_fn_ptr_t *vfp,
-                                int forced_stop, int iters_per_step,
-                                int *cost_list, int *mvjcost, int *mvcost[2],
-                                int *distortion, unsigned int *sse1,
-                                const uint8_t *second_pred, int w, int h,
-                                int use_upsampled_ref) {
+int av1_return_min_sub_pixel_mv(
+    MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
+    int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
+    unsigned int *sse1, const uint8_t *second_pred,
+#if CONFIG_EXT_INTER
+    const uint8_t *mask, int mask_stride, int invert_mask,
+#endif
+    int w, int h, int use_upsampled_ref) {
   COMMON_MV_TEST;
   (void)maxr;
   (void)maxc;
+#if CONFIG_EXT_INTER
+  (void)mask;
+  (void)mask_stride;
+  (void)invert_mask;
+#endif
   bestmv->row = minr;
   bestmv->col = minc;
   besterr = 0;
diff --git a/third_party/aom/av1/encoder/mcomp.h b/third_party/aom/av1/encoder/mcomp.h
index 8465860ad..7e8b4b29d 100644
--- a/third_party/aom/av1/encoder/mcomp.h
+++ b/third_party/aom/av1/encoder/mcomp.h
@@ -58,6 +58,13 @@ int av1_get_mvpred_var(const MACROBLOCK *x, const MV *best_mv,
 int av1_get_mvpred_av_var(const MACROBLOCK *x, const MV *best_mv,
                           const MV *center_mv, const uint8_t *second_pred,
                           const aom_variance_fn_ptr_t *vfp, int use_mvcost);
+#if CONFIG_EXT_INTER
+int av1_get_mvpred_mask_var(const MACROBLOCK *x, const MV *best_mv,
+                            const MV *center_mv, const uint8_t *second_pred,
+                            const uint8_t *mask, int mask_stride,
+                            int invert_mask, const aom_variance_fn_ptr_t *vfp,
+                            int use_mvcost);
+#endif
 
 struct AV1_COMP;
 struct SPEED_FEATURES;
@@ -91,8 +98,11 @@ typedef int(fractional_mv_step_fp)(
     const aom_variance_fn_ptr_t *vfp,
     int forced_stop,  // 0 - full, 1 - qtr only, 2 - half only
     int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
-    int *distortion, unsigned int *sse1, const uint8_t *second_pred, int w,
-    int h, int use_upsampled_ref);
+    int *distortion, unsigned int *sse1, const uint8_t *second_pred,
+#if CONFIG_EXT_INTER
+    const uint8_t *mask, int mask_stride, int invert_mask,
+#endif
+    int w, int h, int use_upsampled_ref);
 
 extern fractional_mv_step_fp av1_find_best_sub_pixel_tree;
 extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned;
@@ -113,6 +123,10 @@ typedef int (*av1_diamond_search_fn_t)(
 
 int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
                              const aom_variance_fn_ptr_t *fn_ptr,
+#if CONFIG_EXT_INTER
+                             const uint8_t *mask, int mask_stride,
+                             int invert_mask,
+#endif
                              const MV *center_mv, const uint8_t *second_pred);
 
 struct AV1_COMP;
@@ -122,27 +136,6 @@ int av1_full_pixel_search(const struct AV1_COMP *cpi, MACROBLOCK *x,
                           int error_per_bit, int *cost_list, const MV *ref_mv,
                           int var_max, int rd);
 
-#if CONFIG_EXT_INTER
-int av1_find_best_masked_sub_pixel_tree(
-    const MACROBLOCK *x, const uint8_t *mask, int mask_stride, MV *bestmv,
-    const MV *ref_mv, int allow_hp, int error_per_bit,
-    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
-    int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1,
-    int is_second);
-int av1_find_best_masked_sub_pixel_tree_up(
-    const struct AV1_COMP *cpi, MACROBLOCK *x, const uint8_t *mask,
-    int mask_stride, int mi_row, int mi_col, MV *bestmv, const MV *ref_mv,
-    int allow_hp, int error_per_bit, const aom_variance_fn_ptr_t *vfp,
-    int forced_stop, int iters_per_step, int *mvjcost, int *mvcost[2],
-    int *distortion, unsigned int *sse1, int is_second, int use_upsampled_ref);
-int av1_masked_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x,
-                                  const uint8_t *mask, int mask_stride,
-                                  MV *mvp_full, int step_param, int sadpb,
-                                  int further_steps, int do_refine,
-                                  const aom_variance_fn_ptr_t *fn_ptr,
-                                  const MV *ref_mv, MV *dst_mv, int is_second);
-#endif  // CONFIG_EXT_INTER
-
 #if CONFIG_MOTION_VAR
 int av1_obmc_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x,
                                 MV *mvp_full, int step_param, int sadpb,
@@ -160,4 +153,14 @@ int av1_find_best_obmc_sub_pixel_tree_up(
 }  // extern "C"
 #endif
 
+#if CONFIG_WARPED_MOTION
+unsigned int av1_compute_motion_cost(const struct AV1_COMP *cpi,
+                                     MACROBLOCK *const x, BLOCK_SIZE bsize,
+                                     int mi_row, int mi_col, const MV *this_mv);
+unsigned int av1_refine_warped_mv(const struct AV1_COMP *cpi,
+                                  MACROBLOCK *const x, BLOCK_SIZE bsize,
+                                  int mi_row, int mi_col, int *pts,
+                                  int *pts_inref);
+#endif  // CONFIG_WARPED_MOTION
+
 #endif  // AV1_ENCODER_MCOMP_H_
diff --git a/third_party/aom/av1/encoder/palette.c b/third_party/aom/av1/encoder/palette.c
index 355141de5..235964dde 100644
--- a/third_party/aom/av1/encoder/palette.c
+++ b/third_party/aom/av1/encoder/palette.c
@@ -167,31 +167,58 @@ int av1_count_colors(const uint8_t *src, int stride, int rows, int cols) {
 }
 
 #if CONFIG_PALETTE_DELTA_ENCODING
-int av1_get_palette_delta_bits_y(const PALETTE_MODE_INFO *const pmi,
-                                 int bit_depth, int *min_bits) {
-  const int n = pmi->palette_size[0];
-  int max_d = 0, i;
-  *min_bits = bit_depth - 3;
-  for (i = 1; i < n; ++i) {
-    const int delta = pmi->palette_colors[i] - pmi->palette_colors[i - 1];
-    assert(delta > 0);
-    if (delta > max_d) max_d = delta;
+static int delta_encode_cost(const int *colors, int num, int bit_depth,
+                             int min_val) {
+  if (num <= 0) return 0;
+  int bits_cost = bit_depth;
+  if (num == 1) return bits_cost;
+  bits_cost += 2;
+  int max_delta = 0;
+  int deltas[PALETTE_MAX_SIZE];
+  const int min_bits = bit_depth - 3;
+  for (int i = 1; i < num; ++i) {
+    const int delta = colors[i] - colors[i - 1];
+    deltas[i - 1] = delta;
+    assert(delta >= min_val);
+    if (delta > max_delta) max_delta = delta;
+  }
+  int bits_per_delta = AOMMAX(av1_ceil_log2(max_delta + 1 - min_val), min_bits);
+  assert(bits_per_delta <= bit_depth);
+  int range = (1 << bit_depth) - colors[0] - min_val;
+  for (int i = 0; i < num - 1; ++i) {
+    bits_cost += bits_per_delta;
+    range -= deltas[i];
+    bits_per_delta = AOMMIN(bits_per_delta, av1_ceil_log2(range));
   }
-  return AOMMAX(av1_ceil_log2(max_d), *min_bits);
+  return bits_cost;
 }
 
-int av1_get_palette_delta_bits_u(const PALETTE_MODE_INFO *const pmi,
-                                 int bit_depth, int *min_bits) {
-  const int n = pmi->palette_size[1];
-  int max_d = 0, i;
-  *min_bits = bit_depth - 3;
-  for (i = 1; i < n; ++i) {
-    const int delta = pmi->palette_colors[PALETTE_MAX_SIZE + i] -
-                      pmi->palette_colors[PALETTE_MAX_SIZE + i - 1];
-    assert(delta >= 0);
-    if (delta > max_d) max_d = delta;
+int av1_index_color_cache(const uint16_t *color_cache, int n_cache,
+                          const uint16_t *colors, int n_colors,
+                          uint8_t *cache_color_found, int *out_cache_colors) {
+  if (n_cache <= 0) {
+    for (int i = 0; i < n_colors; ++i) out_cache_colors[i] = colors[i];
+    return n_colors;
   }
-  return AOMMAX(av1_ceil_log2(max_d + 1), *min_bits);
+  memset(cache_color_found, 0, n_cache * sizeof(*cache_color_found));
+  int n_in_cache = 0;
+  int in_cache_flags[PALETTE_MAX_SIZE];
+  memset(in_cache_flags, 0, sizeof(in_cache_flags));
+  for (int i = 0; i < n_cache && n_in_cache < n_colors; ++i) {
+    for (int j = 0; j < n_colors; ++j) {
+      if (colors[j] == color_cache[i]) {
+        in_cache_flags[j] = 1;
+        cache_color_found[i] = 1;
+        ++n_in_cache;
+        break;
+      }
+    }
+  }
+  int j = 0;
+  for (int i = 0; i < n_colors; ++i)
+    if (!in_cache_flags[i]) out_cache_colors[j++] = colors[i];
+  assert(j == n_colors - n_in_cache);
+  return j;
 }
 
 int av1_get_palette_delta_bits_v(const PALETTE_MODE_INFO *const pmi,
@@ -199,10 +226,10 @@ int av1_get_palette_delta_bits_v(const PALETTE_MODE_INFO *const pmi,
                                  int *min_bits) {
   const int n = pmi->palette_size[1];
   const int max_val = 1 << bit_depth;
-  int max_d = 0, i;
+  int max_d = 0;
   *min_bits = bit_depth - 4;
   *zero_count = 0;
-  for (i = 1; i < n; ++i) {
+  for (int i = 1; i < n; ++i) {
     const int delta = pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] -
                       pmi->palette_colors[2 * PALETTE_MAX_SIZE + i - 1];
     const int v = abs(delta);
@@ -215,26 +242,42 @@ int av1_get_palette_delta_bits_v(const PALETTE_MODE_INFO *const pmi,
 #endif  // CONFIG_PALETTE_DELTA_ENCODING
 
 int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi,
+#if CONFIG_PALETTE_DELTA_ENCODING
+                             uint16_t *color_cache, int n_cache,
+#endif  // CONFIG_PALETTE_DELTA_ENCODING
                              int bit_depth) {
   const int n = pmi->palette_size[0];
 #if CONFIG_PALETTE_DELTA_ENCODING
-  int min_bits = 0;
-  const int bits = av1_get_palette_delta_bits_y(pmi, bit_depth, &min_bits);
-  return av1_cost_bit(128, 0) * (2 + bit_depth + bits * (n - 1));
+  int out_cache_colors[PALETTE_MAX_SIZE];
+  uint8_t cache_color_found[2 * PALETTE_MAX_SIZE];
+  const int n_out_cache =
+      av1_index_color_cache(color_cache, n_cache, pmi->palette_colors, n,
+                            cache_color_found, out_cache_colors);
+  const int total_bits =
+      n_cache + delta_encode_cost(out_cache_colors, n_out_cache, bit_depth, 1);
+  return total_bits * av1_cost_bit(128, 0);
 #else
   return bit_depth * n * av1_cost_bit(128, 0);
 #endif  // CONFIG_PALETTE_DELTA_ENCODING
 }
 
 int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi,
+#if CONFIG_PALETTE_DELTA_ENCODING
+                              uint16_t *color_cache, int n_cache,
+#endif  // CONFIG_PALETTE_DELTA_ENCODING
                               int bit_depth) {
   const int n = pmi->palette_size[1];
 #if CONFIG_PALETTE_DELTA_ENCODING
-  int cost = 0;
+  int total_bits = 0;
   // U channel palette color cost.
-  int min_bits_u = 0;
-  const int bits_u = av1_get_palette_delta_bits_u(pmi, bit_depth, &min_bits_u);
-  cost += av1_cost_bit(128, 0) * (2 + bit_depth + bits_u * (n - 1));
+  int out_cache_colors[PALETTE_MAX_SIZE];
+  uint8_t cache_color_found[2 * PALETTE_MAX_SIZE];
+  const int n_out_cache = av1_index_color_cache(
+      color_cache, n_cache, pmi->palette_colors + PALETTE_MAX_SIZE, n,
+      cache_color_found, out_cache_colors);
+  total_bits +=
+      n_cache + delta_encode_cost(out_cache_colors, n_out_cache, bit_depth, 0);
+
   // V channel palette color cost.
   int zero_count = 0, min_bits_v = 0;
   const int bits_v =
@@ -242,8 +285,8 @@ int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi,
   const int bits_using_delta =
       2 + bit_depth + (bits_v + 1) * (n - 1) - zero_count;
   const int bits_using_raw = bit_depth * n;
-  cost += av1_cost_bit(128, 0) * (1 + AOMMIN(bits_using_delta, bits_using_raw));
-  return cost;
+  total_bits += 1 + AOMMIN(bits_using_delta, bits_using_raw);
+  return total_bits * av1_cost_bit(128, 0);
 #else
   return 2 * bit_depth * n * av1_cost_bit(128, 0);
 #endif  // CONFIG_PALETTE_DELTA_ENCODING
diff --git a/third_party/aom/av1/encoder/palette.h b/third_party/aom/av1/encoder/palette.h
index 5403ac5e6..f5a3c1bdd 100644
--- a/third_party/aom/av1/encoder/palette.h
+++ b/third_party/aom/av1/encoder/palette.h
@@ -45,13 +45,12 @@ int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols,
 #endif  // CONFIG_HIGHBITDEPTH
 
 #if CONFIG_PALETTE_DELTA_ENCODING
-// Return the number of bits used to transmit each luma palette color delta.
-int av1_get_palette_delta_bits_y(const PALETTE_MODE_INFO *const pmi,
-                                 int bit_depth, int *min_bits);
-
-// Return the number of bits used to transmit each U palette color delta.
-int av1_get_palette_delta_bits_u(const PALETTE_MODE_INFO *const pmi,
-                                 int bit_depth, int *min_bits);
+// Given a color cache and a set of base colors, find if each cache color is
+// present in the base colors, record the binary results in "cache_color_found".
+// Record the colors that are not in the color cache in "out_cache_colors".
+int av1_index_color_cache(const uint16_t *color_cache, int n_cache,
+                          const uint16_t *colors, int n_colors,
+                          uint8_t *cache_color_found, int *out_cache_colors);
 
 // Return the number of bits used to transmit each v palette color delta;
 // assign zero_count with the number of deltas being 0.
@@ -60,10 +59,17 @@ int av1_get_palette_delta_bits_v(const PALETTE_MODE_INFO *const pmi,
 #endif  // CONFIG_PALETTE_DELTA_ENCODING
 
 // Return the rate cost for transmitting luma palette color values.
-int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi, int bit_depth);
+int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi,
+#if CONFIG_PALETTE_DELTA_ENCODING
+                             uint16_t *color_cache, int n_cache,
+#endif  // CONFIG_PALETTE_DELTA_ENCODING
+                             int bit_depth);
 
 // Return the rate cost for transmitting chroma palette color values.
 int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi,
+#if CONFIG_PALETTE_DELTA_ENCODING
+                              uint16_t *color_cache, int n_cache,
+#endif  // CONFIG_PALETTE_DELTA_ENCODING
                               int bit_depth);
 
 #ifdef __cplusplus
diff --git a/third_party/aom/av1/encoder/pickrst.c b/third_party/aom/av1/encoder/pickrst.c
index 21410e0af..4a446d24e 100644
--- a/third_party/aom/av1/encoder/pickrst.c
+++ b/third_party/aom/av1/encoder/pickrst.c
@@ -31,17 +31,18 @@
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/picklpf.h"
 #include "av1/encoder/pickrst.h"
+#include "av1/encoder/mathutils.h"
 
 // When set to RESTORE_WIENER or RESTORE_SGRPROJ only those are allowed.
 // When set to RESTORE_NONE (0) we allow switchable.
 const RestorationType force_restore_type = RESTORE_NONE;
 
 // Number of Wiener iterations
-#define NUM_WIENER_ITERS 10
+#define NUM_WIENER_ITERS 5
 
 typedef double (*search_restore_type)(const YV12_BUFFER_CONFIG *src,
                                       AV1_COMP *cpi, int partial_frame,
-                                      RestorationInfo *info,
+                                      int plane, RestorationInfo *info,
                                       RestorationType *rest_level,
                                       double *best_tile_cost,
                                       YV12_BUFFER_CONFIG *dst_frame);
@@ -216,6 +217,62 @@ static int64_t get_pixel_proj_error(uint8_t *src8, int width, int height,
   return err;
 }
 
+#define USE_SGRPROJ_REFINEMENT_SEARCH 1
+static int64_t finer_search_pixel_proj_error(
+    uint8_t *src8, int width, int height, int src_stride, uint8_t *dat8,
+    int dat_stride, int bit_depth, int32_t *flt1, int flt1_stride,
+    int32_t *flt2, int flt2_stride, int start_step, int *xqd) {
+  int64_t err = get_pixel_proj_error(src8, width, height, src_stride, dat8,
+                                     dat_stride, bit_depth, flt1, flt1_stride,
+                                     flt2, flt2_stride, xqd);
+  (void)start_step;
+#if USE_SGRPROJ_REFINEMENT_SEARCH
+  int64_t err2;
+  int tap_min[] = { SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MIN1 };
+  int tap_max[] = { SGRPROJ_PRJ_MAX0, SGRPROJ_PRJ_MAX1 };
+  for (int s = start_step; s >= 1; s >>= 1) {
+    for (int p = 0; p < 2; ++p) {
+      int skip = 0;
+      do {
+        if (xqd[p] - s >= tap_min[p]) {
+          xqd[p] -= s;
+          err2 = get_pixel_proj_error(src8, width, height, src_stride, dat8,
+                                      dat_stride, bit_depth, flt1, flt1_stride,
+                                      flt2, flt2_stride, xqd);
+          if (err2 > err) {
+            xqd[p] += s;
+          } else {
+            err = err2;
+            skip = 1;
+            // At the highest step size continue moving in the same direction
+            if (s == start_step) continue;
+          }
+        }
+        break;
+      } while (1);
+      if (skip) break;
+      do {
+        if (xqd[p] + s <= tap_max[p]) {
+          xqd[p] += s;
+          err2 = get_pixel_proj_error(src8, width, height, src_stride, dat8,
+                                      dat_stride, bit_depth, flt1, flt1_stride,
+                                      flt2, flt2_stride, xqd);
+          if (err2 > err) {
+            xqd[p] -= s;
+          } else {
+            err = err2;
+            // At the highest step size continue moving in the same direction
+            if (s == start_step) continue;
+          }
+        }
+        break;
+      } while (1);
+    }
+  }
+#endif  // USE_SGRPROJ_REFINEMENT_SEARCH
+  return err;
+}
+
 static void get_proj_subspace(uint8_t *src8, int width, int height,
                               int src_stride, uint8_t *dat8, int dat_stride,
                               int bit_depth, int32_t *flt1, int flt1_stride,
@@ -329,12 +386,14 @@ static void search_selfguided_restoration(uint8_t *dat8, int width, int height,
 #if CONFIG_HIGHBITDEPTH
     }
 #endif
+    aom_clear_system_state();
     get_proj_subspace(src8, width, height, src_stride, dat8, dat_stride,
                       bit_depth, flt1, width, flt2, width, exq);
+    aom_clear_system_state();
     encode_xq(exq, exqd);
-    err =
-        get_pixel_proj_error(src8, width, height, src_stride, dat8, dat_stride,
-                             bit_depth, flt1, width, flt2, width, exqd);
+    err = finer_search_pixel_proj_error(src8, width, height, src_stride, dat8,
+                                        dat_stride, bit_depth, flt1, width,
+                                        flt2, width, 2, exqd);
     if (besterr == -1 || err < besterr) {
       bestep = ep;
       besterr = err;
@@ -362,8 +421,9 @@ static int count_sgrproj_bits(SgrprojInfo *sgrproj_info,
 }
 
 static double search_sgrproj(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
-                             int partial_frame, RestorationInfo *info,
-                             RestorationType *type, double *best_tile_cost,
+                             int partial_frame, int plane,
+                             RestorationInfo *info, RestorationType *type,
+                             double *best_tile_cost,
                              YV12_BUFFER_CONFIG *dst_frame) {
   SgrprojInfo *sgrproj_info = info->sgrproj_info;
   double err, cost_norestore, cost_sgrproj;
@@ -374,44 +434,68 @@ static double search_sgrproj(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
   RestorationInfo *rsi = &cpi->rst_search[0];
   int tile_idx, tile_width, tile_height, nhtiles, nvtiles;
   int h_start, h_end, v_start, v_end;
-  // Allocate for the src buffer at high precision
-  const int ntiles = av1_get_rest_ntiles(
-      cm->width, cm->height, cm->rst_info[0].restoration_tilesize, &tile_width,
-      &tile_height, &nhtiles, &nvtiles);
+  int width, height, src_stride, dgd_stride;
+  uint8_t *dgd_buffer, *src_buffer;
+  if (plane == AOM_PLANE_Y) {
+    width = cm->width;
+    height = cm->height;
+    src_buffer = src->y_buffer;
+    src_stride = src->y_stride;
+    dgd_buffer = dgd->y_buffer;
+    dgd_stride = dgd->y_stride;
+    assert(width == dgd->y_crop_width);
+    assert(height == dgd->y_crop_height);
+    assert(width == src->y_crop_width);
+    assert(height == src->y_crop_height);
+  } else {
+    width = src->uv_crop_width;
+    height = src->uv_crop_height;
+    src_stride = src->uv_stride;
+    dgd_stride = dgd->uv_stride;
+    src_buffer = plane == AOM_PLANE_U ? src->u_buffer : src->v_buffer;
+    dgd_buffer = plane == AOM_PLANE_U ? dgd->u_buffer : dgd->v_buffer;
+    assert(width == dgd->uv_crop_width);
+    assert(height == dgd->uv_crop_height);
+  }
+  const int ntiles =
+      av1_get_rest_ntiles(width, height, cm->rst_info[0].restoration_tilesize,
+                          &tile_width, &tile_height, &nhtiles, &nvtiles);
   SgrprojInfo ref_sgrproj_info;
   set_default_sgrproj(&ref_sgrproj_info);
 
-  rsi->frame_restoration_type = RESTORE_SGRPROJ;
+  rsi[plane].frame_restoration_type = RESTORE_SGRPROJ;
 
   for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
-    rsi->restoration_type[tile_idx] = RESTORE_NONE;
+    rsi[plane].restoration_type[tile_idx] = RESTORE_NONE;
   }
   // Compute best Sgrproj filters for each tile
   for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
     av1_get_rest_tile_limits(tile_idx, 0, 0, nhtiles, nvtiles, tile_width,
-                             tile_height, cm->width, cm->height, 0, 0, &h_start,
-                             &h_end, &v_start, &v_end);
+                             tile_height, width, height, 0, 0, &h_start, &h_end,
+                             &v_start, &v_end);
     err = sse_restoration_tile(src, cm->frame_to_show, cm, h_start,
-                               h_end - h_start, v_start, v_end - v_start, 1);
+                               h_end - h_start, v_start, v_end - v_start,
+                               (1 << plane));
     // #bits when a tile is not restored
     bits = av1_cost_bit(RESTORE_NONE_SGRPROJ_PROB, 0);
     cost_norestore = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
     best_tile_cost[tile_idx] = DBL_MAX;
     search_selfguided_restoration(
-        dgd->y_buffer + v_start * dgd->y_stride + h_start, h_end - h_start,
-        v_end - v_start, dgd->y_stride,
-        src->y_buffer + v_start * src->y_stride + h_start, src->y_stride,
+        dgd_buffer + v_start * dgd_stride + h_start, h_end - h_start,
+        v_end - v_start, dgd_stride,
+        src_buffer + v_start * src_stride + h_start, src_stride,
 #if CONFIG_HIGHBITDEPTH
         cm->bit_depth,
 #else
         8,
 #endif  // CONFIG_HIGHBITDEPTH
-        &rsi->sgrproj_info[tile_idx].ep, rsi->sgrproj_info[tile_idx].xqd,
-        cm->rst_internal.tmpbuf);
-    rsi->restoration_type[tile_idx] = RESTORE_SGRPROJ;
-    err = try_restoration_tile(src, cpi, rsi, 1, partial_frame, tile_idx, 0, 0,
-                               dst_frame);
-    bits = count_sgrproj_bits(&rsi->sgrproj_info[tile_idx], &ref_sgrproj_info)
+        &rsi[plane].sgrproj_info[tile_idx].ep,
+        rsi[plane].sgrproj_info[tile_idx].xqd, cm->rst_internal.tmpbuf);
+    rsi[plane].restoration_type[tile_idx] = RESTORE_SGRPROJ;
+    err = try_restoration_tile(src, cpi, rsi, (1 << plane), partial_frame,
+                               tile_idx, 0, 0, dst_frame);
+    bits = count_sgrproj_bits(&rsi[plane].sgrproj_info[tile_idx],
+                              &ref_sgrproj_info)
            << AV1_PROB_COST_SHIFT;
     bits += av1_cost_bit(RESTORE_NONE_SGRPROJ_PROB, 1);
     cost_sgrproj = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
@@ -419,35 +503,34 @@ static double search_sgrproj(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
       type[tile_idx] = RESTORE_NONE;
     } else {
       type[tile_idx] = RESTORE_SGRPROJ;
-      memcpy(&sgrproj_info[tile_idx], &rsi->sgrproj_info[tile_idx],
+      memcpy(&sgrproj_info[tile_idx], &rsi[plane].sgrproj_info[tile_idx],
              sizeof(sgrproj_info[tile_idx]));
-      bits = count_sgrproj_bits(&rsi->sgrproj_info[tile_idx], &ref_sgrproj_info)
-             << AV1_PROB_COST_SHIFT;
       memcpy(&ref_sgrproj_info, &sgrproj_info[tile_idx],
              sizeof(ref_sgrproj_info));
       best_tile_cost[tile_idx] = err;
     }
-    rsi->restoration_type[tile_idx] = RESTORE_NONE;
+    rsi[plane].restoration_type[tile_idx] = RESTORE_NONE;
   }
   // Cost for Sgrproj filtering
   set_default_sgrproj(&ref_sgrproj_info);
-  bits = frame_level_restore_bits[rsi->frame_restoration_type]
+  bits = frame_level_restore_bits[rsi[plane].frame_restoration_type]
          << AV1_PROB_COST_SHIFT;
   for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
     bits +=
         av1_cost_bit(RESTORE_NONE_SGRPROJ_PROB, type[tile_idx] != RESTORE_NONE);
-    memcpy(&rsi->sgrproj_info[tile_idx], &sgrproj_info[tile_idx],
+    memcpy(&rsi[plane].sgrproj_info[tile_idx], &sgrproj_info[tile_idx],
            sizeof(sgrproj_info[tile_idx]));
     if (type[tile_idx] == RESTORE_SGRPROJ) {
-      bits +=
-          count_sgrproj_bits(&rsi->sgrproj_info[tile_idx], &ref_sgrproj_info)
-          << AV1_PROB_COST_SHIFT;
-      memcpy(&ref_sgrproj_info, &rsi->sgrproj_info[tile_idx],
+      bits += count_sgrproj_bits(&rsi[plane].sgrproj_info[tile_idx],
+                                 &ref_sgrproj_info)
+              << AV1_PROB_COST_SHIFT;
+      memcpy(&ref_sgrproj_info, &rsi[plane].sgrproj_info[tile_idx],
              sizeof(ref_sgrproj_info));
     }
-    rsi->restoration_type[tile_idx] = type[tile_idx];
+    rsi[plane].restoration_type[tile_idx] = type[tile_idx];
   }
-  err = try_restoration_frame(src, cpi, rsi, 1, partial_frame, dst_frame);
+  err = try_restoration_frame(src, cpi, rsi, (1 << plane), partial_frame,
+                              dst_frame);
   cost_sgrproj = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
 
   return cost_sgrproj;
@@ -560,46 +643,6 @@ static void compute_stats_highbd(uint8_t *dgd8, uint8_t *src8, int h_start,
 }
 #endif  // CONFIG_HIGHBITDEPTH
 
-// Solves Ax = b, where x and b are column vectors
-static int linsolve(int n, double *A, int stride, double *b, double *x) {
-  int i, j, k;
-  double c;
-
-  aom_clear_system_state();
-
-  // Forward elimination
-  for (k = 0; k < n - 1; k++) {
-    // Bring the largest magitude to the diagonal position
-    for (i = n - 1; i > k; i--) {
-      if (fabs(A[(i - 1) * stride + k]) < fabs(A[i * stride + k])) {
-        for (j = 0; j < n; j++) {
-          c = A[i * stride + j];
-          A[i * stride + j] = A[(i - 1) * stride + j];
-          A[(i - 1) * stride + j] = c;
-        }
-        c = b[i];
-        b[i] = b[i - 1];
-        b[i - 1] = c;
-      }
-    }
-    for (i = k; i < n - 1; i++) {
-      if (fabs(A[k * stride + k]) < 1e-10) return 0;
-      c = A[(i + 1) * stride + k] / A[k * stride + k];
-      for (j = 0; j < n; j++) A[(i + 1) * stride + j] -= c * A[k * stride + j];
-      b[i + 1] -= c * b[k];
-    }
-  }
-  // Backward substitution
-  for (i = n - 1; i >= 0; i--) {
-    if (fabs(A[i * stride + i]) < 1e-10) return 0;
-    c = 0;
-    for (j = i + 1; j <= n - 1; j++) c += A[i * stride + j] * x[j];
-    x[i] = (b[i] - c) / A[i * stride + i];
-  }
-
-  return 1;
-}
-
 static INLINE int wrap_index(int i) {
   return (i >= WIENER_HALFWIN1 ? WIENER_WIN - 1 - i : i);
 }
@@ -696,8 +739,10 @@ static void update_b_sep_sym(double **Mc, double **Hc, double *a, double *b) {
 
 static int wiener_decompose_sep_sym(double *M, double *H, double *a,
                                     double *b) {
-  static const double init_filt[WIENER_WIN] = {
-    0.035623, -0.127154, 0.211436, 0.760190, 0.211436, -0.127154, 0.035623,
+  static const int init_filt[WIENER_WIN] = {
+    WIENER_FILT_TAP0_MIDV, WIENER_FILT_TAP1_MIDV, WIENER_FILT_TAP2_MIDV,
+    WIENER_FILT_TAP3_MIDV, WIENER_FILT_TAP2_MIDV, WIENER_FILT_TAP1_MIDV,
+    WIENER_FILT_TAP0_MIDV,
   };
   int i, j, iter;
   double *Hc[WIENER_WIN2];
@@ -709,8 +754,9 @@ static int wiener_decompose_sep_sym(double *M, double *H, double *a,
           H + i * WIENER_WIN * WIENER_WIN2 + j * WIENER_WIN;
     }
   }
-  memcpy(a, init_filt, sizeof(*a) * WIENER_WIN);
-  memcpy(b, init_filt, sizeof(*b) * WIENER_WIN);
+  for (i = 0; i < WIENER_WIN; i++) {
+    a[i] = b[i] = (double)init_filt[i] / WIENER_FILT_STEP;
+  }
 
   iter = 1;
   while (iter < NUM_WIENER_ITERS) {
@@ -812,158 +858,117 @@ static int count_wiener_bits(WienerInfo *wiener_info,
   return bits;
 }
 
-static double search_wiener_uv(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
-                               int partial_frame, int plane,
-                               RestorationInfo *info, RestorationType *type,
-                               YV12_BUFFER_CONFIG *dst_frame) {
-  WienerInfo *wiener_info = info->wiener_info;
-  AV1_COMMON *const cm = &cpi->common;
-  RestorationInfo *rsi = cpi->rst_search;
-  int64_t err;
-  int bits;
-  double cost_wiener, cost_norestore, cost_wiener_frame, cost_norestore_frame;
-  MACROBLOCK *x = &cpi->td.mb;
-  double M[WIENER_WIN2];
-  double H[WIENER_WIN2 * WIENER_WIN2];
-  double vfilterd[WIENER_WIN], hfilterd[WIENER_WIN];
-  const YV12_BUFFER_CONFIG *dgd = cm->frame_to_show;
-  const int width = src->uv_crop_width;
-  const int height = src->uv_crop_height;
-  const int src_stride = src->uv_stride;
-  const int dgd_stride = dgd->uv_stride;
-  double score;
-  int tile_idx, tile_width, tile_height, nhtiles, nvtiles;
-  int h_start, h_end, v_start, v_end;
-  const int ntiles =
-      av1_get_rest_ntiles(width, height, cm->rst_info[1].restoration_tilesize,
-                          &tile_width, &tile_height, &nhtiles, &nvtiles);
-  WienerInfo ref_wiener_info;
-  set_default_wiener(&ref_wiener_info);
-  assert(width == dgd->uv_crop_width);
-  assert(height == dgd->uv_crop_height);
-
-  rsi[plane].frame_restoration_type = RESTORE_NONE;
-  err = sse_restoration_frame(cm, src, cm->frame_to_show, (1 << plane));
-  bits = 0;
-  cost_norestore_frame = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
-
-  rsi[plane].frame_restoration_type = RESTORE_WIENER;
-
-  for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
-    rsi[plane].restoration_type[tile_idx] = RESTORE_NONE;
-  }
-
-  // Compute best Wiener filters for each tile
-  for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
-    av1_get_rest_tile_limits(tile_idx, 0, 0, nhtiles, nvtiles, tile_width,
-                             tile_height, width, height, 0, 0, &h_start, &h_end,
-                             &v_start, &v_end);
-    err = sse_restoration_tile(src, cm->frame_to_show, cm, h_start,
-                               h_end - h_start, v_start, v_end - v_start,
-                               1 << plane);
-    // #bits when a tile is not restored
-    bits = av1_cost_bit(RESTORE_NONE_WIENER_PROB, 0);
-    cost_norestore = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
-    // best_tile_cost[tile_idx] = DBL_MAX;
-
-    av1_get_rest_tile_limits(tile_idx, 0, 0, nhtiles, nvtiles, tile_width,
-                             tile_height, width, height, WIENER_HALFWIN,
-                             WIENER_HALFWIN, &h_start, &h_end, &v_start,
-                             &v_end);
-    if (plane == AOM_PLANE_U) {
-#if CONFIG_HIGHBITDEPTH
-      if (cm->use_highbitdepth)
-        compute_stats_highbd(dgd->u_buffer, src->u_buffer, h_start, h_end,
-                             v_start, v_end, dgd_stride, src_stride, M, H);
-      else
-#endif  // CONFIG_HIGHBITDEPTH
-        compute_stats(dgd->u_buffer, src->u_buffer, h_start, h_end, v_start,
-                      v_end, dgd_stride, src_stride, M, H);
-    } else if (plane == AOM_PLANE_V) {
-#if CONFIG_HIGHBITDEPTH
-      if (cm->use_highbitdepth)
-        compute_stats_highbd(dgd->v_buffer, src->v_buffer, h_start, h_end,
-                             v_start, v_end, dgd_stride, src_stride, M, H);
-      else
-#endif  // CONFIG_HIGHBITDEPTH
-        compute_stats(dgd->v_buffer, src->v_buffer, h_start, h_end, v_start,
-                      v_end, dgd_stride, src_stride, M, H);
-    } else {
-      assert(0);
-    }
-
-    type[tile_idx] = RESTORE_WIENER;
-
-    if (!wiener_decompose_sep_sym(M, H, vfilterd, hfilterd)) {
-      type[tile_idx] = RESTORE_NONE;
-      continue;
-    }
-    quantize_sym_filter(vfilterd, rsi[plane].wiener_info[tile_idx].vfilter);
-    quantize_sym_filter(hfilterd, rsi[plane].wiener_info[tile_idx].hfilter);
-
-    // Filter score computes the value of the function x'*A*x - x'*b for the
-    // learned filter and compares it against identity filer. If there is no
-    // reduction in the function, the filter is reverted back to identity
-    score = compute_score(M, H, rsi[plane].wiener_info[tile_idx].vfilter,
-                          rsi[plane].wiener_info[tile_idx].hfilter);
-    if (score > 0.0) {
-      type[tile_idx] = RESTORE_NONE;
-      continue;
-    }
-
-    rsi[plane].restoration_type[tile_idx] = RESTORE_WIENER;
-    err = try_restoration_tile(src, cpi, rsi, 1 << plane, partial_frame,
-                               tile_idx, 0, 0, dst_frame);
-    bits =
-        count_wiener_bits(&rsi[plane].wiener_info[tile_idx], &ref_wiener_info)
-        << AV1_PROB_COST_SHIFT;
-    // bits = WIENER_FILT_BITS << AV1_PROB_COST_SHIFT;
-    bits += av1_cost_bit(RESTORE_NONE_WIENER_PROB, 1);
-    cost_wiener = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
-    if (cost_wiener >= cost_norestore) {
-      type[tile_idx] = RESTORE_NONE;
-    } else {
-      type[tile_idx] = RESTORE_WIENER;
-      memcpy(&wiener_info[tile_idx], &rsi[plane].wiener_info[tile_idx],
-             sizeof(wiener_info[tile_idx]));
-      memcpy(&ref_wiener_info, &rsi[plane].wiener_info[tile_idx],
-             sizeof(ref_wiener_info));
+#define USE_WIENER_REFINEMENT_SEARCH 1
+static int64_t finer_tile_search_wiener(const YV12_BUFFER_CONFIG *src,
+                                        AV1_COMP *cpi, RestorationInfo *rsi,
+                                        int start_step, int plane, int tile_idx,
+                                        int partial_frame,
+                                        YV12_BUFFER_CONFIG *dst_frame) {
+  int64_t err = try_restoration_tile(src, cpi, rsi, 1 << plane, partial_frame,
+                                     tile_idx, 0, 0, dst_frame);
+  (void)start_step;
+#if USE_WIENER_REFINEMENT_SEARCH
+  int64_t err2;
+  int tap_min[] = { WIENER_FILT_TAP0_MINV, WIENER_FILT_TAP1_MINV,
+                    WIENER_FILT_TAP2_MINV };
+  int tap_max[] = { WIENER_FILT_TAP0_MAXV, WIENER_FILT_TAP1_MAXV,
+                    WIENER_FILT_TAP2_MAXV };
+  // printf("err  pre = %"PRId64"\n", err);
+  for (int s = start_step; s >= 1; s >>= 1) {
+    for (int p = 0; p < WIENER_HALFWIN; ++p) {
+      int skip = 0;
+      do {
+        if (rsi[plane].wiener_info[tile_idx].hfilter[p] - s >= tap_min[p]) {
+          rsi[plane].wiener_info[tile_idx].hfilter[p] -= s;
+          rsi[plane].wiener_info[tile_idx].hfilter[WIENER_WIN - p - 1] -= s;
+          rsi[plane].wiener_info[tile_idx].hfilter[WIENER_HALFWIN] += 2 * s;
+          err2 = try_restoration_tile(src, cpi, rsi, 1 << plane, partial_frame,
+                                      tile_idx, 0, 0, dst_frame);
+          if (err2 > err) {
+            rsi[plane].wiener_info[tile_idx].hfilter[p] += s;
+            rsi[plane].wiener_info[tile_idx].hfilter[WIENER_WIN - p - 1] += s;
+            rsi[plane].wiener_info[tile_idx].hfilter[WIENER_HALFWIN] -= 2 * s;
+          } else {
+            err = err2;
+            skip = 1;
+            // At the highest step size continue moving in the same direction
+            if (s == start_step) continue;
+          }
+        }
+        break;
+      } while (1);
+      if (skip) break;
+      do {
+        if (rsi[plane].wiener_info[tile_idx].hfilter[p] + s <= tap_max[p]) {
+          rsi[plane].wiener_info[tile_idx].hfilter[p] += s;
+          rsi[plane].wiener_info[tile_idx].hfilter[WIENER_WIN - p - 1] += s;
+          rsi[plane].wiener_info[tile_idx].hfilter[WIENER_HALFWIN] -= 2 * s;
+          err2 = try_restoration_tile(src, cpi, rsi, 1 << plane, partial_frame,
+                                      tile_idx, 0, 0, dst_frame);
+          if (err2 > err) {
+            rsi[plane].wiener_info[tile_idx].hfilter[p] -= s;
+            rsi[plane].wiener_info[tile_idx].hfilter[WIENER_WIN - p - 1] -= s;
+            rsi[plane].wiener_info[tile_idx].hfilter[WIENER_HALFWIN] += 2 * s;
+          } else {
+            err = err2;
+            // At the highest step size continue moving in the same direction
+            if (s == start_step) continue;
+          }
+        }
+        break;
+      } while (1);
     }
-    rsi[plane].restoration_type[tile_idx] = RESTORE_NONE;
-  }
-  // Cost for Wiener filtering
-  set_default_wiener(&ref_wiener_info);
-  bits = 0;
-  for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
-    bits +=
-        av1_cost_bit(RESTORE_NONE_WIENER_PROB, type[tile_idx] != RESTORE_NONE);
-    memcpy(&rsi[plane].wiener_info[tile_idx], &wiener_info[tile_idx],
-           sizeof(wiener_info[tile_idx]));
-    if (type[tile_idx] == RESTORE_WIENER) {
-      bits +=
-          count_wiener_bits(&rsi[plane].wiener_info[tile_idx], &ref_wiener_info)
-          << AV1_PROB_COST_SHIFT;
-      memcpy(&ref_wiener_info, &rsi[plane].wiener_info[tile_idx],
-             sizeof(ref_wiener_info));
+    for (int p = 0; p < WIENER_HALFWIN; ++p) {
+      int skip = 0;
+      do {
+        if (rsi[plane].wiener_info[tile_idx].vfilter[p] - s >= tap_min[p]) {
+          rsi[plane].wiener_info[tile_idx].vfilter[p] -= s;
+          rsi[plane].wiener_info[tile_idx].vfilter[WIENER_WIN - p - 1] -= s;
+          rsi[plane].wiener_info[tile_idx].vfilter[WIENER_HALFWIN] += 2 * s;
+          err2 = try_restoration_tile(src, cpi, rsi, 1 << plane, partial_frame,
+                                      tile_idx, 0, 0, dst_frame);
+          if (err2 > err) {
+            rsi[plane].wiener_info[tile_idx].vfilter[p] += s;
+            rsi[plane].wiener_info[tile_idx].vfilter[WIENER_WIN - p - 1] += s;
+            rsi[plane].wiener_info[tile_idx].vfilter[WIENER_HALFWIN] -= 2 * s;
+          } else {
+            err = err2;
+            skip = 1;
+            // At the highest step size continue moving in the same direction
+            if (s == start_step) continue;
+          }
+        }
+        break;
+      } while (1);
+      if (skip) break;
+      do {
+        if (rsi[plane].wiener_info[tile_idx].vfilter[p] + s <= tap_max[p]) {
+          rsi[plane].wiener_info[tile_idx].vfilter[p] += s;
+          rsi[plane].wiener_info[tile_idx].vfilter[WIENER_WIN - p - 1] += s;
+          rsi[plane].wiener_info[tile_idx].vfilter[WIENER_HALFWIN] -= 2 * s;
+          err2 = try_restoration_tile(src, cpi, rsi, 1 << plane, partial_frame,
+                                      tile_idx, 0, 0, dst_frame);
+          if (err2 > err) {
+            rsi[plane].wiener_info[tile_idx].vfilter[p] -= s;
+            rsi[plane].wiener_info[tile_idx].vfilter[WIENER_WIN - p - 1] -= s;
+            rsi[plane].wiener_info[tile_idx].vfilter[WIENER_HALFWIN] += 2 * s;
+          } else {
+            err = err2;
+            // At the highest step size continue moving in the same direction
+            if (s == start_step) continue;
+          }
+        }
+        break;
+      } while (1);
     }
-    rsi[plane].restoration_type[tile_idx] = type[tile_idx];
-  }
-  err = try_restoration_frame(src, cpi, rsi, 1 << plane, partial_frame,
-                              dst_frame);
-  cost_wiener_frame = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
-
-  if (cost_wiener_frame < cost_norestore_frame) {
-    info->frame_restoration_type = RESTORE_WIENER;
-  } else {
-    info->frame_restoration_type = RESTORE_NONE;
   }
-
-  return info->frame_restoration_type == RESTORE_WIENER ? cost_wiener_frame
-                                                        : cost_norestore_frame;
+// printf("err post = %"PRId64"\n", err);
+#endif  // USE_WIENER_REFINEMENT_SEARCH
+  return err;
 }
 
 static double search_wiener(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
-                            int partial_frame, RestorationInfo *info,
+                            int partial_frame, int plane, RestorationInfo *info,
                             RestorationType *type, double *best_tile_cost,
                             YV12_BUFFER_CONFIG *dst_frame) {
   WienerInfo *wiener_info = info->wiener_info;
@@ -977,38 +982,52 @@ static double search_wiener(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
   double H[WIENER_WIN2 * WIENER_WIN2];
   double vfilterd[WIENER_WIN], hfilterd[WIENER_WIN];
   const YV12_BUFFER_CONFIG *dgd = cm->frame_to_show;
-  const int width = cm->width;
-  const int height = cm->height;
-  const int src_stride = src->y_stride;
-  const int dgd_stride = dgd->y_stride;
+  int width, height, src_stride, dgd_stride;
+  uint8_t *dgd_buffer, *src_buffer;
+  if (plane == AOM_PLANE_Y) {
+    width = cm->width;
+    height = cm->height;
+    src_buffer = src->y_buffer;
+    src_stride = src->y_stride;
+    dgd_buffer = dgd->y_buffer;
+    dgd_stride = dgd->y_stride;
+    assert(width == dgd->y_crop_width);
+    assert(height == dgd->y_crop_height);
+    assert(width == src->y_crop_width);
+    assert(height == src->y_crop_height);
+  } else {
+    width = src->uv_crop_width;
+    height = src->uv_crop_height;
+    src_stride = src->uv_stride;
+    dgd_stride = dgd->uv_stride;
+    src_buffer = plane == AOM_PLANE_U ? src->u_buffer : src->v_buffer;
+    dgd_buffer = plane == AOM_PLANE_U ? dgd->u_buffer : dgd->v_buffer;
+    assert(width == dgd->uv_crop_width);
+    assert(height == dgd->uv_crop_height);
+  }
   double score;
   int tile_idx, tile_width, tile_height, nhtiles, nvtiles;
   int h_start, h_end, v_start, v_end;
-  const int ntiles =
-      av1_get_rest_ntiles(width, height, cm->rst_info[0].restoration_tilesize,
-                          &tile_width, &tile_height, &nhtiles, &nvtiles);
+  const int ntiles = av1_get_rest_ntiles(
+      width, height, cm->rst_info[plane].restoration_tilesize, &tile_width,
+      &tile_height, &nhtiles, &nvtiles);
   WienerInfo ref_wiener_info;
   set_default_wiener(&ref_wiener_info);
 
-  assert(width == dgd->y_crop_width);
-  assert(height == dgd->y_crop_height);
-  assert(width == src->y_crop_width);
-  assert(height == src->y_crop_height);
-
-  rsi->frame_restoration_type = RESTORE_WIENER;
+  rsi[plane].frame_restoration_type = RESTORE_WIENER;
 
   for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
-    rsi->restoration_type[tile_idx] = RESTORE_NONE;
+    rsi[plane].restoration_type[tile_idx] = RESTORE_NONE;
   }
 
 // Construct a (WIENER_HALFWIN)-pixel border around the frame
 #if CONFIG_HIGHBITDEPTH
   if (cm->use_highbitdepth)
-    extend_frame_highbd(CONVERT_TO_SHORTPTR(dgd->y_buffer), width, height,
+    extend_frame_highbd(CONVERT_TO_SHORTPTR(dgd_buffer), width, height,
                         dgd_stride);
   else
 #endif
-    extend_frame(dgd->y_buffer, width, height, dgd_stride);
+    extend_frame(dgd_buffer, width, height, dgd_stride);
 
   // Compute best Wiener filters for each tile
   for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
@@ -1016,7 +1035,8 @@ static double search_wiener(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
                              tile_height, width, height, 0, 0, &h_start, &h_end,
                              &v_start, &v_end);
     err = sse_restoration_tile(src, cm->frame_to_show, cm, h_start,
-                               h_end - h_start, v_start, v_end - v_start, 1);
+                               h_end - h_start, v_start, v_end - v_start,
+                               (1 << plane));
     // #bits when a tile is not restored
     bits = av1_cost_bit(RESTORE_NONE_WIENER_PROB, 0);
     cost_norestore = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
@@ -1027,12 +1047,12 @@ static double search_wiener(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
                              &v_start, &v_end);
 #if CONFIG_HIGHBITDEPTH
     if (cm->use_highbitdepth)
-      compute_stats_highbd(dgd->y_buffer, src->y_buffer, h_start, h_end,
-                           v_start, v_end, dgd_stride, src_stride, M, H);
+      compute_stats_highbd(dgd_buffer, src_buffer, h_start, h_end, v_start,
+                           v_end, dgd_stride, src_stride, M, H);
     else
 #endif  // CONFIG_HIGHBITDEPTH
-      compute_stats(dgd->y_buffer, src->y_buffer, h_start, h_end, v_start,
-                    v_end, dgd_stride, src_stride, M, H);
+      compute_stats(dgd_buffer, src_buffer, h_start, h_end, v_start, v_end,
+                    dgd_stride, src_stride, M, H);
 
     type[tile_idx] = RESTORE_WIENER;
 
@@ -1040,108 +1060,129 @@ static double search_wiener(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
       type[tile_idx] = RESTORE_NONE;
       continue;
     }
-    quantize_sym_filter(vfilterd, rsi->wiener_info[tile_idx].vfilter);
-    quantize_sym_filter(hfilterd, rsi->wiener_info[tile_idx].hfilter);
+    quantize_sym_filter(vfilterd, rsi[plane].wiener_info[tile_idx].vfilter);
+    quantize_sym_filter(hfilterd, rsi[plane].wiener_info[tile_idx].hfilter);
 
     // Filter score computes the value of the function x'*A*x - x'*b for the
     // learned filter and compares it against identity filer. If there is no
     // reduction in the function, the filter is reverted back to identity
-    score = compute_score(M, H, rsi->wiener_info[tile_idx].vfilter,
-                          rsi->wiener_info[tile_idx].hfilter);
+    score = compute_score(M, H, rsi[plane].wiener_info[tile_idx].vfilter,
+                          rsi[plane].wiener_info[tile_idx].hfilter);
     if (score > 0.0) {
       type[tile_idx] = RESTORE_NONE;
       continue;
     }
+    aom_clear_system_state();
 
-    rsi->restoration_type[tile_idx] = RESTORE_WIENER;
-    err = try_restoration_tile(src, cpi, rsi, 1, partial_frame, tile_idx, 0, 0,
-                               dst_frame);
-    bits = count_wiener_bits(&rsi->wiener_info[tile_idx], &ref_wiener_info)
-           << AV1_PROB_COST_SHIFT;
+    rsi[plane].restoration_type[tile_idx] = RESTORE_WIENER;
+    err = finer_tile_search_wiener(src, cpi, rsi, 4, plane, tile_idx,
+                                   partial_frame, dst_frame);
+    bits =
+        count_wiener_bits(&rsi[plane].wiener_info[tile_idx], &ref_wiener_info)
+        << AV1_PROB_COST_SHIFT;
     bits += av1_cost_bit(RESTORE_NONE_WIENER_PROB, 1);
     cost_wiener = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
     if (cost_wiener >= cost_norestore) {
       type[tile_idx] = RESTORE_NONE;
     } else {
       type[tile_idx] = RESTORE_WIENER;
-      memcpy(&wiener_info[tile_idx], &rsi->wiener_info[tile_idx],
+      memcpy(&wiener_info[tile_idx], &rsi[plane].wiener_info[tile_idx],
              sizeof(wiener_info[tile_idx]));
-      memcpy(&ref_wiener_info, &rsi->wiener_info[tile_idx],
+      memcpy(&ref_wiener_info, &rsi[plane].wiener_info[tile_idx],
              sizeof(ref_wiener_info));
-      bits = count_wiener_bits(&wiener_info[tile_idx], &ref_wiener_info)
-             << AV1_PROB_COST_SHIFT;
       best_tile_cost[tile_idx] = err;
     }
-    rsi->restoration_type[tile_idx] = RESTORE_NONE;
+    rsi[plane].restoration_type[tile_idx] = RESTORE_NONE;
   }
   // Cost for Wiener filtering
   set_default_wiener(&ref_wiener_info);
-  bits = frame_level_restore_bits[rsi->frame_restoration_type]
+  bits = frame_level_restore_bits[rsi[plane].frame_restoration_type]
          << AV1_PROB_COST_SHIFT;
   for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
     bits +=
         av1_cost_bit(RESTORE_NONE_WIENER_PROB, type[tile_idx] != RESTORE_NONE);
-    memcpy(&rsi->wiener_info[tile_idx], &wiener_info[tile_idx],
+    memcpy(&rsi[plane].wiener_info[tile_idx], &wiener_info[tile_idx],
            sizeof(wiener_info[tile_idx]));
     if (type[tile_idx] == RESTORE_WIENER) {
-      bits += count_wiener_bits(&rsi->wiener_info[tile_idx], &ref_wiener_info)
-              << AV1_PROB_COST_SHIFT;
-      memcpy(&ref_wiener_info, &rsi->wiener_info[tile_idx],
+      bits +=
+          count_wiener_bits(&rsi[plane].wiener_info[tile_idx], &ref_wiener_info)
+          << AV1_PROB_COST_SHIFT;
+      memcpy(&ref_wiener_info, &rsi[plane].wiener_info[tile_idx],
              sizeof(ref_wiener_info));
     }
-    rsi->restoration_type[tile_idx] = type[tile_idx];
+    rsi[plane].restoration_type[tile_idx] = type[tile_idx];
   }
-  err = try_restoration_frame(src, cpi, rsi, 1, partial_frame, dst_frame);
+  err = try_restoration_frame(src, cpi, rsi, 1 << plane, partial_frame,
+                              dst_frame);
   cost_wiener = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
 
   return cost_wiener;
 }
 
 static double search_norestore(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
-                               int partial_frame, RestorationInfo *info,
-                               RestorationType *type, double *best_tile_cost,
+                               int partial_frame, int plane,
+                               RestorationInfo *info, RestorationType *type,
+                               double *best_tile_cost,
                                YV12_BUFFER_CONFIG *dst_frame) {
-  double err, cost_norestore;
+  int64_t err;
+  double cost_norestore;
   int bits;
   MACROBLOCK *x = &cpi->td.mb;
   AV1_COMMON *const cm = &cpi->common;
   int tile_idx, tile_width, tile_height, nhtiles, nvtiles;
   int h_start, h_end, v_start, v_end;
+  int width, height;
+  if (plane == AOM_PLANE_Y) {
+    width = cm->width;
+    height = cm->height;
+  } else {
+    width = src->uv_crop_width;
+    height = src->uv_crop_height;
+  }
   const int ntiles = av1_get_rest_ntiles(
-      cm->width, cm->height, cm->rst_info[0].restoration_tilesize, &tile_width,
+      width, height, cm->rst_info[plane].restoration_tilesize, &tile_width,
       &tile_height, &nhtiles, &nvtiles);
   (void)info;
   (void)dst_frame;
   (void)partial_frame;
 
+  info->frame_restoration_type = RESTORE_NONE;
   for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
     av1_get_rest_tile_limits(tile_idx, 0, 0, nhtiles, nvtiles, tile_width,
-                             tile_height, cm->width, cm->height, 0, 0, &h_start,
-                             &h_end, &v_start, &v_end);
+                             tile_height, width, height, 0, 0, &h_start, &h_end,
+                             &v_start, &v_end);
     err = sse_restoration_tile(src, cm->frame_to_show, cm, h_start,
-                               h_end - h_start, v_start, v_end - v_start, 1);
+                               h_end - h_start, v_start, v_end - v_start,
+                               1 << plane);
     type[tile_idx] = RESTORE_NONE;
     best_tile_cost[tile_idx] = err;
   }
   // RD cost associated with no restoration
-  err = sse_restoration_tile(src, cm->frame_to_show, cm, 0, cm->width, 0,
-                             cm->height, 1);
+  err = sse_restoration_frame(cm, src, cm->frame_to_show, (1 << plane));
   bits = frame_level_restore_bits[RESTORE_NONE] << AV1_PROB_COST_SHIFT;
   cost_norestore = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
   return cost_norestore;
 }
 
 static double search_switchable_restoration(
-    AV1_COMP *cpi, int partial_frame, RestorationInfo *rsi,
+    AV1_COMP *cpi, int partial_frame, int plane, RestorationInfo *rsi,
     double *tile_cost[RESTORE_SWITCHABLE_TYPES]) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *x = &cpi->td.mb;
   double cost_switchable = 0;
   int bits, tile_idx;
   RestorationType r;
-  const int ntiles = av1_get_rest_ntiles(cm->width, cm->height,
-                                         cm->rst_info[0].restoration_tilesize,
-                                         NULL, NULL, NULL, NULL);
+  int width, height;
+  if (plane == AOM_PLANE_Y) {
+    width = cm->width;
+    height = cm->height;
+  } else {
+    width = ROUND_POWER_OF_TWO(cm->width, cm->subsampling_x);
+    height = ROUND_POWER_OF_TWO(cm->height, cm->subsampling_y);
+  }
+  const int ntiles = av1_get_rest_ntiles(
+      width, height, cm->rst_info[plane].restoration_tilesize, NULL, NULL, NULL,
+      NULL);
   SgrprojInfo ref_sgrproj_info;
   set_default_sgrproj(&ref_sgrproj_info);
   WienerInfo ref_wiener_info;
@@ -1203,57 +1244,60 @@ void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
   double best_cost_restore;
   RestorationType r, best_restore;
 
-  const int ntiles = av1_get_rest_ntiles(cm->width, cm->height,
-                                         cm->rst_info[0].restoration_tilesize,
-                                         NULL, NULL, NULL, NULL);
+  const int ntiles_y = av1_get_rest_ntiles(cm->width, cm->height,
+                                           cm->rst_info[0].restoration_tilesize,
+                                           NULL, NULL, NULL, NULL);
+  const int ntiles_uv = av1_get_rest_ntiles(
+      ROUND_POWER_OF_TWO(cm->width, cm->subsampling_x),
+      ROUND_POWER_OF_TWO(cm->height, cm->subsampling_y),
+      cm->rst_info[1].restoration_tilesize, NULL, NULL, NULL, NULL);
 
+  // Assume ntiles_uv is never larger that ntiles_y and so the same arrays work.
   for (r = 0; r < RESTORE_SWITCHABLE_TYPES; r++) {
-    tile_cost[r] = (double *)aom_malloc(sizeof(*tile_cost[0]) * ntiles);
+    tile_cost[r] = (double *)aom_malloc(sizeof(*tile_cost[0]) * ntiles_y);
     restore_types[r] =
-        (RestorationType *)aom_malloc(sizeof(*restore_types[0]) * ntiles);
-  }
-
-  for (r = 0; r < RESTORE_SWITCHABLE_TYPES; ++r) {
-    if (force_restore_type != 0)
-      if (r != RESTORE_NONE && r != force_restore_type) continue;
-    cost_restore[r] = search_restore_fun[r](
-        src, cpi, method == LPF_PICK_FROM_SUBIMAGE, &cm->rst_info[0],
-        restore_types[r], tile_cost[r], &cpi->trial_frame_rst);
+        (RestorationType *)aom_malloc(sizeof(*restore_types[0]) * ntiles_y);
   }
-  cost_restore[RESTORE_SWITCHABLE] = search_switchable_restoration(
-      cpi, method == LPF_PICK_FROM_SUBIMAGE, &cm->rst_info[0], tile_cost);
 
-  best_cost_restore = DBL_MAX;
-  best_restore = 0;
-  for (r = 0; r < RESTORE_TYPES; ++r) {
+  for (int plane = AOM_PLANE_Y; plane <= AOM_PLANE_V; ++plane) {
+    for (r = 0; r < RESTORE_SWITCHABLE_TYPES; ++r) {
+      cost_restore[r] = DBL_MAX;
+      if (force_restore_type != 0)
+        if (r != RESTORE_NONE && r != force_restore_type) continue;
+      cost_restore[r] =
+          search_restore_fun[r](src, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+                                plane, &cm->rst_info[plane], restore_types[r],
+                                tile_cost[r], &cpi->trial_frame_rst);
+    }
+    if (plane == AOM_PLANE_Y)
+      cost_restore[RESTORE_SWITCHABLE] =
+          search_switchable_restoration(cpi, method == LPF_PICK_FROM_SUBIMAGE,
+                                        plane, &cm->rst_info[plane], tile_cost);
+    else
+      cost_restore[RESTORE_SWITCHABLE] = DBL_MAX;
+    best_cost_restore = DBL_MAX;
+    best_restore = 0;
+    for (r = 0; r < RESTORE_TYPES; ++r) {
+      if (force_restore_type != 0)
+        if (r != RESTORE_NONE && r != force_restore_type) continue;
+      if (cost_restore[r] < best_cost_restore) {
+        best_restore = r;
+        best_cost_restore = cost_restore[r];
+      }
+    }
+    cm->rst_info[plane].frame_restoration_type = best_restore;
     if (force_restore_type != 0)
-      if (r != RESTORE_NONE && r != force_restore_type) continue;
-    if (cost_restore[r] < best_cost_restore) {
-      best_restore = r;
-      best_cost_restore = cost_restore[r];
+      assert(best_restore == force_restore_type ||
+             best_restore == RESTORE_NONE);
+    if (best_restore != RESTORE_SWITCHABLE) {
+      const int nt = (plane == AOM_PLANE_Y ? ntiles_y : ntiles_uv);
+      memcpy(cm->rst_info[plane].restoration_type, restore_types[best_restore],
+             nt * sizeof(restore_types[best_restore][0]));
     }
   }
-  cm->rst_info[0].frame_restoration_type = best_restore;
-  if (force_restore_type != 0)
-    assert(best_restore == force_restore_type || best_restore == RESTORE_NONE);
-  if (best_restore != RESTORE_SWITCHABLE) {
-    memcpy(cm->rst_info[0].restoration_type, restore_types[best_restore],
-           ntiles * sizeof(restore_types[best_restore][0]));
-  }
-
-  // Color components
-  search_wiener_uv(src, cpi, method == LPF_PICK_FROM_SUBIMAGE, AOM_PLANE_U,
-                   &cm->rst_info[AOM_PLANE_U],
-                   cm->rst_info[AOM_PLANE_U].restoration_type,
-                   &cpi->trial_frame_rst);
-  search_wiener_uv(src, cpi, method == LPF_PICK_FROM_SUBIMAGE, AOM_PLANE_V,
-                   &cm->rst_info[AOM_PLANE_V],
-                   cm->rst_info[AOM_PLANE_V].restoration_type,
-                   &cpi->trial_frame_rst);
   /*
-  printf("Frame %d/%d restore types: %d %d %d\n",
-         cm->current_video_frame, cm->show_frame,
-         cm->rst_info[0].frame_restoration_type,
+  printf("Frame %d/%d restore types: %d %d %d\n", cm->current_video_frame,
+         cm->show_frame, cm->rst_info[0].frame_restoration_type,
          cm->rst_info[1].frame_restoration_type,
          cm->rst_info[2].frame_restoration_type);
   printf("Frame %d/%d frame_restore_type %d : %f %f %f %f\n",
diff --git a/third_party/aom/av1/encoder/pvq_encoder.c b/third_party/aom/av1/encoder/pvq_encoder.c
index ab63f1b7d..9d5133012 100644
--- a/third_party/aom/av1/encoder/pvq_encoder.c
+++ b/third_party/aom/av1/encoder/pvq_encoder.c
@@ -247,23 +247,23 @@ static double od_pvq_rate(int qg, int icgr, int theta, int ts,
     aom_writer w;
     od_pvq_codeword_ctx cd;
     int tell;
-#if CONFIG_DAALA_EC
+#if !CONFIG_ANS
     od_ec_enc_init(&w.ec, 1000);
 #else
-# error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+# error "CONFIG_PVQ currently requires !CONFIG_ANS."
 #endif
     OD_COPY(&cd, &adapt->pvq.pvq_codeword_ctx, 1);
-#if CONFIG_DAALA_EC
+#if !CONFIG_ANS
     tell = od_ec_enc_tell_frac(&w.ec);
 #else
-# error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+# error "CONFIG_PVQ currently requires !CONFIG_ANS."
 #endif
     aom_encode_pvq_codeword(&w, &cd, y0, n - (theta != -1), k);
-#if CONFIG_DAALA_EC
+#if !CONFIG_ANS
     rate = (od_ec_enc_tell_frac(&w.ec)-tell)/8.;
     od_ec_enc_clear(&w.ec);
 #else
-# error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+# error "CONFIG_PVQ currently requires !CONFIG_ANS."
 #endif
   }
   if (qg > 0 && theta >= 0) {
@@ -847,22 +847,22 @@ PVQ_SKIP_TYPE od_pvq_encode(daala_enc_ctx *enc,
       int tell2;
       od_rollback_buffer dc_buf;
 
-      dc_rate = -OD_LOG2((double)(skip_cdf[3] - skip_cdf[2])/
-       (double)(skip_cdf[2] - skip_cdf[1]));
+      dc_rate = -OD_LOG2((double)(OD_ICDF(skip_cdf[3]) - OD_ICDF(skip_cdf[2]))/
+       (double)(OD_ICDF(skip_cdf[2]) - OD_ICDF(skip_cdf[1])));
       dc_rate += 1;
 
-#if CONFIG_DAALA_EC
+#if !CONFIG_ANS
       tell2 = od_ec_enc_tell_frac(&enc->w.ec);
 #else
-#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#error "CONFIG_PVQ currently requires !CONFIG_ANS."
 #endif
       od_encode_checkpoint(enc, &dc_buf);
       generic_encode(&enc->w, &enc->state.adapt->model_dc[pli],
        n - 1, &enc->state.adapt->ex_dc[pli][bs][0], 2);
-#if CONFIG_DAALA_EC
+#if !CONFIG_ANS
       tell2 = od_ec_enc_tell_frac(&enc->w.ec) - tell2;
 #else
-#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#error "CONFIG_PVQ currently requires !CONFIG_ANS."
 #endif
       dc_rate += tell2/8.0;
       od_encode_rollback(enc, &dc_buf);
@@ -871,10 +871,10 @@ PVQ_SKIP_TYPE od_pvq_encode(daala_enc_ctx *enc,
        enc->pvq_norm_lambda);
     }
   }
-#if CONFIG_DAALA_EC
+#if !CONFIG_ANS
   tell = od_ec_enc_tell_frac(&enc->w.ec);
 #else
-#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#error "CONFIG_PVQ currently requires !CONFIG_ANS."
 #endif
   /* Code as if we're not skipping. */
   aom_write_symbol(&enc->w, 2 + (out[0] != 0), skip_cdf, 4);
@@ -921,22 +921,22 @@ PVQ_SKIP_TYPE od_pvq_encode(daala_enc_ctx *enc,
     }
     if (encode_flip) cfl_encoded = 1;
   }
-#if CONFIG_DAALA_EC
+#if !CONFIG_ANS
   tell = od_ec_enc_tell_frac(&enc->w.ec) - tell;
 #else
-#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#error "CONFIG_PVQ currently requires !CONFIG_ANS."
 #endif
   /* Account for the rate of skipping the AC, based on the same DC decision
      we made when trying to not skip AC. */
   {
     double skip_rate;
     if (out[0] != 0) {
-      skip_rate = -OD_LOG2((skip_cdf[1] - skip_cdf[0])/
-     (double)skip_cdf[3]);
+      skip_rate = -OD_LOG2((OD_ICDF(skip_cdf[1]) - OD_ICDF(skip_cdf[0]))/
+     (double)OD_ICDF(skip_cdf[3]));
     }
     else {
-      skip_rate = -OD_LOG2(skip_cdf[0]/
-     (double)skip_cdf[3]);
+      skip_rate = -OD_LOG2(OD_ICDF(skip_cdf[0])/
+     (double)OD_ICDF(skip_cdf[3]));
     }
     tell -= (int)floor(.5+8*skip_rate);
   }
@@ -951,22 +951,22 @@ PVQ_SKIP_TYPE od_pvq_encode(daala_enc_ctx *enc,
         int tell2;
         od_rollback_buffer dc_buf;
 
-        dc_rate = -OD_LOG2((double)(skip_cdf[1] - skip_cdf[0])/
-         (double)skip_cdf[0]);
+        dc_rate = -OD_LOG2((double)(OD_ICDF(skip_cdf[1]) - OD_ICDF(skip_cdf[0]))/
+         (double)OD_ICDF(skip_cdf[0]));
         dc_rate += 1;
 
-#if CONFIG_DAALA_EC
+#if !CONFIG_ANS
         tell2 = od_ec_enc_tell_frac(&enc->w.ec);
 #else
-#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#error "CONFIG_PVQ currently requires !CONFIG_ANS."
 #endif
         od_encode_checkpoint(enc, &dc_buf);
         generic_encode(&enc->w, &enc->state.adapt->model_dc[pli],
          n - 1, &enc->state.adapt->ex_dc[pli][bs][0], 2);
-#if CONFIG_DAALA_EC
+#if !CONFIG_ANS
         tell2 = od_ec_enc_tell_frac(&enc->w.ec) - tell2;
 #else
-#error "CONFIG_PVQ currently requires CONFIG_DAALA_EC."
+#error "CONFIG_PVQ currently requires !CONFIG_ANS."
 #endif
         dc_rate += tell2/8.0;
         od_encode_rollback(enc, &dc_buf);
diff --git a/third_party/aom/av1/encoder/ransac.c b/third_party/aom/av1/encoder/ransac.c
index 5d5dd7572..bbd2d179c 100644
--- a/third_party/aom/av1/encoder/ransac.c
+++ b/third_party/aom/av1/encoder/ransac.c
@@ -8,7 +8,6 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
-#define _POSIX_C_SOURCE 200112L  // rand_r()
 #include <memory.h>
 #include <math.h>
 #include <time.h>
@@ -17,6 +16,7 @@
 #include <assert.h>
 
 #include "av1/encoder/ransac.h"
+#include "av1/encoder/mathutils.h"
 
 #define MAX_MINPTS 4
 #define MAX_DEGENERATE_ITER 10
@@ -133,309 +133,6 @@ static void project_points_double_homography(double *mat, double *points,
   }
 }
 
-///////////////////////////////////////////////////////////////////////////////
-// svdcmp
-// Adopted from Numerical Recipes in C
-
-static const double TINY_NEAR_ZERO = 1.0E-12;
-
-static INLINE double sign(double a, double b) {
-  return ((b) >= 0 ? fabs(a) : -fabs(a));
-}
-
-static INLINE double pythag(double a, double b) {
-  double ct;
-  const double absa = fabs(a);
-  const double absb = fabs(b);
-
-  if (absa > absb) {
-    ct = absb / absa;
-    return absa * sqrt(1.0 + ct * ct);
-  } else {
-    ct = absa / absb;
-    return (absb == 0) ? 0 : absb * sqrt(1.0 + ct * ct);
-  }
-}
-
-static void multiply_mat(const double *m1, const double *m2, double *res,
-                         const int m1_rows, const int inner_dim,
-                         const int m2_cols) {
-  double sum;
-
-  int row, col, inner;
-  for (row = 0; row < m1_rows; ++row) {
-    for (col = 0; col < m2_cols; ++col) {
-      sum = 0;
-      for (inner = 0; inner < inner_dim; ++inner)
-        sum += m1[row * inner_dim + inner] * m2[inner * m2_cols + col];
-      *(res++) = sum;
-    }
-  }
-}
-
-static int svdcmp(double **u, int m, int n, double w[], double **v) {
-  const int max_its = 30;
-  int flag, i, its, j, jj, k, l, nm;
-  double anorm, c, f, g, h, s, scale, x, y, z;
-  double *rv1 = (double *)aom_malloc(sizeof(*rv1) * (n + 1));
-  g = scale = anorm = 0.0;
-  for (i = 0; i < n; i++) {
-    l = i + 1;
-    rv1[i] = scale * g;
-    g = s = scale = 0.0;
-    if (i < m) {
-      for (k = i; k < m; k++) scale += fabs(u[k][i]);
-      if (scale != 0.) {
-        for (k = i; k < m; k++) {
-          u[k][i] /= scale;
-          s += u[k][i] * u[k][i];
-        }
-        f = u[i][i];
-        g = -sign(sqrt(s), f);
-        h = f * g - s;
-        u[i][i] = f - g;
-        for (j = l; j < n; j++) {
-          for (s = 0.0, k = i; k < m; k++) s += u[k][i] * u[k][j];
-          f = s / h;
-          for (k = i; k < m; k++) u[k][j] += f * u[k][i];
-        }
-        for (k = i; k < m; k++) u[k][i] *= scale;
-      }
-    }
-    w[i] = scale * g;
-    g = s = scale = 0.0;
-    if (i < m && i != n - 1) {
-      for (k = l; k < n; k++) scale += fabs(u[i][k]);
-      if (scale != 0.) {
-        for (k = l; k < n; k++) {
-          u[i][k] /= scale;
-          s += u[i][k] * u[i][k];
-        }
-        f = u[i][l];
-        g = -sign(sqrt(s), f);
-        h = f * g - s;
-        u[i][l] = f - g;
-        for (k = l; k < n; k++) rv1[k] = u[i][k] / h;
-        for (j = l; j < m; j++) {
-          for (s = 0.0, k = l; k < n; k++) s += u[j][k] * u[i][k];
-          for (k = l; k < n; k++) u[j][k] += s * rv1[k];
-        }
-        for (k = l; k < n; k++) u[i][k] *= scale;
-      }
-    }
-    anorm = fmax(anorm, (fabs(w[i]) + fabs(rv1[i])));
-  }
-
-  for (i = n - 1; i >= 0; i--) {
-    if (i < n - 1) {
-      if (g != 0.) {
-        for (j = l; j < n; j++) v[j][i] = (u[i][j] / u[i][l]) / g;
-        for (j = l; j < n; j++) {
-          for (s = 0.0, k = l; k < n; k++) s += u[i][k] * v[k][j];
-          for (k = l; k < n; k++) v[k][j] += s * v[k][i];
-        }
-      }
-      for (j = l; j < n; j++) v[i][j] = v[j][i] = 0.0;
-    }
-    v[i][i] = 1.0;
-    g = rv1[i];
-    l = i;
-  }
-  for (i = AOMMIN(m, n) - 1; i >= 0; i--) {
-    l = i + 1;
-    g = w[i];
-    for (j = l; j < n; j++) u[i][j] = 0.0;
-    if (g != 0.) {
-      g = 1.0 / g;
-      for (j = l; j < n; j++) {
-        for (s = 0.0, k = l; k < m; k++) s += u[k][i] * u[k][j];
-        f = (s / u[i][i]) * g;
-        for (k = i; k < m; k++) u[k][j] += f * u[k][i];
-      }
-      for (j = i; j < m; j++) u[j][i] *= g;
-    } else {
-      for (j = i; j < m; j++) u[j][i] = 0.0;
-    }
-    ++u[i][i];
-  }
-  for (k = n - 1; k >= 0; k--) {
-    for (its = 0; its < max_its; its++) {
-      flag = 1;
-      for (l = k; l >= 0; l--) {
-        nm = l - 1;
-        if ((double)(fabs(rv1[l]) + anorm) == anorm || nm < 0) {
-          flag = 0;
-          break;
-        }
-        if ((double)(fabs(w[nm]) + anorm) == anorm) break;
-      }
-      if (flag) {
-        c = 0.0;
-        s = 1.0;
-        for (i = l; i <= k; i++) {
-          f = s * rv1[i];
-          rv1[i] = c * rv1[i];
-          if ((double)(fabs(f) + anorm) == anorm) break;
-          g = w[i];
-          h = pythag(f, g);
-          w[i] = h;
-          h = 1.0 / h;
-          c = g * h;
-          s = -f * h;
-          for (j = 0; j < m; j++) {
-            y = u[j][nm];
-            z = u[j][i];
-            u[j][nm] = y * c + z * s;
-            u[j][i] = z * c - y * s;
-          }
-        }
-      }
-      z = w[k];
-      if (l == k) {
-        if (z < 0.0) {
-          w[k] = -z;
-          for (j = 0; j < n; j++) v[j][k] = -v[j][k];
-        }
-        break;
-      }
-      if (its == max_its - 1) {
-        aom_free(rv1);
-        return 1;
-      }
-      assert(k > 0);
-      x = w[l];
-      nm = k - 1;
-      y = w[nm];
-      g = rv1[nm];
-      h = rv1[k];
-      f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
-      g = pythag(f, 1.0);
-      f = ((x - z) * (x + z) + h * ((y / (f + sign(g, f))) - h)) / x;
-      c = s = 1.0;
-      for (j = l; j <= nm; j++) {
-        i = j + 1;
-        g = rv1[i];
-        y = w[i];
-        h = s * g;
-        g = c * g;
-        z = pythag(f, h);
-        rv1[j] = z;
-        c = f / z;
-        s = h / z;
-        f = x * c + g * s;
-        g = g * c - x * s;
-        h = y * s;
-        y *= c;
-        for (jj = 0; jj < n; jj++) {
-          x = v[jj][j];
-          z = v[jj][i];
-          v[jj][j] = x * c + z * s;
-          v[jj][i] = z * c - x * s;
-        }
-        z = pythag(f, h);
-        w[j] = z;
-        if (z != 0.) {
-          z = 1.0 / z;
-          c = f * z;
-          s = h * z;
-        }
-        f = c * g + s * y;
-        x = c * y - s * g;
-        for (jj = 0; jj < m; jj++) {
-          y = u[jj][j];
-          z = u[jj][i];
-          u[jj][j] = y * c + z * s;
-          u[jj][i] = z * c - y * s;
-        }
-      }
-      rv1[l] = 0.0;
-      rv1[k] = f;
-      w[k] = x;
-    }
-  }
-  aom_free(rv1);
-  return 0;
-}
-
-static int SVD(double *U, double *W, double *V, double *matx, int M, int N) {
-  // Assumes allocation for U is MxN
-  double **nrU = (double **)aom_malloc((M) * sizeof(*nrU));
-  double **nrV = (double **)aom_malloc((N) * sizeof(*nrV));
-  int problem, i;
-
-  problem = !(nrU && nrV);
-  if (!problem) {
-    for (i = 0; i < M; i++) {
-      nrU[i] = &U[i * N];
-    }
-    for (i = 0; i < N; i++) {
-      nrV[i] = &V[i * N];
-    }
-  } else {
-    if (nrU) aom_free(nrU);
-    if (nrV) aom_free(nrV);
-    return 1;
-  }
-
-  /* copy from given matx into nrU */
-  for (i = 0; i < M; i++) {
-    memcpy(&(nrU[i][0]), matx + N * i, N * sizeof(*matx));
-  }
-
-  /* HERE IT IS: do SVD */
-  if (svdcmp(nrU, M, N, W, nrV)) {
-    aom_free(nrU);
-    aom_free(nrV);
-    return 1;
-  }
-
-  /* aom_free Numerical Recipes arrays */
-  aom_free(nrU);
-  aom_free(nrV);
-
-  return 0;
-}
-
-int pseudo_inverse(double *inv, double *matx, const int M, const int N) {
-  double ans;
-  int i, j, k;
-  double *const U = (double *)aom_malloc(M * N * sizeof(*matx));
-  double *const W = (double *)aom_malloc(N * sizeof(*matx));
-  double *const V = (double *)aom_malloc(N * N * sizeof(*matx));
-
-  if (!(U && W && V)) {
-    return 1;
-  }
-  if (SVD(U, W, V, matx, M, N)) {
-    aom_free(U);
-    aom_free(W);
-    aom_free(V);
-    return 1;
-  }
-  for (i = 0; i < N; i++) {
-    if (fabs(W[i]) < TINY_NEAR_ZERO) {
-      aom_free(U);
-      aom_free(W);
-      aom_free(V);
-      return 1;
-    }
-  }
-
-  for (i = 0; i < N; i++) {
-    for (j = 0; j < M; j++) {
-      ans = 0;
-      for (k = 0; k < N; k++) {
-        ans += V[k + N * i] * U[k + N * j] / W[k];
-      }
-      inv[j + M * i] = ans;
-    }
-  }
-  aom_free(U);
-  aom_free(W);
-  aom_free(V);
-  return 0;
-}
-
 static void normalize_homography(double *pts, int n, double *T) {
   double *p = pts;
   double mean[2] = { 0, 0 };
@@ -597,7 +294,7 @@ static int find_translation(int np, double *pts1, double *pts2, double *mat) {
 
 static int find_rotzoom(int np, double *pts1, double *pts2, double *mat) {
   const int np2 = np * 2;
-  double *a = (double *)aom_malloc(sizeof(*a) * np2 * 9);
+  double *a = (double *)aom_malloc(sizeof(*a) * (np2 * 5 + 20));
   double *b = a + np2 * 4;
   double *temp = b + np2;
   int i;
@@ -625,11 +322,10 @@ static int find_rotzoom(int np, double *pts1, double *pts2, double *mat) {
     b[2 * i] = dx;
     b[2 * i + 1] = dy;
   }
-  if (pseudo_inverse(temp, a, np2, 4)) {
+  if (!least_squares(4, a, np2, 4, b, temp, mat)) {
     aom_free(a);
     return 1;
   }
-  multiply_mat(temp, b, mat, 4, np2, 1);
   denormalize_rotzoom_reorder(mat, T1, T2);
   aom_free(a);
   return 0;
@@ -637,7 +333,7 @@ static int find_rotzoom(int np, double *pts1, double *pts2, double *mat) {
 
 static int find_affine(int np, double *pts1, double *pts2, double *mat) {
   const int np2 = np * 2;
-  double *a = (double *)aom_malloc(sizeof(*a) * np2 * 13);
+  double *a = (double *)aom_malloc(sizeof(*a) * (np2 * 7 + 42));
   double *b = a + np2 * 6;
   double *temp = b + np2;
   int i;
@@ -669,11 +365,10 @@ static int find_affine(int np, double *pts1, double *pts2, double *mat) {
     b[2 * i] = dx;
     b[2 * i + 1] = dy;
   }
-  if (pseudo_inverse(temp, a, np2, 6)) {
+  if (!least_squares(6, a, np2, 6, b, temp, mat)) {
     aom_free(a);
     return 1;
   }
-  multiply_mat(temp, b, mat, 6, np2, 1);
   denormalize_affine_reorder(mat, T1, T2);
   aom_free(a);
   return 0;
@@ -890,16 +585,22 @@ static int find_homography(int np, double *pts1, double *pts2, double *mat) {
   return 0;
 }
 
+// Generate a random number in the range [0, 32768).
+static unsigned int lcg_rand16(unsigned int *state) {
+  *state = (unsigned int)(*state * 1103515245ULL + 12345);
+  return *state / 65536 % 32768;
+}
+
 static int get_rand_indices(int npoints, int minpts, int *indices,
                             unsigned int *seed) {
   int i, j;
-  int ptr = rand_r(seed) % npoints;
+  int ptr = lcg_rand16(seed) % npoints;
   if (minpts > npoints) return 0;
   indices[0] = ptr;
   ptr = (ptr == npoints - 1 ? 0 : ptr + 1);
   i = 1;
   while (i < minpts) {
-    int index = rand_r(seed) % npoints;
+    int index = lcg_rand16(seed) % npoints;
     while (index) {
       ptr = (ptr == npoints - 1 ? 0 : ptr + 1);
       for (j = 0; j < i; ++j) {
@@ -986,6 +687,9 @@ static int ransac(const int *matched_points, int npoints,
 
   double *cnp1, *cnp2;
 
+  for (i = 0; i < num_desired_motions; ++i) {
+    num_inliers_by_motion[i] = 0;
+  }
   if (npoints < minpts * MINPTS_MULTIPLIER || npoints == 0) {
     return 1;
   }
@@ -1072,7 +776,7 @@ static int ransac(const int *matched_points, int npoints,
     if (current_motion.num_inliers >= worst_kept_motion->num_inliers &&
         current_motion.num_inliers > 1) {
       int temp;
-      double fracinliers, pNoOutliers, mean_distance;
+      double fracinliers, pNoOutliers, mean_distance, dtemp;
       mean_distance = sum_distance / ((double)current_motion.num_inliers);
       current_motion.variance =
           sum_distance_squared / ((double)current_motion.num_inliers - 1.0) -
@@ -1092,7 +796,10 @@ static int ransac(const int *matched_points, int npoints,
         pNoOutliers = 1 - pow(fracinliers, minpts);
         pNoOutliers = fmax(EPS, pNoOutliers);
         pNoOutliers = fmin(1 - EPS, pNoOutliers);
-        temp = (int)(log(1.0 - PROBABILITY_REQUIRED) / log(pNoOutliers));
+        dtemp = log(1.0 - PROBABILITY_REQUIRED) / log(pNoOutliers);
+        temp = (dtemp > (double)INT32_MAX)
+                   ? INT32_MAX
+                   : dtemp < (double)INT32_MIN ? INT32_MIN : (int)dtemp;
 
         if (temp > 0 && temp < N) {
           N = AOMMAX(temp, MIN_TRIALS);
diff --git a/third_party/aom/av1/encoder/ratectrl.c b/third_party/aom/av1/encoder/ratectrl.c
index 1f2ea3606..4552c674e 100644
--- a/third_party/aom/av1/encoder/ratectrl.c
+++ b/third_party/aom/av1/encoder/ratectrl.c
@@ -93,6 +93,11 @@ static int gf_low = 400;
 static int kf_high = 5000;
 static int kf_low = 400;
 
+double av1_resize_rate_factor(const AV1_COMP *cpi) {
+  return (double)(cpi->resize_scale_den * cpi->resize_scale_den) /
+         (cpi->resize_scale_num * cpi->resize_scale_num);
+}
+
 // Functions to compute the active minq lookup table entries based on a
 // formulaic approach to facilitate easier adjustment of the Q tables.
 // The formulae were derived from computing a 3rd order polynomial best
@@ -384,7 +389,7 @@ static double get_rate_correction_factor(const AV1_COMP *cpi) {
     else
       rcf = rc->rate_correction_factors[INTER_NORMAL];
   }
-  rcf *= rcf_mult[rc->frame_size_selector];
+  rcf *= av1_resize_rate_factor(cpi);
   return fclamp(rcf, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
 }
 
@@ -392,7 +397,7 @@ static void set_rate_correction_factor(AV1_COMP *cpi, double factor) {
   RATE_CONTROL *const rc = &cpi->rc;
 
   // Normalize RCF to account for the size-dependent scaling factor.
-  factor /= rcf_mult[cpi->rc.frame_size_selector];
+  factor /= av1_resize_rate_factor(cpi);
 
   factor = fclamp(factor, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
 
@@ -1076,7 +1081,7 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int *bottom_index,
   }
 
   // Modify active_best_quality for downscaled normal frames.
-  if (rc->frame_size_selector != UNSCALED && !frame_is_kf_gf_arf(cpi)) {
+  if (!av1_resize_unscaled(cpi) && !frame_is_kf_gf_arf(cpi)) {
     int qdelta = av1_compute_qdelta_by_rate(
         rc, cm->frame_type, active_best_quality, 2.0, cm->bit_depth);
     active_best_quality =
@@ -1158,11 +1163,10 @@ void av1_rc_set_frame_target(AV1_COMP *cpi, int target) {
 
   rc->this_frame_target = target;
 
-  // Modify frame size target when down-scaling.
-  if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC &&
-      rc->frame_size_selector != UNSCALED)
-    rc->this_frame_target = (int)(rc->this_frame_target *
-                                  rate_thresh_mult[rc->frame_size_selector]);
+  // Modify frame size target when down-scaled.
+  if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC && !av1_resize_unscaled(cpi))
+    rc->this_frame_target =
+        (int)(rc->this_frame_target * av1_resize_rate_factor(cpi));
 
   // Target rate per SB64 (including partial SB64s.
   rc->sb64_target_rate = (int)((int64_t)rc->this_frame_target * 64 * 64) /
@@ -1225,7 +1229,6 @@ static void update_golden_frame_stats(AV1_COMP *cpi) {
 
 void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
   const AV1_COMMON *const cm = &cpi->common;
-  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   RATE_CONTROL *const rc = &cpi->rc;
   const int qindex = cm->base_qindex;
 
@@ -1317,13 +1320,6 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
     rc->frames_since_key++;
     rc->frames_to_key--;
   }
-
-  // Trigger the resizing of the next frame if it is scaled.
-  if (oxcf->pass != 0) {
-    cpi->resize_pending =
-        rc->next_frame_size_selector != rc->frame_size_selector;
-    rc->frame_size_selector = rc->next_frame_size_selector;
-  }
 }
 
 void av1_rc_postencode_update_drop_frame(AV1_COMP *cpi) {
@@ -1501,10 +1497,7 @@ void av1_rc_get_one_pass_cbr_params(AV1_COMP *cpi) {
     target = calc_pframe_target_size_one_pass_cbr(cpi);
 
   av1_rc_set_frame_target(cpi, target);
-  if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC)
-    cpi->resize_pending = av1_resize_one_pass_cbr(cpi);
-  else
-    cpi->resize_pending = 0;
+  // TODO(afergs): Decide whether to scale up, down, or not at all
 }
 
 int av1_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
@@ -1670,90 +1663,3 @@ void av1_set_target_rate(AV1_COMP *cpi) {
     vbr_rate_correction(cpi, &target_rate);
   av1_rc_set_frame_target(cpi, target_rate);
 }
-
-// Check if we should resize, based on average QP from past x frames.
-// Only allow for resize at most one scale down for now, scaling factor is 2.
-int av1_resize_one_pass_cbr(AV1_COMP *cpi) {
-  const AV1_COMMON *const cm = &cpi->common;
-  RATE_CONTROL *const rc = &cpi->rc;
-  int resize_now = 0;
-  cpi->resize_scale_num = 1;
-  cpi->resize_scale_den = 1;
-  // Don't resize on key frame; reset the counters on key frame.
-  if (cm->frame_type == KEY_FRAME) {
-    cpi->resize_avg_qp = 0;
-    cpi->resize_count = 0;
-    return 0;
-  }
-  // Resize based on average buffer underflow and QP over some window.
-  // Ignore samples close to key frame, since QP is usually high after key.
-  if (cpi->rc.frames_since_key > 2 * cpi->framerate) {
-    const int window = (int)(5 * cpi->framerate);
-    cpi->resize_avg_qp += cm->base_qindex;
-    if (cpi->rc.buffer_level < (int)(30 * rc->optimal_buffer_level / 100))
-      ++cpi->resize_buffer_underflow;
-    ++cpi->resize_count;
-    // Check for resize action every "window" frames.
-    if (cpi->resize_count >= window) {
-      int avg_qp = cpi->resize_avg_qp / cpi->resize_count;
-      // Resize down if buffer level has underflowed sufficent amount in past
-      // window, and we are at original resolution.
-      // Resize back up if average QP is low, and we are currently in a resized
-      // down state.
-      if (cpi->resize_state == 0 &&
-          cpi->resize_buffer_underflow > (cpi->resize_count >> 2)) {
-        resize_now = 1;
-        cpi->resize_state = 1;
-      } else if (cpi->resize_state == 1 &&
-                 avg_qp < 40 * cpi->rc.worst_quality / 100) {
-        resize_now = -1;
-        cpi->resize_state = 0;
-      }
-      // Reset for next window measurement.
-      cpi->resize_avg_qp = 0;
-      cpi->resize_count = 0;
-      cpi->resize_buffer_underflow = 0;
-    }
-  }
-  // If decision is to resize, reset some quantities, and check is we should
-  // reduce rate correction factor,
-  if (resize_now != 0) {
-    int target_bits_per_frame;
-    int active_worst_quality;
-    int qindex;
-    int tot_scale_change;
-    // For now, resize is by 1/2 x 1/2.
-    cpi->resize_scale_num = 1;
-    cpi->resize_scale_den = 2;
-    tot_scale_change = (cpi->resize_scale_den * cpi->resize_scale_den) /
-                       (cpi->resize_scale_num * cpi->resize_scale_num);
-    // Reset buffer level to optimal, update target size.
-    rc->buffer_level = rc->optimal_buffer_level;
-    rc->bits_off_target = rc->optimal_buffer_level;
-    rc->this_frame_target = calc_pframe_target_size_one_pass_cbr(cpi);
-    // Reset cyclic refresh parameters.
-    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled)
-      av1_cyclic_refresh_reset_resize(cpi);
-    // Get the projected qindex, based on the scaled target frame size (scaled
-    // so target_bits_per_mb in av1_rc_regulate_q will be correct target).
-    target_bits_per_frame = (resize_now == 1)
-                                ? rc->this_frame_target * tot_scale_change
-                                : rc->this_frame_target / tot_scale_change;
-    active_worst_quality = calc_active_worst_quality_one_pass_cbr(cpi);
-    qindex = av1_rc_regulate_q(cpi, target_bits_per_frame, rc->best_quality,
-                               active_worst_quality);
-    // If resize is down, check if projected q index is close to worst_quality,
-    // and if so, reduce the rate correction factor (since likely can afford
-    // lower q for resized frame).
-    if (resize_now == 1 && qindex > 90 * cpi->rc.worst_quality / 100) {
-      rc->rate_correction_factors[INTER_NORMAL] *= 0.85;
-    }
-    // If resize is back up, check if projected q index is too much above the
-    // current base_qindex, and if so, reduce the rate correction factor
-    // (since prefer to keep q for resized frame at least close to previous q).
-    if (resize_now == -1 && qindex > 130 * cm->base_qindex / 100) {
-      rc->rate_correction_factors[INTER_NORMAL] *= 0.9;
-    }
-  }
-  return resize_now;
-}
diff --git a/third_party/aom/av1/encoder/ratectrl.h b/third_party/aom/av1/encoder/ratectrl.h
index 93a9b4939..61bb0c224 100644
--- a/third_party/aom/av1/encoder/ratectrl.h
+++ b/third_party/aom/av1/encoder/ratectrl.h
@@ -49,27 +49,6 @@ typedef enum {
 } RATE_FACTOR_LEVEL;
 #endif  // CONFIG_EXT_REFS
 
-// Internal frame scaling level.
-typedef enum {
-  UNSCALED = 0,     // Frame is unscaled.
-  SCALE_STEP1 = 1,  // First-level down-scaling.
-  FRAME_SCALE_STEPS
-} FRAME_SCALE_LEVEL;
-
-// Frame dimensions multiplier wrt the native frame size, in 1/16ths,
-// specified for the scale-up case.
-// e.g. 24 => 16/24 = 2/3 of native size. The restriction to 1/16th is
-// intended to match the capabilities of the normative scaling filters,
-// giving precedence to the up-scaling accuracy.
-static const int frame_scale_factor[FRAME_SCALE_STEPS] = { 16, 24 };
-
-// Multiplier of the target rate to be used as threshold for triggering scaling.
-static const double rate_thresh_mult[FRAME_SCALE_STEPS] = { 1.0, 2.0 };
-
-// Scale dependent Rate Correction Factor multipliers. Compensates for the
-// greater number of bits per pixel generated in down-scaled frames.
-static const double rcf_mult[FRAME_SCALE_STEPS] = { 1.0, 2.0 };
-
 typedef struct {
   // Rate targetting variables
   int base_frame_target;  // A baseline frame target before adjustment
@@ -162,10 +141,6 @@ typedef struct {
   int q_2_frame;
 
   // Auto frame-scaling variables.
-  FRAME_SCALE_LEVEL frame_size_selector;
-  FRAME_SCALE_LEVEL next_frame_size_selector;
-  int frame_width[FRAME_SCALE_STEPS];
-  int frame_height[FRAME_SCALE_STEPS];
   int rf_level_maxq[RATE_FACTOR_LEVELS];
 } RATE_CONTROL;
 
@@ -214,6 +189,10 @@ int av1_rc_get_default_max_gf_interval(double framerate, int min_frame_rate);
 void av1_rc_get_one_pass_vbr_params(struct AV1_COMP *cpi);
 void av1_rc_get_one_pass_cbr_params(struct AV1_COMP *cpi);
 
+// How many times less pixels there are to encode given the current scaling.
+// Temporary replacement for rcf_mult and rate_thresh_mult.
+double av1_resize_rate_factor(const struct AV1_COMP *cpi);
+
 // Post encode update of the rate control parameters based
 // on bytes used
 void av1_rc_postencode_update(struct AV1_COMP *cpi, uint64_t bytes_used);
diff --git a/third_party/aom/av1/encoder/rd.c b/third_party/aom/av1/encoder/rd.c
index f06e569e7..94c3bb96d 100644
--- a/third_party/aom/av1/encoder/rd.c
+++ b/third_party/aom/av1/encoder/rd.c
@@ -330,7 +330,6 @@ static void set_block_thresholds(const AV1_COMMON *cm, RD_OPT *rd) {
   }
 }
 
-#if CONFIG_REF_MV
 void av1_set_mvcost(MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame, int ref,
                     int ref_mv_idx) {
   MB_MODE_INFO_EXT *mbmi_ext = x->mbmi_ext;
@@ -340,19 +339,14 @@ void av1_set_mvcost(MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame, int ref,
   (void)ref_frame;
   x->mvcost = x->mv_cost_stack[nmv_ctx];
   x->nmvjointcost = x->nmv_vec_cost[nmv_ctx];
-  x->mvsadcost = x->mvcost;
-  x->nmvjointsadcost = x->nmvjointcost;
 }
-#endif
 
 void av1_initialize_rd_consts(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->td.mb;
   RD_OPT *const rd = &cpi->rd;
   int i;
-#if CONFIG_REF_MV
   int nmv_ctx;
-#endif
 
   aom_clear_system_state();
 
@@ -363,7 +357,6 @@ void av1_initialize_rd_consts(AV1_COMP *cpi) {
 
   set_block_thresholds(cm, rd);
 
-#if CONFIG_REF_MV
   for (nmv_ctx = 0; nmv_ctx < NMV_CONTEXTS; ++nmv_ctx) {
     av1_build_nmv_cost_table(
         x->nmv_vec_cost[nmv_ctx],
@@ -373,19 +366,11 @@ void av1_initialize_rd_consts(AV1_COMP *cpi) {
   }
   x->mvcost = x->mv_cost_stack[0];
   x->nmvjointcost = x->nmv_vec_cost[0];
-  x->mvsadcost = x->mvcost;
-  x->nmvjointsadcost = x->nmvjointcost;
-#else
-  av1_build_nmv_cost_table(
-      x->nmvjointcost, cm->allow_high_precision_mv ? x->nmvcost_hp : x->nmvcost,
-      &cm->fc->nmvc, cm->allow_high_precision_mv);
-#endif
 
   if (cpi->oxcf.pass != 1) {
     av1_fill_token_costs(x->token_costs, cm->fc->coef_probs);
 
-    if (cpi->sf.partition_search_type != VAR_BASED_PARTITION ||
-        cm->frame_type == KEY_FRAME) {
+    if (cm->frame_type == KEY_FRAME) {
 #if CONFIG_EXT_PARTITION_TYPES
       for (i = 0; i < PARTITION_PLOFFSET; ++i)
         av1_cost_tokens(cpi->partition_cost[i], cm->fc->partition_prob[i],
@@ -425,7 +410,6 @@ void av1_initialize_rd_consts(AV1_COMP *cpi) {
     fill_mode_costs(cpi);
 
     if (!frame_is_intra_only(cm)) {
-#if CONFIG_REF_MV
       for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i) {
         cpi->newmv_mode_cost[i][0] = av1_cost_bit(cm->fc->newmv_prob[i], 0);
         cpi->newmv_mode_cost[i][1] = av1_cost_bit(cm->fc->newmv_prob[i], 1);
@@ -445,20 +429,17 @@ void av1_initialize_rd_consts(AV1_COMP *cpi) {
         cpi->drl_mode_cost0[i][0] = av1_cost_bit(cm->fc->drl_prob[i], 0);
         cpi->drl_mode_cost0[i][1] = av1_cost_bit(cm->fc->drl_prob[i], 1);
       }
-#else
-      for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
-        av1_cost_tokens((int *)cpi->inter_mode_cost[i],
-                        cm->fc->inter_mode_probs[i], av1_inter_mode_tree);
-#endif  // CONFIG_REF_MV
 #if CONFIG_EXT_INTER
       for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
         av1_cost_tokens((int *)cpi->inter_compound_mode_cost[i],
                         cm->fc->inter_compound_mode_probs[i],
                         av1_inter_compound_mode_tree);
+#if CONFIG_INTERINTRA
       for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
         av1_cost_tokens((int *)cpi->interintra_mode_cost[i],
                         cm->fc->interintra_mode_prob[i],
                         av1_interintra_mode_tree);
+#endif  // CONFIG_INTERINTRA
 #endif  // CONFIG_EXT_INTER
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
       for (i = BLOCK_8X8; i < BLOCK_SIZES; i++) {
@@ -575,9 +556,15 @@ static void get_entropy_contexts_plane(
   const ENTROPY_CONTEXT *const above = pd->above_context;
   const ENTROPY_CONTEXT *const left = pd->left_context;
 
+#if CONFIG_LV_MAP
+  memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
+  memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
+  return;
+#endif  // CONFIG_LV_MAP
+
   int i;
 
-#if CONFIG_CB4X4
+#if CONFIG_CHROMA_2X2
   switch (tx_size) {
     case TX_2X2:
       memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
@@ -609,6 +596,20 @@ static void get_entropy_contexts_plane(
         t_left[i] =
             !!(*(const uint64_t *)&left[i] | *(const uint64_t *)&left[i + 8]);
       break;
+#if CONFIG_TX64X64
+    case TX_64X64:
+      for (i = 0; i < num_4x4_w; i += 32)
+        t_above[i] =
+            !!(*(const uint64_t *)&above[i] | *(const uint64_t *)&above[i + 8] |
+               *(const uint64_t *)&above[i + 16] |
+               *(const uint64_t *)&above[i + 24]);
+      for (i = 0; i < num_4x4_h; i += 32)
+        t_left[i] =
+            !!(*(const uint64_t *)&left[i] | *(const uint64_t *)&left[i + 8] |
+               *(const uint64_t *)&left[i + 16] |
+               *(const uint64_t *)&left[i + 24]);
+      break;
+#endif  // CONFIG_TX64X64
     case TX_4X8:
       for (i = 0; i < num_4x4_w; i += 2)
         t_above[i] = !!*(const uint16_t *)&above[i];
@@ -647,11 +648,39 @@ static void get_entropy_contexts_plane(
       for (i = 0; i < num_4x4_h; i += 8)
         t_left[i] = !!*(const uint64_t *)&left[i];
       break;
+#if CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
+    case TX_4X16:
+      for (i = 0; i < num_4x4_w; i += 2)
+        t_above[i] = !!*(const uint16_t *)&above[i];
+      for (i = 0; i < num_4x4_h; i += 8)
+        t_left[i] = !!*(const uint64_t *)&left[i];
+      break;
+    case TX_16X4:
+      for (i = 0; i < num_4x4_w; i += 8)
+        t_above[i] = !!*(const uint64_t *)&above[i];
+      for (i = 0; i < num_4x4_h; i += 2)
+        t_left[i] = !!*(const uint16_t *)&left[i];
+      break;
+    case TX_8X32:
+      for (i = 0; i < num_4x4_w; i += 4)
+        t_above[i] = !!*(const uint32_t *)&above[i];
+      for (i = 0; i < num_4x4_h; i += 16)
+        t_left[i] =
+            !!(*(const uint64_t *)&left[i] | *(const uint64_t *)&left[i + 8]);
+      break;
+    case TX_32X8:
+      for (i = 0; i < num_4x4_w; i += 16)
+        t_above[i] =
+            !!(*(const uint64_t *)&above[i] | *(const uint64_t *)&above[i + 8]);
+      for (i = 0; i < num_4x4_h; i += 4)
+        t_left[i] = !!*(const uint32_t *)&left[i];
+      break;
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
 
     default: assert(0 && "Invalid transform size."); break;
   }
   return;
-#endif
+#endif  // CONFIG_CHROMA_2X2
 
   switch (tx_size) {
     case TX_4X4:
@@ -720,6 +749,30 @@ static void get_entropy_contexts_plane(
       for (i = 0; i < num_4x4_h; i += 4)
         t_left[i] = !!*(const uint32_t *)&left[i];
       break;
+#if CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
+    case TX_4X16:
+      memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
+      for (i = 0; i < num_4x4_h; i += 4)
+        t_left[i] = !!*(const uint32_t *)&left[i];
+      break;
+    case TX_16X4:
+      for (i = 0; i < num_4x4_w; i += 4)
+        t_above[i] = !!*(const uint32_t *)&above[i];
+      memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
+      break;
+    case TX_8X32:
+      for (i = 0; i < num_4x4_w; i += 2)
+        t_above[i] = !!*(const uint16_t *)&above[i];
+      for (i = 0; i < num_4x4_h; i += 8)
+        t_left[i] = !!*(const uint64_t *)&left[i];
+      break;
+    case TX_32X8:
+      for (i = 0; i < num_4x4_w; i += 8)
+        t_above[i] = !!*(const uint64_t *)&above[i];
+      for (i = 0; i < num_4x4_h; i += 2)
+        t_left[i] = !!*(const uint16_t *)&left[i];
+      break;
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
     default: assert(0 && "Invalid transform size."); break;
   }
 }
@@ -728,7 +781,12 @@ void av1_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
                               const struct macroblockd_plane *pd,
                               ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE],
                               ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE]) {
+#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
+  const BLOCK_SIZE plane_bsize =
+      AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
+#else
   const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+#endif
   get_entropy_contexts_plane(plane_bsize, tx_size, pd, t_above, t_left);
 }
 
@@ -740,27 +798,25 @@ void av1_mv_pred(const AV1_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer,
   int best_sad = INT_MAX;
   int this_sad = INT_MAX;
   int max_mv = 0;
-  int near_same_nearest;
   uint8_t *src_y_ptr = x->plane[0].src.buf;
   uint8_t *ref_y_ptr;
-  const int num_mv_refs =
-      MAX_MV_REF_CANDIDATES +
-      (cpi->sf.adaptive_motion_search && block_size < x->max_partition_size);
-
-  MV pred_mv[3];
-  pred_mv[0] = x->mbmi_ext->ref_mvs[ref_frame][0].as_mv;
-  pred_mv[1] = x->mbmi_ext->ref_mvs[ref_frame][1].as_mv;
-  pred_mv[2] = x->pred_mv[ref_frame];
+  MV pred_mv[MAX_MV_REF_CANDIDATES + 1];
+  int num_mv_refs = 0;
+
+  pred_mv[num_mv_refs++] = x->mbmi_ext->ref_mvs[ref_frame][0].as_mv;
+  if (x->mbmi_ext->ref_mvs[ref_frame][0].as_int !=
+      x->mbmi_ext->ref_mvs[ref_frame][1].as_int) {
+    pred_mv[num_mv_refs++] = x->mbmi_ext->ref_mvs[ref_frame][1].as_mv;
+  }
+  if (cpi->sf.adaptive_motion_search && block_size < x->max_partition_size)
+    pred_mv[num_mv_refs++] = x->pred_mv[ref_frame];
+
   assert(num_mv_refs <= (int)(sizeof(pred_mv) / sizeof(pred_mv[0])));
 
-  near_same_nearest = x->mbmi_ext->ref_mvs[ref_frame][0].as_int ==
-                      x->mbmi_ext->ref_mvs[ref_frame][1].as_int;
   // Get the sad for each candidate reference mv.
   for (i = 0; i < num_mv_refs; ++i) {
     const MV *this_mv = &pred_mv[i];
     int fp_row, fp_col;
-
-    if (i == 1 && near_same_nearest) continue;
     fp_row = (this_mv->row + 3 + (this_mv->row >= 0)) >> 3;
     fp_col = (this_mv->col + 3 + (this_mv->col >= 0)) >> 3;
     max_mv = AOMMAX(max_mv, AOMMAX(abs(this_mv->row), abs(this_mv->col)) >> 3);
@@ -959,8 +1015,6 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
 
 #if CONFIG_EXT_INTER
 
-  rd->thresh_mult[THR_COMP_NEAREST_NEARLA] += 1200;
-  rd->thresh_mult[THR_COMP_NEAR_NEARESTLA] += 1200;
   rd->thresh_mult[THR_COMP_NEAR_NEARLA] += 1200;
   rd->thresh_mult[THR_COMP_NEAREST_NEWLA] += 1500;
   rd->thresh_mult[THR_COMP_NEW_NEARESTLA] += 1500;
@@ -970,8 +1024,6 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_ZERO_ZEROLA] += 2500;
 
 #if CONFIG_EXT_REFS
-  rd->thresh_mult[THR_COMP_NEAREST_NEARL2A] += 1200;
-  rd->thresh_mult[THR_COMP_NEAR_NEARESTL2A] += 1200;
   rd->thresh_mult[THR_COMP_NEAR_NEARL2A] += 1200;
   rd->thresh_mult[THR_COMP_NEAREST_NEWL2A] += 1500;
   rd->thresh_mult[THR_COMP_NEW_NEARESTL2A] += 1500;
@@ -980,8 +1032,6 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_NEW_NEWL2A] += 2000;
   rd->thresh_mult[THR_COMP_ZERO_ZEROL2A] += 2500;
 
-  rd->thresh_mult[THR_COMP_NEAREST_NEARL3A] += 1200;
-  rd->thresh_mult[THR_COMP_NEAR_NEARESTL3A] += 1200;
   rd->thresh_mult[THR_COMP_NEAR_NEARL3A] += 1200;
   rd->thresh_mult[THR_COMP_NEAREST_NEWL3A] += 1500;
   rd->thresh_mult[THR_COMP_NEW_NEARESTL3A] += 1500;
@@ -991,8 +1041,6 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_ZERO_ZEROL3A] += 2500;
 #endif  // CONFIG_EXT_REFS
 
-  rd->thresh_mult[THR_COMP_NEAREST_NEARGA] += 1200;
-  rd->thresh_mult[THR_COMP_NEAR_NEARESTGA] += 1200;
   rd->thresh_mult[THR_COMP_NEAR_NEARGA] += 1200;
   rd->thresh_mult[THR_COMP_NEAREST_NEWGA] += 1500;
   rd->thresh_mult[THR_COMP_NEW_NEARESTGA] += 1500;
@@ -1002,8 +1050,6 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_ZERO_ZEROGA] += 2500;
 
 #if CONFIG_EXT_REFS
-  rd->thresh_mult[THR_COMP_NEAREST_NEARLB] += 1200;
-  rd->thresh_mult[THR_COMP_NEAR_NEARESTLB] += 1200;
   rd->thresh_mult[THR_COMP_NEAR_NEARLB] += 1200;
   rd->thresh_mult[THR_COMP_NEAREST_NEWLB] += 1500;
   rd->thresh_mult[THR_COMP_NEW_NEARESTLB] += 1500;
@@ -1012,8 +1058,6 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_NEW_NEWLB] += 2000;
   rd->thresh_mult[THR_COMP_ZERO_ZEROLB] += 2500;
 
-  rd->thresh_mult[THR_COMP_NEAREST_NEARL2B] += 1200;
-  rd->thresh_mult[THR_COMP_NEAR_NEARESTL2B] += 1200;
   rd->thresh_mult[THR_COMP_NEAR_NEARL2B] += 1200;
   rd->thresh_mult[THR_COMP_NEAREST_NEWL2B] += 1500;
   rd->thresh_mult[THR_COMP_NEW_NEARESTL2B] += 1500;
@@ -1022,8 +1066,6 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_NEW_NEWL2B] += 2000;
   rd->thresh_mult[THR_COMP_ZERO_ZEROL2B] += 2500;
 
-  rd->thresh_mult[THR_COMP_NEAREST_NEARL3B] += 1200;
-  rd->thresh_mult[THR_COMP_NEAR_NEARESTL3B] += 1200;
   rd->thresh_mult[THR_COMP_NEAR_NEARL3B] += 1200;
   rd->thresh_mult[THR_COMP_NEAREST_NEWL3B] += 1500;
   rd->thresh_mult[THR_COMP_NEW_NEARESTL3B] += 1500;
@@ -1032,8 +1074,6 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_NEW_NEWL3B] += 2000;
   rd->thresh_mult[THR_COMP_ZERO_ZEROL3B] += 2500;
 
-  rd->thresh_mult[THR_COMP_NEAREST_NEARGB] += 1200;
-  rd->thresh_mult[THR_COMP_NEAR_NEARESTGB] += 1200;
   rd->thresh_mult[THR_COMP_NEAR_NEARGB] += 1200;
   rd->thresh_mult[THR_COMP_NEAREST_NEWGB] += 1500;
   rd->thresh_mult[THR_COMP_NEW_NEARESTGB] += 1500;
diff --git a/third_party/aom/av1/encoder/rd.h b/third_party/aom/av1/encoder/rd.h
index c0ac1f7e7..5c3eee493 100644
--- a/third_party/aom/av1/encoder/rd.h
+++ b/third_party/aom/av1/encoder/rd.h
@@ -130,6 +130,10 @@ typedef enum {
 
 #if CONFIG_ALT_INTRA
   THR_SMOOTH,
+#if CONFIG_SMOOTH_HV
+  THR_SMOOTH_V,
+  THR_SMOOTH_H,
+#endif  // CONFIG_SMOOTH_HV
 #endif  // CONFIG_ALT_INTRA
 
 #if CONFIG_EXT_INTER
@@ -357,6 +361,9 @@ static INLINE void av1_init_rd_stats(RD_STATS *rd_stats) {
   rd_stats->rdcost = 0;
   rd_stats->sse = 0;
   rd_stats->skip = 1;
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+  rd_stats->dist_y = 0;
+#endif
 #if CONFIG_RD_DEBUG
   for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
     rd_stats->txb_coeff_cost[plane] = 0;
@@ -381,6 +388,9 @@ static INLINE void av1_invalid_rd_stats(RD_STATS *rd_stats) {
   rd_stats->rdcost = INT64_MAX;
   rd_stats->sse = INT64_MAX;
   rd_stats->skip = 0;
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+  rd_stats->dist_y = INT64_MAX;
+#endif
 #if CONFIG_RD_DEBUG
   for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
     rd_stats->txb_coeff_cost[plane] = INT_MAX;
@@ -405,6 +415,9 @@ static INLINE void av1_merge_rd_stats(RD_STATS *rd_stats_dst,
   rd_stats_dst->dist += rd_stats_src->dist;
   rd_stats_dst->sse += rd_stats_src->sse;
   rd_stats_dst->skip &= rd_stats_src->skip;
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+  rd_stats_dst->dist_y += rd_stats_src->dist_y;
+#endif
 #if CONFIG_RD_DEBUG
   for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
     rd_stats_dst->txb_coeff_cost[plane] += rd_stats_src->txb_coeff_cost[plane];
@@ -454,10 +467,8 @@ YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const struct AV1_COMP *cpi,
 
 void av1_init_me_luts(void);
 
-#if CONFIG_REF_MV
 void av1_set_mvcost(MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame, int ref,
                     int ref_mv_idx);
-#endif
 
 void av1_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
                               const struct macroblockd_plane *pd,
diff --git a/third_party/aom/av1/encoder/rdopt.c b/third_party/aom/av1/encoder/rdopt.c
index a1096f782..2a537a06a 100644
--- a/third_party/aom/av1/encoder/rdopt.c
+++ b/third_party/aom/av1/encoder/rdopt.c
@@ -66,11 +66,18 @@
 #endif  // CONFIG_PVQ || CONFIG_DAALA_DIST
 #if CONFIG_DUAL_FILTER
 #define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS)
+#if USE_EXTRA_FILTER
 static const int filter_sets[DUAL_FILTER_SET_SIZE][2] = {
   { 0, 0 }, { 0, 1 }, { 0, 2 }, { 0, 3 }, { 1, 0 }, { 1, 1 },
   { 1, 2 }, { 1, 3 }, { 2, 0 }, { 2, 1 }, { 2, 2 }, { 2, 3 },
   { 3, 0 }, { 3, 1 }, { 3, 2 }, { 3, 3 },
 };
+#else   // USE_EXTRA_FILTER
+static const int filter_sets[DUAL_FILTER_SET_SIZE][2] = {
+  { 0, 0 }, { 0, 1 }, { 0, 2 }, { 1, 0 }, { 1, 1 },
+  { 1, 2 }, { 2, 0 }, { 2, 1 }, { 2, 2 },
+};
+#endif  // USE_EXTRA_FILTER
 #endif  // CONFIG_DUAL_FILTER
 
 #if CONFIG_EXT_REFS
@@ -217,11 +224,13 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
 
 #if CONFIG_ALT_INTRA
   { SMOOTH_PRED, { INTRA_FRAME, NONE_FRAME } },
+#if CONFIG_SMOOTH_HV
+  { SMOOTH_V_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { SMOOTH_H_PRED, { INTRA_FRAME, NONE_FRAME } },
+#endif  // CONFIG_SMOOTH_HV
 #endif  // CONFIG_ALT_INTRA
 
 #if CONFIG_EXT_INTER
-  { NEAR_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
-  { NEAREST_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
   { NEAR_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
   { NEW_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
   { NEAREST_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
@@ -231,8 +240,6 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { ZERO_ZEROMV, { LAST_FRAME, ALTREF_FRAME } },
 
 #if CONFIG_EXT_REFS
-  { NEAR_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
-  { NEAREST_NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
   { NEAR_NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
   { NEW_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
   { NEAREST_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
@@ -241,8 +248,6 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEW_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
   { ZERO_ZEROMV, { LAST2_FRAME, ALTREF_FRAME } },
 
-  { NEAR_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
-  { NEAREST_NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
   { NEAR_NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
   { NEW_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
   { NEAREST_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
@@ -252,8 +257,6 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { ZERO_ZEROMV, { LAST3_FRAME, ALTREF_FRAME } },
 #endif  // CONFIG_EXT_REFS
 
-  { NEAR_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
-  { NEAREST_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
   { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
   { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
   { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
@@ -263,8 +266,6 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { ZERO_ZEROMV, { GOLDEN_FRAME, ALTREF_FRAME } },
 
 #if CONFIG_EXT_REFS
-  { NEAR_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
-  { NEAREST_NEARMV, { LAST_FRAME, BWDREF_FRAME } },
   { NEAR_NEARMV, { LAST_FRAME, BWDREF_FRAME } },
   { NEW_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
   { NEAREST_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
@@ -273,8 +274,6 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEW_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
   { ZERO_ZEROMV, { LAST_FRAME, BWDREF_FRAME } },
 
-  { NEAR_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
-  { NEAREST_NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
   { NEAR_NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
   { NEW_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
   { NEAREST_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
@@ -283,8 +282,6 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEW_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
   { ZERO_ZEROMV, { LAST2_FRAME, BWDREF_FRAME } },
 
-  { NEAR_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
-  { NEAREST_NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
   { NEAR_NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
   { NEW_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
   { NEAREST_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
@@ -293,8 +290,6 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEW_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
   { ZERO_ZEROMV, { LAST3_FRAME, BWDREF_FRAME } },
 
-  { NEAR_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
-  { NEAREST_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
   { NEAR_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
   { NEW_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
   { NEAREST_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
@@ -390,28 +385,6 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
 #endif  // CONFIG_EXT_INTER
 };
 
-static const REF_DEFINITION av1_ref_order[MAX_REFS] = {
-  { { LAST_FRAME, NONE_FRAME } },
-#if CONFIG_EXT_REFS
-  { { LAST2_FRAME, NONE_FRAME } },    { { LAST3_FRAME, NONE_FRAME } },
-  { { BWDREF_FRAME, NONE_FRAME } },
-#endif  // CONFIG_EXT_REFS
-  { { GOLDEN_FRAME, NONE_FRAME } },   { { ALTREF_FRAME, NONE_FRAME } },
-
-  { { LAST_FRAME, ALTREF_FRAME } },
-#if CONFIG_EXT_REFS
-  { { LAST2_FRAME, ALTREF_FRAME } },  { { LAST3_FRAME, ALTREF_FRAME } },
-#endif  // CONFIG_EXT_REFS
-  { { GOLDEN_FRAME, ALTREF_FRAME } },
-
-#if CONFIG_EXT_REFS
-  { { LAST_FRAME, BWDREF_FRAME } },   { { LAST2_FRAME, BWDREF_FRAME } },
-  { { LAST3_FRAME, BWDREF_FRAME } },  { { GOLDEN_FRAME, BWDREF_FRAME } },
-#endif  // CONFIG_EXT_REFS
-
-  { { INTRA_FRAME, NONE_FRAME } },
-};
-
 #if CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA || CONFIG_PALETTE
 static INLINE int write_uniform_cost(int n, int v) {
   const int l = get_unsigned_bits(n);
@@ -430,22 +403,6 @@ static INLINE int write_uniform_cost(int n, int v) {
 #define FAST_EXT_TX_CORR_MARGIN 0.5
 #define FAST_EXT_TX_EDST_MARGIN 0.3
 
-static const TX_TYPE_1D vtx_tab[TX_TYPES] = {
-  DCT_1D,      ADST_1D, DCT_1D,      ADST_1D,
-#if CONFIG_EXT_TX
-  FLIPADST_1D, DCT_1D,  FLIPADST_1D, ADST_1D, FLIPADST_1D, IDTX_1D,
-  DCT_1D,      IDTX_1D, ADST_1D,     IDTX_1D, FLIPADST_1D, IDTX_1D,
-#endif  // CONFIG_EXT_TX
-};
-
-static const TX_TYPE_1D htx_tab[TX_TYPES] = {
-  DCT_1D,  DCT_1D,      ADST_1D,     ADST_1D,
-#if CONFIG_EXT_TX
-  DCT_1D,  FLIPADST_1D, FLIPADST_1D, FLIPADST_1D, ADST_1D, IDTX_1D,
-  IDTX_1D, DCT_1D,      IDTX_1D,     ADST_1D,     IDTX_1D, FLIPADST_1D,
-#endif  // CONFIG_EXT_TX
-};
-
 #if CONFIG_DAALA_DIST
 static int od_compute_var_4x4(od_coeff *x, int stride) {
   int sum;
@@ -603,10 +560,9 @@ static double od_compute_dist(int qm, int activity_masking, od_coeff *x,
   return sum;
 }
 
-static int64_t av1_daala_dist(const uint8_t *src, int src_stride,
-                              const uint8_t *dst, int dst_stride, int bsw,
-                              int bsh, int qm, int use_activity_masking,
-                              int qindex) {
+int64_t av1_daala_dist(const uint8_t *src, int src_stride, const uint8_t *dst,
+                       int dst_stride, int bsw, int bsh, int qm,
+                       int use_activity_masking, int qindex) {
   int i, j;
   int64_t d;
   DECLARE_ALIGNED(16, od_coeff, orig[MAX_TX_SQUARE]);
@@ -843,7 +799,7 @@ static int prune_one_for_sby(const AV1_COMP *cpi, BLOCK_SIZE bsize,
 static int prune_tx_types(const AV1_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
                           const MACROBLOCKD *const xd, int tx_set) {
 #if CONFIG_EXT_TX
-  const int *tx_set_1D = ext_tx_used_inter_1D[tx_set];
+  const int *tx_set_1D = tx_set >= 0 ? ext_tx_used_inter_1D[tx_set] : NULL;
 #else
   const int tx_set_1D[TX_TYPES_1D] = { 0 };
 #endif  // CONFIG_EXT_TX
@@ -1100,13 +1056,10 @@ static int cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
   int c, cost;
   const int16_t *scan = scan_order->scan;
   const int16_t *nb = scan_order->neighbors;
-#if CONFIG_NEW_TOKENSET
   const int ref = is_inter_block(mbmi);
   aom_prob *blockz_probs =
       cm->fc->blockzero_probs[txsize_sqr_map[tx_size]][type][ref];
 
-#endif  // CONFIG_NEW_TOKENSET
-
 #if CONFIG_HIGHBITDEPTH
   const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, xd->bd);
 #else
@@ -1120,12 +1073,8 @@ static int cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
   (void)cm;
 
   if (eob == 0) {
-#if CONFIG_NEW_TOKENSET
     // single eob token
     cost = av1_cost_bit(blockz_probs[pt], 0);
-#else
-    cost = token_costs[0][0][pt][EOB_TOKEN];
-#endif  // CONFIG_NEW_TOKENSET
   } else {
     if (use_fast_coef_costing) {
       int band_left = *band_count++;
@@ -1134,11 +1083,7 @@ static int cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
       int v = qcoeff[0];
       int16_t prev_t;
       cost = av1_get_token_cost(v, &prev_t, cat6_bits);
-#if CONFIG_NEW_TOKENSET
       cost += (*token_costs)[!prev_t][pt][prev_t];
-#else
-      cost += (*token_costs)[0][pt][prev_t];
-#endif
 
       token_cache[0] = av1_pt_energy_class[prev_t];
       ++token_costs;
@@ -1150,11 +1095,7 @@ static int cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
 
         v = qcoeff[rc];
         cost += av1_get_token_cost(v, &t, cat6_bits);
-#if CONFIG_NEW_TOKENSET
         cost += (*token_costs)[!t][!prev_t][t];
-#else
-        cost += (*token_costs)[!prev_t][!prev_t][t];
-#endif
         prev_t = t;
         if (!--band_left) {
           band_left = *band_count++;
@@ -1163,8 +1104,7 @@ static int cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
       }
 
       // eob token
-      if (band_left || CONFIG_NEW_TOKENSET)
-        cost += (*token_costs)[0][!prev_t][EOB_TOKEN];
+      cost += (*token_costs)[0][!prev_t][EOB_TOKEN];
 
     } else {  // !use_fast_coef_costing
       int band_left = *band_count++;
@@ -1172,23 +1112,12 @@ static int cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
       // dc token
       int v = qcoeff[0];
       int16_t tok;
-#if !CONFIG_NEW_TOKENSET
-      unsigned int(*tok_cost_ptr)[COEFF_CONTEXTS][ENTROPY_TOKENS];
-#endif
       cost = av1_get_token_cost(v, &tok, cat6_bits);
-#if CONFIG_NEW_TOKENSET
       cost += (*token_costs)[!tok][pt][tok];
-#else
-      cost += (*token_costs)[0][pt][tok];
-#endif
 
       token_cache[0] = av1_pt_energy_class[tok];
       ++token_costs;
 
-#if !CONFIG_NEW_TOKENSET
-      tok_cost_ptr = &((*token_costs)[!tok]);
-#endif
-
       // ac tokens
       for (c = 1; c < eob; c++) {
         const int rc = scan[c];
@@ -1196,26 +1125,17 @@ static int cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
         v = qcoeff[rc];
         cost += av1_get_token_cost(v, &tok, cat6_bits);
         pt = get_coef_context(nb, token_cache, c);
-#if CONFIG_NEW_TOKENSET
         cost += (*token_costs)[!tok][pt][tok];
-#else
-        cost += (*tok_cost_ptr)[pt][tok];
-#endif
         token_cache[rc] = av1_pt_energy_class[tok];
         if (!--band_left) {
           band_left = *band_count++;
           ++token_costs;
         }
-#if !CONFIG_NEW_TOKENSET
-        tok_cost_ptr = &((*token_costs)[!tok]);
-#endif
       }
 
       // eob token
-      if (band_left || CONFIG_NEW_TOKENSET) {
-        pt = get_coef_context(nb, token_cache, c);
-        cost += (*token_costs)[0][pt][EOB_TOKEN];
-      }
+      pt = get_coef_context(nb, token_cache, c);
+      cost += (*token_costs)[0][pt][EOB_TOKEN];
     }
   }
 
@@ -1262,7 +1182,9 @@ static void get_txb_dimensions(const MACROBLOCKD *xd, int plane,
                                BLOCK_SIZE plane_bsize, int blk_row, int blk_col,
                                BLOCK_SIZE tx_bsize, int *width, int *height,
                                int *visible_width, int *visible_height) {
+#if !(CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT)
   assert(tx_bsize <= plane_bsize);
+#endif  // !(CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT)
   int txb_height = block_size_high[tx_bsize];
   int txb_width = block_size_wide[tx_bsize];
   const int block_height = block_size_high[plane_bsize];
@@ -1298,7 +1220,12 @@ static unsigned pixel_sse(const AV1_COMP *const cpi, const MACROBLOCKD *xd,
                      &txb_cols, &txb_rows, &visible_cols, &visible_rows);
   assert(visible_rows > 0);
   assert(visible_cols > 0);
+#if CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
+  if ((txb_rows == visible_rows && txb_cols == visible_cols) &&
+      tx_bsize < BLOCK_SIZES) {
+#else
   if (txb_rows == visible_rows && txb_cols == visible_cols) {
+#endif
     unsigned sse;
     cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
     return sse;
@@ -1533,7 +1460,36 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
   if (args->exit_early) return;
 
   if (!is_inter_block(mbmi)) {
+#if CONFIG_CFL
+
+#if CONFIG_EC_ADAPT
+    FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
+#else
+    FRAME_CONTEXT *const ec_ctx = cm->fc;
+#endif  // CONFIG_EC_ADAPT
+
+    av1_predict_intra_block_encoder_facade(x, ec_ctx, plane, block, blk_col,
+                                           blk_row, tx_size, plane_bsize);
+#else
     av1_predict_intra_block_facade(xd, plane, block, blk_col, blk_row, tx_size);
+#endif
+#if CONFIG_DPCM_INTRA
+    const int block_raster_idx =
+        av1_block_index_to_raster_order(tx_size, block);
+    const PREDICTION_MODE mode =
+        (plane == 0) ? get_y_mode(xd->mi[0], block_raster_idx) : mbmi->uv_mode;
+    TX_TYPE tx_type = get_tx_type((plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV,
+                                  xd, block, tx_size);
+    if (av1_use_dpcm_intra(plane, mode, tx_type, mbmi)) {
+      int8_t skip;
+      av1_encode_block_intra_dpcm(cm, x, mode, plane, block, blk_row, blk_col,
+                                  plane_bsize, tx_size, tx_type, a, l, &skip);
+      av1_dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col,
+                     tx_size, &this_rd_stats.dist, &this_rd_stats.sse,
+                     OUTPUT_HAS_DECODED_PIXELS);
+      goto CALCULATE_RD;
+    }
+#endif  // CONFIG_DPCM_INTRA
     av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
   }
 
@@ -1542,8 +1498,7 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
   const int coeff_ctx = combine_entropy_contexts(*a, *l);
   av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
                   coeff_ctx, AV1_XFORM_QUANT_FP);
-  if (x->plane[plane].eobs[block] && !xd->lossless[mbmi->segment_id])
-    av1_optimize_b(cm, x, plane, block, tx_size, coeff_ctx);
+  av1_optimize_b(cm, x, plane, block, plane_bsize, tx_size, a, l);
 
   if (!is_inter_block(mbmi)) {
     struct macroblock_plane *const p = &x->plane[plane];
@@ -1566,6 +1521,9 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
     cfl_store(xd->cfl, dst, dst_stride, blk_row, blk_col, tx_size);
   }
 #endif
+#if CONFIG_DPCM_INTRA
+CALCULATE_RD : {}
+#endif  // CONFIG_DPCM_INTRA
   rd = RDCOST(x->rdmult, x->rddiv, 0, this_rd_stats.dist);
   if (args->this_rd + rd > args->best_rd) {
     args->exit_early = 1;
@@ -1603,7 +1561,7 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
   rd = AOMMIN(rd1, rd2);
 
 #if CONFIG_DAALA_DIST
-  if (plane == 0 &&
+  if (plane == 0 && plane_bsize >= BLOCK_8X8 &&
       (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4)) {
     this_rd_stats.dist = 0;
     this_rd_stats.sse = 0;
@@ -1641,6 +1599,9 @@ static void block_8x8_rd_txfm_daala_dist(int plane, int block, int blk_row,
   int use_activity_masking = 0;
 
   (void)tx_size;
+
+  assert(plane == 0);
+  assert(plane_bsize >= BLOCK_8X8);
 #if CONFIG_PVQ
   use_activity_masking = x->daala_enc.use_activity_masking;
 #endif  // CONFIG_PVQ
@@ -1700,10 +1661,15 @@ static void block_8x8_rd_txfm_daala_dist(int plane, int block, int blk_row,
 
   {
     const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+    const uint8_t txw_unit = tx_size_wide_unit[tx_size];
+    const uint8_t txh_unit = tx_size_high_unit[tx_size];
+    const int step = txw_unit * txh_unit;
+    int offset_h = tx_size_high_unit[TX_4X4];
     // The rate of the current 8x8 block is the sum of four 4x4 blocks in it.
-    this_rd_stats.rate = x->rate_4x4[block - max_blocks_wide - 1] +
-                         x->rate_4x4[block - max_blocks_wide] +
-                         x->rate_4x4[block - 1] + x->rate_4x4[block];
+    this_rd_stats.rate =
+        x->rate_4x4[block - max_blocks_wide * offset_h - step] +
+        x->rate_4x4[block - max_blocks_wide * offset_h] +
+        x->rate_4x4[block - step] + x->rate_4x4[block];
   }
   rd1 = RDCOST(x->rdmult, x->rddiv, this_rd_stats.rate, this_rd_stats.dist);
   rd2 = RDCOST(x->rdmult, x->rddiv, 0, this_rd_stats.sse);
@@ -1740,10 +1706,10 @@ static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
   av1_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
 
 #if CONFIG_DAALA_DIST
-  if (plane == 0 &&
+  if (plane == 0 && bsize >= BLOCK_8X8 &&
       (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4))
-    av1_foreach_8x8_transformed_block_in_plane(
-        xd, bsize, plane, block_rd_txfm, block_8x8_rd_txfm_daala_dist, &args);
+    av1_foreach_8x8_transformed_block_in_yplane(
+        xd, bsize, block_rd_txfm, block_8x8_rd_txfm_daala_dist, &args);
   else
 #endif  // CONFIG_DAALA_DIST
     av1_foreach_transformed_block_in_plane(xd, bsize, plane, block_rd_txfm,
@@ -1812,7 +1778,12 @@ static int tx_size_cost(const AV1_COMP *const cpi, const MACROBLOCK *const x,
     const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size];
     const int depth = tx_size_to_depth(coded_tx_size);
     const int tx_size_ctx = get_tx_size_context(xd);
-    const int r_tx_size = cpi->tx_size_cost[tx_size_cat][tx_size_ctx][depth];
+    int r_tx_size = cpi->tx_size_cost[tx_size_cat][tx_size_ctx][depth];
+#if CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
+    if (is_quarter_tx_allowed(xd, mbmi, is_inter) && tx_size != coded_tx_size)
+      r_tx_size += av1_cost_bit(cm->fc->quarter_tx_size_prob,
+                                tx_size == quarter_txsize_lookup[bsize]);
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
     return r_tx_size;
   } else {
     return 0;
@@ -1924,9 +1895,7 @@ static int skip_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs,
     // transforms should be considered for pruning
     prune = prune_tx_types(cpi, bs, x, xd, -1);
 
-#if CONFIG_REF_MV
   if (mbmi->ref_mv_idx > 0 && tx_type != DCT_DCT) return 1;
-#endif  // CONFIG_REF_MV
   if (FIXED_TX_TYPE && tx_type != get_default_tx_type(0, xd, 0, tx_size))
     return 1;
   if (!is_inter && x->use_default_intra_tx_type &&
@@ -1960,7 +1929,7 @@ static int skip_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs,
   return 0;
 }
 
-#if CONFIG_EXT_INTER
+#if CONFIG_EXT_INTER && (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT)
 static int64_t estimate_yrd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bs,
                                    MACROBLOCK *x, int *r, int64_t *d, int *s,
                                    int64_t *sse, int64_t ref_best_rd) {
@@ -1973,7 +1942,7 @@ static int64_t estimate_yrd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bs,
   *sse = rd_stats.sse;
   return rd;
 }
-#endif  // CONFIG_EXT_INTER
+#endif  // CONFIG_EXT_INTER && (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT)
 
 static void choose_largest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
                                    RD_STATS *rd_stats, int64_t ref_best_rd,
@@ -2191,9 +2160,7 @@ static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
 #endif
     TX_TYPE tx_type;
     for (tx_type = tx_start; tx_type < tx_end; ++tx_type) {
-#if CONFIG_REF_MV
       if (mbmi->ref_mv_idx > 0 && tx_type != DCT_DCT) continue;
-#endif  // CONFIG_REF_MV
       const TX_SIZE rect_tx_size = max_txsize_rect_lookup[bs];
       RD_STATS this_rd_stats;
       int ext_tx_set =
@@ -2219,6 +2186,56 @@ static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
 #endif  // CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
     }
   }
+
+#if CONFIG_RECT_TX_EXT
+  // test 1:4/4:1 tx
+  int evaluate_quarter_tx = 0;
+  if (is_quarter_tx_allowed(xd, mbmi, is_inter)) {
+    if (tx_select) {
+      evaluate_quarter_tx = 1;
+    } else {
+      const TX_SIZE chosen_tx_size =
+          tx_size_from_tx_mode(bs, cm->tx_mode, is_inter);
+      evaluate_quarter_tx = chosen_tx_size == quarter_txsize_lookup[bs];
+    }
+  }
+  if (evaluate_quarter_tx) {
+    TX_TYPE tx_start = DCT_DCT;
+    TX_TYPE tx_end = TX_TYPES;
+#if CONFIG_TXK_SEL
+    // The tx_type becomes dummy when lv_map is on. The tx_type search will be
+    // performed in av1_search_txk_type()
+    tx_end = DCT_DCT + 1;
+#endif
+    TX_TYPE tx_type;
+    for (tx_type = tx_start; tx_type < tx_end; ++tx_type) {
+      if (mbmi->ref_mv_idx > 0 && tx_type != DCT_DCT) continue;
+      const TX_SIZE tx_size = quarter_txsize_lookup[bs];
+      RD_STATS this_rd_stats;
+      int ext_tx_set =
+          get_ext_tx_set(tx_size, bs, is_inter, cm->reduced_tx_set_used);
+      if ((is_inter && ext_tx_used_inter[ext_tx_set][tx_type]) ||
+          (!is_inter && ext_tx_used_intra[ext_tx_set][tx_type])) {
+        rd =
+            txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, tx_type, tx_size);
+        if (rd < best_rd) {
+#if CONFIG_TXK_SEL
+          memcpy(best_txk_type, mbmi->txk_type,
+                 sizeof(best_txk_type[0]) * num_blk);
+#endif
+          best_tx_type = tx_type;
+          best_tx_size = tx_size;
+          best_rd = rd;
+          *rd_stats = this_rd_stats;
+        }
+      }
+#if CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
+      const int is_inter = is_inter_block(mbmi);
+      if (mbmi->sb_type < BLOCK_8X8 && is_inter) break;
+#endif  // CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
+    }
+  }
+#endif  // CONFIG_RECT_TX_EXT
 #endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
 
   if (tx_select) {
@@ -2334,6 +2351,7 @@ static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x,
                                BLOCK_SIZE bsize, int mode_cost) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  assert(!is_inter_block(mbmi));
   RD_STATS this_rd_stats;
   int row, col;
   int64_t temp_sse, this_rd;
@@ -2348,7 +2366,21 @@ static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x,
   int block = 0;
   for (row = 0; row < max_blocks_high; row += stepr) {
     for (col = 0; col < max_blocks_wide; col += stepc) {
+#if CONFIG_CFL
+      const struct macroblockd_plane *const pd = &xd->plane[0];
+      const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+
+#if CONFIG_EC_ADAPT
+      FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
+#else
+      FRAME_CONTEXT *const ec_ctx = cpi->common.fc;
+#endif  // CONFIG_EC_ADAPT
+
+      av1_predict_intra_block_encoder_facade(x, ec_ctx, 0, block, col, row,
+                                             tx_size, plane_bsize);
+#else
       av1_predict_intra_block_facade(xd, 0, block, col, row, tx_size);
+#endif
       block += step;
     }
   }
@@ -2403,6 +2435,28 @@ static void extend_palette_color_map(uint8_t *const color_map, int orig_width,
   }
 }
 
+#if CONFIG_PALETTE_DELTA_ENCODING
+// Bias toward using colors in the cache.
+// TODO(huisu): Try other schemes to improve compression.
+static void optimize_palette_colors(uint16_t *color_cache, int n_cache,
+                                    int n_colors, int stride,
+                                    float *centroids) {
+  if (n_cache <= 0) return;
+  for (int i = 0; i < n_colors * stride; i += stride) {
+    float min_diff = fabsf(centroids[i] - color_cache[0]);
+    int idx = 0;
+    for (int j = 1; j < n_cache; ++j) {
+      float this_diff = fabsf(centroids[i] - color_cache[j]);
+      if (this_diff < min_diff) {
+        min_diff = this_diff;
+        idx = j;
+      }
+    }
+    if (min_diff < 1.5) centroids[i] = color_cache[idx];
+  }
+}
+#endif  // CONFIG_PALETTE_DELTA_ENCODING
+
 static int rd_pick_palette_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
                                      BLOCK_SIZE bsize, int palette_ctx,
                                      int dc_mode_cost, MB_MODE_INFO *best_mbmi,
@@ -2414,6 +2468,7 @@ static int rd_pick_palette_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
   MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *const mic = xd->mi[0];
   MB_MODE_INFO *const mbmi = &mic->mbmi;
+  assert(!is_inter_block(mbmi));
   int this_rate, colors, n;
   const int src_stride = x->plane[0].src.stride;
   const uint8_t *const src = x->plane[0].src.buf;
@@ -2488,12 +2543,38 @@ static int rd_pick_palette_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
 
     if (rows * cols > PALETTE_MAX_BLOCK_SIZE) return 0;
 
+#if CONFIG_PALETTE_DELTA_ENCODING
+    const MODE_INFO *above_mi = xd->above_mi;
+    const MODE_INFO *left_mi = xd->left_mi;
+    uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+    const int n_cache =
+        av1_get_palette_cache(above_mi, left_mi, 0, color_cache);
+#endif  // CONFIG_PALETTE_DELTA_ENCODING
+
     for (n = colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors; n >= 2;
          --n) {
-      for (i = 0; i < n; ++i)
-        centroids[i] = lb + (2 * i + 1) * (ub - lb) / n / 2;
-      av1_k_means(data, centroids, color_map, rows * cols, n, 1, max_itr);
-      k = av1_remove_duplicates(centroids, n);
+      if (colors == PALETTE_MIN_SIZE) {
+        // Special case: These colors automatically become the centroids.
+        assert(colors == n);
+        assert(colors == 2);
+        centroids[0] = lb;
+        centroids[1] = ub;
+        k = 2;
+      } else {
+        for (i = 0; i < n; ++i) {
+          centroids[i] = lb + (2 * i + 1) * (ub - lb) / n / 2;
+        }
+        av1_k_means(data, centroids, color_map, rows * cols, n, 1, max_itr);
+#if CONFIG_PALETTE_DELTA_ENCODING
+        optimize_palette_colors(color_cache, n_cache, n, 1, centroids);
+#endif  // CONFIG_PALETTE_DELTA_ENCODING
+        k = av1_remove_duplicates(centroids, n);
+        if (k < PALETTE_MIN_SIZE) {
+          // Too few unique colors to create a palette. And DC_PRED will work
+          // well for that case anyway. So skip.
+          continue;
+        }
+      }
 
 #if CONFIG_HIGHBITDEPTH
       if (cpi->common.use_highbitdepth)
@@ -2516,7 +2597,11 @@ static int rd_pick_palette_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
           av1_cost_bit(
               av1_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_ctx],
               1);
-      palette_mode_cost += av1_palette_color_cost_y(pmi, cpi->common.bit_depth);
+      palette_mode_cost += av1_palette_color_cost_y(pmi,
+#if CONFIG_PALETTE_DELTA_ENCODING
+                                                    color_cache, n_cache,
+#endif  // CONFIG_PALETTE_DELTA_ENCODING
+                                                    cpi->common.bit_depth);
       for (i = 0; i < rows; ++i) {
         for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
           int color_idx;
@@ -2570,6 +2655,7 @@ static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
   const AV1_COMMON *const cm = &cpi->common;
   PREDICTION_MODE mode;
   MACROBLOCKD *const xd = &x->e_mbd;
+  assert(!is_inter_block(&xd->mi[0]->mbmi));
   int64_t best_rd = rd_thresh;
   struct macroblock_plane *p = &x->plane[0];
   struct macroblockd_plane *pd = &xd->plane[0];
@@ -2577,7 +2663,7 @@ static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
   const int dst_stride = pd->dst.stride;
   const uint8_t *src_init = &p->src.buf[row * 4 * src_stride + col * 4];
   uint8_t *dst_init = &pd->dst.buf[row * 4 * dst_stride + col * 4];
-#if CONFIG_CB4X4
+#if CONFIG_CHROMA_2X2
   // TODO(jingning): This is a temporal change. The whole function should be
   // out when cb4x4 is enabled.
   ENTROPY_CONTEXT ta[4], tempa[4];
@@ -2585,7 +2671,7 @@ static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
 #else
   ENTROPY_CONTEXT ta[2], tempa[2];
   ENTROPY_CONTEXT tl[2], templ[2];
-#endif  // CONFIG_CB4X4
+#endif  // CONFIG_CHROMA_2X2
 
   const int pred_width_in_4x4_blocks = num_4x4_blocks_wide_lookup[bsize];
   const int pred_height_in_4x4_blocks = num_4x4_blocks_high_lookup[bsize];
@@ -2738,7 +2824,8 @@ static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
 #if !CONFIG_PVQ
             av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8,
                             tx_size, coeff_ctx, AV1_XFORM_QUANT_FP);
-            av1_optimize_b(cm, x, 0, block, tx_size, coeff_ctx);
+            av1_optimize_b(cm, x, 0, block, BLOCK_8X8, tx_size, tempa + idx,
+                           templ + idy);
             ratey += av1_cost_coeffs(cpi, x, 0, block, tx_size, scan_order,
                                      tempa + idx, templ + idy,
                                      cpi->sf.use_fast_coef_costing);
@@ -2897,9 +2984,8 @@ static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
 #endif  // CONFIG_CB4X4
                         BLOCK_8X8, tx_size, coeff_ctx, xform_quant);
 
-        if (!is_lossless) {
-          av1_optimize_b(cm, x, 0, block, tx_size, coeff_ctx);
-        }
+        av1_optimize_b(cm, x, 0, block, BLOCK_8X8, tx_size, tempa + idx,
+                       templ + idy);
 
         ratey +=
             av1_cost_coeffs(cpi, x, 0, block, tx_size, scan_order, tempa + idx,
@@ -3013,6 +3099,7 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(const AV1_COMP *const cpi,
   const MODE_INFO *above_mi = xd->above_mi;
   const MODE_INFO *left_mi = xd->left_mi;
   MB_MODE_INFO *const mbmi = &mic->mbmi;
+  assert(!is_inter_block(mbmi));
   const BLOCK_SIZE bsize = mbmi->sb_type;
   const int pred_width_in_4x4_blocks = num_4x4_blocks_wide_lookup[bsize];
   const int pred_height_in_4x4_blocks = num_4x4_blocks_high_lookup[bsize];
@@ -3220,6 +3307,7 @@ static int64_t calc_rd_given_intra_angle(
   RD_STATS tokenonly_rd_stats;
   int64_t this_rd, this_model_rd;
   MB_MODE_INFO *mbmi = &x->e_mbd.mi[0]->mbmi;
+  assert(!is_inter_block(mbmi));
 
   mbmi->angle_delta[0] = angle_delta;
   this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost);
@@ -3261,6 +3349,7 @@ static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
   MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *const mic = xd->mi[0];
   MB_MODE_INFO *mbmi = &mic->mbmi;
+  assert(!is_inter_block(mbmi));
   int i, angle_delta, best_angle_delta = 0;
   int first_try = 1;
 #if CONFIG_INTRA_INTERP
@@ -3393,32 +3482,40 @@ static const uint8_t gradient_to_angle_bin[2][7][16] = {
   },
 };
 
+/* clang-format off */
 static const uint8_t mode_to_angle_bin[INTRA_MODES] = {
   0, 2, 6, 0, 4, 3, 5, 7, 1, 0,
+#if CONFIG_ALT_INTRA
+  0,
+#endif  // CONFIG_ALT_INTRA
 };
+/* clang-format on */
 
 static void angle_estimation(const uint8_t *src, int src_stride, int rows,
-                             int cols, uint8_t *directional_mode_skip_mask) {
-  int i, r, c, index, dx, dy, temp, sn, remd, quot;
+                             int cols, BLOCK_SIZE bsize,
+                             uint8_t *directional_mode_skip_mask) {
+  memset(directional_mode_skip_mask, 0,
+         INTRA_MODES * sizeof(*directional_mode_skip_mask));
+  // Sub-8x8 blocks do not use extra directions.
+  if (bsize < BLOCK_8X8) return;
   uint64_t hist[DIRECTIONAL_MODES];
-  uint64_t hist_sum = 0;
-
   memset(hist, 0, DIRECTIONAL_MODES * sizeof(hist[0]));
   src += src_stride;
+  int r, c, dx, dy;
   for (r = 1; r < rows; ++r) {
     for (c = 1; c < cols; ++c) {
       dx = src[c] - src[c - 1];
       dy = src[c] - src[c - src_stride];
-      temp = dx * dx + dy * dy;
+      int index;
+      const int temp = dx * dx + dy * dy;
       if (dy == 0) {
         index = 2;
       } else {
-        sn = (dx > 0) ^ (dy > 0);
+        const int sn = (dx > 0) ^ (dy > 0);
         dx = abs(dx);
         dy = abs(dy);
-        remd = dx % dy;
-        quot = dx / dy;
-        remd = remd * 16 / dy;
+        const int remd = (dx % dy) * 16 / dy;
+        const int quot = dx / dy;
         index = gradient_to_angle_bin[sn][AOMMIN(quot, 6)][AOMMIN(remd, 15)];
       }
       hist[index] += temp;
@@ -3426,9 +3523,11 @@ static void angle_estimation(const uint8_t *src, int src_stride, int rows,
     src += src_stride;
   }
 
+  int i;
+  uint64_t hist_sum = 0;
   for (i = 0; i < DIRECTIONAL_MODES; ++i) hist_sum += hist[i];
   for (i = 0; i < INTRA_MODES; ++i) {
-    if (i != DC_PRED && i != TM_PRED) {
+    if (av1_is_directional_mode(i, bsize)) {
       const uint8_t angle_bin = mode_to_angle_bin[i];
       uint64_t score = 2 * hist[angle_bin];
       int weight = 2;
@@ -3448,29 +3547,31 @@ static void angle_estimation(const uint8_t *src, int src_stride, int rows,
 
 #if CONFIG_HIGHBITDEPTH
 static void highbd_angle_estimation(const uint8_t *src8, int src_stride,
-                                    int rows, int cols,
+                                    int rows, int cols, BLOCK_SIZE bsize,
                                     uint8_t *directional_mode_skip_mask) {
-  int i, r, c, index, dx, dy, temp, sn, remd, quot;
-  uint64_t hist[DIRECTIONAL_MODES];
-  uint64_t hist_sum = 0;
+  memset(directional_mode_skip_mask, 0,
+         INTRA_MODES * sizeof(*directional_mode_skip_mask));
+  // Sub-8x8 blocks do not use extra directions.
+  if (bsize < BLOCK_8X8) return;
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-
+  uint64_t hist[DIRECTIONAL_MODES];
   memset(hist, 0, DIRECTIONAL_MODES * sizeof(hist[0]));
   src += src_stride;
+  int r, c, dx, dy;
   for (r = 1; r < rows; ++r) {
     for (c = 1; c < cols; ++c) {
       dx = src[c] - src[c - 1];
       dy = src[c] - src[c - src_stride];
-      temp = dx * dx + dy * dy;
+      int index;
+      const int temp = dx * dx + dy * dy;
       if (dy == 0) {
         index = 2;
       } else {
-        sn = (dx > 0) ^ (dy > 0);
+        const int sn = (dx > 0) ^ (dy > 0);
         dx = abs(dx);
         dy = abs(dy);
-        remd = dx % dy;
-        quot = dx / dy;
-        remd = remd * 16 / dy;
+        const int remd = (dx % dy) * 16 / dy;
+        const int quot = dx / dy;
         index = gradient_to_angle_bin[sn][AOMMIN(quot, 6)][AOMMIN(remd, 15)];
       }
       hist[index] += temp;
@@ -3478,9 +3579,11 @@ static void highbd_angle_estimation(const uint8_t *src8, int src_stride,
     src += src_stride;
   }
 
+  int i;
+  uint64_t hist_sum = 0;
   for (i = 0; i < DIRECTIONAL_MODES; ++i) hist_sum += hist[i];
   for (i = 0; i < INTRA_MODES; ++i) {
-    if (i != DC_PRED && i != TM_PRED) {
+    if (av1_is_directional_mode(i, bsize)) {
       const uint8_t angle_bin = mode_to_angle_bin[i];
       uint64_t score = 2 * hist[angle_bin];
       int weight = 2;
@@ -3509,6 +3612,7 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *const mic = xd->mi[0];
   MB_MODE_INFO *const mbmi = &mic->mbmi;
+  assert(!is_inter_block(mbmi));
   MB_MODE_INFO best_mbmi = *mbmi;
   int64_t best_model_rd = INT64_MAX;
 #if CONFIG_EXT_INTRA
@@ -3552,15 +3656,14 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
 
 #if CONFIG_EXT_INTRA
   mbmi->angle_delta[0] = 0;
-  memset(directional_mode_skip_mask, 0,
-         sizeof(directional_mode_skip_mask[0]) * INTRA_MODES);
 #if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-    highbd_angle_estimation(src, src_stride, rows, cols,
+    highbd_angle_estimation(src, src_stride, rows, cols, bsize,
                             directional_mode_skip_mask);
   else
 #endif  // CONFIG_HIGHBITDEPTH
-    angle_estimation(src, src_stride, rows, cols, directional_mode_skip_mask);
+    angle_estimation(src, src_stride, rows, cols, bsize,
+                     directional_mode_skip_mask);
 #endif  // CONFIG_EXT_INTRA
 #if CONFIG_FILTER_INTRA
   mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
@@ -3833,7 +3936,7 @@ void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
   av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
                   coeff_ctx, AV1_XFORM_QUANT_FP);
 
-  av1_optimize_b(cm, x, plane, block, tx_size, coeff_ctx);
+  av1_optimize_b(cm, x, plane, block, plane_bsize, tx_size, a, l);
 
 // TODO(any): Use av1_dist_block to compute distortion
 #if CONFIG_HIGHBITDEPTH
@@ -3936,9 +4039,8 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
   ENTROPY_CONTEXT *pta = ta + blk_col;
   ENTROPY_CONTEXT *ptl = tl + blk_row;
   int coeff_ctx, i;
-  int ctx =
-      txfm_partition_context(tx_above + (blk_col >> 1),
-                             tx_left + (blk_row >> 1), mbmi->sb_type, tx_size);
+  int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row,
+                                   mbmi->sb_type, tx_size);
   int64_t sum_rd = INT64_MAX;
   int tmp_eob = 0;
   int zero_blk_rate;
@@ -4042,8 +4144,8 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
     int idx, idy;
     for (i = 0; i < tx_size_wide_unit[tx_size]; ++i) pta[i] = !(tmp_eob == 0);
     for (i = 0; i < tx_size_high_unit[tx_size]; ++i) ptl[i] = !(tmp_eob == 0);
-    txfm_partition_update(tx_above + (blk_col >> 1), tx_left + (blk_row >> 1),
-                          tx_size, tx_size);
+    txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size,
+                          tx_size);
     inter_tx_size[0][0] = tx_size;
     for (idy = 0; idy < tx_size_high_unit[tx_size] / 2; ++idy)
       for (idx = 0; idx < tx_size_wide_unit[tx_size] / 2; ++idx)
@@ -4082,17 +4184,15 @@ static void inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
     int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
     ENTROPY_CONTEXT ctxa[2 * MAX_MIB_SIZE];
     ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE];
-    TXFM_CONTEXT tx_above[MAX_MIB_SIZE];
-    TXFM_CONTEXT tx_left[MAX_MIB_SIZE];
+    TXFM_CONTEXT tx_above[MAX_MIB_SIZE * 2];
+    TXFM_CONTEXT tx_left[MAX_MIB_SIZE * 2];
 
     RD_STATS pn_rd_stats;
     av1_init_rd_stats(&pn_rd_stats);
 
     av1_get_entropy_contexts(bsize, 0, pd, ctxa, ctxl);
-    memcpy(tx_above, xd->above_txfm_context,
-           sizeof(TXFM_CONTEXT) * (mi_width >> 1));
-    memcpy(tx_left, xd->left_txfm_context,
-           sizeof(TXFM_CONTEXT) * (mi_height >> 1));
+    memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width);
+    memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height);
 
     for (idy = 0; idy < mi_height; idy += bh) {
       for (idx = 0; idx < mi_width; idx += bw) {
@@ -4137,8 +4237,8 @@ static int64_t select_tx_size_fix_type(const AV1_COMP *cpi, MACROBLOCK *x,
   const int max_blocks_wide = max_block_wide(xd, bsize, 0);
 
   mbmi->tx_type = tx_type;
-  mbmi->min_tx_size = TX_SIZES_ALL;
   inter_block_yrd(cpi, x, rd_stats, bsize, ref_best_rd, rd_stats_stack);
+  mbmi->min_tx_size = get_min_tx_size(mbmi->inter_tx_size[0][0]);
 
   if (rd_stats->rate == INT_MAX) return INT64_MAX;
 
@@ -4350,7 +4450,8 @@ static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x,
 
 #if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
   if (x->skip_chroma_rd) return is_cost_valid;
-  bsize = AOMMAX(BLOCK_8X8, bsize);
+  bsize = scale_chroma_bsize(mbmi->sb_type, xd->plane[1].subsampling_x,
+                             xd->plane[1].subsampling_y);
 #endif  // CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
 
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
@@ -4426,6 +4527,7 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
                                        int *skippable) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  assert(!is_inter_block(mbmi));
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
   const BLOCK_SIZE bsize = mbmi->sb_type;
   int this_rate;
@@ -4460,6 +4562,13 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
   }
 #endif  // CONFIG_HIGHBITDEPTH
 
+#if CONFIG_PALETTE_DELTA_ENCODING
+  const MODE_INFO *above_mi = xd->above_mi;
+  const MODE_INFO *left_mi = xd->left_mi;
+  uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+  const int n_cache = av1_get_palette_cache(above_mi, left_mi, 1, color_cache);
+#endif  // CONFIG_PALETTE_DELTA_ENCODING
+
   colors = colors_u > colors_v ? colors_u : colors_v;
   if (colors > 1 && colors <= 64) {
     int r, c, n, i, j;
@@ -4524,6 +4633,7 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
       }
       av1_k_means(data, centroids, color_map, rows * cols, n, 2, max_itr);
 #if CONFIG_PALETTE_DELTA_ENCODING
+      optimize_palette_colors(color_cache, n_cache, n, 2, centroids);
       // Sort the U channel colors in ascending order.
       for (i = 0; i < 2 * (n - 1); i += 2) {
         int min_idx = i;
@@ -4563,7 +4673,11 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
           write_uniform_cost(n, color_map[0]) +
           av1_cost_bit(
               av1_default_palette_uv_mode_prob[pmi->palette_size[0] > 0], 1);
-      this_rate += av1_palette_color_cost_uv(pmi, cpi->common.bit_depth);
+      this_rate += av1_palette_color_cost_uv(pmi,
+#if CONFIG_PALETTE_DELTA_ENCODING
+                                             color_cache, n_cache,
+#endif  // CONFIG_PALETTE_DELTA_ENCODING
+                                             cpi->common.bit_depth);
       for (i = 0; i < rows; ++i) {
         for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
           int color_idx;
@@ -4660,6 +4774,7 @@ static int64_t pick_intra_angle_routine_sbuv(
     int rate_overhead, int64_t best_rd_in, int *rate, RD_STATS *rd_stats,
     int *best_angle_delta, int64_t *best_rd) {
   MB_MODE_INFO *mbmi = &x->e_mbd.mi[0]->mbmi;
+  assert(!is_inter_block(mbmi));
   int this_rate;
   int64_t this_rd;
   RD_STATS tokenonly_rd_stats;
@@ -4687,6 +4802,7 @@ static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
                                     RD_STATS *rd_stats) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  assert(!is_inter_block(mbmi));
   int i, angle_delta, best_angle_delta = 0;
   int64_t this_rd, best_rd_in, rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
 
@@ -4736,12 +4852,23 @@ static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
 }
 #endif  // CONFIG_EXT_INTRA
 
+static void init_sbuv_mode(MB_MODE_INFO *const mbmi) {
+  mbmi->uv_mode = DC_PRED;
+#if CONFIG_PALETTE
+  mbmi->palette_mode_info.palette_size[1] = 0;
+#endif  // CONFIG_PALETTE
+#if CONFIG_FILTER_INTRA
+  mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
+#endif  // CONFIG_FILTER_INTRA
+}
+
 static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
                                        int *rate, int *rate_tokenonly,
                                        int64_t *distortion, int *skippable,
                                        BLOCK_SIZE bsize, TX_SIZE max_tx_size) {
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  assert(!is_inter_block(mbmi));
   MB_MODE_INFO best_mbmi = *mbmi;
   PREDICTION_MODE mode;
   int64_t best_rd = INT64_MAX, this_rd;
@@ -4756,12 +4883,6 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   uint8_t *best_palette_color_map = NULL;
 #endif  // CONFIG_PALETTE
 
-#if CONFIG_FILTER_INTRA
-  mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
-#endif  // CONFIG_FILTER_INTRA
-#if CONFIG_PALETTE
-  pmi->palette_size[1] = 0;
-#endif  // CONFIG_PALETTE
   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
 #if CONFIG_EXT_INTRA
     const int is_directional_mode =
@@ -4858,12 +4979,12 @@ static void choose_intra_uv_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
   // Use an estimated rd for uv_intra based on DC_PRED if the
   // appropriate speed flag is set.
   (void)ctx;
+  init_sbuv_mode(&x->e_mbd.mi[0]->mbmi);
 #if CONFIG_CB4X4
 #if CONFIG_CHROMA_2X2
   rd_pick_intra_sbuv_mode(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
                           bsize, max_tx_size);
 #else
-  max_tx_size = AOMMAX(max_tx_size, TX_4X4);
   if (x->skip_chroma_rd) {
     *rate_uv = 0;
     *rate_uv_tokenonly = 0;
@@ -4893,7 +5014,6 @@ static int cost_mv_ref(const AV1_COMP *const cpi, PREDICTION_MODE mode,
   }
 #endif
 
-#if CONFIG_REF_MV
   int mode_cost = 0;
   int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
   int16_t is_all_zero_mv = mode_context & (1 << ALL_ZERO_FLAG_OFFSET);
@@ -4924,13 +5044,9 @@ static int cost_mv_ref(const AV1_COMP *const cpi, PREDICTION_MODE mode,
       return mode_cost;
     }
   }
-#else
-  assert(is_inter_mode(mode));
-  return cpi->inter_mode_cost[mode_context][INTER_OFFSET(mode)];
-#endif  // CONFIG_REF_MV
 }
 
-#if CONFIG_EXT_INTER
+#if CONFIG_EXT_INTER && (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT)
 static int get_interinter_compound_type_bits(BLOCK_SIZE bsize,
                                              COMPOUND_TYPE comp_type) {
   (void)bsize;
@@ -4945,304 +5061,7 @@ static int get_interinter_compound_type_bits(BLOCK_SIZE bsize,
     default: assert(0); return 0;
   }
 }
-#endif  // CONFIG_EXT_INTER
-
-static int set_and_cost_bmi_mvs(
-    const AV1_COMP *const cpi, MACROBLOCK *x, MACROBLOCKD *xd, int i,
-    PREDICTION_MODE mode, int_mv this_mv[2],
-    int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME],
-    int_mv seg_mvs[TOTAL_REFS_PER_FRAME],
-#if CONFIG_EXT_INTER
-    int_mv compound_seg_newmvs[2],
-#endif  // CONFIG_EXT_INTER
-    int_mv *best_ref_mv[2], const int *mvjcost, int *mvcost[2], int mi_row,
-    int mi_col) {
-  MODE_INFO *const mic = xd->mi[0];
-  const MB_MODE_INFO *const mbmi = &mic->mbmi;
-  const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
-  int thismvcost = 0;
-  int idx, idy;
-  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
-  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
-  const int is_compound = has_second_ref(mbmi);
-  int mode_ctx;
-  (void)mi_row;
-  (void)mi_col;
-
-  switch (mode) {
-    case NEWMV: this_mv[0].as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
-#if CONFIG_EXT_INTER
-      if (!cpi->common.allow_high_precision_mv)
-        lower_mv_precision(&this_mv[0].as_mv, 0);
-#endif  // CONFIG_EXT_INTER
-
-#if CONFIG_REF_MV
-      for (idx = 0; idx < 1 + is_compound; ++idx) {
-        this_mv[idx] = seg_mvs[mbmi->ref_frame[idx]];
-        av1_set_mvcost(x, mbmi->ref_frame[idx], idx, mbmi->ref_mv_idx);
-        thismvcost +=
-            av1_mv_bit_cost(&this_mv[idx].as_mv, &best_ref_mv[idx]->as_mv,
-                            x->nmvjointcost, x->mvcost, MV_COST_WEIGHT_SUB);
-      }
-      (void)mvjcost;
-      (void)mvcost;
-#else
-      thismvcost += av1_mv_bit_cost(&this_mv[0].as_mv, &best_ref_mv[0]->as_mv,
-                                    mvjcost, mvcost, MV_COST_WEIGHT_SUB);
-#if !CONFIG_EXT_INTER
-      if (is_compound) {
-        this_mv[1].as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
-        thismvcost += av1_mv_bit_cost(&this_mv[1].as_mv, &best_ref_mv[1]->as_mv,
-                                      mvjcost, mvcost, MV_COST_WEIGHT_SUB);
-      }
-#endif  // !CONFIG_EXT_INTER
-#endif  // CONFIG_REF_MV
-      break;
-    case NEARMV:
-    case NEARESTMV:
-      this_mv[0].as_int = frame_mv[mode][mbmi->ref_frame[0]].as_int;
-      if (is_compound)
-        this_mv[1].as_int = frame_mv[mode][mbmi->ref_frame[1]].as_int;
-      break;
-    case ZEROMV: {
-      int ref;
-      for (ref = 0; ref < 1 + is_compound; ++ref) {
-#if CONFIG_GLOBAL_MOTION
-        this_mv[ref].as_int =
-            gm_get_motion_vector(
-                &cpi->common.global_motion[mbmi->ref_frame[ref]],
-                cpi->common.allow_high_precision_mv, mbmi->sb_type, mi_col,
-                mi_row, i)
-                .as_int;
-#else
-        this_mv[ref].as_int = 0;
-#endif  // CONFIG_GLOBAL_MOTION
-      }
-      break;
-    }
-#if CONFIG_EXT_INTER
-    case NEW_NEWMV:
-      if (compound_seg_newmvs[0].as_int == INVALID_MV ||
-          compound_seg_newmvs[1].as_int == INVALID_MV) {
-        this_mv[0].as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
-        this_mv[1].as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
-      } else {
-        this_mv[0].as_int = compound_seg_newmvs[0].as_int;
-        this_mv[1].as_int = compound_seg_newmvs[1].as_int;
-      }
-      if (!cpi->common.allow_high_precision_mv)
-        lower_mv_precision(&this_mv[0].as_mv, 0);
-      if (!cpi->common.allow_high_precision_mv)
-        lower_mv_precision(&this_mv[1].as_mv, 0);
-#if CONFIG_REF_MV
-      av1_set_mvcost(x, mbmi->ref_frame[0], 0, mbmi->ref_mv_idx);
-#endif
-      thismvcost += av1_mv_bit_cost(&this_mv[0].as_mv, &best_ref_mv[0]->as_mv,
-                                    mvjcost, mvcost, MV_COST_WEIGHT_SUB);
-#if CONFIG_REF_MV
-      av1_set_mvcost(x, mbmi->ref_frame[1], 1, mbmi->ref_mv_idx);
-#endif
-      thismvcost += av1_mv_bit_cost(&this_mv[1].as_mv, &best_ref_mv[1]->as_mv,
-                                    mvjcost, mvcost, MV_COST_WEIGHT_SUB);
-      break;
-    case NEW_NEARMV:
-    case NEW_NEARESTMV:
-      this_mv[0].as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
-      if (!cpi->common.allow_high_precision_mv)
-        lower_mv_precision(&this_mv[0].as_mv, 0);
-#if CONFIG_REF_MV
-      av1_set_mvcost(x, mbmi->ref_frame[0], 0, mbmi->ref_mv_idx);
-#endif
-      thismvcost += av1_mv_bit_cost(&this_mv[0].as_mv, &best_ref_mv[0]->as_mv,
-                                    mvjcost, mvcost, MV_COST_WEIGHT_SUB);
-      this_mv[1].as_int = frame_mv[mode][mbmi->ref_frame[1]].as_int;
-      break;
-    case NEAR_NEWMV:
-    case NEAREST_NEWMV:
-      this_mv[0].as_int = frame_mv[mode][mbmi->ref_frame[0]].as_int;
-      this_mv[1].as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
-      if (!cpi->common.allow_high_precision_mv)
-        lower_mv_precision(&this_mv[1].as_mv, 0);
-#if CONFIG_REF_MV
-      av1_set_mvcost(x, mbmi->ref_frame[1], 1, mbmi->ref_mv_idx);
-#endif
-      thismvcost += av1_mv_bit_cost(&this_mv[1].as_mv, &best_ref_mv[1]->as_mv,
-                                    mvjcost, mvcost, MV_COST_WEIGHT_SUB);
-      break;
-    case NEAREST_NEARMV:
-    case NEAR_NEARESTMV:
-    case NEAREST_NEARESTMV:
-    case NEAR_NEARMV:
-      this_mv[0].as_int = frame_mv[mode][mbmi->ref_frame[0]].as_int;
-      this_mv[1].as_int = frame_mv[mode][mbmi->ref_frame[1]].as_int;
-      break;
-    case ZERO_ZEROMV:
-#if CONFIG_GLOBAL_MOTION
-      this_mv[0].as_int =
-          gm_get_motion_vector(&cpi->common.global_motion[mbmi->ref_frame[0]],
-                               cpi->common.allow_high_precision_mv,
-                               mbmi->sb_type, mi_col, mi_row, i)
-              .as_int;
-      this_mv[1].as_int =
-          gm_get_motion_vector(&cpi->common.global_motion[mbmi->ref_frame[1]],
-                               cpi->common.allow_high_precision_mv,
-                               mbmi->sb_type, mi_col, mi_row, i)
-              .as_int;
-#else
-      this_mv[0].as_int = 0;
-      this_mv[1].as_int = 0;
-#endif  // CONFIG_GLOBAL_MOTION
-      break;
-#endif  // CONFIG_EXT_INTER
-    default: break;
-  }
-
-  mic->bmi[i].as_mv[0].as_int = this_mv[0].as_int;
-  if (is_compound) mic->bmi[i].as_mv[1].as_int = this_mv[1].as_int;
-
-  mic->bmi[i].as_mode = mode;
-
-#if CONFIG_REF_MV
-  if (mode == NEWMV) {
-    mic->bmi[i].pred_mv[0].as_int =
-        mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0].as_int;
-    if (is_compound)
-      mic->bmi[i].pred_mv[1].as_int =
-          mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0].as_int;
-  } else {
-    mic->bmi[i].pred_mv[0].as_int = this_mv[0].as_int;
-    if (is_compound) mic->bmi[i].pred_mv[1].as_int = this_mv[1].as_int;
-  }
-#endif  // CONFIG_REF_MV
-
-  for (idy = 0; idy < num_4x4_blocks_high; ++idy)
-    for (idx = 0; idx < num_4x4_blocks_wide; ++idx)
-      memmove(&mic->bmi[i + idy * 2 + idx], &mic->bmi[i], sizeof(mic->bmi[i]));
-
-#if CONFIG_REF_MV
-#if CONFIG_EXT_INTER
-  if (is_compound)
-    mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]];
-  else
-#endif  // CONFIG_EXT_INTER
-    mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
-                                         mbmi->ref_frame, mbmi->sb_type, i);
-#else  // CONFIG_REF_MV
-  mode_ctx = mbmi_ext->mode_context[mbmi->ref_frame[0]];
-#endif  // CONFIG_REF_MV
-  return cost_mv_ref(cpi, mode, mode_ctx) + thismvcost;
-}
-
-static int64_t encode_inter_mb_segment_sub8x8(
-    const AV1_COMP *const cpi, MACROBLOCK *x, int64_t best_yrd, int i,
-    int *labelyrate, int64_t *distortion, int64_t *sse, ENTROPY_CONTEXT *ta,
-    ENTROPY_CONTEXT *tl, int ir, int ic, int mi_row, int mi_col) {
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  struct macroblockd_plane *const pd = &xd->plane[0];
-  struct macroblock_plane *const p = &x->plane[0];
-  MODE_INFO *const mi = xd->mi[0];
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(mi->mbmi.sb_type, pd);
-  const int txb_width = max_block_wide(xd, plane_bsize, 0);
-  const int txb_height = max_block_high(xd, plane_bsize, 0);
-  const int width = block_size_wide[plane_bsize];
-  const int height = block_size_high[plane_bsize];
-  int idx, idy;
-  const uint8_t *const src =
-      &p->src.buf[av1_raster_block_offset(BLOCK_8X8, i, p->src.stride)];
-  uint8_t *const dst =
-      &pd->dst.buf[av1_raster_block_offset(BLOCK_8X8, i, pd->dst.stride)];
-  int64_t thisdistortion = 0, thissse = 0;
-  int thisrate = 0;
-  TX_SIZE tx_size = mi->mbmi.tx_size;
-  TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, i, tx_size);
-  const int num_4x4_w = tx_size_wide_unit[tx_size];
-  const int num_4x4_h = tx_size_high_unit[tx_size];
-#if !CONFIG_PVQ
-  const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, 1);
-#else
-  (void)cpi;
-  (void)ta;
-  (void)tl;
-  (void)tx_type;
-#endif  // !CONFIG_PVQ
-
-#if CONFIG_EXT_TX && CONFIG_RECT_TX
-  assert(IMPLIES(xd->lossless[mi->mbmi.segment_id], tx_size == TX_4X4));
-  assert(IMPLIES(!xd->lossless[mi->mbmi.segment_id],
-                 tx_size == max_txsize_rect_lookup[mi->mbmi.sb_type]));
-#else
-  assert(tx_size == TX_4X4);
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
-
-  assert(tx_type == DCT_DCT);
-
-  av1_build_inter_predictor_sub8x8(xd, 0, i, ir, ic, mi_row, mi_col);
-
-#if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    aom_highbd_subtract_block(
-        height, width, av1_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
-        8, src, p->src.stride, dst, pd->dst.stride, xd->bd);
-  } else {
-    aom_subtract_block(height, width,
-                       av1_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
-                       8, src, p->src.stride, dst, pd->dst.stride);
-  }
-#else
-  aom_subtract_block(height, width,
-                     av1_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
-                     8, src, p->src.stride, dst, pd->dst.stride);
-#endif  // CONFIG_HIGHBITDEPTH
-
-  for (idy = 0; idy < txb_height; idy += num_4x4_h) {
-    for (idx = 0; idx < txb_width; idx += num_4x4_w) {
-      int64_t dist, ssz, rd, rd1, rd2;
-      int coeff_ctx;
-      const int k = i + (idy * 2 + idx);
-      const int block = av1_raster_order_to_block_index(tx_size, k);
-      assert(IMPLIES(tx_size == TX_4X8 || tx_size == TX_8X4,
-                     idx == 0 && idy == 0));
-      coeff_ctx = combine_entropy_contexts(*(ta + (k & 1)), *(tl + (k >> 1)));
-      av1_xform_quant(cm, x, 0, block, idy + (i >> 1), idx + (i & 0x01),
-                      BLOCK_8X8, tx_size, coeff_ctx, AV1_XFORM_QUANT_FP);
-      if (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0)
-        av1_optimize_b(cm, x, 0, block, tx_size, coeff_ctx);
-      av1_dist_block(cpi, x, 0, BLOCK_8X8, block, idy + (i >> 1),
-                     idx + (i & 0x1), tx_size, &dist, &ssz,
-                     OUTPUT_HAS_PREDICTED_PIXELS);
-      thisdistortion += dist;
-      thissse += ssz;
-#if !CONFIG_PVQ
-      thisrate +=
-          av1_cost_coeffs(cpi, x, 0, block, tx_size, scan_order, (ta + (k & 1)),
-                          (tl + (k >> 1)), cpi->sf.use_fast_coef_costing);
-      *(ta + (k & 1)) = !(p->eobs[block] == 0);
-      *(tl + (k >> 1)) = !(p->eobs[block] == 0);
-#else
-      thisrate += x->rate;
-#endif  // !CONFIG_PVQ
-#if CONFIG_EXT_TX
-      if (tx_size == TX_8X4) {
-        *(ta + (k & 1) + 1) = *(ta + (k & 1));
-      }
-      if (tx_size == TX_4X8) {
-        *(tl + (k >> 1) + 1) = *(tl + (k >> 1));
-      }
-#endif  // CONFIG_EXT_TX
-      rd1 = RDCOST(x->rdmult, x->rddiv, thisrate, thisdistortion);
-      rd2 = RDCOST(x->rdmult, x->rddiv, 0, thissse);
-      rd = AOMMIN(rd1, rd2);
-      if (rd >= best_yrd) return INT64_MAX;
-    }
-  }
-
-  *distortion = thisdistortion;
-  *labelyrate = thisrate;
-  *sse = thissse;
-
-  return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
-}
+#endif  // CONFIG_EXT_INTER && (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT)
 
 typedef struct {
   int eobs;
@@ -5252,20 +5071,18 @@ typedef struct {
   int64_t bsse;
   int64_t brdcost;
   int_mv mvs[2];
-#if CONFIG_REF_MV
   int_mv pred_mv[2];
-#endif  // CONFIG_REF_MV
 #if CONFIG_EXT_INTER
   int_mv ref_mv[2];
 #endif  // CONFIG_EXT_INTER
 
-#if CONFIG_CB4X4
+#if CONFIG_CHROMA_2X2
   ENTROPY_CONTEXT ta[4];
   ENTROPY_CONTEXT tl[4];
 #else
   ENTROPY_CONTEXT ta[2];
   ENTROPY_CONTEXT tl[2];
-#endif  // CONFIG_CB4X4
+#endif  // CONFIG_CHROMA_2X2
 } SEG_RDSTAT;
 
 typedef struct {
@@ -5293,37 +5110,13 @@ static INLINE int mv_check_bounds(const MvLimits *mv_limits, const MV *mv) {
          (mv->col >> 3) > mv_limits->col_max;
 }
 
-static INLINE void mi_buf_shift(MACROBLOCK *x, int i) {
-  MB_MODE_INFO *const mbmi = &x->e_mbd.mi[0]->mbmi;
-  struct macroblock_plane *const p = &x->plane[0];
-  struct macroblockd_plane *const pd = &x->e_mbd.plane[0];
-
-  p->src.buf =
-      &p->src.buf[av1_raster_block_offset(BLOCK_8X8, i, p->src.stride)];
-  assert(((intptr_t)pd->pre[0].buf & 0x7) == 0);
-  pd->pre[0].buf =
-      &pd->pre[0].buf[av1_raster_block_offset(BLOCK_8X8, i, pd->pre[0].stride)];
-  if (has_second_ref(mbmi))
-    pd->pre[1].buf =
-        &pd->pre[1]
-             .buf[av1_raster_block_offset(BLOCK_8X8, i, pd->pre[1].stride)];
-}
-
-static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src,
-                                  struct buf_2d orig_pre[2]) {
-  MB_MODE_INFO *mbmi = &x->e_mbd.mi[0]->mbmi;
-  x->plane[0].src = orig_src;
-  x->e_mbd.plane[0].pre[0] = orig_pre[0];
-  if (has_second_ref(mbmi)) x->e_mbd.plane[0].pre[1] = orig_pre[1];
-}
-
 // Check if NEARESTMV/NEARMV/ZEROMV is the cheapest way encode zero motion.
 // TODO(aconverse): Find out if this is still productive then clean up or remove
 static int check_best_zero_mv(
     const AV1_COMP *const cpi, const int16_t mode_context[TOTAL_REFS_PER_FRAME],
-#if CONFIG_REF_MV && CONFIG_EXT_INTER
+#if CONFIG_EXT_INTER
     const int16_t compound_mode_context[TOTAL_REFS_PER_FRAME],
-#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
+#endif  // CONFIG_EXT_INTER
     int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME], int this_mode,
     const MV_REFERENCE_FRAME ref_frames[2], const BLOCK_SIZE bsize, int block,
     int mi_row, int mi_col) {
@@ -5355,21 +5148,12 @@ static int check_best_zero_mv(
       frame_mv[this_mode][ref_frames[0]].as_int == zeromv[0].as_int &&
       (ref_frames[1] <= INTRA_FRAME ||
        frame_mv[this_mode][ref_frames[1]].as_int == zeromv[1].as_int)) {
-#if CONFIG_REF_MV
     int16_t rfc =
         av1_mode_context_analyzer(mode_context, ref_frames, bsize, block);
-#else
-    int16_t rfc = mode_context[ref_frames[0]];
-#endif  // CONFIG_REF_MV
     int c1 = cost_mv_ref(cpi, NEARMV, rfc);
     int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
     int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
 
-#if !CONFIG_REF_MV
-    (void)bsize;
-    (void)block;
-#endif  // !CONFIG_REF_MV
-
     if (this_mode == NEARMV) {
       if (c1 > c3) return 0;
     } else if (this_mode == NEARESTMV) {
@@ -5390,40 +5174,25 @@ static int check_best_zero_mv(
     }
   }
 #if CONFIG_EXT_INTER
-  else if ((this_mode == NEAREST_NEARESTMV || this_mode == NEAREST_NEARMV ||
-            this_mode == NEAR_NEARESTMV || this_mode == NEAR_NEARMV ||
+  else if ((this_mode == NEAREST_NEARESTMV || this_mode == NEAR_NEARMV ||
             this_mode == ZERO_ZEROMV) &&
            frame_mv[this_mode][ref_frames[0]].as_int == zeromv[0].as_int &&
            frame_mv[this_mode][ref_frames[1]].as_int == zeromv[1].as_int) {
-#if CONFIG_REF_MV
     int16_t rfc = compound_mode_context[ref_frames[0]];
-#else
-    int16_t rfc = mode_context[ref_frames[0]];
-#endif  // CONFIG_REF_MV
-    int c1 = cost_mv_ref(cpi, NEAREST_NEARMV, rfc);
     int c2 = cost_mv_ref(cpi, NEAREST_NEARESTMV, rfc);
     int c3 = cost_mv_ref(cpi, ZERO_ZEROMV, rfc);
-    int c4 = cost_mv_ref(cpi, NEAR_NEARESTMV, rfc);
     int c5 = cost_mv_ref(cpi, NEAR_NEARMV, rfc);
 
-    if (this_mode == NEAREST_NEARMV) {
-      if (c1 > c3) return 0;
-    } else if (this_mode == NEAREST_NEARESTMV) {
+    if (this_mode == NEAREST_NEARESTMV) {
       if (c2 > c3) return 0;
-    } else if (this_mode == NEAR_NEARESTMV) {
-      if (c4 > c3) return 0;
     } else if (this_mode == NEAR_NEARMV) {
       if (c5 > c3) return 0;
     } else {
       assert(this_mode == ZERO_ZEROMV);
       if ((c3 >= c2 && frame_mv[NEAREST_NEARESTMV][ref_frames[0]].as_int == 0 &&
            frame_mv[NEAREST_NEARESTMV][ref_frames[1]].as_int == 0) ||
-          (c3 >= c1 && frame_mv[NEAREST_NEARMV][ref_frames[0]].as_int == 0 &&
-           frame_mv[NEAREST_NEARMV][ref_frames[1]].as_int == 0) ||
           (c3 >= c5 && frame_mv[NEAR_NEARMV][ref_frames[0]].as_int == 0 &&
-           frame_mv[NEAR_NEARMV][ref_frames[1]].as_int == 0) ||
-          (c3 >= c4 && frame_mv[NEAR_NEARESTMV][ref_frames[0]].as_int == 0 &&
-           frame_mv[NEAR_NEARESTMV][ref_frames[1]].as_int == 0))
+           frame_mv[NEAR_NEARMV][ref_frames[1]].as_int == 0))
         return 0;
     }
   }
@@ -5435,7 +5204,8 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
                                 BLOCK_SIZE bsize, int_mv *frame_mv, int mi_row,
                                 int mi_col,
 #if CONFIG_EXT_INTER
-                                int_mv *ref_mv_sub8x8[2],
+                                int_mv *ref_mv_sub8x8[2], const uint8_t *mask,
+                                int mask_stride,
 #endif  // CONFIG_EXT_INTER
                                 int *rate_mv, const int block) {
   const AV1_COMMON *const cm = &cpi->common;
@@ -5596,17 +5366,26 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
     best_mv->col >>= 3;
     best_mv->row >>= 3;
 
-#if CONFIG_REF_MV
     av1_set_mvcost(x, refs[id], id, mbmi->ref_mv_idx);
-#endif  // CONFIG_REF_MV
 
     // Small-range full-pixel motion search.
     bestsme =
         av1_refining_search_8p_c(x, sadpb, search_range, &cpi->fn_ptr[bsize],
+#if CONFIG_EXT_INTER
+                                 mask, mask_stride, id,
+#endif
                                  &ref_mv[id].as_mv, second_pred);
-    if (bestsme < INT_MAX)
-      bestsme = av1_get_mvpred_av_var(x, best_mv, &ref_mv[id].as_mv,
-                                      second_pred, &cpi->fn_ptr[bsize], 1);
+    if (bestsme < INT_MAX) {
+#if CONFIG_EXT_INTER
+      if (mask)
+        bestsme = av1_get_mvpred_mask_var(x, best_mv, &ref_mv[id].as_mv,
+                                          second_pred, mask, mask_stride, id,
+                                          &cpi->fn_ptr[bsize], 1);
+      else
+#endif
+        bestsme = av1_get_mvpred_av_var(x, best_mv, &ref_mv[id].as_mv,
+                                        second_pred, &cpi->fn_ptr[bsize], 1);
+    }
 
     x->mv_limits = tmp_mv_limits;
 
@@ -5639,7 +5418,11 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
             x, &ref_mv[id].as_mv, cpi->common.allow_high_precision_mv,
             x->errorperbit, &cpi->fn_ptr[bsize], 0,
             cpi->sf.mv.subpel_iters_per_step, NULL, x->nmvjointcost, x->mvcost,
-            &dis, &sse, second_pred, pw, ph, 1);
+            &dis, &sse, second_pred,
+#if CONFIG_EXT_INTER
+            mask, mask_stride, id,
+#endif
+            pw, ph, 1);
 
         // Restore the reference frames.
         pd->pre[0] = backup_pred;
@@ -5649,7 +5432,11 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
             x, &ref_mv[id].as_mv, cpi->common.allow_high_precision_mv,
             x->errorperbit, &cpi->fn_ptr[bsize], 0,
             cpi->sf.mv.subpel_iters_per_step, NULL, x->nmvjointcost, x->mvcost,
-            &dis, &sse, second_pred, pw, ph, 0);
+            &dis, &sse, second_pred,
+#if CONFIG_EXT_INTER
+            mask, mask_stride, id,
+#endif
+            pw, ph, 0);
       }
     }
 
@@ -5673,9 +5460,7 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
       for (i = 0; i < MAX_MB_PLANE; i++)
         xd->plane[i].pre[ref] = backup_yv12[ref][i];
     }
-#if CONFIG_REF_MV
     av1_set_mvcost(x, refs[ref], ref, mbmi->ref_mv_idx);
-#endif  // CONFIG_REF_MV
 #if CONFIG_EXT_INTER && !CONFIG_CB4X4
     if (bsize >= BLOCK_8X8)
 #endif  // CONFIG_EXT_INTER && !CONFIG_CB4X4
@@ -5691,947 +5476,6 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
   }
 }
 
-#if CONFIG_REF_MV && !CONFIG_EXT_INTER
-static void update_mv_search_and_seg_mvs(
-    int *const run_mv_search, int_mv *const seg_mvs, int has_second_rf,
-    const MV_REFERENCE_FRAME *const ref_frame,
-    const SEG_RDSTAT *const ref_rdstat, int_mv *const bsi_ref_mv[2]) {
-  if (has_second_rf) {
-    if (seg_mvs[ref_frame[0]].as_int == ref_rdstat->mvs[0].as_int &&
-        ref_rdstat->mvs[0].as_int != INVALID_MV)
-      if (bsi_ref_mv[0]->as_int == ref_rdstat->pred_mv[0].as_int)
-        --*run_mv_search;
-
-    if (seg_mvs[ref_frame[1]].as_int == ref_rdstat->mvs[1].as_int &&
-        ref_rdstat->mvs[1].as_int != INVALID_MV)
-      if (bsi_ref_mv[1]->as_int == ref_rdstat->pred_mv[1].as_int)
-        --*run_mv_search;
-  } else {
-    if (bsi_ref_mv[0]->as_int == ref_rdstat->pred_mv[0].as_int &&
-        ref_rdstat->mvs[0].as_int != INVALID_MV) {
-      *run_mv_search = 0;
-      seg_mvs[ref_frame[0]].as_int = ref_rdstat->mvs[0].as_int;
-    }
-  }
-}
-#endif  // CONFIG_REF_MV && !CONFIG_EXT_INTER
-
-static int64_t rd_pick_inter_best_sub8x8_mode(
-    const AV1_COMP *const cpi, MACROBLOCK *x, int_mv *best_ref_mv,
-    int_mv *second_best_ref_mv, int64_t best_rd, int *returntotrate,
-    int *returnyrate, int64_t *returndistortion, int *skippable, int64_t *psse,
-    int mvthresh, int_mv seg_mvs[4][TOTAL_REFS_PER_FRAME],
-#if CONFIG_EXT_INTER
-    int_mv compound_seg_newmvs[4][2],
-#endif  // CONFIG_EXT_INTER
-    BEST_SEG_INFO *bsi_buf, int filter_idx, int mi_row, int mi_col) {
-  BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
-#if CONFIG_REF_MV
-  int_mv tmp_ref_mv[2];
-#endif  // CONFIG_REF_MV
-  MACROBLOCKD *xd = &x->e_mbd;
-  MODE_INFO *mi = xd->mi[0];
-  MB_MODE_INFO *mbmi = &mi->mbmi;
-  int mode_idx;
-  int k, br = 0, idx, idy;
-  int64_t bd = 0, block_sse = 0;
-  PREDICTION_MODE this_mode;
-  const AV1_COMMON *cm = &cpi->common;
-  struct macroblock_plane *const p = &x->plane[0];
-  struct macroblockd_plane *const pd = &xd->plane[0];
-  const int label_count = 4;
-  int64_t this_segment_rd = 0;
-  int label_mv_thresh;
-  int segmentyrate = 0;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
-  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
-#if CONFIG_CB4X4
-  ENTROPY_CONTEXT t_above[4], t_left[4];
-#else
-  ENTROPY_CONTEXT t_above[2], t_left[2];
-#endif  // CONFIG_CB4X4
-  int subpelmv = 1, have_ref = 0;
-  const int has_second_rf = has_second_ref(mbmi);
-  const int inter_mode_mask = cpi->sf.inter_mode_mask[bsize];
-  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
-#if CONFIG_PVQ
-  od_rollback_buffer pre_buf;
-
-  od_encode_checkpoint(&x->daala_enc, &pre_buf);
-#endif  // CONFIG_PVQ
-#if CONFIG_EXT_TX && CONFIG_RECT_TX
-  mbmi->tx_size =
-      xd->lossless[mbmi->segment_id] ? TX_4X4 : max_txsize_rect_lookup[bsize];
-#else
-  mbmi->tx_size = TX_4X4;
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
-
-  av1_zero(*bsi);
-
-  bsi->segment_rd = best_rd;
-  bsi->ref_mv[0] = best_ref_mv;
-  bsi->ref_mv[1] = second_best_ref_mv;
-  bsi->mvp.as_int = best_ref_mv->as_int;
-  bsi->mvthresh = mvthresh;
-
-  for (idx = 0; idx < 4; ++idx) bsi->modes[idx] = ZEROMV;
-
-#if CONFIG_REF_MV
-  for (idx = 0; idx < 4; ++idx) {
-    for (k = NEARESTMV; k <= NEWMV; ++k) {
-      bsi->rdstat[idx][INTER_OFFSET(k)].pred_mv[0].as_int = INVALID_MV;
-      bsi->rdstat[idx][INTER_OFFSET(k)].pred_mv[1].as_int = INVALID_MV;
-
-      bsi->rdstat[idx][INTER_OFFSET(k)].mvs[0].as_int = INVALID_MV;
-      bsi->rdstat[idx][INTER_OFFSET(k)].mvs[1].as_int = INVALID_MV;
-    }
-  }
-#endif  // CONFIG_REF_MV
-
-  memcpy(t_above, pd->above_context, sizeof(t_above));
-  memcpy(t_left, pd->left_context, sizeof(t_left));
-
-  // 64 makes this threshold really big effectively
-  // making it so that we very rarely check mvs on
-  // segments.   setting this to 1 would make mv thresh
-  // roughly equal to what it is for macroblocks
-  label_mv_thresh = 1 * bsi->mvthresh / label_count;
-
-  // Segmentation method overheads
-  for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
-    for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
-      // TODO(jingning,rbultje): rewrite the rate-distortion optimization
-      // loop for 4x4/4x8/8x4 block coding. to be replaced with new rd loop
-      int_mv mode_mv[MB_MODE_COUNT][2];
-      int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
-      PREDICTION_MODE mode_selected = ZEROMV;
-      int64_t new_best_rd = INT64_MAX;
-      const int index = idy * 2 + idx;
-      int ref;
-#if CONFIG_REF_MV
-      CANDIDATE_MV ref_mv_stack[2][MAX_REF_MV_STACK_SIZE];
-      uint8_t ref_mv_count[2];
-#endif  // CONFIG_REF_MV
-#if CONFIG_EXT_INTER
-      int_mv ref_mvs_sub8x8[2][2];
-#endif  // CONFIG_EXT_INTER
-#if CONFIG_PVQ
-      od_rollback_buffer idx_buf, post_buf;
-      od_encode_checkpoint(&x->daala_enc, &idx_buf);
-      od_encode_checkpoint(&x->daala_enc, &post_buf);
-#endif  // CONFIG_PVQ
-
-      for (ref = 0; ref < 1 + has_second_rf; ++ref) {
-        const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
-#if CONFIG_EXT_INTER
-        int_mv mv_ref_list[MAX_MV_REF_CANDIDATES];
-        av1_update_mv_context(cm, xd, mi, frame, mv_ref_list, index, mi_row,
-                              mi_col, NULL);
-#endif  // CONFIG_EXT_INTER
-#if CONFIG_GLOBAL_MOTION
-        frame_mv[ZEROMV][frame].as_int =
-            gm_get_motion_vector(&cm->global_motion[frame],
-                                 cm->allow_high_precision_mv, mbmi->sb_type,
-                                 mi_col, mi_row, index)
-                .as_int;
-#else   // CONFIG_GLOBAL_MOTION
-        frame_mv[ZEROMV][frame].as_int = 0;
-#endif  // CONFIG_GLOBAL_MOTION
-        av1_append_sub8x8_mvs_for_idx(cm, xd, index, ref, mi_row, mi_col,
-#if CONFIG_REF_MV
-                                      ref_mv_stack[ref], &ref_mv_count[ref],
-#endif  // CONFIG_REF_MV
-#if CONFIG_EXT_INTER
-                                      mv_ref_list,
-#endif  // CONFIG_EXT_INTER
-                                      &frame_mv[NEARESTMV][frame],
-                                      &frame_mv[NEARMV][frame]);
-
-#if CONFIG_REF_MV
-        tmp_ref_mv[ref] = frame_mv[NEARESTMV][mbmi->ref_frame[ref]];
-        lower_mv_precision(&tmp_ref_mv[ref].as_mv, cm->allow_high_precision_mv);
-        bsi->ref_mv[ref] = &tmp_ref_mv[ref];
-        mbmi_ext->ref_mvs[frame][0] = tmp_ref_mv[ref];
-#endif  // CONFIG_REF_MV
-
-#if CONFIG_EXT_INTER
-        mv_ref_list[0].as_int = frame_mv[NEARESTMV][frame].as_int;
-        mv_ref_list[1].as_int = frame_mv[NEARMV][frame].as_int;
-        av1_find_best_ref_mvs(cm->allow_high_precision_mv, mv_ref_list,
-                              &ref_mvs_sub8x8[0][ref], &ref_mvs_sub8x8[1][ref]);
-
-        if (has_second_rf) {
-#if CONFIG_GLOBAL_MOTION
-          frame_mv[ZERO_ZEROMV][frame].as_int =
-              gm_get_motion_vector(&cm->global_motion[frame],
-                                   cm->allow_high_precision_mv, mbmi->sb_type,
-                                   mi_col, mi_row, index)
-                  .as_int;
-#else
-          frame_mv[ZERO_ZEROMV][frame].as_int = 0;
-#endif  // CONFIG_GLOBAL_MOTION
-          frame_mv[NEAREST_NEARESTMV][frame].as_int =
-              frame_mv[NEARESTMV][frame].as_int;
-
-          if (ref == 0) {
-            frame_mv[NEAREST_NEARMV][frame].as_int =
-                frame_mv[NEARESTMV][frame].as_int;
-            frame_mv[NEAR_NEARESTMV][frame].as_int =
-                frame_mv[NEARMV][frame].as_int;
-            frame_mv[NEAREST_NEWMV][frame].as_int =
-                frame_mv[NEARESTMV][frame].as_int;
-            frame_mv[NEAR_NEWMV][frame].as_int = frame_mv[NEARMV][frame].as_int;
-            frame_mv[NEAR_NEARMV][frame].as_int =
-                frame_mv[NEARMV][frame].as_int;
-          } else if (ref == 1) {
-            frame_mv[NEAREST_NEARMV][frame].as_int =
-                frame_mv[NEARMV][frame].as_int;
-            frame_mv[NEAR_NEARESTMV][frame].as_int =
-                frame_mv[NEARESTMV][frame].as_int;
-            frame_mv[NEW_NEARESTMV][frame].as_int =
-                frame_mv[NEARESTMV][frame].as_int;
-            frame_mv[NEW_NEARMV][frame].as_int = frame_mv[NEARMV][frame].as_int;
-            frame_mv[NEAR_NEARMV][frame].as_int =
-                frame_mv[NEARMV][frame].as_int;
-          }
-        }
-#endif  // CONFIG_EXT_INTER
-      }
-
-// search for the best motion vector on this segment
-#if CONFIG_EXT_INTER
-      for (this_mode = (has_second_rf ? NEAREST_NEARESTMV : NEARESTMV);
-           this_mode <= (has_second_rf ? NEW_NEWMV : NEWMV); ++this_mode)
-#else
-      for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode)
-#endif  // CONFIG_EXT_INTER
-      {
-        const struct buf_2d orig_src = x->plane[0].src;
-        struct buf_2d orig_pre[2];
-        // This flag controls if the motion estimation will kick off. When it
-        // is set to a non-zero value, the encoder will force motion estimation.
-        int run_mv_search = 0;
-
-        mode_idx = INTER_OFFSET(this_mode);
-#if CONFIG_EXT_INTER
-        for (ref = 0; ref < 1 + has_second_rf; ++ref)
-          bsi->ref_mv[ref]->as_int = ref_mvs_sub8x8[0][ref].as_int;
-#endif  // CONFIG_EXT_INTER
-        bsi->rdstat[index][mode_idx].brdcost = INT64_MAX;
-        if (!(inter_mode_mask & (1 << this_mode))) continue;
-
-#if CONFIG_REF_MV
-        run_mv_search = 2;
-#if !CONFIG_EXT_INTER
-        if (filter_idx > 0 && this_mode == NEWMV) {
-          const BEST_SEG_INFO *ref_bsi = bsi_buf;
-          const SEG_RDSTAT *ref_rdstat = &ref_bsi->rdstat[index][mode_idx];
-
-          update_mv_search_and_seg_mvs(&run_mv_search, seg_mvs[index],
-                                       has_second_rf, mbmi->ref_frame,
-                                       ref_rdstat, bsi->ref_mv);
-
-          if (run_mv_search != 0 && filter_idx > 1) {
-            ref_bsi = bsi_buf + 1;
-            ref_rdstat = &ref_bsi->rdstat[index][mode_idx];
-            run_mv_search = 2;
-            update_mv_search_and_seg_mvs(&run_mv_search, seg_mvs[index],
-                                         has_second_rf, mbmi->ref_frame,
-                                         ref_rdstat, bsi->ref_mv);
-          }
-        }
-#endif  // !CONFIG_EXT_INTER
-#endif  // CONFIG_REF_MV
-
-#if CONFIG_GLOBAL_MOTION
-        if (cm->global_motion[mbmi->ref_frame[0]].wmtype == IDENTITY &&
-            (!has_second_rf ||
-             cm->global_motion[mbmi->ref_frame[1]].wmtype == IDENTITY))
-#endif  // CONFIG_GLOBAL_MOTION
-
-          if (!check_best_zero_mv(cpi, mbmi_ext->mode_context,
-#if CONFIG_REF_MV && CONFIG_EXT_INTER
-                                  mbmi_ext->compound_mode_context,
-#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
-                                  frame_mv, this_mode, mbmi->ref_frame, bsize,
-                                  index, mi_row, mi_col))
-            continue;
-
-        memcpy(orig_pre, pd->pre, sizeof(orig_pre));
-        memcpy(bsi->rdstat[index][mode_idx].ta, t_above,
-               sizeof(bsi->rdstat[index][mode_idx].ta));
-        memcpy(bsi->rdstat[index][mode_idx].tl, t_left,
-               sizeof(bsi->rdstat[index][mode_idx].tl));
-#if CONFIG_PVQ
-        od_encode_rollback(&x->daala_enc, &idx_buf);
-#endif  // CONFIG_PVQ
-
-        // motion search for newmv (single predictor case only)
-        if (!has_second_rf &&
-#if CONFIG_EXT_INTER
-            have_newmv_in_inter_mode(this_mode) &&
-            (seg_mvs[index][mbmi->ref_frame[0]].as_int == INVALID_MV)
-#else
-            this_mode == NEWMV &&
-            (seg_mvs[index][mbmi->ref_frame[0]].as_int == INVALID_MV ||
-             run_mv_search)
-#endif  // CONFIG_EXT_INTER
-                ) {
-          int step_param = 0;
-          int bestsme = INT_MAX;
-          int sadpb = x->sadperbit4;
-          MV mvp_full;
-          int max_mv;
-          int cost_list[5];
-          MvLimits tmp_mv_limits = x->mv_limits;
-
-          /* Is the best so far sufficiently good that we cant justify doing
-           * and new motion search. */
-          if (new_best_rd < label_mv_thresh) break;
-
-#if CONFIG_EXT_INTER
-          bsi->mvp.as_int = bsi->ref_mv[0]->as_int;
-#else
-// use previous block's result as next block's MV predictor.
-#if !CONFIG_REF_MV
-          if (index > 0) {
-            bsi->mvp.as_int = mi->bmi[index - 1].as_mv[0].as_int;
-            if (index == 2)
-              bsi->mvp.as_int = mi->bmi[index - 2].as_mv[0].as_int;
-          }
-#endif  // !CONFIG_REF_MV
-#endif  // CONFIG_EXT_INTER
-          max_mv = (index == 0) ? (int)x->max_mv_context[mbmi->ref_frame[0]]
-                                : AOMMAX(abs(bsi->mvp.as_mv.row),
-                                         abs(bsi->mvp.as_mv.col)) >>
-                                      3;
-
-          if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) {
-            // Take wtd average of the step_params based on the last frame's
-            // max mv magnitude and the best ref mvs of the current block for
-            // the given reference.
-            step_param =
-                (av1_init_search_range(max_mv) + cpi->mv_step_param) / 2;
-          } else {
-            step_param = cpi->mv_step_param;
-          }
-
-#if CONFIG_REF_MV
-          mvp_full.row = bsi->ref_mv[0]->as_mv.row >> 3;
-          mvp_full.col = bsi->ref_mv[0]->as_mv.col >> 3;
-#else
-          mvp_full.row = bsi->mvp.as_mv.row >> 3;
-          mvp_full.col = bsi->mvp.as_mv.col >> 3;
-#endif  // CONFIG_REF_MV
-
-          if (cpi->sf.adaptive_motion_search) {
-            mvp_full.row = x->pred_mv[mbmi->ref_frame[0]].row >> 3;
-            mvp_full.col = x->pred_mv[mbmi->ref_frame[0]].col >> 3;
-            step_param = AOMMAX(step_param, 8);
-          }
-
-          // adjust src pointer for this block
-          mi_buf_shift(x, index);
-
-          av1_set_mv_search_range(&x->mv_limits, &bsi->ref_mv[0]->as_mv);
-
-          x->best_mv.as_int = x->second_best_mv.as_int = INVALID_MV;
-
-#if CONFIG_REF_MV
-          av1_set_mvcost(x, mbmi->ref_frame[0], 0, mbmi->ref_mv_idx);
-#endif  // CONFIG_REF_MV
-          bestsme = av1_full_pixel_search(
-              cpi, x, bsize, &mvp_full, step_param, sadpb,
-              cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL,
-              &bsi->ref_mv[0]->as_mv, INT_MAX, 1);
-
-          x->mv_limits = tmp_mv_limits;
-
-          if (bestsme < INT_MAX) {
-            int distortion;
-            if (cpi->sf.use_upsampled_references) {
-              int best_mv_var;
-              const int try_second =
-                  x->second_best_mv.as_int != INVALID_MV &&
-                  x->second_best_mv.as_int != x->best_mv.as_int;
-              const int pw = block_size_wide[bsize];
-              const int ph = block_size_high[bsize];
-              // Use up-sampled reference frames.
-              struct buf_2d backup_pred = pd->pre[0];
-              const YV12_BUFFER_CONFIG *upsampled_ref =
-                  get_upsampled_ref(cpi, mbmi->ref_frame[0]);
-
-              // Set pred for Y plane
-              setup_pred_plane(
-                  &pd->pre[0], bsize, upsampled_ref->y_buffer,
-                  upsampled_ref->y_crop_width, upsampled_ref->y_crop_height,
-                  upsampled_ref->y_stride, (mi_row << 3), (mi_col << 3), NULL,
-                  pd->subsampling_x, pd->subsampling_y);
-
-              // adjust pred pointer for this block
-              pd->pre[0].buf =
-                  &pd->pre[0].buf[(av1_raster_block_offset(BLOCK_8X8, index,
-                                                           pd->pre[0].stride))
-                                  << 3];
-
-              best_mv_var = cpi->find_fractional_mv_step(
-                  x, &bsi->ref_mv[0]->as_mv, cm->allow_high_precision_mv,
-                  x->errorperbit, &cpi->fn_ptr[bsize],
-                  cpi->sf.mv.subpel_force_stop,
-                  cpi->sf.mv.subpel_iters_per_step,
-                  cond_cost_list(cpi, cost_list), x->nmvjointcost, x->mvcost,
-                  &distortion, &x->pred_sse[mbmi->ref_frame[0]], NULL, pw, ph,
-                  1);
-
-              if (try_second) {
-                int this_var;
-                MV best_mv = x->best_mv.as_mv;
-                const MV ref_mv = bsi->ref_mv[0]->as_mv;
-                const int minc =
-                    AOMMAX(x->mv_limits.col_min * 8, ref_mv.col - MV_MAX);
-                const int maxc =
-                    AOMMIN(x->mv_limits.col_max * 8, ref_mv.col + MV_MAX);
-                const int minr =
-                    AOMMAX(x->mv_limits.row_min * 8, ref_mv.row - MV_MAX);
-                const int maxr =
-                    AOMMIN(x->mv_limits.row_max * 8, ref_mv.row + MV_MAX);
-
-                x->best_mv = x->second_best_mv;
-                if (x->best_mv.as_mv.row * 8 <= maxr &&
-                    x->best_mv.as_mv.row * 8 >= minr &&
-                    x->best_mv.as_mv.col * 8 <= maxc &&
-                    x->best_mv.as_mv.col * 8 >= minc) {
-                  this_var = cpi->find_fractional_mv_step(
-                      x, &bsi->ref_mv[0]->as_mv, cm->allow_high_precision_mv,
-                      x->errorperbit, &cpi->fn_ptr[bsize],
-                      cpi->sf.mv.subpel_force_stop,
-                      cpi->sf.mv.subpel_iters_per_step,
-                      cond_cost_list(cpi, cost_list), x->nmvjointcost,
-                      x->mvcost, &distortion, &x->pred_sse[mbmi->ref_frame[0]],
-                      NULL, pw, ph, 1);
-                  if (this_var < best_mv_var) best_mv = x->best_mv.as_mv;
-                  x->best_mv.as_mv = best_mv;
-                }
-              }
-
-              // Restore the reference frames.
-              pd->pre[0] = backup_pred;
-            } else {
-              cpi->find_fractional_mv_step(
-                  x, &bsi->ref_mv[0]->as_mv, cm->allow_high_precision_mv,
-                  x->errorperbit, &cpi->fn_ptr[bsize],
-                  cpi->sf.mv.subpel_force_stop,
-                  cpi->sf.mv.subpel_iters_per_step,
-                  cond_cost_list(cpi, cost_list), x->nmvjointcost, x->mvcost,
-                  &distortion, &x->pred_sse[mbmi->ref_frame[0]], NULL, 0, 0, 0);
-            }
-
-// save motion search result for use in compound prediction
-#if CONFIG_EXT_INTER
-            seg_mvs[index][mbmi->ref_frame[0]].as_mv = x->best_mv.as_mv;
-#else
-            seg_mvs[index][mbmi->ref_frame[0]].as_mv = x->best_mv.as_mv;
-#endif  // CONFIG_EXT_INTER
-          }
-
-          if (cpi->sf.adaptive_motion_search)
-            x->pred_mv[mbmi->ref_frame[0]] = x->best_mv.as_mv;
-
-#if CONFIG_EXT_INTER
-          mode_mv[this_mode][0] = x->best_mv;
-#else
-          mode_mv[NEWMV][0] = x->best_mv;
-#endif  // CONFIG_EXT_INTER
-
-          // restore src pointers
-          mi_buf_restore(x, orig_src, orig_pre);
-        }
-
-        if (has_second_rf) {
-#if CONFIG_EXT_INTER
-          if (seg_mvs[index][mbmi->ref_frame[1]].as_int == INVALID_MV ||
-              seg_mvs[index][mbmi->ref_frame[0]].as_int == INVALID_MV)
-#else
-          if (seg_mvs[index][mbmi->ref_frame[1]].as_int == INVALID_MV ||
-              seg_mvs[index][mbmi->ref_frame[0]].as_int == INVALID_MV)
-#endif  // CONFIG_EXT_INTER
-            continue;
-        }
-
-#if CONFIG_DUAL_FILTER
-        (void)run_mv_search;
-#endif  // CONFIG_DUAL_FILTER
-
-        if (has_second_rf &&
-#if CONFIG_EXT_INTER
-            this_mode == NEW_NEWMV &&
-#else
-            this_mode == NEWMV &&
-#endif  // CONFIG_EXT_INTER
-#if CONFIG_DUAL_FILTER
-            (mbmi->interp_filter[0] == EIGHTTAP_REGULAR || run_mv_search))
-#else
-            (mbmi->interp_filter == EIGHTTAP_REGULAR || run_mv_search))
-#endif  // CONFIG_DUAL_FILTER
-        {
-          // adjust src pointers
-          mi_buf_shift(x, index);
-          if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
-            int rate_mv;
-            frame_mv[this_mode][mbmi->ref_frame[0]].as_int =
-                seg_mvs[index][mbmi->ref_frame[0]].as_int;
-            frame_mv[this_mode][mbmi->ref_frame[1]].as_int =
-                seg_mvs[index][mbmi->ref_frame[1]].as_int;
-            joint_motion_search(cpi, x, bsize, frame_mv[this_mode], mi_row,
-                                mi_col,
-#if CONFIG_EXT_INTER
-                                bsi->ref_mv,
-#endif  // CONFIG_EXT_INTER
-                                &rate_mv, index);
-#if CONFIG_EXT_INTER
-            compound_seg_newmvs[index][0].as_int =
-                frame_mv[this_mode][mbmi->ref_frame[0]].as_int;
-            compound_seg_newmvs[index][1].as_int =
-                frame_mv[this_mode][mbmi->ref_frame[1]].as_int;
-#else
-            seg_mvs[index][mbmi->ref_frame[0]].as_int =
-                frame_mv[this_mode][mbmi->ref_frame[0]].as_int;
-            seg_mvs[index][mbmi->ref_frame[1]].as_int =
-                frame_mv[this_mode][mbmi->ref_frame[1]].as_int;
-#endif  // CONFIG_EXT_INTER
-          }
-          // restore src pointers
-          mi_buf_restore(x, orig_src, orig_pre);
-        }
-
-        bsi->rdstat[index][mode_idx].brate = set_and_cost_bmi_mvs(
-            cpi, x, xd, index, this_mode, mode_mv[this_mode], frame_mv,
-            seg_mvs[index],
-#if CONFIG_EXT_INTER
-            compound_seg_newmvs[index],
-#endif  // CONFIG_EXT_INTER
-            bsi->ref_mv, x->nmvjointcost, x->mvcost, mi_row, mi_col);
-
-        for (ref = 0; ref < 1 + has_second_rf; ++ref) {
-          bsi->rdstat[index][mode_idx].mvs[ref].as_int =
-              mode_mv[this_mode][ref].as_int;
-          if (num_4x4_blocks_wide > 1)
-            bsi->rdstat[index + 1][mode_idx].mvs[ref].as_int =
-                mode_mv[this_mode][ref].as_int;
-          if (num_4x4_blocks_high > 1)
-            bsi->rdstat[index + 2][mode_idx].mvs[ref].as_int =
-                mode_mv[this_mode][ref].as_int;
-#if CONFIG_REF_MV
-          bsi->rdstat[index][mode_idx].pred_mv[ref].as_int =
-              mi->bmi[index].pred_mv[ref].as_int;
-          if (num_4x4_blocks_wide > 1)
-            bsi->rdstat[index + 1][mode_idx].pred_mv[ref].as_int =
-                mi->bmi[index].pred_mv[ref].as_int;
-          if (num_4x4_blocks_high > 1)
-            bsi->rdstat[index + 2][mode_idx].pred_mv[ref].as_int =
-                mi->bmi[index].pred_mv[ref].as_int;
-#endif  // CONFIG_REF_MV
-#if CONFIG_EXT_INTER
-          bsi->rdstat[index][mode_idx].ref_mv[ref].as_int =
-              bsi->ref_mv[ref]->as_int;
-          if (num_4x4_blocks_wide > 1)
-            bsi->rdstat[index + 1][mode_idx].ref_mv[ref].as_int =
-                bsi->ref_mv[ref]->as_int;
-          if (num_4x4_blocks_high > 1)
-            bsi->rdstat[index + 2][mode_idx].ref_mv[ref].as_int =
-                bsi->ref_mv[ref]->as_int;
-#endif  // CONFIG_EXT_INTER
-        }
-
-        // Trap vectors that reach beyond the UMV borders
-        if (mv_check_bounds(&x->mv_limits, &mode_mv[this_mode][0].as_mv) ||
-            (has_second_rf &&
-             mv_check_bounds(&x->mv_limits, &mode_mv[this_mode][1].as_mv)))
-          continue;
-
-        if (filter_idx > 0) {
-          BEST_SEG_INFO *ref_bsi = bsi_buf;
-          subpelmv = 0;
-          have_ref = 1;
-
-          for (ref = 0; ref < 1 + has_second_rf; ++ref) {
-            subpelmv |= mv_has_subpel(&mode_mv[this_mode][ref].as_mv);
-#if CONFIG_EXT_INTER
-            if (have_newmv_in_inter_mode(this_mode))
-              have_ref &=
-                  ((mode_mv[this_mode][ref].as_int ==
-                    ref_bsi->rdstat[index][mode_idx].mvs[ref].as_int) &&
-                   (bsi->ref_mv[ref]->as_int ==
-                    ref_bsi->rdstat[index][mode_idx].ref_mv[ref].as_int));
-            else
-#endif  // CONFIG_EXT_INTER
-              have_ref &= mode_mv[this_mode][ref].as_int ==
-                          ref_bsi->rdstat[index][mode_idx].mvs[ref].as_int;
-          }
-
-          have_ref &= ref_bsi->rdstat[index][mode_idx].brate > 0;
-
-          if (filter_idx > 1 && !subpelmv && !have_ref) {
-            ref_bsi = bsi_buf + 1;
-            have_ref = 1;
-            for (ref = 0; ref < 1 + has_second_rf; ++ref)
-#if CONFIG_EXT_INTER
-              if (have_newmv_in_inter_mode(this_mode))
-                have_ref &=
-                    ((mode_mv[this_mode][ref].as_int ==
-                      ref_bsi->rdstat[index][mode_idx].mvs[ref].as_int) &&
-                     (bsi->ref_mv[ref]->as_int ==
-                      ref_bsi->rdstat[index][mode_idx].ref_mv[ref].as_int));
-              else
-#endif  // CONFIG_EXT_INTER
-                have_ref &= mode_mv[this_mode][ref].as_int ==
-                            ref_bsi->rdstat[index][mode_idx].mvs[ref].as_int;
-
-            have_ref &= ref_bsi->rdstat[index][mode_idx].brate > 0;
-          }
-
-          if (!subpelmv && have_ref &&
-              ref_bsi->rdstat[index][mode_idx].brdcost < INT64_MAX) {
-#if CONFIG_REF_MV
-            bsi->rdstat[index][mode_idx].byrate =
-                ref_bsi->rdstat[index][mode_idx].byrate;
-            bsi->rdstat[index][mode_idx].bdist =
-                ref_bsi->rdstat[index][mode_idx].bdist;
-            bsi->rdstat[index][mode_idx].bsse =
-                ref_bsi->rdstat[index][mode_idx].bsse;
-            bsi->rdstat[index][mode_idx].brate +=
-                ref_bsi->rdstat[index][mode_idx].byrate;
-            bsi->rdstat[index][mode_idx].eobs =
-                ref_bsi->rdstat[index][mode_idx].eobs;
-
-            bsi->rdstat[index][mode_idx].brdcost =
-                RDCOST(x->rdmult, x->rddiv, bsi->rdstat[index][mode_idx].brate,
-                       bsi->rdstat[index][mode_idx].bdist);
-
-            memcpy(bsi->rdstat[index][mode_idx].ta,
-                   ref_bsi->rdstat[index][mode_idx].ta,
-                   sizeof(bsi->rdstat[index][mode_idx].ta));
-            memcpy(bsi->rdstat[index][mode_idx].tl,
-                   ref_bsi->rdstat[index][mode_idx].tl,
-                   sizeof(bsi->rdstat[index][mode_idx].tl));
-#else
-            memcpy(&bsi->rdstat[index][mode_idx],
-                   &ref_bsi->rdstat[index][mode_idx], sizeof(SEG_RDSTAT));
-#endif  // CONFIG_REF_MV
-            if (num_4x4_blocks_wide > 1)
-              bsi->rdstat[index + 1][mode_idx].eobs =
-                  ref_bsi->rdstat[index + 1][mode_idx].eobs;
-            if (num_4x4_blocks_high > 1)
-              bsi->rdstat[index + 2][mode_idx].eobs =
-                  ref_bsi->rdstat[index + 2][mode_idx].eobs;
-
-            if (bsi->rdstat[index][mode_idx].brdcost < new_best_rd) {
-#if CONFIG_REF_MV
-              // If the NEWMV mode is using the same motion vector as the
-              // NEARESTMV mode, skip the rest rate-distortion calculations
-              // and use the inferred motion vector modes.
-              if (this_mode == NEWMV) {
-                if (has_second_rf) {
-                  if (bsi->rdstat[index][mode_idx].mvs[0].as_int ==
-                          bsi->ref_mv[0]->as_int &&
-                      bsi->rdstat[index][mode_idx].mvs[1].as_int ==
-                          bsi->ref_mv[1]->as_int)
-                    continue;
-                } else {
-                  if (bsi->rdstat[index][mode_idx].mvs[0].as_int ==
-                      bsi->ref_mv[0]->as_int)
-                    continue;
-                }
-              }
-#endif  // CONFIG_REF_MV
-              mode_selected = this_mode;
-              new_best_rd = bsi->rdstat[index][mode_idx].brdcost;
-#if CONFIG_PVQ
-              od_encode_checkpoint(&x->daala_enc, &post_buf);
-#endif  // CONFIG_PVQ
-            }
-            continue;
-          }
-        }
-
-        bsi->rdstat[index][mode_idx].brdcost = encode_inter_mb_segment_sub8x8(
-            cpi, x, bsi->segment_rd - this_segment_rd, index,
-            &bsi->rdstat[index][mode_idx].byrate,
-            &bsi->rdstat[index][mode_idx].bdist,
-            &bsi->rdstat[index][mode_idx].bsse, bsi->rdstat[index][mode_idx].ta,
-            bsi->rdstat[index][mode_idx].tl, idy, idx, mi_row, mi_col);
-
-        if (bsi->rdstat[index][mode_idx].brdcost < INT64_MAX) {
-          bsi->rdstat[index][mode_idx].brdcost += RDCOST(
-              x->rdmult, x->rddiv, bsi->rdstat[index][mode_idx].brate, 0);
-          bsi->rdstat[index][mode_idx].brate +=
-              bsi->rdstat[index][mode_idx].byrate;
-          bsi->rdstat[index][mode_idx].eobs = p->eobs[index];
-          if (num_4x4_blocks_wide > 1)
-            bsi->rdstat[index + 1][mode_idx].eobs = p->eobs[index + 1];
-          if (num_4x4_blocks_high > 1)
-            bsi->rdstat[index + 2][mode_idx].eobs = p->eobs[index + 2];
-        }
-
-        if (bsi->rdstat[index][mode_idx].brdcost < new_best_rd) {
-#if CONFIG_REF_MV
-          // If the NEWMV mode is using the same motion vector as the
-          // NEARESTMV mode, skip the rest rate-distortion calculations
-          // and use the inferred motion vector modes.
-          if (this_mode == NEWMV) {
-            if (has_second_rf) {
-              if (bsi->rdstat[index][mode_idx].mvs[0].as_int ==
-                      bsi->ref_mv[0]->as_int &&
-                  bsi->rdstat[index][mode_idx].mvs[1].as_int ==
-                      bsi->ref_mv[1]->as_int)
-                continue;
-            } else {
-              if (bsi->rdstat[index][mode_idx].mvs[0].as_int ==
-                  bsi->ref_mv[0]->as_int)
-                continue;
-            }
-          }
-#endif  // CONFIG_REF_MV
-          mode_selected = this_mode;
-          new_best_rd = bsi->rdstat[index][mode_idx].brdcost;
-
-#if CONFIG_PVQ
-          od_encode_checkpoint(&x->daala_enc, &post_buf);
-#endif  // CONFIG_PVQ
-        }
-      } /*for each 4x4 mode*/
-
-      if (new_best_rd == INT64_MAX) {
-        int iy, midx;
-        for (iy = index + 1; iy < 4; ++iy)
-#if CONFIG_EXT_INTER
-          for (midx = 0; midx < INTER_MODES + INTER_COMPOUND_MODES; ++midx)
-#else
-          for (midx = 0; midx < INTER_MODES; ++midx)
-#endif  // CONFIG_EXT_INTER
-            bsi->rdstat[iy][midx].brdcost = INT64_MAX;
-        bsi->segment_rd = INT64_MAX;
-#if CONFIG_PVQ
-        od_encode_rollback(&x->daala_enc, &pre_buf);
-#endif  // CONFIG_PVQ
-        return INT64_MAX;
-      }
-
-      mode_idx = INTER_OFFSET(mode_selected);
-      memcpy(t_above, bsi->rdstat[index][mode_idx].ta, sizeof(t_above));
-      memcpy(t_left, bsi->rdstat[index][mode_idx].tl, sizeof(t_left));
-#if CONFIG_PVQ
-      od_encode_rollback(&x->daala_enc, &post_buf);
-#endif  // CONFIG_PVQ
-
-#if CONFIG_EXT_INTER
-      bsi->ref_mv[0]->as_int = bsi->rdstat[index][mode_idx].ref_mv[0].as_int;
-      if (has_second_rf)
-        bsi->ref_mv[1]->as_int = bsi->rdstat[index][mode_idx].ref_mv[1].as_int;
-#endif  // CONFIG_EXT_INTER
-      set_and_cost_bmi_mvs(cpi, x, xd, index, mode_selected,
-                           mode_mv[mode_selected], frame_mv, seg_mvs[index],
-#if CONFIG_EXT_INTER
-                           compound_seg_newmvs[index],
-#endif  // CONFIG_EXT_INTER
-                           bsi->ref_mv, x->nmvjointcost, x->mvcost, mi_row,
-                           mi_col);
-
-      br += bsi->rdstat[index][mode_idx].brate;
-      bd += bsi->rdstat[index][mode_idx].bdist;
-      block_sse += bsi->rdstat[index][mode_idx].bsse;
-      segmentyrate += bsi->rdstat[index][mode_idx].byrate;
-      this_segment_rd += bsi->rdstat[index][mode_idx].brdcost;
-
-      if (this_segment_rd > bsi->segment_rd) {
-        int iy, midx;
-        for (iy = index + 1; iy < 4; ++iy)
-#if CONFIG_EXT_INTER
-          for (midx = 0; midx < INTER_MODES + INTER_COMPOUND_MODES; ++midx)
-#else
-          for (midx = 0; midx < INTER_MODES; ++midx)
-#endif  // CONFIG_EXT_INTER
-            bsi->rdstat[iy][midx].brdcost = INT64_MAX;
-        bsi->segment_rd = INT64_MAX;
-#if CONFIG_PVQ
-        od_encode_rollback(&x->daala_enc, &pre_buf);
-#endif  // CONFIG_PVQ
-        return INT64_MAX;
-      }
-    }
-  } /* for each label */
-#if CONFIG_PVQ
-  od_encode_rollback(&x->daala_enc, &pre_buf);
-#endif  // CONFIG_PVQ
-
-  bsi->r = br;
-  bsi->d = bd;
-  bsi->segment_yrate = segmentyrate;
-  bsi->segment_rd = this_segment_rd;
-  bsi->sse = block_sse;
-
-  // update the coding decisions
-  for (k = 0; k < 4; ++k) bsi->modes[k] = mi->bmi[k].as_mode;
-
-#if CONFIG_DAALA_DIST
-  // Compute prediction (i.e. skip) and decoded distortion by daala-distortion.
-  {
-    const int src_stride = p->src.stride;
-    const int dst_stride = pd->dst.stride;
-    uint8_t *src = p->src.buf;
-    uint8_t *dst = pd->dst.buf;
-    const BLOCK_SIZE plane_bsize = get_plane_block_size(mi->mbmi.sb_type, pd);
-    const int use_activity_masking = 0;
-    const int qm = OD_HVS_QM;
-    const int bsw = block_size_wide[plane_bsize];
-    const int bsh = block_size_high[plane_bsize];
-    int64_t rd1, rd2;
-    int64_t daala_sse, daala_dist;
-    TX_SIZE tx_size = mbmi->tx_size;
-
-#if CONFIG_HIGHBITDEPTH
-    uint8_t *recon_8x8;
-    DECLARE_ALIGNED(16, uint16_t, recon16[8 * 8]);
-
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-      recon_8x8 = CONVERT_TO_BYTEPTR(recon16);
-    else
-      recon_8x8 = (uint8_t *)recon16;
-#else
-    DECLARE_ALIGNED(16, uint8_t, recon_8x8[8 * 8]);
-#endif  // CONFIG_HIGHBITDEPTH
-
-#if CONFIG_PVQ
-    use_activity_masking = x->daala_enc.use_activity_masking;
-#endif  // CONFIG_PVQ
-
-    // For each of sub8x8 prediction block in a 8x8 block
-    for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
-      for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
-        int i = idy * 2 + idx;
-        const uint8_t *const src_sub8x8 =
-            src + av1_raster_block_offset(BLOCK_8X8, i, p->src.stride);
-        uint8_t *const dst_sub8x8 =
-            dst + av1_raster_block_offset(BLOCK_8X8, i, pd->dst.stride);
-        uint8_t *recon_sub8x8 = recon_8x8 + (idy * 8 + idx) * 4;
-        const int txb_width = max_block_wide(xd, plane_bsize, 0);
-        const int txb_height = max_block_high(xd, plane_bsize, 0);
-        int idx_, idy_;
-
-        av1_build_inter_predictor_sub8x8(xd, 0, i, idy, idx, mi_row, mi_col);
-#if CONFIG_HIGHBITDEPTH
-        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-          aom_highbd_subtract_block(
-              height, width,
-              av1_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,
-              src_sub8x8, p->src.stride, dst_sub8x8, pd->dst.stride, xd->bd);
-        } else {
-          aom_subtract_block(
-              height, width,
-              av1_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,
-              src_sub8x8, p->src.stride, dst_sub8x8, pd->dst.stride);
-        }
-#else
-        aom_subtract_block(
-            bsh, bsw, av1_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
-            8, src_sub8x8, p->src.stride, dst_sub8x8, pd->dst.stride);
-#endif  // CONFIG_HIGHBITDEPTH
-
-#if CONFIG_HIGHBITDEPTH
-        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-          aom_highbd_convolve_copy(dst_sub8x8, dst_stride, recon_sub8x8, 8,
-                                   NULL, 0, NULL, 0, bsw, bsh, xd->bd);
-        } else {
-#endif  // CONFIG_HIGHBITDEPTH
-          aom_convolve_copy(dst_sub8x8, dst_stride, recon_sub8x8, 8, NULL, 0,
-                            NULL, 0, bsw, bsh);
-#if CONFIG_HIGHBITDEPTH
-        }
-#endif  // CONFIG_HIGHBITDEPTH
-
-        // To get decoded pixels, do 4x4 xform and quant for each 4x4 block
-        // in a sub8x8 prediction block. In case remaining parts of
-        // sub8x8 inter mode rdo assume pd->dst stores predicted pixels,
-        // use local buffer to store decoded pixels.
-        for (idy_ = 0; idy_ < txb_height; idy_++) {
-          for (idx_ = 0; idx_ < txb_width; idx_++) {
-            int coeff_ctx = 0;
-            const tran_low_t *dqcoeff;
-            uint16_t eob;
-            const PLANE_TYPE plane_type = PLANE_TYPE_Y;
-            uint8_t *recon_4x4 = recon_sub8x8 + (idy_ * 8 + idx_) * 4;
-            const int block_raster_idx = (idy + idy_) * 2 + (idx + idx_);
-            const int block =
-                av1_raster_order_to_block_index(tx_size, block_raster_idx);
-            TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
-
-            dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-            av1_xform_quant(cm, x, 0, block, idy + idy_, idx + idx_, BLOCK_8X8,
-                            tx_size, coeff_ctx, AV1_XFORM_QUANT_FP);
-            if (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0)
-              av1_optimize_b(cm, x, 0, block, tx_size, coeff_ctx);
-
-            eob = p->eobs[block];
-            av1_inverse_transform_block(xd, dqcoeff, tx_type, tx_size,
-                                        recon_4x4, 8, eob);
-          }
-        }
-      }
-    }
-    // Compute daala-distortion for a 8x8 block
-    daala_sse = av1_daala_dist(src, src_stride, pd->dst.buf, dst_stride, 8, 8,
-                               qm, use_activity_masking, x->qindex)
-                << 4;
-
-    daala_dist = av1_daala_dist(src, src_stride, recon_8x8, 8, 8, 8, qm,
-                                use_activity_masking, x->qindex)
-                 << 4;
-
-    bsi->sse = daala_sse;
-    bsi->d = daala_dist;
-
-    rd1 = RDCOST(x->rdmult, x->rddiv, bsi->r, bsi->d);
-    rd2 = RDCOST(x->rdmult, x->rddiv, 0, bsi->sse);
-    bsi->segment_rd = AOMMIN(rd1, rd2);
-  }
-#endif  // CONFIG_DAALA_DIST
-
-  if (bsi->segment_rd > best_rd) return INT64_MAX;
-  /* set it to the best */
-  for (idx = 0; idx < 4; idx++) {
-    mode_idx = INTER_OFFSET(bsi->modes[idx]);
-    mi->bmi[idx].as_mv[0].as_int = bsi->rdstat[idx][mode_idx].mvs[0].as_int;
-    if (has_second_ref(mbmi))
-      mi->bmi[idx].as_mv[1].as_int = bsi->rdstat[idx][mode_idx].mvs[1].as_int;
-#if CONFIG_REF_MV
-    mi->bmi[idx].pred_mv[0] = bsi->rdstat[idx][mode_idx].pred_mv[0];
-    if (has_second_ref(mbmi))
-      mi->bmi[idx].pred_mv[1] = bsi->rdstat[idx][mode_idx].pred_mv[1];
-#endif  // CONFIG_REF_MV
-#if CONFIG_EXT_INTER
-    mi->bmi[idx].ref_mv[0].as_int = bsi->rdstat[idx][mode_idx].ref_mv[0].as_int;
-    if (has_second_rf)
-      mi->bmi[idx].ref_mv[1].as_int =
-          bsi->rdstat[idx][mode_idx].ref_mv[1].as_int;
-#endif  // CONFIG_EXT_INTER
-    x->plane[0].eobs[idx] = bsi->rdstat[idx][mode_idx].eobs;
-    mi->bmi[idx].as_mode = bsi->modes[idx];
-  }
-
-  /*
-   * used to set mbmi->mv.as_int
-   */
-  *returntotrate = bsi->r;
-  *returndistortion = bsi->d;
-  *returnyrate = bsi->segment_yrate;
-  *skippable = av1_is_skippable_in_plane(x, BLOCK_8X8, 0);
-  *psse = bsi->sse;
-  mbmi->mode = bsi->modes[3];
-
-  return bsi->segment_rd;
-}
-
 static void estimate_ref_frame_costs(const AV1_COMMON *cm,
                                      const MACROBLOCKD *xd, int segment_id,
                                      unsigned int *ref_costs_single,
@@ -6808,15 +5652,13 @@ static void setup_buffer_inter(
   av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);
 
   // Gets an initial list of candidate vectors from neighbours and orders them
-  av1_find_mv_refs(
-      cm, xd, mi, ref_frame,
-#if CONFIG_REF_MV
-      &mbmi_ext->ref_mv_count[ref_frame], mbmi_ext->ref_mv_stack[ref_frame],
+  av1_find_mv_refs(cm, xd, mi, ref_frame, &mbmi_ext->ref_mv_count[ref_frame],
+                   mbmi_ext->ref_mv_stack[ref_frame],
 #if CONFIG_EXT_INTER
-      mbmi_ext->compound_mode_context,
+                   mbmi_ext->compound_mode_context,
 #endif  // CONFIG_EXT_INTER
-#endif  // CONFIG_REF_MV
-      candidates, mi_row, mi_col, NULL, NULL, mbmi_ext->mode_context);
+                   candidates, mi_row, mi_col, NULL, NULL,
+                   mbmi_ext->mode_context);
 
   // Candidate refinement carried out at encoder and decoder
   av1_find_best_ref_mvs(cm->allow_high_precision_mv, candidates,
@@ -6882,9 +5724,7 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
 
   av1_set_mv_search_range(&x->mv_limits, &ref_mv);
 
-#if CONFIG_REF_MV
   av1_set_mvcost(x, ref, ref_idx, mbmi->ref_mv_idx);
-#endif  // CONFIG_REF_MV
 
   // Work out the size of the first step in the mv step search.
   // 0 here is maximum length first step. 1 is AOMMAX >> 1 etc.
@@ -6996,8 +5836,11 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
               x, &ref_mv, cm->allow_high_precision_mv, x->errorperbit,
               &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
               cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
-              x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, pw, ph,
-              1);
+              x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL,
+#if CONFIG_EXT_INTER
+              NULL, 0, 0,
+#endif
+              pw, ph, 1);
 
           if (try_second) {
             const int minc =
@@ -7021,7 +5864,11 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
                   &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
                   cpi->sf.mv.subpel_iters_per_step,
                   cond_cost_list(cpi, cost_list), x->nmvjointcost, x->mvcost,
-                  &dis, &x->pred_sse[ref], NULL, pw, ph, 1);
+                  &dis, &x->pred_sse[ref], NULL,
+#if CONFIG_EXT_INTER
+                  NULL, 0, 0,
+#endif
+                  pw, ph, 1);
               if (this_var < best_mv_var) best_mv = x->best_mv.as_mv;
               x->best_mv.as_mv = best_mv;
             }
@@ -7034,8 +5881,11 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
               x, &ref_mv, cm->allow_high_precision_mv, x->errorperbit,
               &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
               cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
-              x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, 0, 0,
-              0);
+              x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL,
+#if CONFIG_EXT_INTER
+              NULL, 0, 0,
+#endif
+              0, 0, 0);
         }
 #if CONFIG_MOTION_VAR
         break;
@@ -7077,131 +5927,287 @@ static INLINE void restore_dst_buf(MACROBLOCKD *xd, BUFFER_SET dst) {
 }
 
 #if CONFIG_EXT_INTER
-#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
-static void do_masked_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                    const uint8_t *mask, int mask_stride,
-                                    BLOCK_SIZE bsize, int mi_row, int mi_col,
-                                    int_mv *tmp_mv, int *rate_mv, int ref_idx) {
+static void build_second_inter_pred(const AV1_COMP *cpi, MACROBLOCK *x,
+                                    BLOCK_SIZE bsize, const MV *other_mv,
+                                    int mi_row, int mi_col, const int block,
+                                    int ref_idx, uint8_t *second_pred) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int pw = block_size_wide[bsize];
+  const int ph = block_size_high[bsize];
   MACROBLOCKD *xd = &x->e_mbd;
-  const AV1_COMMON *cm = &cpi->common;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
-  int bestsme = INT_MAX;
-  int step_param;
-  int sadpb = x->sadperbit16;
-  MV mvp_full;
-  int ref = mbmi->ref_frame[ref_idx];
-  MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv;
-
-  MvLimits tmp_mv_limits = x->mv_limits;
-
-  const YV12_BUFFER_CONFIG *scaled_ref_frame =
-      av1_get_scaled_ref_frame(cpi, ref);
-  int i;
+  const int other_ref = mbmi->ref_frame[!ref_idx];
+#if CONFIG_DUAL_FILTER
+  InterpFilter interp_filter[2] = {
+    (ref_idx == 0) ? mbmi->interp_filter[2] : mbmi->interp_filter[0],
+    (ref_idx == 0) ? mbmi->interp_filter[3] : mbmi->interp_filter[1]
+  };
+#else
+  const InterpFilter interp_filter = mbmi->interp_filter;
+#endif  // CONFIG_DUAL_FILTER
+  struct scale_factors sf;
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  // ic and ir are the 4x4 coordiantes of the sub8x8 at index "block"
+  const int ic = block & 1;
+  const int ir = (block - ic) >> 1;
+  const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x) + 4 * ic;
+  const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y) + 4 * ir;
+#if CONFIG_GLOBAL_MOTION
+  WarpedMotionParams *const wm = &xd->global_motion[other_ref];
+  int is_global = is_global_mv_block(xd->mi[0], block, wm->wmtype);
+#endif  // CONFIG_GLOBAL_MOTION
+#else
+  (void)block;
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
 
-  MV pred_mv[3];
-  pred_mv[0] = x->mbmi_ext->ref_mvs[ref][0].as_mv;
-  pred_mv[1] = x->mbmi_ext->ref_mvs[ref][1].as_mv;
-  pred_mv[2] = x->pred_mv[ref];
+  // This function should only ever be called for compound modes
+  assert(has_second_ref(mbmi));
 
-#if CONFIG_REF_MV
-  av1_set_mvcost(x, ref, ref_idx, mbmi->ref_mv_idx);
-#endif  // CONFIG_REF_MV
+  struct buf_2d backup_yv12[MAX_MB_PLANE];
+  const YV12_BUFFER_CONFIG *const scaled_ref_frame =
+      av1_get_scaled_ref_frame(cpi, other_ref);
 
   if (scaled_ref_frame) {
+    int i;
     // Swap out the reference frame for a version that's been scaled to
     // match the resolution of the current frame, allowing the existing
     // motion search code to be used without additional modifications.
     for (i = 0; i < MAX_MB_PLANE; i++)
-      backup_yv12[i] = xd->plane[i].pre[ref_idx];
-
-    av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL);
+      backup_yv12[i] = xd->plane[i].pre[!ref_idx];
+    av1_setup_pre_planes(xd, !ref_idx, scaled_ref_frame, mi_row, mi_col, NULL);
   }
 
-  av1_set_mv_search_range(&x->mv_limits, &ref_mv);
+// Since we have scaled the reference frames to match the size of the current
+// frame we must use a unit scaling factor during mode selection.
+#if CONFIG_HIGHBITDEPTH
+  av1_setup_scale_factors_for_frame(&sf, cm->width, cm->height, cm->width,
+                                    cm->height, cm->use_highbitdepth);
+#else
+  av1_setup_scale_factors_for_frame(&sf, cm->width, cm->height, cm->width,
+                                    cm->height);
+#endif  // CONFIG_HIGHBITDEPTH
 
-  // Work out the size of the first step in the mv step search.
-  // 0 here is maximum length first step. 1 is MAX >> 1 etc.
-  if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) {
-    // Take wtd average of the step_params based on the last frame's
-    // max mv magnitude and that based on the best ref mvs of the current
-    // block for the given reference.
-    step_param =
-        (av1_init_search_range(x->max_mv_context[ref]) + cpi->mv_step_param) /
-        2;
+  struct buf_2d ref_yv12;
+
+  const int plane = 0;
+  ConvolveParams conv_params = get_conv_params(0, plane);
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+  WarpTypesAllowed warp_types;
+#if CONFIG_GLOBAL_MOTION
+  warp_types.global_warp_allowed = is_global;
+#endif  // CONFIG_GLOBAL_MOTION
+#if CONFIG_WARPED_MOTION
+  warp_types.local_warp_allowed = mbmi->motion_mode == WARPED_CAUSAL;
+#endif  // CONFIG_WARPED_MOTION
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+
+  // Initialized here because of compiler problem in Visual Studio.
+  ref_yv12 = xd->plane[plane].pre[!ref_idx];
+
+// Get the prediction block from the 'other' reference frame.
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    av1_highbd_build_inter_predictor(
+        ref_yv12.buf, ref_yv12.stride, second_pred, pw, other_mv, &sf, pw, ph,
+        0, interp_filter,
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+        &warp_types, p_col, p_row,
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+        plane, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd);
   } else {
-    step_param = cpi->mv_step_param;
+#endif  // CONFIG_HIGHBITDEPTH
+    av1_build_inter_predictor(
+        ref_yv12.buf, ref_yv12.stride, second_pred, pw, other_mv, &sf, pw, ph,
+        &conv_params, interp_filter,
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+        &warp_types, p_col, p_row, plane, !ref_idx,
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+        MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd);
+#if CONFIG_HIGHBITDEPTH
   }
+#endif  // CONFIG_HIGHBITDEPTH
 
-  // TODO(debargha): is show_frame needed here?
-  if (cpi->sf.adaptive_motion_search && bsize < cm->sb_size && cm->show_frame) {
-    int boffset =
-        2 * (b_width_log2_lookup[cm->sb_size] -
-             AOMMIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize]));
-    step_param = AOMMAX(step_param, boffset);
+  if (scaled_ref_frame) {
+    // Restore the prediction frame pointers to their unscaled versions.
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      xd->plane[i].pre[!ref_idx] = backup_yv12[i];
   }
+}
 
-  if (cpi->sf.adaptive_motion_search) {
-    int bwl = b_width_log2_lookup[bsize];
-    int bhl = b_height_log2_lookup[bsize];
-    int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4);
+// Search for the best mv for one component of a compound,
+// given that the other component is fixed.
+static void compound_single_motion_search(
+    const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, MV *this_mv,
+    int mi_row, int mi_col, const uint8_t *second_pred, const uint8_t *mask,
+    int mask_stride, int *rate_mv, const int block, int ref_idx) {
+  const int pw = block_size_wide[bsize];
+  const int ph = block_size_high[bsize];
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  const int ref = mbmi->ref_frame[ref_idx];
+  int_mv ref_mv = x->mbmi_ext->ref_mvs[ref][0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
 
-    if (tlevel < 5) step_param += 2;
+  struct buf_2d backup_yv12[MAX_MB_PLANE];
+  const YV12_BUFFER_CONFIG *const scaled_ref_frame =
+      av1_get_scaled_ref_frame(cpi, ref);
 
-    // prev_mv_sad is not setup for dynamically scaled frames.
-    if (cpi->oxcf.resize_mode != RESIZE_DYNAMIC) {
-      for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) {
-        if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
-          x->pred_mv[ref].row = 0;
-          x->pred_mv[ref].col = 0;
-          tmp_mv->as_int = INVALID_MV;
+  // Check that this is either an interinter or an interintra block
+  assert(has_second_ref(mbmi) ||
+         (ref_idx == 0 && mbmi->ref_frame[1] == INTRA_FRAME));
 
-          if (scaled_ref_frame) {
-            int j;
-            for (j = 0; j < MAX_MB_PLANE; ++j)
-              xd->plane[j].pre[ref_idx] = backup_yv12[j];
-          }
-          return;
-        }
-      }
-    }
+  if (scaled_ref_frame) {
+    int i;
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // motion search code to be used without additional modifications.
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      backup_yv12[i] = xd->plane[i].pre[ref_idx];
+    av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL);
   }
 
-  mvp_full = pred_mv[x->mv_best_ref_index[ref]];
+  struct buf_2d orig_yv12;
+  int bestsme = INT_MAX;
+  int sadpb = x->sadperbit16;
+  MV *const best_mv = &x->best_mv.as_mv;
+  int search_range = 3;
+
+  MvLimits tmp_mv_limits = x->mv_limits;
 
-  mvp_full.col >>= 3;
-  mvp_full.row >>= 3;
+  // Initialized here because of compiler problem in Visual Studio.
+  if (ref_idx) {
+    orig_yv12 = pd->pre[0];
+    pd->pre[0] = pd->pre[ref_idx];
+  }
 
-  bestsme = av1_masked_full_pixel_diamond(
-      cpi, x, mask, mask_stride, &mvp_full, step_param, sadpb,
-      MAX_MVSEARCH_STEPS - 1 - step_param, 1, &cpi->fn_ptr[bsize], &ref_mv,
-      &tmp_mv->as_mv, ref_idx);
+  // Do compound motion search on the current reference frame.
+  av1_set_mv_search_range(&x->mv_limits, &ref_mv.as_mv);
+
+  // Use the mv result from the single mode as mv predictor.
+  *best_mv = *this_mv;
+
+  best_mv->col >>= 3;
+  best_mv->row >>= 3;
+
+  av1_set_mvcost(x, ref, ref_idx, mbmi->ref_mv_idx);
+
+  // Small-range full-pixel motion search.
+  bestsme = av1_refining_search_8p_c(x, sadpb, search_range,
+                                     &cpi->fn_ptr[bsize], mask, mask_stride,
+                                     ref_idx, &ref_mv.as_mv, second_pred);
+  if (bestsme < INT_MAX) {
+    if (mask)
+      bestsme =
+          av1_get_mvpred_mask_var(x, best_mv, &ref_mv.as_mv, second_pred, mask,
+                                  mask_stride, ref_idx, &cpi->fn_ptr[bsize], 1);
+    else
+      bestsme = av1_get_mvpred_av_var(x, best_mv, &ref_mv.as_mv, second_pred,
+                                      &cpi->fn_ptr[bsize], 1);
+  }
 
   x->mv_limits = tmp_mv_limits;
 
   if (bestsme < INT_MAX) {
     int dis; /* TODO: use dis in distortion calculation later. */
-    av1_find_best_masked_sub_pixel_tree_up(
-        cpi, x, mask, mask_stride, mi_row, mi_col, &tmp_mv->as_mv, &ref_mv,
-        cm->allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize],
-        cpi->sf.mv.subpel_force_stop, cpi->sf.mv.subpel_iters_per_step,
-        x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], ref_idx,
-        cpi->sf.use_upsampled_references);
-  }
-  *rate_mv = av1_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->nmvjointcost,
-                             x->mvcost, MV_COST_WEIGHT);
+    unsigned int sse;
+    if (cpi->sf.use_upsampled_references) {
+      // Use up-sampled reference frames.
+      struct buf_2d backup_pred = pd->pre[0];
+      const YV12_BUFFER_CONFIG *upsampled_ref = get_upsampled_ref(cpi, ref);
+
+      // Set pred for Y plane
+      setup_pred_plane(&pd->pre[0], bsize, upsampled_ref->y_buffer,
+                       upsampled_ref->y_crop_width,
+                       upsampled_ref->y_crop_height, upsampled_ref->y_stride,
+                       (mi_row << 3), (mi_col << 3), NULL, pd->subsampling_x,
+                       pd->subsampling_y);
+
+// If bsize < BLOCK_8X8, adjust pred pointer for this block
+#if !CONFIG_CB4X4
+      if (bsize < BLOCK_8X8)
+        pd->pre[0].buf =
+            &pd->pre[0].buf[(av1_raster_block_offset(BLOCK_8X8, block,
+                                                     pd->pre[0].stride))
+                            << 3];
+#endif  // !CONFIG_CB4X4
+
+      bestsme = cpi->find_fractional_mv_step(
+          x, &ref_mv.as_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
+          &cpi->fn_ptr[bsize], 0, cpi->sf.mv.subpel_iters_per_step, NULL,
+          x->nmvjointcost, x->mvcost, &dis, &sse, second_pred, mask,
+          mask_stride, ref_idx, pw, ph, 1);
+
+      // Restore the reference frames.
+      pd->pre[0] = backup_pred;
+    } else {
+      (void)block;
+      bestsme = cpi->find_fractional_mv_step(
+          x, &ref_mv.as_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
+          &cpi->fn_ptr[bsize], 0, cpi->sf.mv.subpel_iters_per_step, NULL,
+          x->nmvjointcost, x->mvcost, &dis, &sse, second_pred, mask,
+          mask_stride, ref_idx, pw, ph, 0);
+    }
+  }
+
+  // Restore the pointer to the first (possibly scaled) prediction buffer.
+  if (ref_idx) pd->pre[0] = orig_yv12;
+
+  if (bestsme < INT_MAX) *this_mv = *best_mv;
 
-  if (cpi->sf.adaptive_motion_search && cm->show_frame)
-    x->pred_mv[ref] = tmp_mv->as_mv;
+  *rate_mv = 0;
 
   if (scaled_ref_frame) {
+    // Restore the prediction frame pointers to their unscaled versions.
+    int i;
     for (i = 0; i < MAX_MB_PLANE; i++)
       xd->plane[i].pre[ref_idx] = backup_yv12[i];
   }
+
+  av1_set_mvcost(x, ref, ref_idx, mbmi->ref_mv_idx);
+  *rate_mv += av1_mv_bit_cost(this_mv, &ref_mv.as_mv, x->nmvjointcost,
+                              x->mvcost, MV_COST_WEIGHT);
 }
 
+// Wrapper for compound_single_motion_search, for the common case
+// where the second prediction is also an inter mode.
+static void compound_single_motion_search_interinter(
+    const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *frame_mv,
+    int mi_row, int mi_col, const uint8_t *mask, int mask_stride, int *rate_mv,
+    const int block, int ref_idx) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+
+  // This function should only ever be called for compound modes
+  assert(has_second_ref(mbmi));
+
+// Prediction buffer from second frame.
+#if CONFIG_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[MAX_SB_SQUARE]);
+  uint8_t *second_pred;
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
+  else
+    second_pred = (uint8_t *)second_pred_alloc_16;
+#else
+  DECLARE_ALIGNED(16, uint8_t, second_pred[MAX_SB_SQUARE]);
+#endif  // CONFIG_HIGHBITDEPTH
+
+  MV *this_mv = &frame_mv[mbmi->ref_frame[ref_idx]].as_mv;
+  const MV *other_mv = &frame_mv[mbmi->ref_frame[!ref_idx]].as_mv;
+
+  build_second_inter_pred(cpi, x, bsize, other_mv, mi_row, mi_col, block,
+                          ref_idx, second_pred);
+
+  compound_single_motion_search(cpi, x, bsize, this_mv, mi_row, mi_col,
+                                second_pred, mask, mask_stride, rate_mv, block,
+                                ref_idx);
+}
+
+#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
 static void do_masked_motion_search_indexed(
-    const AV1_COMP *const cpi, MACROBLOCK *x,
+    const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
     const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE bsize,
     int mi_row, int mi_col, int_mv *tmp_mv, int *rate_mv, int which) {
   // NOTE: which values: 0 - 0 only, 1 - 1 only, 2 - both
@@ -7213,23 +6219,22 @@ static void do_masked_motion_search_indexed(
 
   mask = av1_get_compound_type_mask(comp_data, sb_type);
 
-  if (which == 0 || which == 2)
-    do_masked_motion_search(cpi, x, mask, mask_stride, bsize, mi_row, mi_col,
-                            &tmp_mv[0], &rate_mv[0], 0);
-
-  if (which == 1 || which == 2) {
-// get the negative mask
-#if CONFIG_COMPOUND_SEGMENT
-    uint8_t inv_mask_buf[2 * MAX_SB_SQUARE];
-    const int h = block_size_high[bsize];
-    mask = av1_get_compound_type_mask_inverse(
-        comp_data, inv_mask_buf, h, mask_stride, mask_stride, sb_type);
-#else
-    mask = av1_get_compound_type_mask_inverse(comp_data, sb_type);
-#endif  // CONFIG_COMPOUND_SEGMENT
-    do_masked_motion_search(cpi, x, mask, mask_stride, bsize, mi_row, mi_col,
-                            &tmp_mv[1], &rate_mv[1], 1);
-  }
+  int_mv frame_mv[TOTAL_REFS_PER_FRAME];
+  MV_REFERENCE_FRAME rf[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] };
+  assert(bsize >= BLOCK_8X8 || CONFIG_CB4X4);
+
+  frame_mv[rf[0]].as_int = cur_mv[0].as_int;
+  frame_mv[rf[1]].as_int = cur_mv[1].as_int;
+  if (which == 0 || which == 1) {
+    compound_single_motion_search_interinter(cpi, x, bsize, frame_mv, mi_row,
+                                             mi_col, mask, mask_stride, rate_mv,
+                                             0, which);
+  } else if (which == 2) {
+    joint_motion_search(cpi, x, bsize, frame_mv, mi_row, mi_col, NULL, mask,
+                        mask_stride, rate_mv, 0);
+  }
+  tmp_mv[0].as_int = frame_mv[rf[0]].as_int;
+  tmp_mv[1].as_int = frame_mv[rf[1]].as_int;
 }
 #endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
 #endif  // CONFIG_EXT_INTER
@@ -7275,7 +6280,7 @@ static int estimate_wedge_sign(const AV1_COMP *cpi, const MACROBLOCK *x,
   const int f_index = bsize - BLOCK_8X8;
   const int bw = block_size_wide[bsize];
   const int bh = block_size_high[bsize];
-  uint32_t esq[2][4], var;
+  uint32_t esq[2][4];
   int64_t tl, br;
 
 #if CONFIG_HIGHBITDEPTH
@@ -7285,23 +6290,22 @@ static int estimate_wedge_sign(const AV1_COMP *cpi, const MACROBLOCK *x,
   }
 #endif  // CONFIG_HIGHBITDEPTH
 
-  var = cpi->fn_ptr[f_index].vf(src, src_stride, pred0, stride0, &esq[0][0]);
-  var = cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, pred0 + bw / 2,
-                                stride0, &esq[0][1]);
-  var = cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride, src_stride,
-                                pred0 + bh / 2 * stride0, stride0, &esq[0][2]);
-  var = cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride + bw / 2, src_stride,
-                                pred0 + bh / 2 * stride0 + bw / 2, stride0,
-                                &esq[0][3]);
-  var = cpi->fn_ptr[f_index].vf(src, src_stride, pred1, stride1, &esq[1][0]);
-  var = cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, pred1 + bw / 2,
-                                stride1, &esq[1][1]);
-  var = cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride, src_stride,
-                                pred1 + bh / 2 * stride1, stride0, &esq[1][2]);
-  var = cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride + bw / 2, src_stride,
-                                pred1 + bh / 2 * stride1 + bw / 2, stride0,
-                                &esq[1][3]);
-  (void)var;
+  cpi->fn_ptr[f_index].vf(src, src_stride, pred0, stride0, &esq[0][0]);
+  cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, pred0 + bw / 2, stride0,
+                          &esq[0][1]);
+  cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride, src_stride,
+                          pred0 + bh / 2 * stride0, stride0, &esq[0][2]);
+  cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride + bw / 2, src_stride,
+                          pred0 + bh / 2 * stride0 + bw / 2, stride0,
+                          &esq[0][3]);
+  cpi->fn_ptr[f_index].vf(src, src_stride, pred1, stride1, &esq[1][0]);
+  cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, pred1 + bw / 2, stride1,
+                          &esq[1][1]);
+  cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride, src_stride,
+                          pred1 + bh / 2 * stride1, stride0, &esq[1][2]);
+  cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride + bw / 2, src_stride,
+                          pred1 + bh / 2 * stride1 + bw / 2, stride0,
+                          &esq[1][3]);
 
   tl = (int64_t)(esq[0][0] + esq[0][1] + esq[0][2]) -
        (int64_t)(esq[1][0] + esq[1][1] + esq[1][2]);
@@ -7353,16 +6357,6 @@ static InterpFilter predict_interp_filter(
               single_filter[NEARESTMV][refs[1]])
             best_filter = single_filter[NEARESTMV][refs[0]];
           break;
-        case NEAREST_NEARMV:
-          if (single_filter[NEARESTMV][refs[0]] ==
-              single_filter[NEARMV][refs[1]])
-            best_filter = single_filter[NEARESTMV][refs[0]];
-          break;
-        case NEAR_NEARESTMV:
-          if (single_filter[NEARMV][refs[0]] ==
-              single_filter[NEARESTMV][refs[1]])
-            best_filter = single_filter[NEARMV][refs[0]];
-          break;
         case NEAR_NEARMV:
           if (single_filter[NEARMV][refs[0]] == single_filter[NEARMV][refs[1]])
             best_filter = single_filter[NEARMV][refs[0]];
@@ -7575,6 +6569,7 @@ static int64_t pick_interinter_wedge(const AV1_COMP *const cpi,
   int wedge_sign = 0;
 
   assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
+  assert(cpi->common.allow_masked_compound);
 
   if (cpi->sf.fast_wedge_sign_estimate) {
     wedge_sign = estimate_wedge_sign(cpi, x, bsize, p0, bw, p1, bw);
@@ -7688,6 +6683,7 @@ static int64_t pick_interintra_wedge(const AV1_COMP *const cpi,
   int wedge_index = -1;
 
   assert(is_interintra_wedge_used(bsize));
+  assert(cpi->common.allow_interintra_compound);
 
   rd = pick_wedge_fixed_sign(cpi, x, bsize, p0, p1, 0, &wedge_index);
 
@@ -7715,15 +6711,13 @@ static int64_t pick_interinter_mask(const AV1_COMP *const cpi, MACROBLOCK *x,
   }
 }
 
-static int interinter_compound_motion_search(const AV1_COMP *const cpi,
-                                             MACROBLOCK *x,
-                                             const BLOCK_SIZE bsize,
-                                             const int this_mode, int mi_row,
-                                             int mi_col) {
+static int interinter_compound_motion_search(
+    const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
+    const BLOCK_SIZE bsize, const int this_mode, int mi_row, int mi_col) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   int_mv tmp_mv[2];
-  int rate_mvs[2], tmp_rate_mv = 0;
+  int tmp_rate_mv = 0;
   const INTERINTER_COMPOUND_DATA compound_data = {
 #if CONFIG_WEDGE
     mbmi->wedge_index,
@@ -7736,20 +6730,17 @@ static int interinter_compound_motion_search(const AV1_COMP *const cpi,
     mbmi->interinter_compound_type
   };
   if (this_mode == NEW_NEWMV) {
-    do_masked_motion_search_indexed(cpi, x, &compound_data, bsize, mi_row,
-                                    mi_col, tmp_mv, rate_mvs, 2);
-    tmp_rate_mv = rate_mvs[0] + rate_mvs[1];
+    do_masked_motion_search_indexed(cpi, x, cur_mv, &compound_data, bsize,
+                                    mi_row, mi_col, tmp_mv, &tmp_rate_mv, 2);
     mbmi->mv[0].as_int = tmp_mv[0].as_int;
     mbmi->mv[1].as_int = tmp_mv[1].as_int;
   } else if (this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV) {
-    do_masked_motion_search_indexed(cpi, x, &compound_data, bsize, mi_row,
-                                    mi_col, tmp_mv, rate_mvs, 0);
-    tmp_rate_mv = rate_mvs[0];
+    do_masked_motion_search_indexed(cpi, x, cur_mv, &compound_data, bsize,
+                                    mi_row, mi_col, tmp_mv, &tmp_rate_mv, 0);
     mbmi->mv[0].as_int = tmp_mv[0].as_int;
   } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
-    do_masked_motion_search_indexed(cpi, x, &compound_data, bsize, mi_row,
-                                    mi_col, tmp_mv, rate_mvs, 1);
-    tmp_rate_mv = rate_mvs[1];
+    do_masked_motion_search_indexed(cpi, x, cur_mv, &compound_data, bsize,
+                                    mi_row, mi_col, tmp_mv, &tmp_rate_mv, 1);
     mbmi->mv[1].as_int = tmp_mv[1].as_int;
   }
   return tmp_rate_mv;
@@ -7760,6 +6751,7 @@ static int64_t build_and_cost_compound_type(
     const BLOCK_SIZE bsize, const int this_mode, int rs2, int rate_mv,
     BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0, uint8_t **preds1,
     int *strides, int mi_row, int mi_col) {
+  const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   int rate_sum;
@@ -7775,9 +6767,9 @@ static int64_t build_and_cost_compound_type(
 
   if (have_newmv_in_inter_mode(this_mode) &&
       use_masked_motion_search(compound_type)) {
-    *out_rate_mv = interinter_compound_motion_search(cpi, x, bsize, this_mode,
-                                                     mi_row, mi_col);
-    av1_build_inter_predictors_sby(xd, mi_row, mi_col, ctx, bsize);
+    *out_rate_mv = interinter_compound_motion_search(cpi, x, cur_mv, bsize,
+                                                     this_mode, mi_row, mi_col);
+    av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize);
     model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
                     &tmp_skip_txfm_sb, &tmp_skip_sse_sb);
     rd = RDCOST(x->rdmult, x->rddiv, rs2 + *out_rate_mv + rate_sum, dist_sum);
@@ -7830,9 +6822,6 @@ typedef struct {
   // Pointer to array of motion vectors to use for each ref and their rates
   // Should point to first of 2 arrays in 2D array
   int *single_newmv_rate;
-  // Pointers costs of compound inter-intra and inter-inter predictions
-  int *compmode_interintra_cost;
-  int *compmode_interinter_cost;
   // Pointer to array of predicted rate-distortion
   // Should point to first of 2 arrays in 2D array
   int64_t (*modelled_rd)[TOTAL_REFS_PER_FRAME];
@@ -7872,14 +6861,12 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
       frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
 
       if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
-        joint_motion_search(cpi, x, bsize, frame_mv, mi_row, mi_col, NULL,
-                            rate_mv, 0);
+        joint_motion_search(cpi, x, bsize, frame_mv, mi_row, mi_col, NULL, NULL,
+                            0, rate_mv, 0);
       } else {
         *rate_mv = 0;
         for (i = 0; i < 2; ++i) {
-#if CONFIG_REF_MV
           av1_set_mvcost(x, refs[i], i, mbmi->ref_mv_idx);
-#endif  // CONFIG_REF_MV
           *rate_mv += av1_mv_bit_cost(
               &frame_mv[refs[i]].as_mv, &mbmi_ext->ref_mvs[refs[i]][0].as_mv,
               x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
@@ -7887,21 +6874,31 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
       }
     } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
       frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
-#if CONFIG_REF_MV
-      av1_set_mvcost(x, refs[1], 1, mbmi->ref_mv_idx);
-#endif  // CONFIG_REF_MV
-      *rate_mv = av1_mv_bit_cost(&frame_mv[refs[1]].as_mv,
-                                 &mbmi_ext->ref_mvs[refs[1]][0].as_mv,
-                                 x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+      if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
+        frame_mv[refs[0]].as_int =
+            mode_mv[compound_ref0_mode(this_mode)][refs[0]].as_int;
+        compound_single_motion_search_interinter(
+            cpi, x, bsize, frame_mv, mi_row, mi_col, NULL, 0, rate_mv, 0, 1);
+      } else {
+        av1_set_mvcost(x, refs[1], 1, mbmi->ref_mv_idx);
+        *rate_mv = av1_mv_bit_cost(&frame_mv[refs[1]].as_mv,
+                                   &mbmi_ext->ref_mvs[refs[1]][0].as_mv,
+                                   x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+      }
     } else {
       assert(this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV);
       frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
-#if CONFIG_REF_MV
-      av1_set_mvcost(x, refs[0], 0, mbmi->ref_mv_idx);
-#endif  // CONFIG_REF_MV
-      *rate_mv = av1_mv_bit_cost(&frame_mv[refs[0]].as_mv,
-                                 &mbmi_ext->ref_mvs[refs[0]][0].as_mv,
-                                 x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+      if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
+        frame_mv[refs[1]].as_int =
+            mode_mv[compound_ref1_mode(this_mode)][refs[1]].as_int;
+        compound_single_motion_search_interinter(
+            cpi, x, bsize, frame_mv, mi_row, mi_col, NULL, 0, rate_mv, 0, 0);
+      } else {
+        av1_set_mvcost(x, refs[0], 0, mbmi->ref_mv_idx);
+        *rate_mv = av1_mv_bit_cost(&frame_mv[refs[0]].as_mv,
+                                   &mbmi_ext->ref_mvs[refs[0]][0].as_mv,
+                                   x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+      }
     }
 #else
     // Initialize mv using single prediction mode result.
@@ -7913,9 +6910,7 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
     } else {
       *rate_mv = 0;
       for (i = 0; i < 2; ++i) {
-#if CONFIG_REF_MV
         av1_set_mvcost(x, refs[i], i, mbmi->ref_mv_idx);
-#endif  // CONFIG_REF_MV
         *rate_mv += av1_mv_bit_cost(&frame_mv[refs[i]].as_mv,
                                     &mbmi_ext->ref_mvs[refs[i]][0].as_mv,
                                     x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
@@ -7986,7 +6981,7 @@ int64_t interpolation_filter_search(
   set_default_interp_filters(mbmi, assign_filter);
 
   *switchable_rate = av1_get_switchable_rate(cpi, xd);
-  av1_build_inter_predictors_sb(xd, mi_row, mi_col, orig_dst, bsize);
+  av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
   model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate, &tmp_dist,
                   skip_txfm_sb, skip_sse_sb);
   *rd = RDCOST(x->rdmult, x->rddiv, *switchable_rate + tmp_rate, tmp_dist);
@@ -8022,7 +7017,7 @@ int64_t interpolation_filter_search(
         mbmi->interp_filter = (InterpFilter)i;
 #endif  // CONFIG_DUAL_FILTER
         tmp_rs = av1_get_switchable_rate(cpi, xd);
-        av1_build_inter_predictors_sb(xd, mi_row, mi_col, orig_dst, bsize);
+        av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
         model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
                         &tmp_dist, &tmp_skip_sb, &tmp_skip_sse);
         tmp_rd = RDCOST(x->rdmult, x->rddiv, tmp_rs + tmp_rate, tmp_dist);
@@ -8077,6 +7072,7 @@ static int64_t motion_mode_rd(
     int mi_col, HandleInterModeArgs *const args, const int64_t ref_best_rd,
     const int *refs, int rate_mv,
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+    int_mv *const single_newmv,
 #if CONFIG_EXT_INTER
     int rate2_bmc_nocoeff, MB_MODE_INFO *best_bmc_mbmi,
 #if CONFIG_MOTION_VAR
@@ -8183,10 +7179,10 @@ static int64_t motion_mode_rd(
         if (!has_subpel_mv_component(xd->mi[0], xd, 1))
           mbmi->interp_filter[1] = EIGHTTAP_REGULAR;
 #endif  // CONFIG_DUAL_FILTER
-        av1_build_inter_predictors_sb(xd, mi_row, mi_col, orig_dst, bsize);
+        av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
 #if CONFIG_EXT_INTER
       } else {
-        av1_build_inter_predictors_sb(xd, mi_row, mi_col, orig_dst, bsize);
+        av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
 #endif  // CONFIG_EXT_INTER
       }
       av1_build_obmc_inter_prediction(
@@ -8214,10 +7210,55 @@ static int64_t motion_mode_rd(
                                                             : cm->interp_filter;
 #endif  // CONFIG_DUAL_FILTER
 
-      if (find_projection(mbmi->num_proj_ref[0], pts, pts_inref, bsize,
-                          mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
-                          &mbmi->wm_params[0], mi_row, mi_col) == 0) {
-        av1_build_inter_predictors_sb(xd, mi_row, mi_col, NULL, bsize);
+      if (!find_projection(mbmi->num_proj_ref[0], pts, pts_inref, bsize,
+                           mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
+                           &mbmi->wm_params[0], mi_row, mi_col)) {
+        // Refine MV for NEWMV mode
+        if (!is_comp_pred && have_newmv_in_inter_mode(this_mode)) {
+          int tmp_rate_mv = 0;
+          const int_mv mv0 = mbmi->mv[0];
+          WarpedMotionParams wm_params0 = mbmi->wm_params[0];
+
+          // Refine MV in a small range.
+          av1_refine_warped_mv(cpi, x, bsize, mi_row, mi_col, pts, pts_inref);
+
+          // Keep the refined MV and WM parameters.
+          if (mv0.as_int != mbmi->mv[0].as_int) {
+            const int ref = refs[0];
+            const MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv;
+
+            tmp_rate_mv =
+                av1_mv_bit_cost(&mbmi->mv[0].as_mv, &ref_mv, x->nmvjointcost,
+                                x->mvcost, MV_COST_WEIGHT);
+
+            if (cpi->sf.adaptive_motion_search)
+              x->pred_mv[ref] = mbmi->mv[0].as_mv;
+
+            single_newmv[ref] = mbmi->mv[0];
+
+            if (discount_newmv_test(cpi, this_mode, mbmi->mv[0], mode_mv,
+                                    refs[0])) {
+              tmp_rate_mv = AOMMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
+            }
+#if CONFIG_EXT_INTER
+            tmp_rate2 = rate2_bmc_nocoeff - rate_mv_bmc + tmp_rate_mv;
+#else
+            tmp_rate2 = rate2_nocoeff - rate_mv + tmp_rate_mv;
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_DUAL_FILTER
+            if (!has_subpel_mv_component(xd->mi[0], xd, 0))
+              mbmi->interp_filter[0] = EIGHTTAP_REGULAR;
+            if (!has_subpel_mv_component(xd->mi[0], xd, 1))
+              mbmi->interp_filter[1] = EIGHTTAP_REGULAR;
+#endif  // CONFIG_DUAL_FILTER
+          } else {
+            // Restore the old MV and WM parameters.
+            mbmi->mv[0] = mv0;
+            mbmi->wm_params[0] = wm_params0;
+          }
+        }
+
+        av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
         model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
                         &tmp_dist, skip_txfm_sb, skip_sse_sb);
       } else {
@@ -8446,16 +7487,16 @@ static int64_t handle_inter_mode(
   int rate_mv = 0;
 #if CONFIG_EXT_INTER
   int pred_exists = 1;
+#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
   const int bw = block_size_wide[bsize];
+#endif  // ONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
   int_mv single_newmv[TOTAL_REFS_PER_FRAME];
 #if CONFIG_INTERINTRA
   const unsigned int *const interintra_mode_cost =
       cpi->interintra_mode_cost[size_group_lookup[bsize]];
 #endif  // CONFIG_INTERINTRA
   const int is_comp_interintra_pred = (mbmi->ref_frame[1] == INTRA_FRAME);
-#if CONFIG_REF_MV
   uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
-#endif  // CONFIG_REF_MV
 #else
   int_mv *const single_newmv = args->single_newmv;
 #endif  // CONFIG_EXT_INTER
@@ -8484,10 +7525,19 @@ static int64_t handle_inter_mode(
   int16_t mode_ctx;
 
 #if CONFIG_EXT_INTER
-  *args->compmode_interintra_cost = 0;
+#if CONFIG_INTERINTRA
+  int compmode_interintra_cost = 0;
   mbmi->use_wedge_interintra = 0;
-  *args->compmode_interinter_cost = 0;
+#endif
+#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
+  int compmode_interinter_cost = 0;
   mbmi->interinter_compound_type = COMPOUND_AVERAGE;
+#endif
+
+#if CONFIG_INTERINTRA
+  if (!cm->allow_interintra_compound && is_comp_interintra_pred)
+    return INT64_MAX;
+#endif  // CONFIG_INTERINTRA
 
   // is_comp_interintra_pred implies !is_comp_pred
   assert(!is_comp_interintra_pred || (!is_comp_pred));
@@ -8495,7 +7545,6 @@ static int64_t handle_inter_mode(
   assert(!is_comp_interintra_pred || is_interintra_allowed(mbmi));
 #endif  // CONFIG_EXT_INTER
 
-#if CONFIG_REF_MV
 #if CONFIG_EXT_INTER
   if (is_comp_pred)
     mode_ctx = mbmi_ext->compound_mode_context[refs[0]];
@@ -8503,9 +7552,6 @@ static int64_t handle_inter_mode(
 #endif  // CONFIG_EXT_INTER
     mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
                                          mbmi->ref_frame, bsize, -1);
-#else   // CONFIG_REF_MV
-  mode_ctx = mbmi_ext->mode_context[refs[0]];
-#endif  // CONFIG_REF_MV
 
 #if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
@@ -8545,7 +7591,6 @@ static int64_t handle_inter_mode(
     mbmi->mv[i].as_int = cur_mv[i].as_int;
   }
 
-#if CONFIG_REF_MV
 #if CONFIG_EXT_INTER
   if (this_mode == NEAREST_NEARESTMV)
 #else
@@ -8569,7 +7614,7 @@ static int64_t handle_inter_mode(
 
 #if CONFIG_EXT_INTER
   if (mbmi_ext->ref_mv_count[ref_frame_type] > 0) {
-    if (this_mode == NEAREST_NEWMV || this_mode == NEAREST_NEARMV) {
+    if (this_mode == NEAREST_NEWMV) {
       cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv;
 
       lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv);
@@ -8578,7 +7623,7 @@ static int64_t handle_inter_mode(
       mbmi->mv[0].as_int = cur_mv[0].as_int;
     }
 
-    if (this_mode == NEW_NEARESTMV || this_mode == NEAR_NEARESTMV) {
+    if (this_mode == NEW_NEARESTMV) {
       cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv;
 
       lower_mv_precision(&cur_mv[1].as_mv, cm->allow_high_precision_mv);
@@ -8590,8 +7635,7 @@ static int64_t handle_inter_mode(
 
   if (mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
     int ref_mv_idx = mbmi->ref_mv_idx + 1;
-    if (this_mode == NEAR_NEWMV || this_mode == NEAR_NEARESTMV ||
-        this_mode == NEAR_NEARMV) {
+    if (this_mode == NEAR_NEWMV || this_mode == NEAR_NEARMV) {
       cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
 
       lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv);
@@ -8600,8 +7644,7 @@ static int64_t handle_inter_mode(
       mbmi->mv[0].as_int = cur_mv[0].as_int;
     }
 
-    if (this_mode == NEW_NEARMV || this_mode == NEAREST_NEARMV ||
-        this_mode == NEAR_NEARMV) {
+    if (this_mode == NEW_NEARMV || this_mode == NEAR_NEARMV) {
       cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
 
       lower_mv_precision(&cur_mv[1].as_mv, cm->allow_high_precision_mv);
@@ -8626,7 +7669,6 @@ static int64_t handle_inter_mode(
     }
   }
 #endif  // CONFIG_EXT_INTER
-#endif  // CONFIG_REF_MV
 
   // do first prediction into the destination buffer. Do the next
   // prediction into a temporary buffer. Then keep track of which one
@@ -8659,7 +7701,7 @@ static int64_t handle_inter_mode(
 #else
     rd_stats->rate += AOMMIN(cost_mv_ref(cpi, this_mode, mode_ctx),
                              cost_mv_ref(cpi, NEARESTMV, mode_ctx));
-#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
+#endif  // CONFIG_EXT_INTER
   } else {
     rd_stats->rate += cost_mv_ref(cpi, this_mode, mode_ctx);
   }
@@ -8688,6 +7730,7 @@ static int64_t handle_inter_mode(
 #endif  // CONFIG_MOTION_VAR
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 
+#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
   if (is_comp_pred) {
     int rate_sum, rs2;
     int64_t dist_sum;
@@ -8705,6 +7748,9 @@ static int64_t handle_inter_mode(
     int strides[1] = { bw };
     int tmp_rate_mv;
     int masked_compound_used = is_any_masked_compound_used(bsize);
+#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+    masked_compound_used = masked_compound_used && cm->allow_masked_compound;
+#endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
     COMPOUND_TYPE cur_type;
 
     best_mv[0].as_int = cur_mv[0].as_int;
@@ -8714,8 +7760,6 @@ static int64_t handle_inter_mode(
     uint8_t tmp_mask_buf[2 * MAX_SB_SQUARE];
     best_compound_data.seg_mask = tmp_mask_buf;
 #endif  // CONFIG_COMPOUND_SEGMENT
-    av1_cost_tokens(compound_type_cost, cm->fc->compound_type_prob[bsize],
-                    av1_compound_type_tree);
 
     if (masked_compound_used) {
       av1_cost_tokens(compound_type_cost, cm->fc->compound_type_prob[bsize],
@@ -8728,6 +7772,7 @@ static int64_t handle_inter_mode(
     }
 
     for (cur_type = COMPOUND_AVERAGE; cur_type < COMPOUND_TYPES; cur_type++) {
+      if (cur_type != COMPOUND_AVERAGE && !masked_compound_used) break;
       if (!is_interinter_compound_used(cur_type, bsize)) break;
       tmp_rate_mv = rate_mv;
       best_rd_cur = INT64_MAX;
@@ -8740,7 +7785,8 @@ static int64_t handle_inter_mode(
 
       switch (cur_type) {
         case COMPOUND_AVERAGE:
-          av1_build_inter_predictors_sby(xd, mi_row, mi_col, &orig_dst, bsize);
+          av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, &orig_dst,
+                                         bsize);
           av1_subtract_plane(x, bsize, 0);
           rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
                                    &tmp_skip_txfm_sb, &tmp_skip_sse_sb,
@@ -8830,13 +7876,14 @@ static int64_t handle_inter_mode(
 
     pred_exists = 0;
 
-    *args->compmode_interinter_cost =
+    compmode_interinter_cost =
         av1_cost_literal(get_interinter_compound_type_bits(
             bsize, mbmi->interinter_compound_type)) +
         (masked_compound_used
              ? compound_type_cost[mbmi->interinter_compound_type]
              : 0);
   }
+#endif  // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
 
 #if CONFIG_INTERINTRA
   if (is_comp_interintra_pred) {
@@ -8863,7 +7910,7 @@ static int64_t handle_inter_mode(
       xd->plane[j].dst.buf = tmp_buf + j * MAX_SB_SQUARE;
       xd->plane[j].dst.stride = bw;
     }
-    av1_build_inter_predictors_sby(xd, mi_row, mi_col, &orig_dst, bsize);
+    av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, &orig_dst, bsize);
     restore_dst_buf(xd, orig_dst);
     mbmi->ref_frame[1] = INTRA_FRAME;
     mbmi->use_wedge_interintra = 0;
@@ -8876,7 +7923,8 @@ static int64_t handle_inter_mode(
       av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
       model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
                       &tmp_skip_txfm_sb, &tmp_skip_sse_sb);
-      rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate_mv + rate_sum, dist_sum);
+      rd =
+          RDCOST(x->rdmult, x->rddiv, tmp_rate_mv + rate_sum + rmode, dist_sum);
       if (rd < best_interintra_rd) {
         best_interintra_rd = rd;
         best_interintra_mode = mbmi->interintra_mode;
@@ -8907,7 +7955,7 @@ static int64_t handle_inter_mode(
       if (rd != INT64_MAX)
         rd = RDCOST(x->rdmult, x->rddiv, rmode + rate_mv + rwedge + rate_sum,
                     dist_sum);
-      best_interintra_rd_nowedge = rd;
+      best_interintra_rd_nowedge = best_interintra_rd;
 
       // Disable wedge search if source variance is small
       if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh) {
@@ -8926,17 +7974,18 @@ static int64_t handle_inter_mode(
           // get negative of mask
           const uint8_t *mask = av1_get_contiguous_soft_mask(
               mbmi->interintra_wedge_index, 1, bsize);
-          do_masked_motion_search(cpi, x, mask, bw, bsize, mi_row, mi_col,
-                                  &tmp_mv, &tmp_rate_mv, 0);
+          tmp_mv.as_int = x->mbmi_ext->ref_mvs[refs[0]][0].as_int;
+          compound_single_motion_search(cpi, x, bsize, &tmp_mv.as_mv, mi_row,
+                                        mi_col, intrapred, mask, bw,
+                                        &tmp_rate_mv, 0, 0);
           mbmi->mv[0].as_int = tmp_mv.as_int;
-          av1_build_inter_predictors_sby(xd, mi_row, mi_col, &orig_dst, bsize);
+          av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, &orig_dst,
+                                         bsize);
           model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
                           &tmp_skip_txfm_sb, &tmp_skip_sse_sb);
           rd = RDCOST(x->rdmult, x->rddiv,
                       rmode + tmp_rate_mv + rwedge + rate_sum, dist_sum);
-          if (rd < best_interintra_rd_wedge) {
-            best_interintra_rd_wedge = rd;
-          } else {
+          if (rd >= best_interintra_rd_wedge) {
             tmp_mv.as_int = cur_mv[0].as_int;
             tmp_rate_mv = rate_mv;
           }
@@ -8956,37 +8005,33 @@ static int64_t handle_inter_mode(
         best_interintra_rd_wedge = rd;
         if (best_interintra_rd_wedge < best_interintra_rd_nowedge) {
           mbmi->use_wedge_interintra = 1;
-          best_interintra_rd = best_interintra_rd_wedge;
           mbmi->mv[0].as_int = tmp_mv.as_int;
           rd_stats->rate += tmp_rate_mv - rate_mv;
           rate_mv = tmp_rate_mv;
         } else {
           mbmi->use_wedge_interintra = 0;
-          best_interintra_rd = best_interintra_rd_nowedge;
           mbmi->mv[0].as_int = cur_mv[0].as_int;
         }
       } else {
         mbmi->use_wedge_interintra = 0;
-        best_interintra_rd = best_interintra_rd_nowedge;
       }
     }
 #endif  // CONFIG_WEDGE
 
     pred_exists = 0;
-    *args->compmode_interintra_cost =
-        av1_cost_bit(cm->fc->interintra_prob[size_group_lookup[bsize]], 1);
-    *args->compmode_interintra_cost +=
+    compmode_interintra_cost =
+        av1_cost_bit(cm->fc->interintra_prob[size_group_lookup[bsize]], 1) +
         interintra_mode_cost[mbmi->interintra_mode];
     if (is_interintra_wedge_used(bsize)) {
-      *args->compmode_interintra_cost += av1_cost_bit(
+      compmode_interintra_cost += av1_cost_bit(
           cm->fc->wedge_interintra_prob[bsize], mbmi->use_wedge_interintra);
       if (mbmi->use_wedge_interintra) {
-        *args->compmode_interintra_cost +=
+        compmode_interintra_cost +=
             av1_cost_literal(get_interintra_wedge_bits(bsize));
       }
     }
   } else if (is_interintra_allowed(mbmi)) {
-    *args->compmode_interintra_cost =
+    compmode_interintra_cost =
         av1_cost_bit(cm->fc->interintra_prob[size_group_lookup[bsize]], 0);
   }
 #endif  // CONFIG_INTERINTRA
@@ -8994,7 +8039,7 @@ static int64_t handle_inter_mode(
   if (pred_exists == 0) {
     int tmp_rate;
     int64_t tmp_dist;
-    av1_build_inter_predictors_sb(xd, mi_row, mi_col, &orig_dst, bsize);
+    av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, &orig_dst, bsize);
     model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
                     &tmp_dist, &skip_txfm_sb, &skip_sse_sb);
     rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
@@ -9034,10 +8079,23 @@ static int64_t handle_inter_mode(
     }
   }
 
+#if CONFIG_EXT_INTER
+#if CONFIG_INTERINTRA
+  rd_stats->rate += compmode_interintra_cost;
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+  rate2_bmc_nocoeff += compmode_interintra_cost;
+#endif
+#endif
+#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
+  rd_stats->rate += compmode_interinter_cost;
+#endif
+#endif
+
   ret_val = motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv,
                            disable_skip, mode_mv, mi_row, mi_col, args,
                            ref_best_rd, refs, rate_mv,
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+                           single_newmv,
 #if CONFIG_EXT_INTER
                            rate2_bmc_nocoeff, &best_bmc_mbmi,
 #if CONFIG_MOTION_VAR
@@ -9060,34 +8118,36 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
 
   MACROBLOCKD *const xd = &x->e_mbd;
   const TileInfo *tile = &xd->tile;
+#if CONFIG_EC_ADAPT
+  FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
+#else
+  FRAME_CONTEXT *const ec_ctx = cm->fc;
+#endif  // CONFIG_EC_ADAPT
   MODE_INFO *const mi = xd->mi[0];
   const int mi_row = -xd->mb_to_top_edge / (8 * MI_SIZE);
   const int mi_col = -xd->mb_to_left_edge / (8 * MI_SIZE);
   const int w = block_size_wide[bsize];
   const int h = block_size_high[bsize];
   const int sb_row = mi_row / MAX_MIB_SIZE;
+  const int sb_col = mi_col / MAX_MIB_SIZE;
 
-  int_mv dv_ref;
-  av1_find_ref_dv(&dv_ref, mi_row, mi_col);
-
-  const MvLimits tmp_mv_limits = x->mv_limits;
+  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  MV_REFERENCE_FRAME ref_frame = INTRA_FRAME;
+  int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
+  av1_find_mv_refs(cm, xd, mi, ref_frame, &mbmi_ext->ref_mv_count[ref_frame],
+                   mbmi_ext->ref_mv_stack[ref_frame],
+#if CONFIG_EXT_INTER
+                   mbmi_ext->compound_mode_context,
+#endif  // CONFIG_EXT_INTER
+                   candidates, mi_row, mi_col, NULL, NULL,
+                   mbmi_ext->mode_context);
 
-  // TODO(aconverse@google.com): Handle same row DV.
-  x->mv_limits.col_min = (tile->mi_col_start - mi_col) * MI_SIZE;
-  x->mv_limits.col_max = (tile->mi_col_end - mi_col) * MI_SIZE - w;
-  x->mv_limits.row_min = (tile->mi_row_start - mi_row) * MI_SIZE;
-  x->mv_limits.row_max = (sb_row * MAX_MIB_SIZE - mi_row) * MI_SIZE - h;
-  assert(x->mv_limits.col_min >= tmp_mv_limits.col_min);
-  assert(x->mv_limits.col_max <= tmp_mv_limits.col_max);
-  assert(x->mv_limits.row_min >= tmp_mv_limits.row_min);
-  assert(x->mv_limits.row_max <= tmp_mv_limits.row_max);
-  av1_set_mv_search_range(&x->mv_limits, &dv_ref.as_mv);
+  int_mv nearestmv, nearmv;
+  av1_find_best_ref_mvs(0, candidates, &nearestmv, &nearmv);
 
-  if (x->mv_limits.col_max < x->mv_limits.col_min ||
-      x->mv_limits.row_max < x->mv_limits.row_min) {
-    x->mv_limits = tmp_mv_limits;
-    return INT64_MAX;
-  }
+  int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv;
+  if (dv_ref.as_int == 0) av1_find_ref_dv(&dv_ref, mi_row, mi_col);
+  mbmi_ext->ref_mvs[INTRA_FRAME][0] = dv_ref;
 
   struct buf_2d yv12_mb[MAX_MB_PLANE];
   av1_setup_pred_block(xd, yv12_mb, xd->cur_buf, mi_row, mi_col, NULL, NULL);
@@ -9095,86 +8155,140 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
     xd->plane[i].pre[0] = yv12_mb[i];
   }
 
-  int step_param = cpi->mv_step_param;
-  MV mvp_full = dv_ref.as_mv;
-  mvp_full.col >>= 3;
-  mvp_full.row >>= 3;
-  int sadpb = x->sadperbit16;
-  int cost_list[5];
-  int bestsme = av1_full_pixel_search(cpi, x, bsize, &mvp_full, step_param,
-                                      sadpb, cond_cost_list(cpi, cost_list),
-                                      &dv_ref.as_mv, INT_MAX, 1);
+  enum IntrabcMotionDirection {
+    IBC_MOTION_ABOVE,
+    IBC_MOTION_LEFT,
+    IBC_MOTION_DIRECTIONS
+  };
 
-  x->mv_limits = tmp_mv_limits;
-  if (bestsme == INT_MAX) return INT64_MAX;
-  mvp_full = x->best_mv.as_mv;
-  MV dv = {.row = mvp_full.row * 8, .col = mvp_full.col * 8 };
-  if (mv_check_bounds(&x->mv_limits, &dv)) return INT64_MAX;
-  if (!is_dv_valid(dv, tile, mi_row, mi_col, bsize)) return INT64_MAX;
   MB_MODE_INFO *mbmi = &mi->mbmi;
   MB_MODE_INFO best_mbmi = *mbmi;
   RD_STATS best_rdcost = *rd_cost;
   int best_skip = x->skip;
+
+  for (enum IntrabcMotionDirection dir = IBC_MOTION_ABOVE;
+       dir < IBC_MOTION_DIRECTIONS; ++dir) {
+    const MvLimits tmp_mv_limits = x->mv_limits;
+    switch (dir) {
+      case IBC_MOTION_ABOVE:
+        x->mv_limits.col_min = (tile->mi_col_start - mi_col) * MI_SIZE;
+        x->mv_limits.col_max = (tile->mi_col_end - mi_col) * MI_SIZE - w;
+        x->mv_limits.row_min = (tile->mi_row_start - mi_row) * MI_SIZE;
+        x->mv_limits.row_max = (sb_row * MAX_MIB_SIZE - mi_row) * MI_SIZE - h;
+        break;
+      case IBC_MOTION_LEFT:
+        x->mv_limits.col_min = (tile->mi_col_start - mi_col) * MI_SIZE;
+        x->mv_limits.col_max = (sb_col * MAX_MIB_SIZE - mi_col) * MI_SIZE - w;
+        // TODO(aconverse@google.com): Minimize the overlap between above and
+        // left areas.
+        x->mv_limits.row_min = (tile->mi_row_start - mi_row) * MI_SIZE;
+        int bottom_coded_mi_edge =
+            AOMMIN((sb_row + 1) * MAX_MIB_SIZE, tile->mi_row_end);
+        x->mv_limits.row_max = (bottom_coded_mi_edge - mi_row) * MI_SIZE - h;
+        break;
+      default: assert(0);
+    }
+    assert(x->mv_limits.col_min >= tmp_mv_limits.col_min);
+    assert(x->mv_limits.col_max <= tmp_mv_limits.col_max);
+    assert(x->mv_limits.row_min >= tmp_mv_limits.row_min);
+    assert(x->mv_limits.row_max <= tmp_mv_limits.row_max);
+    av1_set_mv_search_range(&x->mv_limits, &dv_ref.as_mv);
+
+    if (x->mv_limits.col_max < x->mv_limits.col_min ||
+        x->mv_limits.row_max < x->mv_limits.row_min) {
+      x->mv_limits = tmp_mv_limits;
+      continue;
+    }
+
+    int step_param = cpi->mv_step_param;
+    MV mvp_full = dv_ref.as_mv;
+    mvp_full.col >>= 3;
+    mvp_full.row >>= 3;
+    int sadpb = x->sadperbit16;
+    int cost_list[5];
+    int bestsme = av1_full_pixel_search(cpi, x, bsize, &mvp_full, step_param,
+                                        sadpb, cond_cost_list(cpi, cost_list),
+                                        &dv_ref.as_mv, INT_MAX, 1);
+
+    x->mv_limits = tmp_mv_limits;
+    if (bestsme == INT_MAX) continue;
+    mvp_full = x->best_mv.as_mv;
+    MV dv = {.row = mvp_full.row * 8, .col = mvp_full.col * 8 };
+    if (mv_check_bounds(&x->mv_limits, &dv)) continue;
+    if (!is_dv_valid(dv, tile, mi_row, mi_col, bsize)) continue;
+
 #if CONFIG_PALETTE
-  memset(&mbmi->palette_mode_info, 0, sizeof(mbmi->palette_mode_info));
+    memset(&mbmi->palette_mode_info, 0, sizeof(mbmi->palette_mode_info));
 #endif
-  mbmi->use_intrabc = 1;
-  mbmi->mode = DC_PRED;
-  mbmi->uv_mode = DC_PRED;
-  mbmi->mv[0].as_mv = dv;
+    mbmi->use_intrabc = 1;
+    mbmi->mode = DC_PRED;
+    mbmi->uv_mode = DC_PRED;
+    mbmi->mv[0].as_mv = dv;
 #if CONFIG_DUAL_FILTER
-  for (int idx = 0; idx < 4; ++idx) mbmi->interp_filter[idx] = BILINEAR;
+    for (int idx = 0; idx < 4; ++idx) mbmi->interp_filter[idx] = BILINEAR;
 #else
-  mbmi->interp_filter = BILINEAR;
+    mbmi->interp_filter = BILINEAR;
 #endif
-  mbmi->skip = 0;
-  x->skip = 0;
-  av1_build_inter_predictors_sb(xd, mi_row, mi_col, NULL, bsize);
-
-  int rate_mv = av1_mv_bit_cost(&dv, &dv_ref.as_mv, x->nmvjointcost, x->mvcost,
-                                MV_COST_WEIGHT);
-  const PREDICTION_MODE A = av1_above_block_mode(mi, xd->above_mi, 0);
-  const PREDICTION_MODE L = av1_left_block_mode(mi, xd->left_mi, 0);
-  const int rate_mode =
-      cpi->y_mode_costs[A][L][DC_PRED] + av1_cost_bit(INTRABC_PROB, 1);
-
-  RD_STATS rd_stats, rd_stats_uv;
-  av1_subtract_plane(x, bsize, 0);
-  super_block_yrd(cpi, x, &rd_stats, bsize, INT64_MAX);
-  super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
-  av1_merge_rd_stats(&rd_stats, &rd_stats_uv);
+    mbmi->skip = 0;
+    x->skip = 0;
+    av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
+
+    int rate_mv = av1_mv_bit_cost(&dv, &dv_ref.as_mv, x->nmvjointcost,
+                                  x->mvcost, MV_COST_WEIGHT);
+    const PREDICTION_MODE A = av1_above_block_mode(mi, xd->above_mi, 0);
+    const PREDICTION_MODE L = av1_left_block_mode(mi, xd->left_mi, 0);
+    const int rate_mode = cpi->y_mode_costs[A][L][DC_PRED] +
+                          av1_cost_bit(ec_ctx->intrabc_prob, 1);
+
+    RD_STATS rd_stats, rd_stats_uv;
+    av1_subtract_plane(x, bsize, 0);
+    super_block_yrd(cpi, x, &rd_stats, bsize, INT64_MAX);
+    super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+    av1_merge_rd_stats(&rd_stats, &rd_stats_uv);
 #if CONFIG_RD_DEBUG
-  mbmi->rd_stats = rd_stats;
+    mbmi->rd_stats = rd_stats;
 #endif
 
-  const aom_prob skip_prob = av1_get_skip_prob(cm, xd);
-
-  RD_STATS rdc_noskip;
-  av1_init_rd_stats(&rdc_noskip);
-  rdc_noskip.rate =
-      rate_mode + rate_mv + rd_stats.rate + av1_cost_bit(skip_prob, 0);
-  rdc_noskip.dist = rd_stats.dist;
-  rdc_noskip.rdcost =
-      RDCOST(x->rdmult, x->rddiv, rdc_noskip.rate, rdc_noskip.dist);
-  if (rdc_noskip.rdcost < best_rd) {
-    best_rd = rdc_noskip.rdcost;
-    best_mbmi = *mbmi;
-    best_skip = x->skip;
-    best_rdcost = rdc_noskip;
-  }
+#if CONFIG_VAR_TX
+    // TODO(aconverse@google.com): Evaluate allowing VAR TX on intrabc blocks
+    const int width = block_size_wide[bsize] >> tx_size_wide_log2[0];
+    const int height = block_size_high[bsize] >> tx_size_high_log2[0];
+    int idx, idy;
+    for (idy = 0; idy < height; ++idy)
+      for (idx = 0; idx < width; ++idx)
+        mbmi->inter_tx_size[idy >> 1][idx >> 1] = mbmi->tx_size;
+    mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
+#endif  // CONFIG_VAR_TX
 
-  x->skip = 1;
-  mbmi->skip = 1;
-  RD_STATS rdc_skip;
-  av1_init_rd_stats(&rdc_skip);
-  rdc_skip.rate = rate_mode + rate_mv + av1_cost_bit(skip_prob, 1);
-  rdc_skip.dist = rd_stats.sse;
-  rdc_skip.rdcost = RDCOST(x->rdmult, x->rddiv, rdc_skip.rate, rdc_skip.dist);
-  if (rdc_skip.rdcost < best_rd) {
-    best_rd = rdc_skip.rdcost;
-    best_mbmi = *mbmi;
-    best_skip = x->skip;
-    best_rdcost = rdc_skip;
+    const aom_prob skip_prob = av1_get_skip_prob(cm, xd);
+
+    RD_STATS rdc_noskip;
+    av1_init_rd_stats(&rdc_noskip);
+    rdc_noskip.rate =
+        rate_mode + rate_mv + rd_stats.rate + av1_cost_bit(skip_prob, 0);
+    rdc_noskip.dist = rd_stats.dist;
+    rdc_noskip.rdcost =
+        RDCOST(x->rdmult, x->rddiv, rdc_noskip.rate, rdc_noskip.dist);
+    if (rdc_noskip.rdcost < best_rd) {
+      best_rd = rdc_noskip.rdcost;
+      best_mbmi = *mbmi;
+      best_skip = x->skip;
+      best_rdcost = rdc_noskip;
+    }
+
+    x->skip = 1;
+    mbmi->skip = 1;
+    RD_STATS rdc_skip;
+    av1_init_rd_stats(&rdc_skip);
+    rdc_skip.rate = rate_mode + rate_mv + av1_cost_bit(skip_prob, 1);
+    rdc_skip.dist = rd_stats.sse;
+    rdc_skip.rdcost = RDCOST(x->rdmult, x->rddiv, rdc_skip.rate, rdc_skip.dist);
+    if (rdc_skip.rdcost < best_rd) {
+      best_rd = rdc_skip.rdcost;
+      best_mbmi = *mbmi;
+      best_skip = x->skip;
+      best_rdcost = rdc_skip;
+    }
   }
   *mbmi = best_mbmi;
   *rd_cost = best_rdcost;
@@ -9200,6 +8314,7 @@ void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
   xd->mi[0]->mbmi.ref_frame[1] = NONE_FRAME;
 #if CONFIG_INTRABC
   xd->mi[0]->mbmi.use_intrabc = 0;
+  xd->mi[0]->mbmi.mv[0].as_int = 0;
 #endif  // CONFIG_INTRABC
 
   const int64_t intra_yrd =
@@ -9212,11 +8327,8 @@ void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
   if (intra_yrd < best_rd) {
     max_uv_tx_size = uv_txsize_lookup[bsize][xd->mi[0]->mbmi.tx_size]
                                      [pd[1].subsampling_x][pd[1].subsampling_y];
-
+    init_sbuv_mode(&xd->mi[0]->mbmi);
 #if CONFIG_CB4X4
-#if !CONFIG_CHROMA_2X2
-    max_uv_tx_size = AOMMAX(max_uv_tx_size, TX_4X4);
-#endif  // !CONFIG_CHROMA_2X2
     if (!x->skip_chroma_rd)
       rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, &dist_uv,
                               &uv_skip, bsize, max_uv_tx_size);
@@ -9235,6 +8347,9 @@ void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
       rd_cost->dist = dist_y + dist_uv;
     }
     rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+    rd_cost->dist_y = dist_y;
+#endif
   } else {
     rd_cost->rate = INT_MAX;
   }
@@ -9602,10 +8717,8 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
   int64_t best_pred_diff[REFERENCE_MODES];
   int64_t best_pred_rd[REFERENCE_MODES];
   MB_MODE_INFO best_mbmode;
-#if CONFIG_REF_MV
   int rate_skip0 = av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
   int rate_skip1 = av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
-#endif  // CONFIG_REF_MV
   int best_mode_skippable = 0;
   int midx, best_mode_index = -1;
   unsigned int ref_costs_single[TOTAL_REFS_PER_FRAME];
@@ -9635,13 +8748,11 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
   const int *const intra_mode_cost = cpi->mbmode_cost[size_group_lookup[bsize]];
   int best_skip2 = 0;
   uint8_t ref_frame_skip_mask[2] = { 0 };
-#if CONFIG_EXT_INTER
   uint32_t mode_skip_mask[TOTAL_REFS_PER_FRAME] = { 0 };
+#if CONFIG_EXT_INTER && CONFIG_INTERINTRA
   MV_REFERENCE_FRAME best_single_inter_ref = LAST_FRAME;
   int64_t best_single_inter_rd = INT64_MAX;
-#else
-  uint16_t mode_skip_mask[TOTAL_REFS_PER_FRAME] = { 0 };
-#endif  // CONFIG_EXT_INTER
+#endif  // CONFIG_EXT_INTER && CONFIG_INTERINTRA
   int mode_skip_start = sf->mode_skip_start + 1;
   const int *const rd_threshes = rd_opt->threshes[segment_id][bsize];
   const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
@@ -9663,8 +8774,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     NULL,
     NULL,
     NULL,
-    NULL,
-    NULL,
 #else   // CONFIG_EXT_INTER
     NULL,
 #endif  // CONFIG_EXT_INTER
@@ -9681,15 +8790,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
   const MODE_INFO *left_mi = xd->left_mi;
 #endif  // CONFIG_PALETTE
 #if CONFIG_MOTION_VAR
-#if CONFIG_HIGHBITDEPTH
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
-#else
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[MAX_MB_PLANE * MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[MAX_MB_PLANE * MAX_SB_SQUARE]);
-#endif  // CONFIG_HIGHBITDEPTH
-  DECLARE_ALIGNED(16, int32_t, weighted_src_buf[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(16, int32_t, mask2d_buf[MAX_SB_SQUARE]);
   int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
   int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
   int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
@@ -9698,22 +8798,24 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 #if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     int len = sizeof(uint16_t);
-    args.above_pred_buf[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
-    args.above_pred_buf[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * len);
+    args.above_pred_buf[0] = CONVERT_TO_BYTEPTR(x->above_pred_buf);
+    args.above_pred_buf[1] =
+        CONVERT_TO_BYTEPTR(x->above_pred_buf + MAX_SB_SQUARE * len);
     args.above_pred_buf[2] =
-        CONVERT_TO_BYTEPTR(tmp_buf1 + 2 * MAX_SB_SQUARE * len);
-    args.left_pred_buf[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
-    args.left_pred_buf[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * len);
+        CONVERT_TO_BYTEPTR(x->above_pred_buf + 2 * MAX_SB_SQUARE * len);
+    args.left_pred_buf[0] = CONVERT_TO_BYTEPTR(x->left_pred_buf);
+    args.left_pred_buf[1] =
+        CONVERT_TO_BYTEPTR(x->left_pred_buf + MAX_SB_SQUARE * len);
     args.left_pred_buf[2] =
-        CONVERT_TO_BYTEPTR(tmp_buf2 + 2 * MAX_SB_SQUARE * len);
+        CONVERT_TO_BYTEPTR(x->left_pred_buf + 2 * MAX_SB_SQUARE * len);
   } else {
 #endif  // CONFIG_HIGHBITDEPTH
-    args.above_pred_buf[0] = tmp_buf1;
-    args.above_pred_buf[1] = tmp_buf1 + MAX_SB_SQUARE;
-    args.above_pred_buf[2] = tmp_buf1 + 2 * MAX_SB_SQUARE;
-    args.left_pred_buf[0] = tmp_buf2;
-    args.left_pred_buf[1] = tmp_buf2 + MAX_SB_SQUARE;
-    args.left_pred_buf[2] = tmp_buf2 + 2 * MAX_SB_SQUARE;
+    args.above_pred_buf[0] = x->above_pred_buf;
+    args.above_pred_buf[1] = x->above_pred_buf + MAX_SB_SQUARE;
+    args.above_pred_buf[2] = x->above_pred_buf + 2 * MAX_SB_SQUARE;
+    args.left_pred_buf[0] = x->left_pred_buf;
+    args.left_pred_buf[1] = x->left_pred_buf + MAX_SB_SQUARE;
+    args.left_pred_buf[2] = x->left_pred_buf + 2 * MAX_SB_SQUARE;
 #if CONFIG_HIGHBITDEPTH
   }
 #endif  // CONFIG_HIGHBITDEPTH
@@ -9731,11 +8833,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
   }
 #endif  // CONFIG_PALETTE
 
-#if CONFIG_EXT_INTRA
-  memset(directional_mode_skip_mask, 0,
-         sizeof(directional_mode_skip_mask[0]) * INTRA_MODES);
-#endif  // CONFIG_EXT_INTRA
-
   estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
                            &comp_mode_p);
 
@@ -9756,9 +8853,9 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     x->pred_mv_sad[ref_frame] = INT_MAX;
     x->mbmi_ext->mode_context[ref_frame] = 0;
-#if CONFIG_REF_MV && CONFIG_EXT_INTER
+#if CONFIG_EXT_INTER
     x->mbmi_ext->compound_mode_context[ref_frame] = 0;
-#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
+#endif  // CONFIG_EXT_INTER
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
       assert(get_ref_frame_buffer(cpi, ref_frame) != NULL);
       setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
@@ -9788,7 +8885,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 #endif  // CONFIG_EXT_INTER
   }
 
-#if CONFIG_REF_MV
   for (; ref_frame < MODE_CTX_REF_FRAMES; ++ref_frame) {
     MODE_INFO *const mi = xd->mi[0];
     int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
@@ -9813,10 +8909,10 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         mbmi_ext->mode_context[ref_frame] &= ~(1 << ALL_ZERO_FLAG_OFFSET);
     }
   }
-#endif  // CONFIG_REF_MV
 
 #if CONFIG_MOTION_VAR
   av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col);
+
   if (check_num_overlappable_neighbors(mbmi) &&
       is_motion_variation_allowed_bsize(bsize)) {
     av1_build_prediction_by_above_preds(cm, xd, mi_row, mi_col,
@@ -9827,8 +8923,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
                                        dst_height2, args.left_pred_stride);
     av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
                          mi_col);
-    x->mask_buf = mask2d_buf;
-    x->wsrc_buf = weighted_src_buf;
     calc_target_weighted_pred(cm, x, xd, mi_row, mi_col, args.above_pred_buf[0],
                               args.above_pred_stride[0], args.left_pred_buf[0],
                               args.left_pred_stride[0]);
@@ -9904,10 +8998,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 #if CONFIG_EXT_INTER
       if (frame_mv[NEAREST_NEARESTMV][ALTREF_FRAME].as_int != zeromv.as_int)
         mode_skip_mask[ALTREF_FRAME] |= (1 << NEAREST_NEARESTMV);
-      if (frame_mv[NEAREST_NEARMV][ALTREF_FRAME].as_int != zeromv.as_int)
-        mode_skip_mask[ALTREF_FRAME] |= (1 << NEAREST_NEARMV);
-      if (frame_mv[NEAR_NEARESTMV][ALTREF_FRAME].as_int != zeromv.as_int)
-        mode_skip_mask[ALTREF_FRAME] |= (1 << NEAR_NEARESTMV);
       if (frame_mv[NEAR_NEARMV][ALTREF_FRAME].as_int != zeromv.as_int)
         mode_skip_mask[ALTREF_FRAME] |= (1 << NEAR_NEARMV);
 #endif  // CONFIG_EXT_INTER
@@ -9931,7 +9021,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
   if (sf->adaptive_mode_search) {
     if (cm->show_frame && !cpi->rc.is_src_frame_alt_ref &&
         cpi->rc.frames_since_golden >= 3)
-      if (x->pred_mv_sad[GOLDEN_FRAME] > (x->pred_mv_sad[LAST_FRAME] << 1))
+      if ((x->pred_mv_sad[GOLDEN_FRAME] >> 1) > x->pred_mv_sad[LAST_FRAME])
         mode_skip_mask[GOLDEN_FRAME] |= INTER_ALL;
   }
 
@@ -9985,18 +9075,16 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     int64_t this_rd = INT64_MAX;
     int disable_skip = 0;
     int compmode_cost = 0;
-#if CONFIG_EXT_INTER
-    int compmode_interintra_cost = 0;
-    int compmode_interinter_cost = 0;
-#endif  // CONFIG_EXT_INTER
     int rate2 = 0, rate_y = 0, rate_uv = 0;
     int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+    int64_t distortion2_y = 0;
+    int64_t total_sse_y = INT64_MAX;
+#endif
     int skippable = 0;
     int this_skip2 = 0;
     int64_t total_sse = INT64_MAX;
-#if CONFIG_REF_MV
     uint8_t ref_frame_type;
-#endif  // CONFIG_REF_MV
 #if CONFIG_PVQ
     od_encode_rollback(&x->daala_enc, &pre_buf);
 #endif  // CONFIG_PVQ
@@ -10004,9 +9092,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     this_mode = av1_mode_order[mode_index].mode;
     ref_frame = av1_mode_order[mode_index].ref_frame[0];
     second_ref_frame = av1_mode_order[mode_index].ref_frame[1];
-#if CONFIG_REF_MV
     mbmi->ref_mv_idx = 0;
-#endif  // CONFIG_REF_MV
 
 #if CONFIG_EXT_INTER
     if (ref_frame > INTRA_FRAME && second_ref_frame == INTRA_FRAME) {
@@ -10079,7 +9165,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     // This is only used in motion vector unit test.
     if (cpi->oxcf.motion_vector_unit_test && ref_frame == INTRA_FRAME) continue;
 
-#if CONFIG_LOWDELAY_COMPOUND  // Changes LL bitstream
+#if CONFIG_ONE_SIDED_COMPOUND  // Changes LL bitstream
 #if CONFIG_EXT_REFS
     if (cpi->oxcf.pass == 0) {
       // Complexity-compression trade-offs
@@ -10144,9 +9230,9 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 #endif  // CONFIG_GLOBAL_MOTION
       const MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, second_ref_frame };
       if (!check_best_zero_mv(cpi, mbmi_ext->mode_context,
-#if CONFIG_REF_MV && CONFIG_EXT_INTER
+#if CONFIG_EXT_INTER
                               mbmi_ext->compound_mode_context,
-#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
+#endif  // CONFIG_EXT_INTER
                               frame_mv, this_mode, ref_frames, bsize, -1,
                               mi_row, mi_col))
         continue;
@@ -10181,9 +9267,9 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
     }
 
-#if CONFIG_EXT_INTER
+#if CONFIG_EXT_INTER && CONFIG_INTERINTRA
     mbmi->interintra_mode = (INTERINTRA_MODE)(II_DC_PRED - 1);
-#endif  // CONFIG_EXT_INTER
+#endif  // CONFIG_EXT_INTER && CONFIG_INTERINTRA
 
     if (ref_frame == INTRA_FRAME) {
       RD_STATS rd_stats_y;
@@ -10199,11 +9285,11 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
           const uint8_t *src = x->plane[0].src.buf;
 #if CONFIG_HIGHBITDEPTH
           if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-            highbd_angle_estimation(src, src_stride, rows, cols,
+            highbd_angle_estimation(src, src_stride, rows, cols, bsize,
                                     directional_mode_skip_mask);
           else
 #endif  // CONFIG_HIGHBITDEPTH
-            angle_estimation(src, src_stride, rows, cols,
+            angle_estimation(src, src_stride, rows, cols, bsize,
                              directional_mode_skip_mask);
           angle_stats_ready = 1;
         }
@@ -10336,18 +9422,19 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED)
         rate2 += intra_cost_penalty;
       distortion2 = distortion_y + distortion_uv;
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+      if (bsize < BLOCK_8X8) distortion2_y = distortion_y;
+#endif
     } else {
-#if CONFIG_REF_MV
       int_mv backup_ref_mv[2];
 
 #if !SUB8X8_COMP_REF
-      if (bsize < BLOCK_8X8 && mbmi->ref_frame[1] > INTRA_FRAME) continue;
+      if (bsize == BLOCK_4X4 && mbmi->ref_frame[1] > INTRA_FRAME) continue;
 #endif  // !SUB8X8_COMP_REF
 
       backup_ref_mv[0] = mbmi_ext->ref_mvs[ref_frame][0];
       if (comp_pred) backup_ref_mv[1] = mbmi_ext->ref_mvs[second_ref_frame][0];
-#endif  // CONFIG_REF_MV
-#if CONFIG_EXT_INTER
+#if CONFIG_EXT_INTER && CONFIG_INTERINTRA
       if (second_ref_frame == INTRA_FRAME) {
         if (best_single_inter_ref != ref_frame) continue;
         mbmi->interintra_mode = intra_to_interintra_mode[best_intra_mode];
@@ -10365,8 +9452,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
 #endif  // CONFIG_FILTER_INTRA
       }
-#endif  // CONFIG_EXT_INTER
-#if CONFIG_REF_MV
+#endif  // CONFIG_EXT_INTER && CONFIG_INTERINTRA
       mbmi->ref_mv_idx = 0;
       ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
 
@@ -10411,7 +9497,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 #if CONFIG_EXT_INTER
       }
 #endif  // CONFIG_EXT_INTER
-#endif  // CONFIG_REF_MV
       {
         RD_STATS rd_stats, rd_stats_y, rd_stats_uv;
         av1_init_rd_stats(&rd_stats);
@@ -10421,18 +9506,11 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         args.single_newmv = single_newmv;
 #if CONFIG_EXT_INTER
         args.single_newmv_rate = single_newmv_rate;
-        args.compmode_interintra_cost = &compmode_interintra_cost;
-        args.compmode_interinter_cost = &compmode_interinter_cost;
         args.modelled_rd = modelled_rd;
 #endif  // CONFIG_EXT_INTER
         this_rd = handle_inter_mode(cpi, x, bsize, &rd_stats, &rd_stats_y,
                                     &rd_stats_uv, &disable_skip, frame_mv,
                                     mi_row, mi_col, &args, best_rd);
-// Prevent pointers from escaping local scope
-#if CONFIG_EXT_INTER
-        args.compmode_interintra_cost = NULL;
-        args.compmode_interinter_cost = NULL;
-#endif  // CONFIG_EXT_INTER
 
         rate2 = rd_stats.rate;
         skippable = rd_stats.skip;
@@ -10440,9 +9518,11 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         total_sse = rd_stats.sse;
         rate_y = rd_stats_y.rate;
         rate_uv = rd_stats_uv.rate;
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+        if (bsize < BLOCK_8X8) distortion2_y = rd_stats_y.dist;
+#endif
       }
 
-#if CONFIG_REF_MV
 // TODO(jingning): This needs some refactoring to improve code quality
 // and reduce redundant steps.
 #if CONFIG_EXT_INTER
@@ -10505,10 +9585,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
           int ref;
           int_mv cur_mv;
           RD_STATS tmp_rd_stats, tmp_rd_stats_y, tmp_rd_stats_uv;
-#if CONFIG_EXT_INTER
-          int tmp_compmode_interintra_cost = 0;
-          int tmp_compmode_interinter_cost = 0;
-#endif  // CONFIG_EXT_INTER
 
           av1_invalid_rd_stats(&tmp_rd_stats);
           x->skip = 0;
@@ -10586,8 +9662,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
             args.single_newmv = dummy_single_newmv;
 #if CONFIG_EXT_INTER
             args.single_newmv_rate = dummy_single_newmv_rate;
-            args.compmode_interintra_cost = &tmp_compmode_interintra_cost;
-            args.compmode_interinter_cost = &tmp_compmode_interinter_cost;
             args.modelled_rd = NULL;
 #endif  // CONFIG_EXT_INTER
             tmp_alt_rd = handle_inter_mode(
@@ -10597,8 +9671,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
             args.single_newmv = NULL;
 #if CONFIG_EXT_INTER
             args.single_newmv_rate = NULL;
-            args.compmode_interintra_cost = NULL;
-            args.compmode_interinter_cost = NULL;
 #endif  // CONFIG_EXT_INTER
           }
 
@@ -10658,15 +9730,17 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
             tmp_ref_rd = tmp_alt_rd;
             backup_mbmi = *mbmi;
             backup_skip = x->skip;
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+            if (bsize < BLOCK_8X8) {
+              total_sse_y = tmp_rd_stats_y.sse;
+              distortion2_y = tmp_rd_stats_y.dist;
+            }
+#endif
 #if CONFIG_VAR_TX
             for (i = 0; i < MAX_MB_PLANE; ++i)
               memcpy(x->blk_skip_drl[i], x->blk_skip[i],
                      sizeof(uint8_t) * ctx->num_4x4_blk);
 #endif  // CONFIG_VAR_TX
-#if CONFIG_EXT_INTER
-            compmode_interintra_cost = tmp_compmode_interintra_cost;
-            compmode_interinter_cost = tmp_compmode_interinter_cost;
-#endif  // CONFIG_EXT_INTER
           } else {
             *mbmi = backup_mbmi;
             x->skip = backup_skip;
@@ -10684,29 +9758,19 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       }
       mbmi_ext->ref_mvs[ref_frame][0] = backup_ref_mv[0];
       if (comp_pred) mbmi_ext->ref_mvs[second_ref_frame][0] = backup_ref_mv[1];
-#endif  // CONFIG_REF_MV
 
       if (this_rd == INT64_MAX) continue;
 
 #if SUB8X8_COMP_REF
       compmode_cost = av1_cost_bit(comp_mode_p, comp_pred);
 #else
-      if (mbmi->sb_type >= BLOCK_8X8)
+      if (mbmi->sb_type != BLOCK_4X4)
         compmode_cost = av1_cost_bit(comp_mode_p, comp_pred);
 #endif  // SUB8X8_COMP_REF
 
       if (cm->reference_mode == REFERENCE_MODE_SELECT) rate2 += compmode_cost;
     }
 
-#if CONFIG_EXT_INTER
-    rate2 += compmode_interintra_cost;
-    if (cm->reference_mode != SINGLE_REFERENCE && comp_pred)
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-      if (mbmi->motion_mode == SIMPLE_TRANSLATION)
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-        rate2 += compmode_interinter_cost;
-#endif  // CONFIG_EXT_INTER
-
     // Estimate the reference frame signaling cost and add it
     // to the rolling cost variable.
     if (comp_pred) {
@@ -10731,14 +9795,9 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         // Cost the skip mb case
         rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
       } else if (ref_frame != INTRA_FRAME && !xd->lossless[mbmi->segment_id]) {
-#if CONFIG_REF_MV
         if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv + rate_skip0,
                    distortion2) <
             RDCOST(x->rdmult, x->rddiv, rate_skip1, total_sse)) {
-#else
-        if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
-            RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
-#endif  // CONFIG_REF_MV
           // Add in the cost of the no skip flag.
           rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
         } else {
@@ -10750,6 +9809,9 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
           this_skip2 = 1;
           rate_y = 0;
           rate_uv = 0;
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+          if (bsize < BLOCK_8X8) distortion2_y = total_sse_y;
+#endif
         }
       } else {
         // Add in the cost of the no skip flag.
@@ -10775,13 +9837,13 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         best_intra_rd = this_rd;
         best_intra_mode = mbmi->mode;
       }
-#if CONFIG_EXT_INTER
+#if CONFIG_EXT_INTER && CONFIG_INTERINTRA
     } else if (second_ref_frame == NONE_FRAME) {
       if (this_rd < best_single_inter_rd) {
         best_single_inter_rd = this_rd;
         best_single_inter_ref = mbmi->ref_frame[0];
       }
-#endif  // CONFIG_EXT_INTER
+#endif  // CONFIG_EXT_INTER && CONFIG_INTERINTRA
     }
 
     if (!disable_skip && ref_frame == INTRA_FRAME) {
@@ -10839,7 +9901,9 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         best_rate_y = rate_y + av1_cost_bit(av1_get_skip_prob(cm, xd),
                                             this_skip2 || skippable);
         best_rate_uv = rate_uv;
-
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+        if (bsize < BLOCK_8X8) rd_cost->dist_y = distortion2_y;
+#endif
 #if CONFIG_VAR_TX
         for (i = 0; i < MAX_MB_PLANE; ++i)
           memcpy(ctx->blk_skip[i], x->blk_skip[i],
@@ -10900,7 +9964,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     }
 
     if (is_inter_mode(mbmi->mode)) {
-      av1_build_inter_predictors_sb(xd, mi_row, mi_col, NULL, bsize);
+      av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
 #if CONFIG_MOTION_VAR
       if (mbmi->motion_mode == OBMC_CAUSAL) {
         av1_build_obmc_inter_prediction(
@@ -10967,6 +10031,9 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       rd_cost->rate +=
           (rd_stats_y.rate + rd_stats_uv.rate - best_rate_y - best_rate_uv);
       rd_cost->dist = rd_stats_y.dist + rd_stats_uv.dist;
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+      if (bsize < BLOCK_8X8) rd_cost->dist_y = rd_stats_y.dist;
+#endif
       rd_cost->rdcost =
           RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
       best_skip2 = skip_blk;
@@ -11111,9 +10178,7 @@ PALETTE_EXIT:
                                          best_mbmode.ref_frame[1] };
     int comp_pred_mode = refs[1] > INTRA_FRAME;
     int_mv zeromv[2];
-#if CONFIG_REF_MV
     const uint8_t rf_type = av1_ref_frame_type(best_mbmode.ref_frame);
-#endif  // CONFIG_REF_MV
 #if CONFIG_GLOBAL_MOTION
     zeromv[0].as_int = gm_get_motion_vector(&cm->global_motion[refs[0]],
                                             cm->allow_high_precision_mv, bsize,
@@ -11129,7 +10194,6 @@ PALETTE_EXIT:
     zeromv[0].as_int = 0;
     zeromv[1].as_int = 0;
 #endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_REF_MV
     if (!comp_pred_mode) {
       int ref_set = (mbmi_ext->ref_mv_count[rf_type] >= 2)
                         ? AOMMIN(2, mbmi_ext->ref_mv_count[rf_type] - 2)
@@ -11196,17 +10260,9 @@ PALETTE_EXIT:
           nearmv[0] = mbmi_ext->ref_mv_stack[rf_type][i + 1].this_mv;
           nearmv[1] = mbmi_ext->ref_mv_stack[rf_type][i + 1].comp_mv;
 
-          // Try switching to the NEAR_NEAREST type modes first
-          if (nearestmv[0].as_int == best_mbmode.mv[0].as_int &&
+          // Try switching to the NEAR_NEARMV mode
+          if (nearmv[0].as_int == best_mbmode.mv[0].as_int &&
               nearmv[1].as_int == best_mbmode.mv[1].as_int) {
-            best_mbmode.mode = NEAREST_NEARMV;
-            best_mbmode.ref_mv_idx = i;
-          } else if (nearmv[0].as_int == best_mbmode.mv[0].as_int &&
-                     nearestmv[1].as_int == best_mbmode.mv[1].as_int) {
-            best_mbmode.mode = NEAR_NEARESTMV;
-            best_mbmode.ref_mv_idx = i;
-          } else if (nearmv[0].as_int == best_mbmode.mv[0].as_int &&
-                     nearmv[1].as_int == best_mbmode.mv[1].as_int) {
             best_mbmode.mode = NEAR_NEARMV;
             best_mbmode.ref_mv_idx = i;
           }
@@ -11225,72 +10281,8 @@ PALETTE_EXIT:
       }
 #endif  // CONFIG_EXT_INTER
     }
-#else
-#if CONFIG_EXT_INTER
-    if (!comp_pred_mode) {
-#endif  // CONFIG_EXT_INTER
-      if (frame_mv[NEARESTMV][refs[0]].as_int == best_mbmode.mv[0].as_int &&
-          ((comp_pred_mode &&
-            frame_mv[NEARESTMV][refs[1]].as_int == best_mbmode.mv[1].as_int) ||
-           !comp_pred_mode))
-        best_mbmode.mode = NEARESTMV;
-      else if (frame_mv[NEARMV][refs[0]].as_int == best_mbmode.mv[0].as_int &&
-               ((comp_pred_mode &&
-                 frame_mv[NEARMV][refs[1]].as_int ==
-                     best_mbmode.mv[1].as_int) ||
-                !comp_pred_mode))
-        best_mbmode.mode = NEARMV;
-      else if (best_mbmode.mv[0].as_int == zeromv[0].as_int &&
-               ((comp_pred_mode &&
-                 best_mbmode.mv[1].as_int == zeromv[1].as_int) ||
-                !comp_pred_mode))
-        best_mbmode.mode = ZEROMV;
-#if CONFIG_EXT_INTER
-    } else {
-#if CONFIG_GLOBAL_MOTION
-      zeromv[0].as_int = gm_get_motion_vector(&cm->global_motion[refs[0]],
-                                              cm->allow_high_precision_mv,
-                                              bsize, mi_col, mi_row, 0)
-                             .as_int;
-      zeromv[1].as_int = comp_pred_mode
-                             ? gm_get_motion_vector(&cm->global_motion[refs[1]],
-                                                    cm->allow_high_precision_mv,
-                                                    bsize, mi_col, mi_row, 0)
-                                   .as_int
-                             : 0;
-#else
-      zeromv[0].as_int = 0;
-      zeromv[1].as_int = 0;
-#endif  // CONFIG_GLOBAL_MOTION
-      if (frame_mv[NEAREST_NEARESTMV][refs[0]].as_int ==
-              best_mbmode.mv[0].as_int &&
-          frame_mv[NEAREST_NEARESTMV][refs[1]].as_int ==
-              best_mbmode.mv[1].as_int)
-        best_mbmode.mode = NEAREST_NEARESTMV;
-      else if (frame_mv[NEAREST_NEARMV][refs[0]].as_int ==
-                   best_mbmode.mv[0].as_int &&
-               frame_mv[NEAREST_NEARMV][refs[1]].as_int ==
-                   best_mbmode.mv[1].as_int)
-        best_mbmode.mode = NEAREST_NEARMV;
-      else if (frame_mv[NEAR_NEARESTMV][refs[0]].as_int ==
-                   best_mbmode.mv[0].as_int &&
-               frame_mv[NEAR_NEARESTMV][refs[1]].as_int ==
-                   best_mbmode.mv[1].as_int)
-        best_mbmode.mode = NEAR_NEARESTMV;
-      else if (frame_mv[NEAR_NEARMV][refs[0]].as_int ==
-                   best_mbmode.mv[0].as_int &&
-               frame_mv[NEAR_NEARMV][refs[1]].as_int ==
-                   best_mbmode.mv[1].as_int)
-        best_mbmode.mode = NEAR_NEARMV;
-      else if (best_mbmode.mv[0].as_int == zeromv[0].as_int &&
-               best_mbmode.mv[1].as_int == zeromv[1].as_int)
-        best_mbmode.mode = ZERO_ZEROMV;
-    }
-#endif  // CONFIG_EXT_INTER
-#endif  // CONFIG_REF_MV
   }
 
-#if CONFIG_REF_MV
   // Make sure that the ref_mv_idx is only nonzero when we're
   // using a mode which can support ref_mv_idx
   if (best_mbmode.ref_mv_idx != 0 &&
@@ -11339,7 +10331,6 @@ PALETTE_EXIT:
       }
     }
   }
-#endif  // CONFIG_REF_MV
 
   if (best_mode_index < 0 || best_rd >= best_rd_so_far) {
     rd_cost->rate = INT_MAX;
@@ -11412,14 +10403,12 @@ PALETTE_EXIT:
   }
 #endif  // CONFIG_GLOBAL_MOTION
 
-#if CONFIG_REF_MV
   for (i = 0; i < 1 + has_second_ref(mbmi); ++i) {
     if (mbmi->mode != NEWMV)
       mbmi->pred_mv[i].as_int = mbmi->mv[i].as_int;
     else
       mbmi->pred_mv[i].as_int = mbmi_ext->ref_mvs[mbmi->ref_frame[i]][0].as_int;
   }
-#endif  // CONFIG_REF_MV
 
   for (i = 0; i < REFERENCE_MODES; ++i) {
     if (best_pred_rd[i] == INT64_MAX)
@@ -11502,10 +10491,8 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
   mbmi->tx_size = max_txsize_lookup[bsize];
   x->skip = 1;
 
-#if CONFIG_REF_MV
   mbmi->ref_mv_idx = 0;
   mbmi->pred_mv[0].as_int = 0;
-#endif  // CONFIG_REF_MV
 
   mbmi->motion_mode = SIMPLE_TRANSLATION;
 #if CONFIG_MOTION_VAR
@@ -11566,7 +10553,9 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
   rd_cost->rate = rate2;
   rd_cost->dist = distortion2;
   rd_cost->rdcost = this_rd;
-
+#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+  if (bsize < BLOCK_8X8) rd_cost->dist_y = distortion2;
+#endif
   if (this_rd >= best_rd_so_far) {
     rd_cost->rate = INT_MAX;
     rd_cost->rdcost = INT64_MAX;
@@ -11589,791 +10578,6 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
   store_coding_context(x, ctx, THR_ZEROMV, best_pred_diff, 0);
 }
 
-void av1_rd_pick_inter_mode_sub8x8(const struct AV1_COMP *cpi,
-                                   TileDataEnc *tile_data, struct macroblock *x,
-                                   int mi_row, int mi_col,
-                                   struct RD_STATS *rd_cost,
-#if CONFIG_SUPERTX
-                                   int *returnrate_nocoef,
-#endif  // CONFIG_SUPERTX
-                                   BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
-                                   int64_t best_rd_so_far) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const RD_OPT *const rd_opt = &cpi->rd;
-  const SPEED_FEATURES *const sf = &cpi->sf;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const struct segmentation *const seg = &cm->seg;
-  MV_REFERENCE_FRAME ref_frame, second_ref_frame;
-  unsigned char segment_id = mbmi->segment_id;
-  int comp_pred, i;
-  int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
-  struct buf_2d yv12_mb[TOTAL_REFS_PER_FRAME][MAX_MB_PLANE];
-  static const int flag_list[TOTAL_REFS_PER_FRAME] = {
-    0,
-    AOM_LAST_FLAG,
-#if CONFIG_EXT_REFS
-    AOM_LAST2_FLAG,
-    AOM_LAST3_FLAG,
-#endif  // CONFIG_EXT_REFS
-    AOM_GOLD_FLAG,
-#if CONFIG_EXT_REFS
-    AOM_BWD_FLAG,
-#endif  // CONFIG_EXT_REFS
-    AOM_ALT_FLAG
-  };
-  int64_t best_rd = best_rd_so_far;
-  int64_t best_yrd = best_rd_so_far;  // FIXME(rbultje) more precise
-  int64_t best_pred_diff[REFERENCE_MODES];
-  int64_t best_pred_rd[REFERENCE_MODES];
-  MB_MODE_INFO best_mbmode;
-  int ref_index, best_ref_index = 0;
-  unsigned int ref_costs_single[TOTAL_REFS_PER_FRAME];
-  unsigned int ref_costs_comp[TOTAL_REFS_PER_FRAME];
-  aom_prob comp_mode_p;
-#if CONFIG_DUAL_FILTER
-  InterpFilter tmp_best_filter[4] = { 0 };
-#else
-  InterpFilter tmp_best_filter = SWITCHABLE;
-#endif  // CONFIG_DUAL_FILTER
-  int rate_uv_intra, rate_uv_tokenonly = INT_MAX;
-  int64_t dist_uv = INT64_MAX;
-  int skip_uv;
-  PREDICTION_MODE mode_uv = DC_PRED;
-  const int intra_cost_penalty = av1_get_intra_cost_penalty(
-      cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
-  int_mv seg_mvs[4][TOTAL_REFS_PER_FRAME];
-  b_mode_info best_bmodes[4];
-  int best_skip2 = 0;
-  int ref_frame_skip_mask[2] = { 0 };
-  int internal_active_edge =
-      av1_active_edge_sb(cpi, mi_row, mi_col) && av1_internal_image_edge(cpi);
-#if CONFIG_PVQ
-  od_rollback_buffer pre_buf;
-
-  od_encode_checkpoint(&x->daala_enc, &pre_buf);
-#endif  // CONFIG_PVQ
-
-#if CONFIG_SUPERTX
-  best_rd_so_far = INT64_MAX;
-  best_rd = best_rd_so_far;
-  best_yrd = best_rd_so_far;
-#endif  // CONFIG_SUPERTX
-  av1_zero(best_mbmode);
-
-#if CONFIG_FILTER_INTRA
-  mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
-  mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
-#endif  // CONFIG_FILTER_INTRA
-  mbmi->motion_mode = SIMPLE_TRANSLATION;
-#if CONFIG_EXT_INTER
-  mbmi->interinter_compound_type = COMPOUND_AVERAGE;
-  mbmi->use_wedge_interintra = 0;
-#endif  // CONFIG_EXT_INTER
-#if CONFIG_WARPED_MOTION
-  mbmi->num_proj_ref[0] = 0;
-  mbmi->num_proj_ref[1] = 0;
-#endif  // CONFIG_WARPED_MOTION
-
-  for (i = 0; i < 4; i++) {
-    int j;
-    for (j = 0; j < TOTAL_REFS_PER_FRAME; j++)
-      seg_mvs[i][j].as_int = INVALID_MV;
-  }
-
-  estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
-                           &comp_mode_p);
-
-  for (i = 0; i < REFERENCE_MODES; ++i) best_pred_rd[i] = INT64_MAX;
-  rate_uv_intra = INT_MAX;
-
-  rd_cost->rate = INT_MAX;
-#if CONFIG_SUPERTX
-  *returnrate_nocoef = INT_MAX;
-#endif  // CONFIG_SUPERTX
-
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
-    x->mbmi_ext->mode_context[ref_frame] = 0;
-#if CONFIG_REF_MV && CONFIG_EXT_INTER
-    x->mbmi_ext->compound_mode_context[ref_frame] = 0;
-#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
-    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
-      setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
-                         frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
-    } else {
-      ref_frame_skip_mask[0] |= (1 << ref_frame);
-      ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
-    }
-    frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
-#if CONFIG_EXT_INTER
-#endif  // CONFIG_EXT_INTER
-    frame_mv[ZEROMV][ref_frame].as_int = 0;
-  }
-
-#if CONFIG_PALETTE
-  mbmi->palette_mode_info.palette_size[0] = 0;
-  mbmi->palette_mode_info.palette_size[1] = 0;
-#endif  // CONFIG_PALETTE
-
-  for (ref_index = 0; ref_index < MAX_REFS; ++ref_index) {
-    int mode_excluded = 0;
-    int64_t this_rd = INT64_MAX;
-    int disable_skip = 0;
-    int compmode_cost = 0;
-    int rate2 = 0, rate_y = 0, rate_uv = 0;
-    int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
-    int skippable = 0;
-    int this_skip2 = 0;
-    int64_t total_sse = INT_MAX;
-
-#if CONFIG_PVQ
-    od_encode_rollback(&x->daala_enc, &pre_buf);
-#endif  // CONFIG_PVQ
-
-    ref_frame = av1_ref_order[ref_index].ref_frame[0];
-    second_ref_frame = av1_ref_order[ref_index].ref_frame[1];
-
-#if CONFIG_REF_MV
-    mbmi->ref_mv_idx = 0;
-#endif  // CONFIG_REF_MV
-
-    // Look at the reference frame of the best mode so far and set the
-    // skip mask to look at a subset of the remaining modes.
-    if (ref_index > 2 && sf->mode_skip_start < MAX_MODES) {
-      if (ref_index == 3) {
-        switch (best_mbmode.ref_frame[0]) {
-          case INTRA_FRAME: break;
-          case LAST_FRAME:
-            ref_frame_skip_mask[0] |= (1 << GOLDEN_FRAME) |
-#if CONFIG_EXT_REFS
-                                      (1 << LAST2_FRAME) | (1 << LAST3_FRAME) |
-                                      (1 << BWDREF_FRAME) |
-#endif  // CONFIG_EXT_REFS
-                                      (1 << ALTREF_FRAME);
-            ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
-            break;
-#if CONFIG_EXT_REFS
-          case LAST2_FRAME:
-            ref_frame_skip_mask[0] |= (1 << LAST_FRAME) | (1 << LAST3_FRAME) |
-                                      (1 << GOLDEN_FRAME) |
-                                      (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME);
-            ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
-            break;
-          case LAST3_FRAME:
-            ref_frame_skip_mask[0] |= (1 << LAST_FRAME) | (1 << LAST2_FRAME) |
-                                      (1 << GOLDEN_FRAME) |
-                                      (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME);
-            ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
-            break;
-#endif  // CONFIG_EXT_REFS
-          case GOLDEN_FRAME:
-            ref_frame_skip_mask[0] |= (1 << LAST_FRAME) |
-#if CONFIG_EXT_REFS
-                                      (1 << LAST2_FRAME) | (1 << LAST3_FRAME) |
-                                      (1 << BWDREF_FRAME) |
-#endif  // CONFIG_EXT_REFS
-                                      (1 << ALTREF_FRAME);
-            ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
-            break;
-#if CONFIG_EXT_REFS
-          case BWDREF_FRAME:
-            ref_frame_skip_mask[0] |= (1 << LAST_FRAME) | (1 << LAST2_FRAME) |
-                                      (1 << LAST3_FRAME) | (1 << GOLDEN_FRAME) |
-                                      (1 << ALTREF_FRAME);
-            ref_frame_skip_mask[1] |= (1 << ALTREF_FRAME) | 0x01;
-            break;
-#endif  // CONFIG_EXT_REFS
-          case ALTREF_FRAME:
-            ref_frame_skip_mask[0] |= (1 << LAST_FRAME) |
-#if CONFIG_EXT_REFS
-                                      (1 << LAST2_FRAME) | (1 << LAST3_FRAME) |
-                                      (1 << BWDREF_FRAME) |
-#endif  // CONFIG_EXT_REFS
-                                      (1 << GOLDEN_FRAME);
-#if CONFIG_EXT_REFS
-            ref_frame_skip_mask[1] |= (1 << BWDREF_FRAME) | 0x01;
-#endif  // CONFIG_EXT_REFS
-            break;
-          case NONE_FRAME:
-          case TOTAL_REFS_PER_FRAME:
-            assert(0 && "Invalid Reference frame");
-            break;
-        }
-      }
-    }
-
-    if ((ref_frame_skip_mask[0] & (1 << ref_frame)) &&
-        (ref_frame_skip_mask[1] & (1 << AOMMAX(0, second_ref_frame))))
-      continue;
-
-    // Test best rd so far against threshold for trying this mode.
-    if (!internal_active_edge &&
-        rd_less_than_thresh(best_rd,
-                            rd_opt->threshes[segment_id][bsize][ref_index],
-                            tile_data->thresh_freq_fact[bsize][ref_index]))
-      continue;
-
-    // This is only used in motion vector unit test.
-    if (cpi->oxcf.motion_vector_unit_test && ref_frame == INTRA_FRAME) continue;
-
-#if CONFIG_LOWDELAY_COMPOUND  // Changes LL bitstream
-#if CONFIG_EXT_REFS
-    if (cpi->oxcf.pass == 0) {
-      // Complexity-compression trade-offs
-      // if (ref_frame == ALTREF_FRAME) continue;
-      // if (ref_frame == BWDREF_FRAME) continue;
-      if (second_ref_frame == ALTREF_FRAME) continue;
-      // if (second_ref_frame == BWDREF_FRAME) continue;
-    }
-#endif
-#endif
-    comp_pred = second_ref_frame > INTRA_FRAME;
-    if (comp_pred) {
-      if (!cpi->allow_comp_inter_inter) continue;
-      if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue;
-      // Do not allow compound prediction if the segment level reference frame
-      // feature is in use as in this case there can only be one reference.
-      if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) continue;
-
-      if ((sf->mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
-          best_mbmode.ref_frame[0] == INTRA_FRAME)
-        continue;
-    }
-
-    // TODO(jingning, jkoleszar): scaling reference frame not supported for
-    // sub8x8 blocks.
-    if (ref_frame > INTRA_FRAME &&
-        av1_is_scaled(&cm->frame_refs[ref_frame - 1].sf))
-      continue;
-
-    if (second_ref_frame > INTRA_FRAME &&
-        av1_is_scaled(&cm->frame_refs[second_ref_frame - 1].sf))
-      continue;
-
-    if (comp_pred)
-      mode_excluded = cm->reference_mode == SINGLE_REFERENCE;
-    else if (ref_frame != INTRA_FRAME)
-      mode_excluded = cm->reference_mode == COMPOUND_REFERENCE;
-
-    // If the segment reference frame feature is enabled....
-    // then do nothing if the current ref frame is not allowed..
-    if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
-        get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
-      continue;
-      // Disable this drop out case if the ref frame
-      // segment level feature is enabled for this segment. This is to
-      // prevent the possibility that we end up unable to pick any mode.
-    } else if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
-      // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
-      // unless ARNR filtering is enabled in which case we want
-      // an unfiltered alternative. We allow near/nearest as well
-      // because they may result in zero-zero MVs but be cheaper.
-      if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0))
-        continue;
-    }
-
-    mbmi->tx_size = TX_4X4;
-    mbmi->uv_mode = DC_PRED;
-    mbmi->ref_frame[0] = ref_frame;
-    mbmi->ref_frame[1] = second_ref_frame;
-// Evaluate all sub-pel filters irrespective of whether we can use
-// them for this frame.
-#if CONFIG_DUAL_FILTER
-    for (i = 0; i < 4; ++i)
-      mbmi->interp_filter[i] = cm->interp_filter == SWITCHABLE
-                                   ? EIGHTTAP_REGULAR
-                                   : cm->interp_filter;
-#else
-    mbmi->interp_filter =
-        cm->interp_filter == SWITCHABLE ? EIGHTTAP_REGULAR : cm->interp_filter;
-#endif  // CONFIG_DUAL_FILTER
-    x->skip = 0;
-    set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
-
-    // Select prediction reference frames.
-    for (i = 0; i < MAX_MB_PLANE; i++) {
-      xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
-      if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
-    }
-
-#if CONFIG_VAR_TX
-    mbmi->inter_tx_size[0][0] = mbmi->tx_size;
-    mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
-#endif  // CONFIG_VAR_TX
-
-    if (ref_frame == INTRA_FRAME) {
-      int rate;
-      if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate, &rate_y, &distortion_y,
-                                       NULL, best_rd) >= best_rd)
-        continue;
-      rate2 += rate;
-      rate2 += intra_cost_penalty;
-      distortion2 += distortion_y;
-
-      if (rate_uv_intra == INT_MAX) {
-        choose_intra_uv_mode(cpi, x, ctx, bsize, TX_4X4, &rate_uv_intra,
-                             &rate_uv_tokenonly, &dist_uv, &skip_uv, &mode_uv);
-      }
-      rate2 += rate_uv_intra;
-      rate_uv = rate_uv_tokenonly;
-      distortion2 += dist_uv;
-      distortion_uv = dist_uv;
-      mbmi->uv_mode = mode_uv;
-    } else {
-      int rate;
-      int64_t distortion;
-      int64_t this_rd_thresh;
-      int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX;
-      int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX;
-      int64_t tmp_best_distortion = INT_MAX, tmp_best_sse, uv_sse;
-      int tmp_best_skippable = 0;
-      int switchable_filter_index;
-      int_mv *second_ref =
-          comp_pred ? &x->mbmi_ext->ref_mvs[second_ref_frame][0] : NULL;
-      b_mode_info tmp_best_bmodes[16];  // Should this be 4 ?
-      MB_MODE_INFO tmp_best_mbmode;
-#if CONFIG_DUAL_FILTER
-      BEST_SEG_INFO bsi[DUAL_FILTER_SET_SIZE];
-#else
-      BEST_SEG_INFO bsi[SWITCHABLE_FILTERS];
-#endif  // CONFIG_DUAL_FILTER
-      int pred_exists = 0;
-      int uv_skippable;
-#if CONFIG_EXT_INTER
-      int_mv compound_seg_newmvs[4][2];
-      for (i = 0; i < 4; i++) {
-        compound_seg_newmvs[i][0].as_int = INVALID_MV;
-        compound_seg_newmvs[i][1].as_int = INVALID_MV;
-      }
-#endif  // CONFIG_EXT_INTER
-
-      this_rd_thresh = (ref_frame == LAST_FRAME)
-                           ? rd_opt->threshes[segment_id][bsize][THR_LAST]
-                           : rd_opt->threshes[segment_id][bsize][THR_ALTR];
-#if CONFIG_EXT_REFS
-      this_rd_thresh = (ref_frame == LAST2_FRAME)
-                           ? rd_opt->threshes[segment_id][bsize][THR_LAST2]
-                           : this_rd_thresh;
-      this_rd_thresh = (ref_frame == LAST3_FRAME)
-                           ? rd_opt->threshes[segment_id][bsize][THR_LAST3]
-                           : this_rd_thresh;
-      this_rd_thresh = (ref_frame == BWDREF_FRAME)
-                           ? rd_opt->threshes[segment_id][bsize][THR_BWDR]
-                           : this_rd_thresh;
-#endif  // CONFIG_EXT_REFS
-      this_rd_thresh = (ref_frame == GOLDEN_FRAME)
-                           ? rd_opt->threshes[segment_id][bsize][THR_GOLD]
-                           : this_rd_thresh;
-
-      // TODO(any): Add search of the tx_type to improve rd performance at the
-      // expense of speed.
-      mbmi->tx_type = DCT_DCT;
-
-      if (cm->interp_filter != BILINEAR) {
-#if CONFIG_DUAL_FILTER
-        tmp_best_filter[0] = EIGHTTAP_REGULAR;
-        tmp_best_filter[1] = EIGHTTAP_REGULAR;
-        tmp_best_filter[2] = EIGHTTAP_REGULAR;
-        tmp_best_filter[3] = EIGHTTAP_REGULAR;
-#else
-        tmp_best_filter = EIGHTTAP_REGULAR;
-#endif  // CONFIG_DUAL_FILTER
-        if (x->source_variance < sf->disable_filter_search_var_thresh) {
-#if CONFIG_DUAL_FILTER
-          tmp_best_filter[0] = EIGHTTAP_REGULAR;
-#else
-          tmp_best_filter = EIGHTTAP_REGULAR;
-#endif  // CONFIG_DUAL_FILTER
-        } else if (sf->adaptive_pred_interp_filter == 1 &&
-                   ctx->pred_interp_filter < SWITCHABLE) {
-#if CONFIG_DUAL_FILTER
-          tmp_best_filter[0] = ctx->pred_interp_filter;
-#else
-          tmp_best_filter = ctx->pred_interp_filter;
-#endif  // CONFIG_DUAL_FILTER
-        } else if (sf->adaptive_pred_interp_filter == 2) {
-#if CONFIG_DUAL_FILTER
-          tmp_best_filter[0] = ctx->pred_interp_filter < SWITCHABLE
-                                   ? ctx->pred_interp_filter
-                                   : 0;
-#else
-          tmp_best_filter = ctx->pred_interp_filter < SWITCHABLE
-                                ? ctx->pred_interp_filter
-                                : 0;
-#endif  // CONFIG_DUAL_FILTER
-        } else {
-#if CONFIG_DUAL_FILTER
-          const int filter_set_size = DUAL_FILTER_SET_SIZE;
-#else
-          const int filter_set_size = SWITCHABLE_FILTERS;
-#endif  // CONFIG_DUAL_FILTER
-          for (switchable_filter_index = 0;
-               switchable_filter_index < filter_set_size;
-               ++switchable_filter_index) {
-            int newbest, rs;
-            int64_t rs_rd;
-            MB_MODE_INFO_EXT *mbmi_ext = x->mbmi_ext;
-#if CONFIG_DUAL_FILTER
-            mbmi->interp_filter[0] = filter_sets[switchable_filter_index][0];
-            mbmi->interp_filter[1] = filter_sets[switchable_filter_index][1];
-            mbmi->interp_filter[2] = filter_sets[switchable_filter_index][0];
-            mbmi->interp_filter[3] = filter_sets[switchable_filter_index][1];
-#else
-            mbmi->interp_filter = switchable_filter_index;
-#endif  // CONFIG_DUAL_FILTER
-            tmp_rd = rd_pick_inter_best_sub8x8_mode(
-                cpi, x, &mbmi_ext->ref_mvs[ref_frame][0], second_ref, best_yrd,
-                &rate, &rate_y, &distortion, &skippable, &total_sse,
-                (int)this_rd_thresh, seg_mvs,
-#if CONFIG_EXT_INTER
-                compound_seg_newmvs,
-#endif  // CONFIG_EXT_INTER
-                bsi, switchable_filter_index, mi_row, mi_col);
-            if (tmp_rd == INT64_MAX) continue;
-            rs = av1_get_switchable_rate(cpi, xd);
-            rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
-            if (cm->interp_filter == SWITCHABLE) tmp_rd += rs_rd;
-
-            newbest = (tmp_rd < tmp_best_rd);
-            if (newbest) {
-#if CONFIG_DUAL_FILTER
-              tmp_best_filter[0] = mbmi->interp_filter[0];
-              tmp_best_filter[1] = mbmi->interp_filter[1];
-              tmp_best_filter[2] = mbmi->interp_filter[2];
-              tmp_best_filter[3] = mbmi->interp_filter[3];
-#else
-              tmp_best_filter = mbmi->interp_filter;
-#endif  // CONFIG_DUAL_FILTER
-              tmp_best_rd = tmp_rd;
-            }
-            if ((newbest && cm->interp_filter == SWITCHABLE) ||
-                (
-#if CONFIG_DUAL_FILTER
-                    mbmi->interp_filter[0] == cm->interp_filter
-#else
-                    mbmi->interp_filter == cm->interp_filter
-#endif  // CONFIG_DUAL_FILTER
-                    && cm->interp_filter != SWITCHABLE)) {
-              tmp_best_rdu = tmp_rd;
-              tmp_best_rate = rate;
-              tmp_best_ratey = rate_y;
-              tmp_best_distortion = distortion;
-              tmp_best_sse = total_sse;
-              tmp_best_skippable = skippable;
-              tmp_best_mbmode = *mbmi;
-              for (i = 0; i < 4; i++) {
-                tmp_best_bmodes[i] = xd->mi[0]->bmi[i];
-              }
-              pred_exists = 1;
-            }
-          }  // switchable_filter_index loop
-        }
-      }
-
-      if (tmp_best_rdu == INT64_MAX && pred_exists) continue;
-
-#if CONFIG_DUAL_FILTER
-      mbmi->interp_filter[0] =
-          (cm->interp_filter == SWITCHABLE ? tmp_best_filter[0]
-                                           : cm->interp_filter);
-      mbmi->interp_filter[1] =
-          (cm->interp_filter == SWITCHABLE ? tmp_best_filter[1]
-                                           : cm->interp_filter);
-      mbmi->interp_filter[2] =
-          (cm->interp_filter == SWITCHABLE ? tmp_best_filter[2]
-                                           : cm->interp_filter);
-      mbmi->interp_filter[3] =
-          (cm->interp_filter == SWITCHABLE ? tmp_best_filter[3]
-                                           : cm->interp_filter);
-#else
-      mbmi->interp_filter =
-          (cm->interp_filter == SWITCHABLE ? tmp_best_filter
-                                           : cm->interp_filter);
-#endif  // CONFIG_DUAL_FILTER
-
-      if (!pred_exists) {
-        // Handles the special case when a filter that is not in the
-        // switchable list (bilinear) is indicated at the frame level
-        tmp_rd = rd_pick_inter_best_sub8x8_mode(
-            cpi, x, &x->mbmi_ext->ref_mvs[ref_frame][0], second_ref, best_yrd,
-            &rate, &rate_y, &distortion, &skippable, &total_sse,
-            (int)this_rd_thresh, seg_mvs,
-#if CONFIG_EXT_INTER
-            compound_seg_newmvs,
-#endif  // CONFIG_EXT_INTER
-            bsi, 0, mi_row, mi_col);
-        if (tmp_rd == INT64_MAX) continue;
-      } else {
-        total_sse = tmp_best_sse;
-        rate = tmp_best_rate;
-        rate_y = tmp_best_ratey;
-        distortion = tmp_best_distortion;
-        skippable = tmp_best_skippable;
-        *mbmi = tmp_best_mbmode;
-        for (i = 0; i < 4; i++) xd->mi[0]->bmi[i] = tmp_best_bmodes[i];
-      }
-      // Add in the cost of the transform type
-      if (!xd->lossless[mbmi->segment_id]) {
-        int rate_tx_type = 0;
-#if CONFIG_EXT_TX
-        if (get_ext_tx_types(mbmi->tx_size, bsize, 1, cm->reduced_tx_set_used) >
-            1) {
-          const int eset =
-              get_ext_tx_set(mbmi->tx_size, bsize, 1, cm->reduced_tx_set_used);
-          rate_tx_type =
-              cpi->inter_tx_type_costs[eset][mbmi->tx_size][mbmi->tx_type];
-        }
-#else
-        if (mbmi->tx_size < TX_32X32) {
-          rate_tx_type = cpi->inter_tx_type_costs[mbmi->tx_size][mbmi->tx_type];
-        }
-#endif  // CONFIG_EXT_TX
-        rate += rate_tx_type;
-        rate_y += rate_tx_type;
-      }
-
-      rate2 += rate;
-      distortion2 += distortion;
-
-      if (cm->interp_filter == SWITCHABLE)
-        rate2 += av1_get_switchable_rate(cpi, xd);
-
-      if (!mode_excluded)
-        mode_excluded = comp_pred ? cm->reference_mode == SINGLE_REFERENCE
-                                  : cm->reference_mode == COMPOUND_REFERENCE;
-
-      compmode_cost = av1_cost_bit(comp_mode_p, comp_pred);
-
-      tmp_best_rdu =
-          best_rd - AOMMIN(RDCOST(x->rdmult, x->rddiv, rate2, distortion2),
-                           RDCOST(x->rdmult, x->rddiv, 0, total_sse));
-
-      if (tmp_best_rdu > 0) {
-        // If even the 'Y' rd value of split is higher than best so far
-        // then dont bother looking at UV
-        int is_cost_valid_uv;
-        RD_STATS rd_stats_uv;
-        av1_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col, NULL,
-                                        BLOCK_8X8);
-#if CONFIG_VAR_TX
-        is_cost_valid_uv =
-            inter_block_uvrd(cpi, x, &rd_stats_uv, BLOCK_8X8, tmp_best_rdu);
-#else
-        is_cost_valid_uv =
-            super_block_uvrd(cpi, x, &rd_stats_uv, BLOCK_8X8, tmp_best_rdu);
-#endif  // CONFIG_VAR_TX
-        rate_uv = rd_stats_uv.rate;
-        distortion_uv = rd_stats_uv.dist;
-        uv_skippable = rd_stats_uv.skip;
-        uv_sse = rd_stats_uv.sse;
-
-        if (!is_cost_valid_uv) continue;
-        rate2 += rate_uv;
-        distortion2 += distortion_uv;
-        skippable = skippable && uv_skippable;
-        total_sse += uv_sse;
-      } else {
-        continue;
-      }
-    }
-
-    if (cm->reference_mode == REFERENCE_MODE_SELECT) rate2 += compmode_cost;
-
-    // Estimate the reference frame signaling cost and add it
-    // to the rolling cost variable.
-    if (second_ref_frame > INTRA_FRAME) {
-      rate2 += ref_costs_comp[ref_frame];
-#if CONFIG_EXT_REFS
-      rate2 += ref_costs_comp[second_ref_frame];
-#endif  // CONFIG_EXT_REFS
-    } else {
-      rate2 += ref_costs_single[ref_frame];
-    }
-
-    if (!disable_skip) {
-      // Skip is never coded at the segment level for sub8x8 blocks and instead
-      // always coded in the bitstream at the mode info level.
-
-      if (ref_frame != INTRA_FRAME && !xd->lossless[mbmi->segment_id]) {
-        if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
-            RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
-          // Add in the cost of the no skip flag.
-          rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
-        } else {
-          // FIXME(rbultje) make this work for splitmv also
-          rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
-          distortion2 = total_sse;
-          assert(total_sse >= 0);
-          rate2 -= (rate_y + rate_uv);
-          rate_y = 0;
-          rate_uv = 0;
-          this_skip2 = 1;
-        }
-      } else {
-        // Add in the cost of the no skip flag.
-        rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
-      }
-
-      // Calculate the final RD estimate for this mode.
-      this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
-    }
-
-    if (!disable_skip && ref_frame == INTRA_FRAME) {
-      for (i = 0; i < REFERENCE_MODES; ++i)
-        best_pred_rd[i] = AOMMIN(best_pred_rd[i], this_rd);
-    }
-
-    // Did this mode help.. i.e. is it the new best mode
-    if (this_rd < best_rd || x->skip) {
-      if (!mode_excluded) {
-        // Note index of best mode so far
-        best_ref_index = ref_index;
-
-        if (ref_frame == INTRA_FRAME) {
-          /* required for left and above block mv */
-          mbmi->mv[0].as_int = 0;
-        }
-
-        rd_cost->rate = rate2;
-#if CONFIG_SUPERTX
-        *returnrate_nocoef = rate2 - rate_y - rate_uv;
-        if (!disable_skip)
-          *returnrate_nocoef -=
-              av1_cost_bit(av1_get_skip_prob(cm, xd), this_skip2);
-        *returnrate_nocoef -= av1_cost_bit(av1_get_intra_inter_prob(cm, xd),
-                                           mbmi->ref_frame[0] != INTRA_FRAME);
-        assert(*returnrate_nocoef > 0);
-#endif  // CONFIG_SUPERTX
-        rd_cost->dist = distortion2;
-        rd_cost->rdcost = this_rd;
-        best_rd = this_rd;
-        best_yrd =
-            best_rd - RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv);
-        best_mbmode = *mbmi;
-        best_skip2 = this_skip2;
-
-#if CONFIG_VAR_TX
-        for (i = 0; i < MAX_MB_PLANE; ++i)
-          memset(ctx->blk_skip[i], 0, sizeof(uint8_t) * ctx->num_4x4_blk);
-#endif  // CONFIG_VAR_TX
-
-        for (i = 0; i < 4; i++) best_bmodes[i] = xd->mi[0]->bmi[i];
-      }
-    }
-
-    /* keep record of best compound/single-only prediction */
-    if (!disable_skip && ref_frame != INTRA_FRAME) {
-      int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
-
-      if (cm->reference_mode == REFERENCE_MODE_SELECT) {
-        single_rate = rate2 - compmode_cost;
-        hybrid_rate = rate2;
-      } else {
-        single_rate = rate2;
-        hybrid_rate = rate2 + compmode_cost;
-      }
-
-      single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
-      hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
-
-      if (!comp_pred && single_rd < best_pred_rd[SINGLE_REFERENCE])
-        best_pred_rd[SINGLE_REFERENCE] = single_rd;
-      else if (comp_pred && single_rd < best_pred_rd[COMPOUND_REFERENCE])
-        best_pred_rd[COMPOUND_REFERENCE] = single_rd;
-
-      if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
-        best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
-    }
-
-    if (x->skip && !comp_pred) break;
-  }
-
-  if (best_rd >= best_rd_so_far) {
-    rd_cost->rate = INT_MAX;
-    rd_cost->rdcost = INT64_MAX;
-#if CONFIG_SUPERTX
-    *returnrate_nocoef = INT_MAX;
-#endif  // CONFIG_SUPERTX
-    return;
-  }
-
-  if (best_rd == INT64_MAX) {
-    rd_cost->rate = INT_MAX;
-    rd_cost->dist = INT64_MAX;
-    rd_cost->rdcost = INT64_MAX;
-#if CONFIG_SUPERTX
-    *returnrate_nocoef = INT_MAX;
-#endif  // CONFIG_SUPERTX
-    return;
-  }
-
-#if CONFIG_DUAL_FILTER
-  assert((cm->interp_filter == SWITCHABLE) ||
-         (cm->interp_filter == best_mbmode.interp_filter[0]) ||
-         !is_inter_block(&best_mbmode));
-#else
-  assert((cm->interp_filter == SWITCHABLE) ||
-         (cm->interp_filter == best_mbmode.interp_filter) ||
-         !is_inter_block(&best_mbmode));
-#endif  // CONFIG_DUAL_FILTER
-
-  av1_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact,
-                            sf->adaptive_rd_thresh, bsize, best_ref_index);
-
-  // macroblock modes
-  *mbmi = best_mbmode;
-#if CONFIG_VAR_TX
-  mbmi->inter_tx_size[0][0] = mbmi->tx_size;
-#endif  // CONFIG_VAR_TX
-
-  x->skip |= best_skip2;
-  if (!is_inter_block(&best_mbmode)) {
-    for (i = 0; i < 4; i++) xd->mi[0]->bmi[i].as_mode = best_bmodes[i].as_mode;
-  } else {
-    for (i = 0; i < 4; ++i)
-      memcpy(&xd->mi[0]->bmi[i], &best_bmodes[i], sizeof(b_mode_info));
-
-#if CONFIG_REF_MV
-    mbmi->pred_mv[0].as_int = xd->mi[0]->bmi[3].pred_mv[0].as_int;
-    mbmi->pred_mv[1].as_int = xd->mi[0]->bmi[3].pred_mv[1].as_int;
-#endif  // CONFIG_REF_MV
-    mbmi->mv[0].as_int = xd->mi[0]->bmi[3].as_mv[0].as_int;
-    mbmi->mv[1].as_int = xd->mi[0]->bmi[3].as_mv[1].as_int;
-  }
-
-// Note: this section is needed since the mode may have been forced to ZEROMV
-#if CONFIG_GLOBAL_MOTION
-  if (mbmi->mode == ZEROMV
-#if CONFIG_EXT_INTER
-      || mbmi->mode == ZERO_ZEROMV
-#endif  // CONFIG_EXT_INTER
-      ) {
-    if (is_nontrans_global_motion(xd)) {
-#if CONFIG_DUAL_FILTER
-      mbmi->interp_filter[0] = cm->interp_filter == SWITCHABLE
-                                   ? EIGHTTAP_REGULAR
-                                   : cm->interp_filter;
-      mbmi->interp_filter[1] = cm->interp_filter == SWITCHABLE
-                                   ? EIGHTTAP_REGULAR
-                                   : cm->interp_filter;
-#else
-      mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP_REGULAR
-                                                            : cm->interp_filter;
-#endif  // CONFIG_DUAL_FILTER
-    }
-  }
-#endif  // CONFIG_GLOBAL_MOTION
-
-  for (i = 0; i < REFERENCE_MODES; ++i) {
-    if (best_pred_rd[i] == INT64_MAX)
-      best_pred_diff[i] = INT_MIN;
-    else
-      best_pred_diff[i] = best_rd - best_pred_rd[i];
-  }
-
-  store_coding_context(x, ctx, best_ref_index, best_pred_diff, 0);
-}
-
 #if CONFIG_MOTION_VAR
 // This function has a structure similar to av1_build_obmc_inter_prediction
 //
@@ -12454,9 +10658,14 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
     i = 0;
     do {  // for each mi in the above row
       const int mi_col_offset = i;
-      const MB_MODE_INFO *const above_mbmi =
+      const MB_MODE_INFO *above_mbmi =
           &xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]->mbmi;
-      const BLOCK_SIZE a_bsize = above_mbmi->sb_type;
+#if CONFIG_CHROMA_SUB8X8
+      if (above_mbmi->sb_type < BLOCK_8X8)
+        above_mbmi =
+            &xd->mi[mi_col_offset + 1 + mi_row_offset * xd->mi_stride]->mbmi;
+#endif
+      const BLOCK_SIZE a_bsize = AOMMAX(above_mbmi->sb_type, BLOCK_8X8);
       const int mi_step = AOMMIN(xd->n8_w, mi_size_wide[a_bsize]);
       const int neighbor_bw = mi_step * MI_SIZE;
 
@@ -12528,9 +10737,15 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
     i = 0;
     do {  // for each mi in the left column
       const int mi_row_offset = i;
-      const MB_MODE_INFO *const left_mbmi =
+      MB_MODE_INFO *left_mbmi =
           &xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]->mbmi;
-      const BLOCK_SIZE l_bsize = left_mbmi->sb_type;
+
+#if CONFIG_CHROMA_SUB8X8
+      if (left_mbmi->sb_type < BLOCK_8X8)
+        left_mbmi =
+            &xd->mi[mi_col_offset + (mi_row_offset + 1) * xd->mi_stride]->mbmi;
+#endif
+      const BLOCK_SIZE l_bsize = AOMMAX(left_mbmi->sb_type, BLOCK_8X8);
       const int mi_step = AOMMIN(xd->n8_h, mi_size_high[l_bsize]);
       const int neighbor_bh = mi_step * MI_SIZE;
 
@@ -12636,7 +10851,7 @@ void av1_check_ncobmc_rd(const struct AV1_COMP *cpi, struct macroblock *x,
   av1_setup_dst_planes(x->e_mbd.plane, bsize,
                        get_frame_new_buffer(&cpi->common), mi_row, mi_col);
 
-  av1_build_inter_predictors_sb(xd, mi_row, mi_col, NULL, bsize);
+  av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
 
   av1_subtract_plane(x, bsize, 0);
   super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
diff --git a/third_party/aom/av1/encoder/rdopt.h b/third_party/aom/av1/encoder/rdopt.h
index a7053b289..e5d778fe5 100644
--- a/third_party/aom/av1/encoder/rdopt.h
+++ b/third_party/aom/av1/encoder/rdopt.h
@@ -62,6 +62,12 @@ void av1_dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
                     TX_SIZE tx_size, int64_t *out_dist, int64_t *out_sse,
                     OUTPUT_STATUS output_status);
 
+#if CONFIG_DAALA_DIST
+int64_t av1_daala_dist(const uint8_t *src, int src_stride, const uint8_t *dst,
+                       int dst_stride, int bsw, int bsh, int qm,
+                       int use_activity_masking, int qindex);
+#endif
+
 #if !CONFIG_PVQ || CONFIG_VAR_TX
 int av1_cost_coeffs(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
                     int block, TX_SIZE tx_size, const SCAN_ORDER *scan_order,
@@ -101,16 +107,6 @@ int av1_active_h_edge(const struct AV1_COMP *cpi, int mi_row, int mi_step);
 int av1_active_v_edge(const struct AV1_COMP *cpi, int mi_col, int mi_step);
 int av1_active_edge_sb(const struct AV1_COMP *cpi, int mi_row, int mi_col);
 
-void av1_rd_pick_inter_mode_sub8x8(const struct AV1_COMP *cpi,
-                                   struct TileDataEnc *tile_data,
-                                   struct macroblock *x, int mi_row, int mi_col,
-                                   struct RD_STATS *rd_cost,
-#if CONFIG_SUPERTX
-                                   int *returnrate_nocoef,
-#endif  // CONFIG_SUPERTX
-                                   BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
-                                   int64_t best_rd_so_far);
-
 #if CONFIG_MOTION_VAR && CONFIG_NCOBMC
 void av1_check_ncobmc_rd(const struct AV1_COMP *cpi, struct macroblock *x,
                          int mi_row, int mi_col);
diff --git a/third_party/aom/av1/encoder/speed_features.c b/third_party/aom/av1/encoder/speed_features.c
index 20c96761b..e2275a54f 100644
--- a/third_party/aom/av1/encoder/speed_features.c
+++ b/third_party/aom/av1/encoder/speed_features.c
@@ -139,8 +139,10 @@ static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi,
   }
 }
 
-static void set_good_speed_feature(AV1_COMP *cpi, AV1_COMMON *cm,
-                                   SPEED_FEATURES *sf, int speed) {
+static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
+                                                          SPEED_FEATURES *sf,
+                                                          int speed) {
+  AV1_COMMON *const cm = &cpi->common;
   const int boosted = frame_is_boosted(cpi);
 
   if (speed >= 1) {
@@ -205,6 +207,9 @@ static void set_good_speed_feature(AV1_COMP *cpi, AV1_COMMON *cm,
 #if CONFIG_EXT_TX
     sf->tx_type_search.prune_mode = PRUNE_TWO;
 #endif
+#if CONFIG_GLOBAL_MOTION
+    sf->gm_search_type = GM_DISABLE_SEARCH;
+#endif  // CONFIG_GLOBAL_MOTION
   }
 
   if (speed >= 4) {
@@ -286,6 +291,12 @@ static void set_good_speed_feature(AV1_COMP *cpi, AV1_COMMON *cm,
     sf->coeff_prob_appx_step = 4;
     sf->mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH;
   }
+  if (speed >= 8) {
+    sf->mv.search_method = FAST_DIAMOND;
+    sf->mv.fullpel_search_step_param = 10;
+    sf->mv.subpel_force_stop = 2;
+    sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
+  }
 }
 
 void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi) {
@@ -339,12 +350,13 @@ void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi) {
 }
 
 void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
-  SPEED_FEATURES *const sf = &cpi->sf;
   AV1_COMMON *const cm = &cpi->common;
+  SPEED_FEATURES *const sf = &cpi->sf;
   MACROBLOCK *const x = &cpi->td.mb;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   int i;
 
+  (void)cm;
   // best quality defaults
   sf->frame_parameter_update = 1;
   sf->mv.search_method = NSTEP;
@@ -418,13 +430,16 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
 
   // Set this at the appropriate speed levels
   sf->use_transform_domain_distortion = 0;
+#if CONFIG_GLOBAL_MOTION
+  sf->gm_search_type = GM_FULL_SEARCH;
+#endif  // CONFIG_GLOBAL_MOTION
 
   if (oxcf->mode == GOOD
 #if CONFIG_XIPHRC
       || oxcf->pass == 1
 #endif
       )
-    set_good_speed_feature(cpi, cm, sf, oxcf->speed);
+    set_good_speed_features_framesize_independent(cpi, sf, oxcf->speed);
 
   // sf->partition_search_breakout_dist_thr is set assuming max 64x64
   // blocks. Normalise this if the blocks are bigger.
diff --git a/third_party/aom/av1/encoder/speed_features.h b/third_party/aom/av1/encoder/speed_features.h
index af54a1a9a..5710d77c7 100644
--- a/third_party/aom/av1/encoder/speed_features.h
+++ b/third_party/aom/av1/encoder/speed_features.h
@@ -24,6 +24,9 @@ enum {
               (1 << D207_PRED) | (1 << D63_PRED) |
 #if CONFIG_ALT_INTRA
               (1 << SMOOTH_PRED) |
+#if CONFIG_SMOOTH_HV
+              (1 << SMOOTH_V_PRED) | (1 << SMOOTH_H_PRED) |
+#endif  // CONFIG_SMOOTH_HV
 #endif  // CONFIG_ALT_INTRA
               (1 << TM_PRED),
   INTRA_DC = (1 << DC_PRED),
@@ -36,37 +39,33 @@ enum {
 #if CONFIG_EXT_INTER
 enum {
   INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) | (1 << NEWMV) |
-              (1 << NEAREST_NEARESTMV) | (1 << NEAR_NEARMV) |
-              (1 << NEAREST_NEARMV) | (1 << NEAR_NEARESTMV) | (1 << NEW_NEWMV) |
+              (1 << NEAREST_NEARESTMV) | (1 << NEAR_NEARMV) | (1 << NEW_NEWMV) |
               (1 << NEAREST_NEWMV) | (1 << NEAR_NEWMV) | (1 << NEW_NEARMV) |
               (1 << NEW_NEARESTMV) | (1 << ZERO_ZEROMV),
   INTER_NEAREST = (1 << NEARESTMV) | (1 << NEAREST_NEARESTMV) |
-                  (1 << NEAREST_NEARMV) | (1 << NEAR_NEARESTMV) |
                   (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV),
   INTER_NEAREST_NEW = (1 << NEARESTMV) | (1 << NEWMV) |
                       (1 << NEAREST_NEARESTMV) | (1 << NEW_NEWMV) |
-                      (1 << NEAR_NEARESTMV) | (1 << NEAREST_NEARMV) |
                       (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV) |
                       (1 << NEW_NEARMV) | (1 << NEAR_NEWMV),
   INTER_NEAREST_ZERO = (1 << NEARESTMV) | (1 << ZEROMV) |
                        (1 << NEAREST_NEARESTMV) | (1 << ZERO_ZEROMV) |
-                       (1 << NEAREST_NEARMV) | (1 << NEAR_NEARESTMV) |
                        (1 << NEAREST_NEWMV) | (1 << NEW_NEARESTMV),
-  INTER_NEAREST_NEW_ZERO =
-      (1 << NEARESTMV) | (1 << ZEROMV) | (1 << NEWMV) |
-      (1 << NEAREST_NEARESTMV) | (1 << ZERO_ZEROMV) | (1 << NEW_NEWMV) |
-      (1 << NEAREST_NEARMV) | (1 << NEAR_NEARESTMV) | (1 << NEW_NEARESTMV) |
-      (1 << NEAREST_NEWMV) | (1 << NEW_NEARMV) | (1 << NEAR_NEWMV),
-  INTER_NEAREST_NEAR_NEW =
-      (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV) |
-      (1 << NEAREST_NEARESTMV) | (1 << NEW_NEWMV) | (1 << NEAREST_NEARMV) |
-      (1 << NEAR_NEARESTMV) | (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV) |
-      (1 << NEW_NEARMV) | (1 << NEAR_NEWMV) | (1 << NEAR_NEARMV),
-  INTER_NEAREST_NEAR_ZERO =
-      (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) |
-      (1 << NEAREST_NEARESTMV) | (1 << ZERO_ZEROMV) | (1 << NEAREST_NEARMV) |
-      (1 << NEAR_NEARESTMV) | (1 << NEAREST_NEWMV) | (1 << NEW_NEARESTMV) |
-      (1 << NEW_NEARMV) | (1 << NEAR_NEWMV) | (1 << NEAR_NEARMV),
+  INTER_NEAREST_NEW_ZERO = (1 << NEARESTMV) | (1 << ZEROMV) | (1 << NEWMV) |
+                           (1 << NEAREST_NEARESTMV) | (1 << ZERO_ZEROMV) |
+                           (1 << NEW_NEWMV) | (1 << NEW_NEARESTMV) |
+                           (1 << NEAREST_NEWMV) | (1 << NEW_NEARMV) |
+                           (1 << NEAR_NEWMV),
+  INTER_NEAREST_NEAR_NEW = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV) |
+                           (1 << NEAREST_NEARESTMV) | (1 << NEW_NEWMV) |
+                           (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV) |
+                           (1 << NEW_NEARMV) | (1 << NEAR_NEWMV) |
+                           (1 << NEAR_NEARMV),
+  INTER_NEAREST_NEAR_ZERO = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) |
+                            (1 << NEAREST_NEARESTMV) | (1 << ZERO_ZEROMV) |
+                            (1 << NEAREST_NEWMV) | (1 << NEW_NEARESTMV) |
+                            (1 << NEW_NEARMV) | (1 << NEAR_NEWMV) |
+                            (1 << NEAR_NEARMV),
 };
 #else
 enum {
@@ -196,14 +195,7 @@ typedef enum {
   // Always use a fixed size partition
   FIXED_PARTITION,
 
-  REFERENCE_PARTITION,
-
-  // Use an arbitrary partitioning scheme based on source variance within
-  // a 64X64 SB
-  VAR_BASED_PARTITION,
-
-  // Use non-fixed partitions based on source variance
-  SOURCE_VAR_BASED_PARTITION
+  REFERENCE_PARTITION
 } PARTITION_SEARCH_TYPE;
 
 typedef enum {
@@ -251,6 +243,14 @@ typedef struct MESH_PATTERN {
   int interval;
 } MESH_PATTERN;
 
+#if CONFIG_GLOBAL_MOTION
+typedef enum {
+  GM_FULL_SEARCH,
+  GM_REDUCED_REF_SEARCH,
+  GM_DISABLE_SEARCH
+} GM_SEARCH_TYPE;
+#endif  // CONFIG_GLOBAL_MOTION
+
 typedef struct SPEED_FEATURES {
   MV_SPEED_FEATURES mv;
 
@@ -432,7 +432,7 @@ typedef struct SPEED_FEATURES {
   // TODO(aconverse): Fold this into one of the other many mode skips
   BLOCK_SIZE max_intra_bsize;
 
-  // The frequency that we check if SOURCE_VAR_BASED_PARTITION or
+  // The frequency that we check if
   // FIXED_PARTITION search type should be used.
   int search_type_check_frequency;
 
@@ -470,6 +470,10 @@ typedef struct SPEED_FEATURES {
   // Whether to compute distortion in the image domain (slower but
   // more accurate), or in the transform domain (faster but less acurate).
   int use_transform_domain_distortion;
+
+#if CONFIG_GLOBAL_MOTION
+  GM_SEARCH_TYPE gm_search_type;
+#endif  // CONFIG_GLOBAL_MOTION
 } SPEED_FEATURES;
 
 struct AV1_COMP;
diff --git a/third_party/aom/av1/encoder/subexp.c b/third_party/aom/av1/encoder/subexp.c
index 8960d3341..6a8ba12d8 100644
--- a/third_party/aom/av1/encoder/subexp.c
+++ b/third_party/aom/av1/encoder/subexp.c
@@ -179,83 +179,6 @@ int av1_prob_diff_update_savings_search_model(const unsigned int *ct,
   return bestsavings;
 }
 
-#if CONFIG_SUBFRAME_PROB_UPDATE
-static int get_cost(unsigned int ct[][2], aom_prob p, int n) {
-  int i, p0 = p;
-  unsigned int total_ct[2] = { 0, 0 };
-  int cost = 0;
-
-  for (i = 0; i <= n; ++i) {
-    cost += cost_branch256(ct[i], p);
-    total_ct[0] += ct[i][0];
-    total_ct[1] += ct[i][1];
-    if (i < n)
-      p = av1_merge_probs(p0, total_ct, COEF_COUNT_SAT, COEF_MAX_UPDATE_FACTOR);
-  }
-  return cost;
-}
-
-int av1_prob_update_search_subframe(unsigned int ct[][2], aom_prob oldp,
-                                    aom_prob *bestp, aom_prob upd, int n) {
-  const int old_b = get_cost(ct, oldp, n);
-  int bestsavings = 0;
-  const int upd_cost = av1_cost_one(upd) - av1_cost_zero(upd);
-  aom_prob newp, bestnewp = oldp;
-  const int step = *bestp > oldp ? -1 : 1;
-
-  for (newp = *bestp; newp != oldp; newp += step) {
-    const int new_b = get_cost(ct, newp, n);
-    const int update_b = prob_diff_update_cost(newp, oldp) + upd_cost;
-    const int savings = old_b - new_b - update_b;
-    if (savings > bestsavings) {
-      bestsavings = savings;
-      bestnewp = newp;
-    }
-  }
-  *bestp = bestnewp;
-  return bestsavings;
-}
-
-int av1_prob_update_search_model_subframe(
-    unsigned int ct[ENTROPY_NODES][COEF_PROBS_BUFS][2], const aom_prob *oldp,
-    aom_prob *bestp, aom_prob upd, int stepsize, int n) {
-  int i, old_b, new_b, update_b, savings, bestsavings;
-  int newp;
-  const int step_sign = *bestp > oldp[PIVOT_NODE] ? -1 : 1;
-  const int step = stepsize * step_sign;
-  const int upd_cost = av1_cost_one(upd) - av1_cost_zero(upd);
-  aom_prob bestnewp, newplist[ENTROPY_NODES], oldplist[ENTROPY_NODES];
-  av1_model_to_full_probs(oldp, oldplist);
-  memcpy(newplist, oldp, sizeof(aom_prob) * UNCONSTRAINED_NODES);
-  for (i = UNCONSTRAINED_NODES, old_b = 0; i < ENTROPY_NODES; ++i)
-    old_b += get_cost(ct[i], oldplist[i], n);
-  old_b += get_cost(ct[PIVOT_NODE], oldplist[PIVOT_NODE], n);
-
-  bestsavings = 0;
-  bestnewp = oldp[PIVOT_NODE];
-
-  assert(stepsize > 0);
-
-  for (newp = *bestp; (newp - oldp[PIVOT_NODE]) * step_sign < 0; newp += step) {
-    if (newp < 1 || newp > 255) continue;
-    newplist[PIVOT_NODE] = newp;
-    av1_model_to_full_probs(newplist, newplist);
-    for (i = UNCONSTRAINED_NODES, new_b = 0; i < ENTROPY_NODES; ++i)
-      new_b += get_cost(ct[i], newplist[i], n);
-    new_b += get_cost(ct[PIVOT_NODE], newplist[PIVOT_NODE], n);
-    update_b = prob_diff_update_cost(newp, oldp[PIVOT_NODE]) + upd_cost;
-    savings = old_b - new_b - update_b;
-    if (savings > bestsavings) {
-      bestsavings = savings;
-      bestnewp = newp;
-    }
-  }
-
-  *bestp = bestnewp;
-  return bestsavings;
-}
-#endif  // CONFIG_SUBFRAME_PROB_UPDATE
-
 void av1_cond_prob_diff_update(aom_writer *w, aom_prob *oldp,
                                const unsigned int ct[2], int probwt) {
   const aom_prob upd = DIFF_UPDATE_PROB;
diff --git a/third_party/aom/av1/encoder/subexp.h b/third_party/aom/av1/encoder/subexp.h
index 049265cb8..580edabdb 100644
--- a/third_party/aom/av1/encoder/subexp.h
+++ b/third_party/aom/av1/encoder/subexp.h
@@ -35,13 +35,6 @@ int av1_prob_diff_update_savings_search_model(const unsigned int *ct,
 
 int av1_cond_prob_diff_update_savings(aom_prob *oldp, const unsigned int ct[2],
                                       int probwt);
-#if CONFIG_SUBFRAME_PROB_UPDATE
-int av1_prob_update_search_subframe(unsigned int ct[][2], aom_prob oldp,
-                                    aom_prob *bestp, aom_prob upd, int n);
-int av1_prob_update_search_model_subframe(
-    unsigned int ct[ENTROPY_NODES][COEF_PROBS_BUFS][2], const aom_prob *oldp,
-    aom_prob *bestp, aom_prob upd, int stepsize, int n);
-#endif  // CONFIG_SUBFRAME_PROB_UPDATE
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/encoder/temporal_filter.c b/third_party/aom/av1/encoder/temporal_filter.c
index de962fe84..1ed1ebdb2 100644
--- a/third_party/aom/av1/encoder/temporal_filter.c
+++ b/third_party/aom/av1/encoder/temporal_filter.c
@@ -281,14 +281,10 @@ static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi,
 
   av1_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
 
-#if CONFIG_REF_MV
   x->mvcost = x->mv_cost_stack[0];
   x->nmvjointcost = x->nmv_vec_cost[0];
-  x->mvsadcost = x->mvcost;
-  x->nmvjointsadcost = x->nmvjointcost;
-#endif
 
-  // Ignore mv costing by sending NULL pointer instead of cost arrays
+  // Use mv costing from x->mvcost directly
   av1_hex_search(x, &best_ref_mv1_full, step_param, sadpb, 1,
                  cond_cost_list(cpi, cost_list), &cpi->fn_ptr[BLOCK_16X16], 0,
                  &best_ref_mv1);
@@ -299,8 +295,11 @@ static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi,
   bestsme = cpi->find_fractional_mv_step(
       x, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit,
       &cpi->fn_ptr[BLOCK_16X16], 0, mv_sf->subpel_iters_per_step,
-      cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0,
-      0);
+      cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL,
+#if CONFIG_EXT_INTER
+      NULL, 0, 0,
+#endif
+      0, 0, 0);
 
   x->e_mbd.mi[0]->bmi[0].as_mv[0] = x->best_mv;
 
diff --git a/third_party/aom/av1/encoder/tokenize.c b/third_party/aom/av1/encoder/tokenize.c
index f48493bf8..18d2cd958 100644
--- a/third_party/aom/av1/encoder/tokenize.c
+++ b/third_party/aom/av1/encoder/tokenize.c
@@ -23,6 +23,9 @@
 
 #include "av1/encoder/cost.h"
 #include "av1/encoder/encoder.h"
+#if CONFIG_LV_MAP
+#include "av1/encoder/encodetxb.c"
+#endif
 #include "av1/encoder/rdopt.h"
 #include "av1/encoder/tokenize.h"
 
@@ -261,20 +264,6 @@ const av1_extra_bit av1_extra_bits[ENTROPY_TOKENS] = {
 };
 #endif
 
-#if !CONFIG_EC_MULTISYMBOL
-const struct av1_token av1_coef_encodings[ENTROPY_TOKENS] = {
-  { 2, 2 },  { 6, 3 },   { 28, 5 },  { 58, 6 },  { 59, 6 },  { 60, 6 },
-  { 61, 6 }, { 124, 7 }, { 125, 7 }, { 126, 7 }, { 127, 7 }, { 0, 1 }
-};
-#endif  // !CONFIG_EC_MULTISYMBOL
-
-struct tokenize_b_args {
-  const AV1_COMP *cpi;
-  ThreadData *td;
-  TOKENEXTRA **tp;
-  int this_rate;
-};
-
 #if !CONFIG_PVQ || CONFIG_VAR_TX
 static void cost_coeffs_b(int plane, int block, int blk_row, int blk_col,
                           BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
@@ -314,7 +303,6 @@ static void set_entropy_context_b(int plane, int block, int blk_row,
                    blk_row);
 }
 
-#if CONFIG_NEW_TOKENSET
 static INLINE void add_token(TOKENEXTRA **t,
                              aom_cdf_prob (*tail_cdf)[CDF_SIZE(ENTROPY_TOKENS)],
                              aom_cdf_prob (*head_cdf)[CDF_SIZE(ENTROPY_TOKENS)],
@@ -328,25 +316,6 @@ static INLINE void add_token(TOKENEXTRA **t,
   (*t)->first_val = first_val;
   (*t)++;
 }
-
-#else  // CONFIG_NEW_TOKENSET
-static INLINE void add_token(
-    TOKENEXTRA **t, const aom_prob *context_tree,
-#if CONFIG_EC_MULTISYMBOL
-    aom_cdf_prob (*token_cdf)[CDF_SIZE(ENTROPY_TOKENS)],
-#endif  // CONFIG_EC_MULTISYMBOL
-    int32_t extra, uint8_t token, uint8_t skip_eob_node, unsigned int *counts) {
-  (*t)->token = token;
-  (*t)->extra = extra;
-  (*t)->context_tree = context_tree;
-#if CONFIG_EC_MULTISYMBOL
-  (*t)->token_cdf = token_cdf;
-#endif  // CONFIG_EC_MULTISYMBOL
-  (*t)->skip_eob_node = skip_eob_node;
-  (*t)++;
-  ++counts[token];
-}
-#endif  // CONFIG_NEW_TOKENSET
 #endif  // !CONFIG_PVQ || CONFIG_VAR_TX
 
 #if CONFIG_PALETTE
@@ -471,22 +440,11 @@ static void tokenize_b(int plane, int block, int blk_row, int blk_col,
   const int ref = is_inter_block(mbmi);
   unsigned int(*const counts)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
       td->rd_counts.coef_counts[txsize_sqr_map[tx_size]][type][ref];
-#if !CONFIG_NEW_TOKENSET
-#if CONFIG_SUBFRAME_PROB_UPDATE
-  const aom_prob(*coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
-      cpi->subframe_stats.coef_probs_buf[cpi->common.coef_probs_update_idx]
-                                        [txsize_sqr_map[tx_size]][type][ref];
-#else
-  aom_prob(*const coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
-      cpi->common.fc->coef_probs[txsize_sqr_map[tx_size]][type][ref];
-#endif  // CONFIG_SUBFRAME_PROB_UPDATE
-#endif  // !CONFIG_NEW_TOKENSET
 #if CONFIG_EC_ADAPT
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-#elif CONFIG_EC_MULTISYMBOL
+#else
   FRAME_CONTEXT *ec_ctx = cpi->common.fc;
 #endif
-#if CONFIG_NEW_TOKENSET
   aom_cdf_prob(
       *const coef_head_cdfs)[COEFF_CONTEXTS][CDF_SIZE(ENTROPY_TOKENS)] =
       ec_ctx->coef_head_cdfs[txsize_sqr_map[tx_size]][type][ref];
@@ -497,13 +455,6 @@ static void tokenize_b(int plane, int block, int blk_row, int blk_col,
       td->counts->blockz_count[txsize_sqr_map[tx_size]][type][ref];
   int eob_val;
   int first_val = 1;
-#else
-#if CONFIG_EC_MULTISYMBOL
-  aom_cdf_prob(*const coef_cdfs)[COEFF_CONTEXTS][CDF_SIZE(ENTROPY_TOKENS)] =
-      ec_ctx->coef_cdfs[txsize_sqr_map[tx_size]][type][ref];
-#endif
-  int skip_eob = 0;
-#endif
   const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
   unsigned int(*const eob_branch)[COEFF_CONTEXTS] =
       td->counts->eob_branch[txsize_sqr_map[tx_size]][type][ref];
@@ -517,7 +468,6 @@ static void tokenize_b(int plane, int block, int blk_row, int blk_col,
   nb = scan_order->neighbors;
   c = 0;
 
-#if CONFIG_NEW_TOKENSET
   if (eob == 0)
     add_token(&t, &coef_tail_cdfs[band[c]][pt], &coef_head_cdfs[band[c]][pt], 1,
               1, 0, BLOCK_Z_TOKEN);
@@ -553,33 +503,6 @@ static void tokenize_b(int plane, int block, int blk_row, int blk_col,
     ++c;
     pt = get_coef_context(nb, token_cache, AOMMIN(c, eob - 1));
   }
-#else
-  while (c < eob) {
-    const int v = qcoeff[scan[c]];
-    eob_branch[band[c]][pt] += !skip_eob;
-
-    av1_get_token_extra(v, &token, &extra);
-
-    add_token(&t, coef_probs[band[c]][pt],
-#if CONFIG_EC_MULTISYMBOL
-              &coef_cdfs[band[c]][pt],
-#endif
-              extra, (uint8_t)token, (uint8_t)skip_eob, counts[band[c]][pt]);
-
-    token_cache[scan[c]] = av1_pt_energy_class[token];
-    ++c;
-    pt = get_coef_context(nb, token_cache, c);
-    skip_eob = (token == ZERO_TOKEN);
-  }
-  if (c < seg_eob) {
-    add_token(&t, coef_probs[band[c]][pt],
-#if CONFIG_EC_MULTISYMBOL
-              NULL,
-#endif
-              0, EOB_TOKEN, 0, counts[band[c]][pt]);
-    ++eob_branch[band[c]][pt];
-  }
-#endif  // CONFIG_NEW_TOKENSET
 
 #if CONFIG_COEF_INTERLEAVE
   t->token = EOSB_TOKEN;
@@ -651,6 +574,18 @@ void tokenize_vartx(ThreadData *td, TOKENEXTRA **t, RUN_TYPE dry_run,
 
   if (tx_size == plane_tx_size) {
     plane_bsize = get_plane_block_size(mbmi->sb_type, pd);
+#if CONFIG_LV_MAP
+    if (!dry_run) {
+      av1_update_and_record_txb_context(plane, block, blk_row, blk_col,
+                                        plane_bsize, tx_size, arg);
+    } else if (dry_run == DRY_RUN_NORMAL) {
+      av1_update_txb_context_b(plane, block, blk_row, blk_col, plane_bsize,
+                               tx_size, arg);
+    } else {
+      printf("DRY_RUN_COSTCOEFFS is not supported yet\n");
+      assert(0);
+    }
+#else
     if (!dry_run)
       tokenize_b(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg);
     else if (dry_run == DRY_RUN_NORMAL)
@@ -658,6 +593,7 @@ void tokenize_vartx(ThreadData *td, TOKENEXTRA **t, RUN_TYPE dry_run,
                             tx_size, arg);
     else if (dry_run == DRY_RUN_COSTCOEFFS)
       cost_coeffs_b(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg);
+#endif
   } else {
     // Half the block size in transform block unit.
     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
@@ -688,7 +624,11 @@ void av1_tokenize_sb_vartx(const AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+#if CONFIG_LV_MAP
+  (void)t;
+#else
   TOKENEXTRA *t_backup = *t;
+#endif
   const int ctx = av1_get_skip_context(xd);
   const int skip_inc =
       !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
@@ -698,22 +638,25 @@ void av1_tokenize_sb_vartx(const AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
 
   if (mbmi->skip) {
     if (!dry_run) td->counts->skip[ctx][1] += skip_inc;
-    reset_skip_context(xd, bsize);
+    av1_reset_skip_context(xd, mi_row, mi_col, bsize);
+#if !CONFIG_LV_MAP
     if (dry_run) *t = t_backup;
+#endif
     return;
   }
 
-  if (!dry_run)
-    td->counts->skip[ctx][0] += skip_inc;
+  if (!dry_run) td->counts->skip[ctx][0] += skip_inc;
+#if !CONFIG_LV_MAP
   else
     *t = t_backup;
+#endif
 
   for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
 #if CONFIG_CB4X4
     if (!is_chroma_reference(mi_row, mi_col, bsize,
                              xd->plane[plane].subsampling_x,
                              xd->plane[plane].subsampling_y)) {
-#if !CONFIG_PVQ
+#if !CONFIG_PVQ || !CONFIG_LV_MAP
       if (!dry_run) {
         (*t)->token = EOSB_TOKEN;
         (*t)++;
@@ -746,10 +689,12 @@ void av1_tokenize_sb_vartx(const AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
       }
     }
 
+#if !CONFIG_LV_MAP
     if (!dry_run) {
       (*t)->token = EOSB_TOKEN;
       (*t)++;
     }
+#endif
   }
   if (rate) *rate += arg.this_rate;
 }
@@ -768,7 +713,7 @@ void av1_tokenize_sb(const AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
   struct tokenize_b_args arg = { cpi, td, t, 0 };
   if (mbmi->skip) {
     if (!dry_run) td->counts->skip[ctx][1] += skip_inc;
-    reset_skip_context(xd, bsize);
+    av1_reset_skip_context(xd, mi_row, mi_col, bsize);
     return;
   }
 
@@ -843,8 +788,8 @@ void av1_tokenize_sb(const AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
 
 #if CONFIG_SUPERTX
 void av1_tokenize_sb_supertx(const AV1_COMP *cpi, ThreadData *td,
-                             TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize,
-                             int *rate) {
+                             TOKENEXTRA **t, RUN_TYPE dry_run, int mi_row,
+                             int mi_col, BLOCK_SIZE bsize, int *rate) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &td->mb.e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
@@ -855,7 +800,7 @@ void av1_tokenize_sb_supertx(const AV1_COMP *cpi, ThreadData *td,
   struct tokenize_b_args arg = { cpi, td, t, 0 };
   if (mbmi->skip) {
     if (!dry_run) td->counts->skip[ctx][1] += skip_inc;
-    reset_skip_context(xd, bsize);
+    av1_reset_skip_context(xd, mi_row, mi_col, bsize);
     if (dry_run) *t = t_backup;
     return;
   }
diff --git a/third_party/aom/av1/encoder/tokenize.h b/third_party/aom/av1/encoder/tokenize.h
index 3928111d6..cbfa3cd91 100644
--- a/third_party/aom/av1/encoder/tokenize.h
+++ b/third_party/aom/av1/encoder/tokenize.h
@@ -35,14 +35,10 @@ typedef struct {
 } TOKENVALUE;
 
 typedef struct {
-#if CONFIG_NEW_TOKENSET
   aom_cdf_prob (*tail_cdf)[CDF_SIZE(ENTROPY_TOKENS)];
   aom_cdf_prob (*head_cdf)[CDF_SIZE(ENTROPY_TOKENS)];
   int eob_val;
   int first_val;
-#elif CONFIG_EC_MULTISYMBOL
-  aom_cdf_prob (*token_cdf)[CDF_SIZE(ENTROPY_TOKENS)];
-#endif
   const aom_prob *context_tree;
   EXTRABIT extra;
   uint8_t token;
@@ -51,15 +47,19 @@ typedef struct {
 
 extern const aom_tree_index av1_coef_tree[];
 extern const aom_tree_index av1_coef_con_tree[];
-#if !CONFIG_EC_MULTISYMBOL
-extern const struct av1_token av1_coef_encodings[];
-#endif  // !CONFIG_EC_MULTISYMBOL
 
 int av1_is_skippable_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
 
 struct AV1_COMP;
 struct ThreadData;
 
+struct tokenize_b_args {
+  const struct AV1_COMP *cpi;
+  struct ThreadData *td;
+  TOKENEXTRA **tp;
+  int this_rate;
+};
+
 typedef enum {
   OUTPUT_ENABLED = 0,
   DRY_RUN_NORMAL,
@@ -85,8 +85,8 @@ void av1_tokenize_sb(const struct AV1_COMP *cpi, struct ThreadData *td,
                      int *rate, const int mi_row, const int mi_col);
 #if CONFIG_SUPERTX
 void av1_tokenize_sb_supertx(const struct AV1_COMP *cpi, struct ThreadData *td,
-                             TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize,
-                             int *rate);
+                             TOKENEXTRA **t, RUN_TYPE dry_run, int mi_row,
+                             int mi_col, BLOCK_SIZE bsize, int *rate);
 #endif
 
 extern const int16_t *av1_dct_value_cost_ptr;
diff --git a/third_party/aom/av1/encoder/variance_tree.c b/third_party/aom/av1/encoder/variance_tree.c
deleted file mode 100644
index 9384cd78e..000000000
--- a/third_party/aom/av1/encoder/variance_tree.c
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "av1/encoder/variance_tree.h"
-#include "av1/encoder/encoder.h"
-
-void av1_setup_var_tree(struct AV1Common *cm, ThreadData *td) {
-  int i, j;
-#if CONFIG_EXT_PARTITION
-  const int leaf_nodes = 1024;
-  const int tree_nodes = 1024 + 256 + 64 + 16 + 4 + 1;
-#else
-  const int leaf_nodes = 256;
-  const int tree_nodes = 256 + 64 + 16 + 4 + 1;
-#endif  // CONFIG_EXT_PARTITION
-  int index = 0;
-  VAR_TREE *this_var;
-  int nodes;
-
-  aom_free(td->var_tree);
-  CHECK_MEM_ERROR(cm, td->var_tree,
-                  aom_calloc(tree_nodes, sizeof(*td->var_tree)));
-
-  this_var = &td->var_tree[0];
-
-  // Sets up all the leaf nodes in the tree.
-  for (index = 0; index < leaf_nodes; ++index) {
-    VAR_TREE *const leaf = &td->var_tree[index];
-    leaf->split[0] = NULL;
-  }
-
-  // Each node has 4 leaf nodes, fill in the child pointers
-  // from leafs to the root.
-  for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) {
-    for (i = 0; i < nodes; ++i, ++index) {
-      VAR_TREE *const node = &td->var_tree[index];
-      for (j = 0; j < 4; j++) node->split[j] = this_var++;
-    }
-  }
-
-  // Set up the root node for the largest superblock size
-  i = MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2;
-  td->var_root[i] = &td->var_tree[tree_nodes - 1];
-  // Set up the root nodes for the rest of the possible superblock sizes
-  while (--i >= 0) {
-    td->var_root[i] = td->var_root[i + 1]->split[0];
-  }
-}
-
-void av1_free_var_tree(ThreadData *td) {
-  aom_free(td->var_tree);
-  td->var_tree = NULL;
-}
diff --git a/third_party/aom/av1/encoder/variance_tree.h b/third_party/aom/av1/encoder/variance_tree.h
deleted file mode 100644
index a9f27302e..000000000
--- a/third_party/aom/av1/encoder/variance_tree.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AV1_ENCODER_VARIANCE_TREE_H_
-#define AV1_ENCODER_VARIANCE_TREE_H_
-
-#include <assert.h>
-
-#include "./aom_config.h"
-
-#include "aom/aom_integer.h"
-
-#include "av1/common/enums.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct AV1Common;
-struct ThreadData;
-
-typedef struct {
-  int64_t sum_square_error;
-  int64_t sum_error;
-  int log2_count;
-  int variance;
-} VAR;
-
-typedef struct {
-  VAR none;
-  VAR horz[2];
-  VAR vert[2];
-} partition_variance;
-
-typedef struct VAR_TREE {
-  int force_split;
-  partition_variance variances;
-  struct VAR_TREE *split[4];
-  BLOCK_SIZE bsize;
-  const uint8_t *src;
-  const uint8_t *ref;
-  int src_stride;
-  int ref_stride;
-  int width;
-  int height;
-#if CONFIG_HIGHBITDEPTH
-  int highbd;
-#endif  // CONFIG_HIGHBITDEPTH
-} VAR_TREE;
-
-void av1_setup_var_tree(struct AV1Common *cm, struct ThreadData *td);
-void av1_free_var_tree(struct ThreadData *td);
-
-// Set variance values given sum square error, sum error, count.
-static INLINE void fill_variance(int64_t s2, int64_t s, int c, VAR *v) {
-  v->sum_square_error = s2;
-  v->sum_error = s;
-  v->log2_count = c;
-  v->variance =
-      (int)(256 * (v->sum_square_error -
-                   ((v->sum_error * v->sum_error) >> v->log2_count)) >>
-            v->log2_count);
-}
-
-static INLINE void sum_2_variances(const VAR *a, const VAR *b, VAR *r) {
-  assert(a->log2_count == b->log2_count);
-  fill_variance(a->sum_square_error + b->sum_square_error,
-                a->sum_error + b->sum_error, a->log2_count + 1, r);
-}
-
-static INLINE void fill_variance_node(VAR_TREE *vt) {
-  sum_2_variances(&vt->split[0]->variances.none, &vt->split[1]->variances.none,
-                  &vt->variances.horz[0]);
-  sum_2_variances(&vt->split[2]->variances.none, &vt->split[3]->variances.none,
-                  &vt->variances.horz[1]);
-  sum_2_variances(&vt->split[0]->variances.none, &vt->split[2]->variances.none,
-                  &vt->variances.vert[0]);
-  sum_2_variances(&vt->split[1]->variances.none, &vt->split[3]->variances.none,
-                  &vt->variances.vert[1]);
-  sum_2_variances(&vt->variances.vert[0], &vt->variances.vert[1],
-                  &vt->variances.none);
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif /* AV1_ENCODER_VARIANCE_TREE_H_ */
diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c b/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c
index f9c95b6cb..190317389 100644
--- a/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c
+++ b/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c
@@ -15,13 +15,65 @@
 #include "./av1_rtcd.h"
 #include "aom/aom_integer.h"
 
-void av1_quantize_fp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs,
+static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset,
+                              __m128i *c0, __m128i *c1) {
+  const tran_low_t *addr = coeff + offset;
+#if CONFIG_HIGHBITDEPTH
+  const __m128i x0 = _mm_load_si128((const __m128i *)addr);
+  const __m128i x1 = _mm_load_si128((const __m128i *)addr + 1);
+  const __m128i x2 = _mm_load_si128((const __m128i *)addr + 2);
+  const __m128i x3 = _mm_load_si128((const __m128i *)addr + 3);
+  *c0 = _mm_packs_epi32(x0, x1);
+  *c1 = _mm_packs_epi32(x2, x3);
+#else
+  *c0 = _mm_load_si128((const __m128i *)addr);
+  *c1 = _mm_load_si128((const __m128i *)addr + 1);
+#endif
+}
+
+static INLINE void write_qcoeff(const __m128i *qc0, const __m128i *qc1,
+                                tran_low_t *qcoeff, intptr_t offset) {
+  tran_low_t *addr = qcoeff + offset;
+#if CONFIG_HIGHBITDEPTH
+  const __m128i zero = _mm_setzero_si128();
+  __m128i sign_bits = _mm_cmplt_epi16(*qc0, zero);
+  __m128i y0 = _mm_unpacklo_epi16(*qc0, sign_bits);
+  __m128i y1 = _mm_unpackhi_epi16(*qc0, sign_bits);
+  _mm_store_si128((__m128i *)addr, y0);
+  _mm_store_si128((__m128i *)addr + 1, y1);
+
+  sign_bits = _mm_cmplt_epi16(*qc1, zero);
+  y0 = _mm_unpacklo_epi16(*qc1, sign_bits);
+  y1 = _mm_unpackhi_epi16(*qc1, sign_bits);
+  _mm_store_si128((__m128i *)addr + 2, y0);
+  _mm_store_si128((__m128i *)addr + 3, y1);
+#else
+  _mm_store_si128((__m128i *)addr, *qc0);
+  _mm_store_si128((__m128i *)addr + 1, *qc1);
+#endif
+}
+
+static INLINE void write_zero(tran_low_t *qcoeff, intptr_t offset) {
+  const __m128i zero = _mm_setzero_si128();
+  tran_low_t *addr = qcoeff + offset;
+#if CONFIG_HIGHBITDEPTH
+  _mm_store_si128((__m128i *)addr, zero);
+  _mm_store_si128((__m128i *)addr + 1, zero);
+  _mm_store_si128((__m128i *)addr + 2, zero);
+  _mm_store_si128((__m128i *)addr + 3, zero);
+#else
+  _mm_store_si128((__m128i *)addr, zero);
+  _mm_store_si128((__m128i *)addr + 1, zero);
+#endif
+}
+
+void av1_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                           int skip_block, const int16_t *zbin_ptr,
                           const int16_t *round_ptr, const int16_t *quant_ptr,
-                          const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
-                          int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                          uint16_t *eob_ptr, const int16_t *scan_ptr,
-                          const int16_t *iscan_ptr) {
+                          const int16_t *quant_shift_ptr,
+                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                          const int16_t *scan_ptr, const int16_t *iscan_ptr) {
   __m128i zero;
   __m128i thr;
   int16_t nzflag;
@@ -54,8 +106,7 @@ void av1_quantize_fp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs,
         __m128i qcoeff0, qcoeff1;
         __m128i qtmp0, qtmp1;
         // Do DC and first 15 AC
-        coeff0 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs));
-        coeff1 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs) + 1);
+        read_coeff(coeff_ptr, n_coeffs, &coeff0, &coeff1);
 
         // Poor man's sign extract
         coeff0_sign = _mm_srai_epi16(coeff0, 15);
@@ -78,15 +129,13 @@ void av1_quantize_fp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs,
         qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
         qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
 
-        _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
-        _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+        write_qcoeff(&qcoeff0, &qcoeff1, qcoeff_ptr, n_coeffs);
 
         coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
         dequant = _mm_unpackhi_epi64(dequant, dequant);
         coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
 
-        _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
-        _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+        write_qcoeff(&coeff0, &coeff1, dqcoeff_ptr, n_coeffs);
       }
 
       {
@@ -121,8 +170,7 @@ void av1_quantize_fp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs,
         __m128i qcoeff0, qcoeff1;
         __m128i qtmp0, qtmp1;
 
-        coeff0 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs));
-        coeff1 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs) + 1);
+        read_coeff(coeff_ptr, n_coeffs, &coeff0, &coeff1);
 
         // Poor man's sign extract
         coeff0_sign = _mm_srai_epi16(coeff0, 15);
@@ -147,20 +195,15 @@ void av1_quantize_fp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs,
           qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
           qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
 
-          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
-          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+          write_qcoeff(&qcoeff0, &qcoeff1, qcoeff_ptr, n_coeffs);
 
           coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
           coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
 
-          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
-          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+          write_qcoeff(&coeff0, &coeff1, dqcoeff_ptr, n_coeffs);
         } else {
-          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
-          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
-
-          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
-          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
+          write_zero(qcoeff_ptr, n_coeffs);
+          write_zero(dqcoeff_ptr, n_coeffs);
         }
       }
 
@@ -200,10 +243,8 @@ void av1_quantize_fp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs,
     }
   } else {
     do {
-      _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
-      _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
-      _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
-      _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
+      write_zero(dqcoeff_ptr, n_coeffs);
+      write_zero(qcoeff_ptr, n_coeffs);
       n_coeffs += 8 * 2;
     } while (n_coeffs < 0);
     *eob_ptr = 0;
diff --git a/third_party/aom/av1/encoder/x86/corner_match_sse4.c b/third_party/aom/av1/encoder/x86/corner_match_sse4.c
new file mode 100644
index 000000000..179da0d28
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/corner_match_sse4.c
@@ -0,0 +1,91 @@
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include "./av1_rtcd.h"
+#include "aom_ports/mem.h"
+#include "av1/encoder/corner_match.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, byte_mask[16]) = {
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0
+};
+#if MATCH_SZ != 13
+#error "Need to change byte_mask in corner_match_sse4.c if MATCH_SZ != 13"
+#endif
+
+/* Compute corr(im1, im2) * MATCH_SZ * stddev(im1), where the
+   correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows
+   of each image, centered at (x1, y1) and (x2, y2) respectively.
+*/
+double compute_cross_correlation_sse4_1(unsigned char *im1, int stride1, int x1,
+                                        int y1, unsigned char *im2, int stride2,
+                                        int x2, int y2) {
+  int i;
+  // 2 16-bit partial sums in lanes 0, 4 (== 2 32-bit partial sums in lanes 0,
+  // 2)
+  __m128i sum1_vec = _mm_setzero_si128();
+  __m128i sum2_vec = _mm_setzero_si128();
+  // 4 32-bit partial sums of squares
+  __m128i sumsq2_vec = _mm_setzero_si128();
+  __m128i cross_vec = _mm_setzero_si128();
+
+  const __m128i mask = _mm_load_si128((__m128i *)byte_mask);
+  const __m128i zero = _mm_setzero_si128();
+
+  im1 += (y1 - MATCH_SZ_BY2) * stride1 + (x1 - MATCH_SZ_BY2);
+  im2 += (y2 - MATCH_SZ_BY2) * stride2 + (x2 - MATCH_SZ_BY2);
+
+  for (i = 0; i < MATCH_SZ; ++i) {
+    const __m128i v1 =
+        _mm_and_si128(_mm_loadu_si128((__m128i *)&im1[i * stride1]), mask);
+    const __m128i v2 =
+        _mm_and_si128(_mm_loadu_si128((__m128i *)&im2[i * stride2]), mask);
+
+    // Using the 'sad' intrinsic here is a bit faster than adding
+    // v1_l + v1_r and v2_l + v2_r, plus it avoids the need for a 16->32 bit
+    // conversion step later, for a net speedup of ~10%
+    sum1_vec = _mm_add_epi16(sum1_vec, _mm_sad_epu8(v1, zero));
+    sum2_vec = _mm_add_epi16(sum2_vec, _mm_sad_epu8(v2, zero));
+
+    const __m128i v1_l = _mm_cvtepu8_epi16(v1);
+    const __m128i v1_r = _mm_cvtepu8_epi16(_mm_srli_si128(v1, 8));
+    const __m128i v2_l = _mm_cvtepu8_epi16(v2);
+    const __m128i v2_r = _mm_cvtepu8_epi16(_mm_srli_si128(v2, 8));
+
+    sumsq2_vec = _mm_add_epi32(
+        sumsq2_vec,
+        _mm_add_epi32(_mm_madd_epi16(v2_l, v2_l), _mm_madd_epi16(v2_r, v2_r)));
+    cross_vec = _mm_add_epi32(
+        cross_vec,
+        _mm_add_epi32(_mm_madd_epi16(v1_l, v2_l), _mm_madd_epi16(v1_r, v2_r)));
+  }
+
+  // Now we can treat the four registers (sum1_vec, sum2_vec, sumsq2_vec,
+  // cross_vec)
+  // as holding 4 32-bit elements each, which we want to sum horizontally.
+  // We do this by transposing and then summing vertically.
+  __m128i tmp_0 = _mm_unpacklo_epi32(sum1_vec, sum2_vec);
+  __m128i tmp_1 = _mm_unpackhi_epi32(sum1_vec, sum2_vec);
+  __m128i tmp_2 = _mm_unpacklo_epi32(sumsq2_vec, cross_vec);
+  __m128i tmp_3 = _mm_unpackhi_epi32(sumsq2_vec, cross_vec);
+
+  __m128i tmp_4 = _mm_unpacklo_epi64(tmp_0, tmp_2);
+  __m128i tmp_5 = _mm_unpackhi_epi64(tmp_0, tmp_2);
+  __m128i tmp_6 = _mm_unpacklo_epi64(tmp_1, tmp_3);
+  __m128i tmp_7 = _mm_unpackhi_epi64(tmp_1, tmp_3);
+
+  __m128i res =
+      _mm_add_epi32(_mm_add_epi32(tmp_4, tmp_5), _mm_add_epi32(tmp_6, tmp_7));
+
+  int sum1 = _mm_extract_epi32(res, 0);
+  int sum2 = _mm_extract_epi32(res, 1);
+  int sumsq2 = _mm_extract_epi32(res, 2);
+  int cross = _mm_extract_epi32(res, 3);
+
+  int var2 = sumsq2 * MATCH_SZ_SQ - sum2 * sum2;
+  int cov = cross * MATCH_SZ_SQ - sum1 * sum2;
+  return cov / sqrt((double)var2);
+}
diff --git a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
index f201a29aa..b56eed518 100644
--- a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
+++ b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
@@ -13,7 +13,7 @@
 
 #include "./av1_rtcd.h"
 #include "./aom_config.h"
-#include "av1/common/av1_fwd_txfm2d_cfg.h"
+#include "av1/common/av1_fwd_txfm1d_cfg.h"
 #include "av1/common/av1_txfm.h"
 #include "av1/common/x86/highbd_txfm_utility_sse4.h"
 #include "aom_dsp/txfm_common.h"
@@ -58,7 +58,7 @@ static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
 // shift[1] is used in txfm_func_col()
 // shift[2] is used in txfm_func_row()
 static void fdct4x4_sse4_1(__m128i *in, int bit) {
-  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const int32_t *cospi = cospi_arr(bit);
   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
@@ -133,7 +133,7 @@ void av1_highbd_fht4x4_sse4_1(const int16_t *input, tran_low_t *output,
 }
 
 static void fadst4x4_sse4_1(__m128i *in, int bit) {
-  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const int32_t *cospi = cospi_arr(bit);
   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
@@ -209,71 +209,81 @@ static void fadst4x4_sse4_1(__m128i *in, int bit) {
 void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff,
                                int input_stride, int tx_type, int bd) {
   __m128i in[4];
-  const TXFM_2D_CFG *cfg = NULL;
+  const TXFM_1D_CFG *row_cfg = NULL;
+  const TXFM_1D_CFG *col_cfg = NULL;
 
   switch (tx_type) {
     case DCT_DCT:
-      cfg = &fwd_txfm_2d_cfg_dct_dct_4;
-      load_buffer_4x4(input, in, input_stride, 0, 0, cfg->shift[0]);
-      fdct4x4_sse4_1(in, cfg->cos_bit_col[2]);
-      fdct4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      row_cfg = &fwd_txfm_1d_row_cfg_dct_4;
+      col_cfg = &fwd_txfm_1d_col_cfg_dct_4;
+      load_buffer_4x4(input, in, input_stride, 0, 0, row_cfg->shift[0]);
+      fdct4x4_sse4_1(in, col_cfg->cos_bit[2]);
+      fdct4x4_sse4_1(in, row_cfg->cos_bit[2]);
       write_buffer_4x4(in, coeff);
       break;
     case ADST_DCT:
-      cfg = &fwd_txfm_2d_cfg_adst_dct_4;
-      load_buffer_4x4(input, in, input_stride, 0, 0, cfg->shift[0]);
-      fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
-      fdct4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      row_cfg = &fwd_txfm_1d_row_cfg_dct_4;
+      col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
+      load_buffer_4x4(input, in, input_stride, 0, 0, row_cfg->shift[0]);
+      fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
+      fdct4x4_sse4_1(in, row_cfg->cos_bit[2]);
       write_buffer_4x4(in, coeff);
       break;
     case DCT_ADST:
-      cfg = &fwd_txfm_2d_cfg_dct_adst_4;
-      load_buffer_4x4(input, in, input_stride, 0, 0, cfg->shift[0]);
-      fdct4x4_sse4_1(in, cfg->cos_bit_col[2]);
-      fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
+      col_cfg = &fwd_txfm_1d_col_cfg_dct_4;
+      load_buffer_4x4(input, in, input_stride, 0, 0, row_cfg->shift[0]);
+      fdct4x4_sse4_1(in, col_cfg->cos_bit[2]);
+      fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
       write_buffer_4x4(in, coeff);
       break;
     case ADST_ADST:
-      cfg = &fwd_txfm_2d_cfg_adst_adst_4;
-      load_buffer_4x4(input, in, input_stride, 0, 0, cfg->shift[0]);
-      fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
-      fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
+      col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
+      load_buffer_4x4(input, in, input_stride, 0, 0, row_cfg->shift[0]);
+      fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
+      fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
       write_buffer_4x4(in, coeff);
       break;
 #if CONFIG_EXT_TX
     case FLIPADST_DCT:
-      cfg = &fwd_txfm_2d_cfg_adst_dct_4;
-      load_buffer_4x4(input, in, input_stride, 1, 0, cfg->shift[0]);
-      fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
-      fdct4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      row_cfg = &fwd_txfm_1d_row_cfg_dct_4;
+      col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
+      load_buffer_4x4(input, in, input_stride, 1, 0, row_cfg->shift[0]);
+      fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
+      fdct4x4_sse4_1(in, row_cfg->cos_bit[2]);
       write_buffer_4x4(in, coeff);
       break;
     case DCT_FLIPADST:
-      cfg = &fwd_txfm_2d_cfg_dct_adst_4;
-      load_buffer_4x4(input, in, input_stride, 0, 1, cfg->shift[0]);
-      fdct4x4_sse4_1(in, cfg->cos_bit_col[2]);
-      fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
+      col_cfg = &fwd_txfm_1d_col_cfg_dct_4;
+      load_buffer_4x4(input, in, input_stride, 0, 1, row_cfg->shift[0]);
+      fdct4x4_sse4_1(in, col_cfg->cos_bit[2]);
+      fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
       write_buffer_4x4(in, coeff);
       break;
     case FLIPADST_FLIPADST:
-      cfg = &fwd_txfm_2d_cfg_adst_adst_4;
-      load_buffer_4x4(input, in, input_stride, 1, 1, cfg->shift[0]);
-      fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
-      fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
+      col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
+      load_buffer_4x4(input, in, input_stride, 1, 1, row_cfg->shift[0]);
+      fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
+      fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
       write_buffer_4x4(in, coeff);
       break;
     case ADST_FLIPADST:
-      cfg = &fwd_txfm_2d_cfg_adst_adst_4;
-      load_buffer_4x4(input, in, input_stride, 0, 1, cfg->shift[0]);
-      fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
-      fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
+      col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
+      load_buffer_4x4(input, in, input_stride, 0, 1, row_cfg->shift[0]);
+      fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
+      fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
       write_buffer_4x4(in, coeff);
       break;
     case FLIPADST_ADST:
-      cfg = &fwd_txfm_2d_cfg_adst_adst_4;
-      load_buffer_4x4(input, in, input_stride, 1, 0, cfg->shift[0]);
-      fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
-      fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
+      col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
+      load_buffer_4x4(input, in, input_stride, 1, 0, row_cfg->shift[0]);
+      fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
+      fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
       write_buffer_4x4(in, coeff);
       break;
 #endif
@@ -429,7 +439,7 @@ static INLINE void write_buffer_8x8(const __m128i *res, tran_low_t *output) {
 }
 
 static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
-  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const int32_t *cospi = cospi_arr(bit);
   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
   const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
@@ -625,7 +635,7 @@ static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
 }
 
 static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
-  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const int32_t *cospi = cospi_arr(bit);
   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
@@ -930,97 +940,107 @@ static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
 void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride,
                                int tx_type, int bd) {
   __m128i in[16], out[16];
-  const TXFM_2D_CFG *cfg = NULL;
+  const TXFM_1D_CFG *row_cfg = NULL;
+  const TXFM_1D_CFG *col_cfg = NULL;
 
   switch (tx_type) {
     case DCT_DCT:
-      cfg = &fwd_txfm_2d_cfg_dct_dct_8;
-      load_buffer_8x8(input, in, stride, 0, 0, cfg->shift[0]);
-      fdct8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
-      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      row_cfg = &fwd_txfm_1d_row_cfg_dct_8;
+      col_cfg = &fwd_txfm_1d_col_cfg_dct_8;
+      load_buffer_8x8(input, in, stride, 0, 0, row_cfg->shift[0]);
+      fdct8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
+      col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
       transpose_8x8(out, in);
-      fdct8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      fdct8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case ADST_DCT:
-      cfg = &fwd_txfm_2d_cfg_adst_dct_8;
-      load_buffer_8x8(input, in, stride, 0, 0, cfg->shift[0]);
-      fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
-      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      row_cfg = &fwd_txfm_1d_row_cfg_dct_8;
+      col_cfg = &fwd_txfm_1d_col_cfg_adst_8;
+      load_buffer_8x8(input, in, stride, 0, 0, row_cfg->shift[0]);
+      fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
+      col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
       transpose_8x8(out, in);
-      fdct8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      fdct8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case DCT_ADST:
-      cfg = &fwd_txfm_2d_cfg_dct_adst_8;
-      load_buffer_8x8(input, in, stride, 0, 0, cfg->shift[0]);
-      fdct8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
-      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      row_cfg = &fwd_txfm_1d_row_cfg_adst_8;
+      col_cfg = &fwd_txfm_1d_col_cfg_dct_8;
+      load_buffer_8x8(input, in, stride, 0, 0, row_cfg->shift[0]);
+      fdct8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
+      col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
       transpose_8x8(out, in);
-      fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case ADST_ADST:
-      cfg = &fwd_txfm_2d_cfg_adst_adst_8;
-      load_buffer_8x8(input, in, stride, 0, 0, cfg->shift[0]);
-      fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
-      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      row_cfg = &fwd_txfm_1d_row_cfg_adst_8;
+      col_cfg = &fwd_txfm_1d_col_cfg_adst_8;
+      load_buffer_8x8(input, in, stride, 0, 0, row_cfg->shift[0]);
+      fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
+      col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
       transpose_8x8(out, in);
-      fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
 #if CONFIG_EXT_TX
     case FLIPADST_DCT:
-      cfg = &fwd_txfm_2d_cfg_adst_dct_8;
-      load_buffer_8x8(input, in, stride, 1, 0, cfg->shift[0]);
-      fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
-      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      row_cfg = &fwd_txfm_1d_row_cfg_dct_8;
+      col_cfg = &fwd_txfm_1d_col_cfg_adst_8;
+      load_buffer_8x8(input, in, stride, 1, 0, row_cfg->shift[0]);
+      fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
+      col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
       transpose_8x8(out, in);
-      fdct8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      fdct8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case DCT_FLIPADST:
-      cfg = &fwd_txfm_2d_cfg_dct_adst_8;
-      load_buffer_8x8(input, in, stride, 0, 1, cfg->shift[0]);
-      fdct8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
-      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      row_cfg = &fwd_txfm_1d_row_cfg_adst_8;
+      col_cfg = &fwd_txfm_1d_col_cfg_dct_8;
+      load_buffer_8x8(input, in, stride, 0, 1, row_cfg->shift[0]);
+      fdct8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
+      col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
       transpose_8x8(out, in);
-      fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case FLIPADST_FLIPADST:
-      cfg = &fwd_txfm_2d_cfg_adst_adst_8;
-      load_buffer_8x8(input, in, stride, 1, 1, cfg->shift[0]);
-      fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
-      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      row_cfg = &fwd_txfm_1d_row_cfg_adst_8;
+      col_cfg = &fwd_txfm_1d_col_cfg_adst_8;
+      load_buffer_8x8(input, in, stride, 1, 1, row_cfg->shift[0]);
+      fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
+      col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
       transpose_8x8(out, in);
-      fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case ADST_FLIPADST:
-      cfg = &fwd_txfm_2d_cfg_adst_adst_8;
-      load_buffer_8x8(input, in, stride, 0, 1, cfg->shift[0]);
-      fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
-      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      row_cfg = &fwd_txfm_1d_row_cfg_adst_8;
+      col_cfg = &fwd_txfm_1d_col_cfg_adst_8;
+      load_buffer_8x8(input, in, stride, 0, 1, row_cfg->shift[0]);
+      fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
+      col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
       transpose_8x8(out, in);
-      fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
     case FLIPADST_ADST:
-      cfg = &fwd_txfm_2d_cfg_adst_adst_8;
-      load_buffer_8x8(input, in, stride, 1, 0, cfg->shift[0]);
-      fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
-      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      row_cfg = &fwd_txfm_1d_row_cfg_adst_8;
+      col_cfg = &fwd_txfm_1d_col_cfg_adst_8;
+      load_buffer_8x8(input, in, stride, 1, 0, row_cfg->shift[0]);
+      fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
+      col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
       transpose_8x8(out, in);
-      fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      fadst8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
@@ -1107,7 +1127,7 @@ static INLINE void load_buffer_16x16(const int16_t *input, __m128i *out,
 }
 
 static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
-  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const int32_t *cospi = cospi_arr(bit);
   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
   const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
@@ -1393,7 +1413,7 @@ static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
 }
 
 static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
-  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const int32_t *cospi = cospi_arr(bit);
   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
   const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
@@ -1794,97 +1814,107 @@ static void write_buffer_16x16(const __m128i *in, tran_low_t *output) {
 void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff,
                                  int stride, int tx_type, int bd) {
   __m128i in[64], out[64];
-  const TXFM_2D_CFG *cfg = NULL;
+  const TXFM_1D_CFG *row_cfg = NULL;
+  const TXFM_1D_CFG *col_cfg = NULL;
 
   switch (tx_type) {
     case DCT_DCT:
-      cfg = &fwd_txfm_2d_cfg_dct_dct_16;
-      load_buffer_16x16(input, in, stride, 0, 0, cfg->shift[0]);
-      fdct16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
-      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      row_cfg = &fwd_txfm_1d_row_cfg_dct_16;
+      col_cfg = &fwd_txfm_1d_col_cfg_dct_16;
+      load_buffer_16x16(input, in, stride, 0, 0, row_cfg->shift[0]);
+      fdct16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
+      col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
       transpose_16x16(out, in);
-      fdct16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      fdct16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case ADST_DCT:
-      cfg = &fwd_txfm_2d_cfg_adst_dct_16;
-      load_buffer_16x16(input, in, stride, 0, 0, cfg->shift[0]);
-      fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
-      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      row_cfg = &fwd_txfm_1d_row_cfg_dct_16;
+      col_cfg = &fwd_txfm_1d_col_cfg_adst_16;
+      load_buffer_16x16(input, in, stride, 0, 0, row_cfg->shift[0]);
+      fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
+      col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
       transpose_16x16(out, in);
-      fdct16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      fdct16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case DCT_ADST:
-      cfg = &fwd_txfm_2d_cfg_dct_adst_16;
-      load_buffer_16x16(input, in, stride, 0, 0, cfg->shift[0]);
-      fdct16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
-      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      row_cfg = &fwd_txfm_1d_row_cfg_adst_16;
+      col_cfg = &fwd_txfm_1d_col_cfg_dct_16;
+      load_buffer_16x16(input, in, stride, 0, 0, row_cfg->shift[0]);
+      fdct16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
+      col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
       transpose_16x16(out, in);
-      fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case ADST_ADST:
-      cfg = &fwd_txfm_2d_cfg_adst_adst_16;
-      load_buffer_16x16(input, in, stride, 0, 0, cfg->shift[0]);
-      fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
-      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      row_cfg = &fwd_txfm_1d_row_cfg_adst_16;
+      col_cfg = &fwd_txfm_1d_col_cfg_adst_16;
+      load_buffer_16x16(input, in, stride, 0, 0, row_cfg->shift[0]);
+      fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
+      col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
       transpose_16x16(out, in);
-      fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
 #if CONFIG_EXT_TX
     case FLIPADST_DCT:
-      cfg = &fwd_txfm_2d_cfg_adst_dct_16;
-      load_buffer_16x16(input, in, stride, 1, 0, cfg->shift[0]);
-      fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
-      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      row_cfg = &fwd_txfm_1d_row_cfg_dct_16;
+      col_cfg = &fwd_txfm_1d_col_cfg_adst_16;
+      load_buffer_16x16(input, in, stride, 1, 0, row_cfg->shift[0]);
+      fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
+      col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
       transpose_16x16(out, in);
-      fdct16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      fdct16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case DCT_FLIPADST:
-      cfg = &fwd_txfm_2d_cfg_dct_adst_16;
-      load_buffer_16x16(input, in, stride, 0, 1, cfg->shift[0]);
-      fdct16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
-      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      row_cfg = &fwd_txfm_1d_row_cfg_adst_16;
+      col_cfg = &fwd_txfm_1d_col_cfg_dct_16;
+      load_buffer_16x16(input, in, stride, 0, 1, row_cfg->shift[0]);
+      fdct16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
+      col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
       transpose_16x16(out, in);
-      fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case FLIPADST_FLIPADST:
-      cfg = &fwd_txfm_2d_cfg_adst_adst_16;
-      load_buffer_16x16(input, in, stride, 1, 1, cfg->shift[0]);
-      fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
-      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      row_cfg = &fwd_txfm_1d_row_cfg_adst_16;
+      col_cfg = &fwd_txfm_1d_col_cfg_adst_16;
+      load_buffer_16x16(input, in, stride, 1, 1, row_cfg->shift[0]);
+      fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
+      col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
       transpose_16x16(out, in);
-      fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case ADST_FLIPADST:
-      cfg = &fwd_txfm_2d_cfg_adst_adst_16;
-      load_buffer_16x16(input, in, stride, 0, 1, cfg->shift[0]);
-      fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
-      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      row_cfg = &fwd_txfm_1d_row_cfg_adst_16;
+      col_cfg = &fwd_txfm_1d_col_cfg_adst_16;
+      load_buffer_16x16(input, in, stride, 0, 1, row_cfg->shift[0]);
+      fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
+      col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
       transpose_16x16(out, in);
-      fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
     case FLIPADST_ADST:
-      cfg = &fwd_txfm_2d_cfg_adst_adst_16;
-      load_buffer_16x16(input, in, stride, 1, 0, cfg->shift[0]);
-      fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
-      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      row_cfg = &fwd_txfm_1d_row_cfg_adst_16;
+      col_cfg = &fwd_txfm_1d_col_cfg_adst_16;
+      load_buffer_16x16(input, in, stride, 1, 0, row_cfg->shift[0]);
+      fadst16x16_sse4_1(in, out, col_cfg->cos_bit[0]);
+      col_txfm_16x16_rounding(out, -row_cfg->shift[1]);
       transpose_16x16(out, in);
-      fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      fadst16x16_sse4_1(in, out, row_cfg->cos_bit[0]);
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
diff --git a/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c b/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
index 198e4e4c4..8495ad1aa 100644
--- a/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
+++ b/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
@@ -269,8 +269,8 @@ static void fdct16_avx2(__m256i *in) {
   x0 = _mm256_unpacklo_epi16(v0, v1);
   x1 = _mm256_unpackhi_epi16(v0, v1);
 
-  t0 = butter_fly(x0, x1, cospi_p16_p16);
-  t1 = butter_fly(x0, x1, cospi_p16_m16);
+  t0 = butter_fly(&x0, &x1, &cospi_p16_p16);
+  t1 = butter_fly(&x0, &x1, &cospi_p16_m16);
 
   // 4, 12
   v0 = _mm256_sub_epi16(s1, s2);
@@ -279,8 +279,8 @@ static void fdct16_avx2(__m256i *in) {
   x0 = _mm256_unpacklo_epi16(v0, v1);
   x1 = _mm256_unpackhi_epi16(v0, v1);
 
-  t2 = butter_fly(x0, x1, cospi_p24_p08);
-  t3 = butter_fly(x0, x1, cospi_m08_p24);
+  t2 = butter_fly(&x0, &x1, &cospi_p24_p08);
+  t3 = butter_fly(&x0, &x1, &cospi_m08_p24);
 
   // 2, 6, 10, 14
   s0 = _mm256_sub_epi16(u3, u4);
@@ -294,8 +294,8 @@ static void fdct16_avx2(__m256i *in) {
   x0 = _mm256_unpacklo_epi16(s2, s1);
   x1 = _mm256_unpackhi_epi16(s2, s1);
 
-  v2 = butter_fly(x0, x1, cospi_p16_p16);  // output[5]
-  v1 = butter_fly(x0, x1, cospi_p16_m16);  // output[6]
+  v2 = butter_fly(&x0, &x1, &cospi_p16_p16);  // output[5]
+  v1 = butter_fly(&x0, &x1, &cospi_p16_m16);  // output[6]
 
   s0 = _mm256_add_epi16(v0, v1);  // step[4]
   s1 = _mm256_sub_epi16(v0, v1);  // step[5]
@@ -306,14 +306,14 @@ static void fdct16_avx2(__m256i *in) {
   x0 = _mm256_unpacklo_epi16(s0, s3);
   x1 = _mm256_unpackhi_epi16(s0, s3);
 
-  t4 = butter_fly(x0, x1, cospi_p28_p04);
-  t5 = butter_fly(x0, x1, cospi_m04_p28);
+  t4 = butter_fly(&x0, &x1, &cospi_p28_p04);
+  t5 = butter_fly(&x0, &x1, &cospi_m04_p28);
 
   // 10, 6
   x0 = _mm256_unpacklo_epi16(s1, s2);
   x1 = _mm256_unpackhi_epi16(s1, s2);
-  t6 = butter_fly(x0, x1, cospi_p12_p20);
-  t7 = butter_fly(x0, x1, cospi_m20_p12);
+  t6 = butter_fly(&x0, &x1, &cospi_p12_p20);
+  t7 = butter_fly(&x0, &x1, &cospi_m20_p12);
 
   // 1, 3, 5, 7, 9, 11, 13, 15
   s0 = _mm256_sub_epi16(in[7], in[8]);  // step[8]
@@ -337,14 +337,14 @@ static void fdct16_avx2(__m256i *in) {
   x0 = _mm256_unpacklo_epi16(u5, u2);
   x1 = _mm256_unpackhi_epi16(u5, u2);
 
-  s2 = butter_fly(x0, x1, cospi_p16_p16);  // step[13]
-  s5 = butter_fly(x0, x1, cospi_p16_m16);  // step[10]
+  s2 = butter_fly(&x0, &x1, &cospi_p16_p16);  // step[13]
+  s5 = butter_fly(&x0, &x1, &cospi_p16_m16);  // step[10]
 
   x0 = _mm256_unpacklo_epi16(u4, u3);
   x1 = _mm256_unpackhi_epi16(u4, u3);
 
-  s3 = butter_fly(x0, x1, cospi_p16_p16);  // step[12]
-  s4 = butter_fly(x0, x1, cospi_p16_m16);  // step[11]
+  s3 = butter_fly(&x0, &x1, &cospi_p16_p16);  // step[12]
+  s4 = butter_fly(&x0, &x1, &cospi_p16_m16);  // step[11]
 
   u0 = _mm256_add_epi16(s0, s4);  // output[8]
   u1 = _mm256_add_epi16(s1, s5);
@@ -364,14 +364,14 @@ static void fdct16_avx2(__m256i *in) {
   x0 = _mm256_unpacklo_epi16(u1, u6);
   x1 = _mm256_unpackhi_epi16(u1, u6);
 
-  s1 = butter_fly(x0, x1, cospi_m08_p24);
-  s6 = butter_fly(x0, x1, cospi_p24_p08);
+  s1 = butter_fly(&x0, &x1, &cospi_m08_p24);
+  s6 = butter_fly(&x0, &x1, &cospi_p24_p08);
 
   x0 = _mm256_unpacklo_epi16(u2, u5);
   x1 = _mm256_unpackhi_epi16(u2, u5);
 
-  s2 = butter_fly(x0, x1, cospi_m24_m08);
-  s5 = butter_fly(x0, x1, cospi_m08_p24);
+  s2 = butter_fly(&x0, &x1, &cospi_m24_m08);
+  s5 = butter_fly(&x0, &x1, &cospi_m08_p24);
 
   // stage 5
   u0 = _mm256_add_epi16(s0, s1);
@@ -386,23 +386,23 @@ static void fdct16_avx2(__m256i *in) {
   // stage 6
   x0 = _mm256_unpacklo_epi16(u0, u7);
   x1 = _mm256_unpackhi_epi16(u0, u7);
-  in[1] = butter_fly(x0, x1, cospi_p30_p02);
-  in[15] = butter_fly(x0, x1, cospi_m02_p30);
+  in[1] = butter_fly(&x0, &x1, &cospi_p30_p02);
+  in[15] = butter_fly(&x0, &x1, &cospi_m02_p30);
 
   x0 = _mm256_unpacklo_epi16(u1, u6);
   x1 = _mm256_unpackhi_epi16(u1, u6);
-  in[9] = butter_fly(x0, x1, cospi_p14_p18);
-  in[7] = butter_fly(x0, x1, cospi_m18_p14);
+  in[9] = butter_fly(&x0, &x1, &cospi_p14_p18);
+  in[7] = butter_fly(&x0, &x1, &cospi_m18_p14);
 
   x0 = _mm256_unpacklo_epi16(u2, u5);
   x1 = _mm256_unpackhi_epi16(u2, u5);
-  in[5] = butter_fly(x0, x1, cospi_p22_p10);
-  in[11] = butter_fly(x0, x1, cospi_m10_p22);
+  in[5] = butter_fly(&x0, &x1, &cospi_p22_p10);
+  in[11] = butter_fly(&x0, &x1, &cospi_m10_p22);
 
   x0 = _mm256_unpacklo_epi16(u3, u4);
   x1 = _mm256_unpackhi_epi16(u3, u4);
-  in[13] = butter_fly(x0, x1, cospi_p06_p26);
-  in[3] = butter_fly(x0, x1, cospi_m26_p06);
+  in[13] = butter_fly(&x0, &x1, &cospi_p06_p26);
+  in[3] = butter_fly(&x0, &x1, &cospi_m26_p06);
 }
 
 void fadst16_avx2(__m256i *in) {
@@ -953,7 +953,9 @@ void fadst16_avx2(__m256i *in) {
 }
 
 #if CONFIG_EXT_TX
-static void fidtx16_avx2(__m256i *in) { txfm_scaling16_avx2(Sqrt2, in); }
+static void fidtx16_avx2(__m256i *in) {
+  txfm_scaling16_avx2((int16_t)Sqrt2, in);
+}
 #endif
 
 void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride,
@@ -964,28 +966,28 @@ void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride,
     case DCT_DCT:
       load_buffer_16x16(input, stride, 0, 0, in);
       fdct16_avx2(in);
-      mm256_transpose_16x16(in);
+      mm256_transpose_16x16(in, in);
       right_shift_16x16(in);
       fdct16_avx2(in);
       break;
     case ADST_DCT:
       load_buffer_16x16(input, stride, 0, 0, in);
       fadst16_avx2(in);
-      mm256_transpose_16x16(in);
+      mm256_transpose_16x16(in, in);
       right_shift_16x16(in);
       fdct16_avx2(in);
       break;
     case DCT_ADST:
       load_buffer_16x16(input, stride, 0, 0, in);
       fdct16_avx2(in);
-      mm256_transpose_16x16(in);
+      mm256_transpose_16x16(in, in);
       right_shift_16x16(in);
       fadst16_avx2(in);
       break;
     case ADST_ADST:
       load_buffer_16x16(input, stride, 0, 0, in);
       fadst16_avx2(in);
-      mm256_transpose_16x16(in);
+      mm256_transpose_16x16(in, in);
       right_shift_16x16(in);
       fadst16_avx2(in);
       break;
@@ -993,91 +995,91 @@ void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride,
     case FLIPADST_DCT:
       load_buffer_16x16(input, stride, 1, 0, in);
       fadst16_avx2(in);
-      mm256_transpose_16x16(in);
+      mm256_transpose_16x16(in, in);
       right_shift_16x16(in);
       fdct16_avx2(in);
       break;
     case DCT_FLIPADST:
       load_buffer_16x16(input, stride, 0, 1, in);
       fdct16_avx2(in);
-      mm256_transpose_16x16(in);
+      mm256_transpose_16x16(in, in);
       right_shift_16x16(in);
       fadst16_avx2(in);
       break;
     case FLIPADST_FLIPADST:
       load_buffer_16x16(input, stride, 1, 1, in);
       fadst16_avx2(in);
-      mm256_transpose_16x16(in);
+      mm256_transpose_16x16(in, in);
       right_shift_16x16(in);
       fadst16_avx2(in);
       break;
     case ADST_FLIPADST:
       load_buffer_16x16(input, stride, 0, 1, in);
       fadst16_avx2(in);
-      mm256_transpose_16x16(in);
+      mm256_transpose_16x16(in, in);
       right_shift_16x16(in);
       fadst16_avx2(in);
       break;
     case FLIPADST_ADST:
       load_buffer_16x16(input, stride, 1, 0, in);
       fadst16_avx2(in);
-      mm256_transpose_16x16(in);
+      mm256_transpose_16x16(in, in);
       right_shift_16x16(in);
       fadst16_avx2(in);
       break;
     case IDTX:
       load_buffer_16x16(input, stride, 0, 0, in);
       fidtx16_avx2(in);
-      mm256_transpose_16x16(in);
+      mm256_transpose_16x16(in, in);
       right_shift_16x16(in);
       fidtx16_avx2(in);
       break;
     case V_DCT:
       load_buffer_16x16(input, stride, 0, 0, in);
       fdct16_avx2(in);
-      mm256_transpose_16x16(in);
+      mm256_transpose_16x16(in, in);
       right_shift_16x16(in);
       fidtx16_avx2(in);
       break;
     case H_DCT:
       load_buffer_16x16(input, stride, 0, 0, in);
       fidtx16_avx2(in);
-      mm256_transpose_16x16(in);
+      mm256_transpose_16x16(in, in);
       right_shift_16x16(in);
       fdct16_avx2(in);
       break;
     case V_ADST:
       load_buffer_16x16(input, stride, 0, 0, in);
       fadst16_avx2(in);
-      mm256_transpose_16x16(in);
+      mm256_transpose_16x16(in, in);
       right_shift_16x16(in);
       fidtx16_avx2(in);
       break;
     case H_ADST:
       load_buffer_16x16(input, stride, 0, 0, in);
       fidtx16_avx2(in);
-      mm256_transpose_16x16(in);
+      mm256_transpose_16x16(in, in);
       right_shift_16x16(in);
       fadst16_avx2(in);
       break;
     case V_FLIPADST:
       load_buffer_16x16(input, stride, 1, 0, in);
       fadst16_avx2(in);
-      mm256_transpose_16x16(in);
+      mm256_transpose_16x16(in, in);
       right_shift_16x16(in);
       fidtx16_avx2(in);
       break;
     case H_FLIPADST:
       load_buffer_16x16(input, stride, 0, 1, in);
       fidtx16_avx2(in);
-      mm256_transpose_16x16(in);
+      mm256_transpose_16x16(in, in);
       right_shift_16x16(in);
       fadst16_avx2(in);
       break;
 #endif  // CONFIG_EXT_TX
     default: assert(0); break;
   }
-  mm256_transpose_16x16(in);
+  mm256_transpose_16x16(in, in);
   write_buffer_16x16(in, output);
   _mm256_zeroupper();
 }
@@ -1110,10 +1112,10 @@ static void mm256_vectors_swap(__m256i *a0, __m256i *a1, const int size) {
 }
 
 static void mm256_transpose_32x32(__m256i *in0, __m256i *in1) {
-  mm256_transpose_16x16(in0);
-  mm256_transpose_16x16(&in0[16]);
-  mm256_transpose_16x16(in1);
-  mm256_transpose_16x16(&in1[16]);
+  mm256_transpose_16x16(in0, in0);
+  mm256_transpose_16x16(&in0[16], &in0[16]);
+  mm256_transpose_16x16(in1, in1);
+  mm256_transpose_16x16(&in1[16], &in1[16]);
   mm256_vectors_swap(&in0[16], in1, 16);
 }
 
@@ -1247,23 +1249,23 @@ static void fdct16_odd_avx2(__m256i *in) {
 
   u0 = _mm256_unpacklo_epi16(in[4], in[11]);
   u1 = _mm256_unpackhi_epi16(in[4], in[11]);
-  y4 = butter_fly(u0, u1, cospi_m16_p16);
-  y11 = butter_fly(u0, u1, cospi_p16_p16);
+  y4 = butter_fly(&u0, &u1, &cospi_m16_p16);
+  y11 = butter_fly(&u0, &u1, &cospi_p16_p16);
 
   u0 = _mm256_unpacklo_epi16(in[5], in[10]);
   u1 = _mm256_unpackhi_epi16(in[5], in[10]);
-  y5 = butter_fly(u0, u1, cospi_m16_p16);
-  y10 = butter_fly(u0, u1, cospi_p16_p16);
+  y5 = butter_fly(&u0, &u1, &cospi_m16_p16);
+  y10 = butter_fly(&u0, &u1, &cospi_p16_p16);
 
   u0 = _mm256_unpacklo_epi16(in[6], in[9]);
   u1 = _mm256_unpackhi_epi16(in[6], in[9]);
-  y6 = butter_fly(u0, u1, cospi_m16_p16);
-  y9 = butter_fly(u0, u1, cospi_p16_p16);
+  y6 = butter_fly(&u0, &u1, &cospi_m16_p16);
+  y9 = butter_fly(&u0, &u1, &cospi_p16_p16);
 
   u0 = _mm256_unpacklo_epi16(in[7], in[8]);
   u1 = _mm256_unpackhi_epi16(in[7], in[8]);
-  y7 = butter_fly(u0, u1, cospi_m16_p16);
-  y8 = butter_fly(u0, u1, cospi_p16_p16);
+  y7 = butter_fly(&u0, &u1, &cospi_m16_p16);
+  y8 = butter_fly(&u0, &u1, &cospi_p16_p16);
 
   y12 = in[12];
   y13 = in[13];
@@ -1300,23 +1302,23 @@ static void fdct16_odd_avx2(__m256i *in) {
 
   u0 = _mm256_unpacklo_epi16(x2, x13);
   u1 = _mm256_unpackhi_epi16(x2, x13);
-  y2 = butter_fly(u0, u1, cospi_m08_p24);
-  y13 = butter_fly(u0, u1, cospi_p24_p08);
+  y2 = butter_fly(&u0, &u1, &cospi_m08_p24);
+  y13 = butter_fly(&u0, &u1, &cospi_p24_p08);
 
   u0 = _mm256_unpacklo_epi16(x3, x12);
   u1 = _mm256_unpackhi_epi16(x3, x12);
-  y3 = butter_fly(u0, u1, cospi_m08_p24);
-  y12 = butter_fly(u0, u1, cospi_p24_p08);
+  y3 = butter_fly(&u0, &u1, &cospi_m08_p24);
+  y12 = butter_fly(&u0, &u1, &cospi_p24_p08);
 
   u0 = _mm256_unpacklo_epi16(x4, x11);
   u1 = _mm256_unpackhi_epi16(x4, x11);
-  y4 = butter_fly(u0, u1, cospi_m24_m08);
-  y11 = butter_fly(u0, u1, cospi_m08_p24);
+  y4 = butter_fly(&u0, &u1, &cospi_m24_m08);
+  y11 = butter_fly(&u0, &u1, &cospi_m08_p24);
 
   u0 = _mm256_unpacklo_epi16(x5, x10);
   u1 = _mm256_unpackhi_epi16(x5, x10);
-  y5 = butter_fly(u0, u1, cospi_m24_m08);
-  y10 = butter_fly(u0, u1, cospi_m08_p24);
+  y5 = butter_fly(&u0, &u1, &cospi_m24_m08);
+  y10 = butter_fly(&u0, &u1, &cospi_m08_p24);
 
   // stage 5
   x0 = _mm256_add_epi16(y0, y3);
@@ -1349,23 +1351,23 @@ static void fdct16_odd_avx2(__m256i *in) {
 
   u0 = _mm256_unpacklo_epi16(x1, x14);
   u1 = _mm256_unpackhi_epi16(x1, x14);
-  y1 = butter_fly(u0, u1, cospi_m04_p28);
-  y14 = butter_fly(u0, u1, cospi_p28_p04);
+  y1 = butter_fly(&u0, &u1, &cospi_m04_p28);
+  y14 = butter_fly(&u0, &u1, &cospi_p28_p04);
 
   u0 = _mm256_unpacklo_epi16(x2, x13);
   u1 = _mm256_unpackhi_epi16(x2, x13);
-  y2 = butter_fly(u0, u1, cospi_m28_m04);
-  y13 = butter_fly(u0, u1, cospi_m04_p28);
+  y2 = butter_fly(&u0, &u1, &cospi_m28_m04);
+  y13 = butter_fly(&u0, &u1, &cospi_m04_p28);
 
   u0 = _mm256_unpacklo_epi16(x5, x10);
   u1 = _mm256_unpackhi_epi16(x5, x10);
-  y5 = butter_fly(u0, u1, cospi_m20_p12);
-  y10 = butter_fly(u0, u1, cospi_p12_p20);
+  y5 = butter_fly(&u0, &u1, &cospi_m20_p12);
+  y10 = butter_fly(&u0, &u1, &cospi_p12_p20);
 
   u0 = _mm256_unpacklo_epi16(x6, x9);
   u1 = _mm256_unpackhi_epi16(x6, x9);
-  y6 = butter_fly(u0, u1, cospi_m12_m20);
-  y9 = butter_fly(u0, u1, cospi_m20_p12);
+  y6 = butter_fly(&u0, &u1, &cospi_m12_m20);
+  y9 = butter_fly(&u0, &u1, &cospi_m20_p12);
 
   // stage 7
   x0 = _mm256_add_epi16(y0, y1);
@@ -1389,43 +1391,43 @@ static void fdct16_odd_avx2(__m256i *in) {
   // stage 8
   u0 = _mm256_unpacklo_epi16(x0, x15);
   u1 = _mm256_unpackhi_epi16(x0, x15);
-  in[0] = butter_fly(u0, u1, cospi_p31_p01);
-  in[15] = butter_fly(u0, u1, cospi_m01_p31);
+  in[0] = butter_fly(&u0, &u1, &cospi_p31_p01);
+  in[15] = butter_fly(&u0, &u1, &cospi_m01_p31);
 
   u0 = _mm256_unpacklo_epi16(x1, x14);
   u1 = _mm256_unpackhi_epi16(x1, x14);
-  in[1] = butter_fly(u0, u1, cospi_p15_p17);
-  in[14] = butter_fly(u0, u1, cospi_m17_p15);
+  in[1] = butter_fly(&u0, &u1, &cospi_p15_p17);
+  in[14] = butter_fly(&u0, &u1, &cospi_m17_p15);
 
   u0 = _mm256_unpacklo_epi16(x2, x13);
   u1 = _mm256_unpackhi_epi16(x2, x13);
-  in[2] = butter_fly(u0, u1, cospi_p23_p09);
-  in[13] = butter_fly(u0, u1, cospi_m09_p23);
+  in[2] = butter_fly(&u0, &u1, &cospi_p23_p09);
+  in[13] = butter_fly(&u0, &u1, &cospi_m09_p23);
 
   u0 = _mm256_unpacklo_epi16(x3, x12);
   u1 = _mm256_unpackhi_epi16(x3, x12);
-  in[3] = butter_fly(u0, u1, cospi_p07_p25);
-  in[12] = butter_fly(u0, u1, cospi_m25_p07);
+  in[3] = butter_fly(&u0, &u1, &cospi_p07_p25);
+  in[12] = butter_fly(&u0, &u1, &cospi_m25_p07);
 
   u0 = _mm256_unpacklo_epi16(x4, x11);
   u1 = _mm256_unpackhi_epi16(x4, x11);
-  in[4] = butter_fly(u0, u1, cospi_p27_p05);
-  in[11] = butter_fly(u0, u1, cospi_m05_p27);
+  in[4] = butter_fly(&u0, &u1, &cospi_p27_p05);
+  in[11] = butter_fly(&u0, &u1, &cospi_m05_p27);
 
   u0 = _mm256_unpacklo_epi16(x5, x10);
   u1 = _mm256_unpackhi_epi16(x5, x10);
-  in[5] = butter_fly(u0, u1, cospi_p11_p21);
-  in[10] = butter_fly(u0, u1, cospi_m21_p11);
+  in[5] = butter_fly(&u0, &u1, &cospi_p11_p21);
+  in[10] = butter_fly(&u0, &u1, &cospi_m21_p11);
 
   u0 = _mm256_unpacklo_epi16(x6, x9);
   u1 = _mm256_unpackhi_epi16(x6, x9);
-  in[6] = butter_fly(u0, u1, cospi_p19_p13);
-  in[9] = butter_fly(u0, u1, cospi_m13_p19);
+  in[6] = butter_fly(&u0, &u1, &cospi_p19_p13);
+  in[9] = butter_fly(&u0, &u1, &cospi_m13_p19);
 
   u0 = _mm256_unpacklo_epi16(x7, x8);
   u1 = _mm256_unpackhi_epi16(x7, x8);
-  in[7] = butter_fly(u0, u1, cospi_p03_p29);
-  in[8] = butter_fly(u0, u1, cospi_m29_p03);
+  in[7] = butter_fly(&u0, &u1, &cospi_p03_p29);
+  in[8] = butter_fly(&u0, &u1, &cospi_m29_p03);
 }
 
 static void fdct32_avx2(__m256i *in0, __m256i *in1) {
@@ -1464,7 +1466,7 @@ static INLINE void write_buffer_32x32(const __m256i *in0, const __m256i *in1,
 static void fhalfright32_16col_avx2(__m256i *in) {
   int i = 0;
   const __m256i zero = _mm256_setzero_si256();
-  const __m256i sqrt2 = _mm256_set1_epi16(Sqrt2);
+  const __m256i sqrt2 = _mm256_set1_epi16((int16_t)Sqrt2);
   const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
   __m256i x0, x1;