1 files changed, 978 insertions, 384 deletions
diff --git a/third_party/aom/av1/encoder/encodeframe.c b/third_party/aom/av1/encoder/encodeframe.c
index 36d09c02a..d13eb42fb 100644
--- a/third_party/aom/av1/encoder/encodeframe.c
+++ b/third_party/aom/av1/encoder/encodeframe.c
@@ -84,7 +84,7 @@ static void predict_superblock(const AV1_COMP *const cpi, ThreadData *td,
 #if CONFIG_EXT_INTER
                                int mi_row_ori, int mi_col_ori,
 #endif  // CONFIG_EXT_INTER
-                               int mi_row_pred, int mi_col_pred,
+                               int mi_row_pred, int mi_col_pred, int plane,
                                BLOCK_SIZE bsize_pred, int b_sub8x8, int block);
 static int check_supertx_sb(BLOCK_SIZE bsize, TX_SIZE supertx_size,
                             PC_TREE *pc_tree);
@@ -308,7 +308,6 @@ static void set_offsets_without_segment_id(const AV1_COMP *const cpi,
   av1_setup_src_planes(x, cpi->source, mi_row, mi_col);
 
   // R/D setup.
-  x->rddiv = cpi->rd.RDDIV;
   x->rdmult = cpi->rd.RDMULT;
 
   // required by av1_append_sub8x8_mvs_for_idx() and av1_find_best_ref_mvs()
@@ -326,6 +325,10 @@ static void set_offsets(const AV1_COMP *const cpi, const TileInfo *const tile,
   set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
 
   mbmi = &xd->mi[0]->mbmi;
+#if CONFIG_CFL
+  xd->cfl->mi_row = mi_row;
+  xd->cfl->mi_col = mi_col;
+#endif
 
   // Setup segment ID.
   if (seg->enabled) {
@@ -413,7 +416,6 @@ static void set_offsets_extend(const AV1_COMP *const cpi, ThreadData *td,
   xd->left_available = (mi_col_ori > tile->mi_col_start);
 
   // R/D setup.
-  x->rddiv = cpi->rd.RDDIV;
   x->rdmult = cpi->rd.RDMULT;
 }
 
@@ -539,6 +541,21 @@ static void set_ref_and_pred_mvs(MACROBLOCK *const x, int_mv *const mi_pred_mv,
       mbmi->pred_mv[1] = this_mv;
       mi_pred_mv[1] = this_mv;
     }
+#if CONFIG_COMPOUND_SINGLEREF
+  } else if (is_inter_singleref_comp_mode(mbmi->mode)) {
+    // Special case: SR_NEAR_NEWMV uses 1 + mbmi->ref_mv_idx
+    // (like NEARMV) instead
+    if (mbmi->mode == SR_NEAR_NEWMV) ref_mv_idx += 1;
+
+    if (compound_ref0_mode(mbmi->mode) == NEWMV ||
+        compound_ref1_mode(mbmi->mode) == NEWMV) {
+      int_mv this_mv = curr_ref_mv_stack[ref_mv_idx].this_mv;
+      clamp_mv_ref(&this_mv.as_mv, bw, bh, xd);
+      mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv;
+      mbmi->pred_mv[0] = this_mv;
+      mi_pred_mv[0] = this_mv;
+    }
+#endif  // CONFIG_COMPOUND_SINGLEREF
   } else {
 #endif  // CONFIG_EXT_INTER
     if (mbmi->mode == NEWMV) {
@@ -635,7 +652,6 @@ static void update_state(const AV1_COMP *const cpi, ThreadData *td,
 #if CONFIG_PALETTE
   for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
 #endif  // CONFIG_PALETTE
-
   // Restore the coding context of the MB to that that was in place
   // when the mode was picked for it
   for (y = 0; y < mi_height; y++)
@@ -814,7 +830,6 @@ static void update_state_supertx(const AV1_COMP *const cpi, ThreadData *td,
     }
     mi_addr->mbmi.segment_id_supertx = MAX_SEGMENTS;
   }
-
   // Restore the coding context of the MB to that that was in place
   // when the mode was picked for it
   for (y = 0; y < mi_height; y++)
@@ -1147,7 +1162,7 @@ static void update_supertx_param_sb(const AV1_COMP *const cpi, ThreadData *td,
 }
 #endif  // CONFIG_SUPERTX
 
-#if CONFIG_MOTION_VAR && CONFIG_NCOBMC
+#if CONFIG_MOTION_VAR && (CONFIG_NCOBMC || CONFIG_NCOBMC_ADAPT_WEIGHT)
 static void set_mode_info_b(const AV1_COMP *const cpi,
                             const TileInfo *const tile, ThreadData *td,
                             int mi_row, int mi_col, BLOCK_SIZE bsize,
@@ -1167,6 +1182,7 @@ static void set_mode_info_sb(const AV1_COMP *const cpi, ThreadData *td,
   BLOCK_SIZE subsize = get_subsize(bsize, partition);
 #if CONFIG_EXT_PARTITION_TYPES
   const BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+  const int quarter_step = mi_size_wide[bsize] / 4;
 #endif
 #if CONFIG_CB4X4
   const int unify_bsize = 1;
@@ -1245,6 +1261,24 @@ static void set_mode_info_sb(const AV1_COMP *const cpi, ThreadData *td,
       set_mode_info_b(cpi, tile, td, mi_row + hbs, mi_col + hbs, bsize2,
                       &pc_tree->verticalb[2]);
       break;
+    case PARTITION_HORZ_4:
+      for (int i = 0; i < 4; ++i) {
+        int this_mi_row = mi_row + i * quarter_step;
+        if (i > 0 && this_mi_row >= cm->mi_rows) break;
+
+        set_mode_info_b(cpi, tile, td, this_mi_row, mi_col, subsize,
+                        &pc_tree->horizontal4[i]);
+      }
+      break;
+    case PARTITION_VERT_4:
+      for (int i = 0; i < 4; ++i) {
+        int this_mi_col = mi_col + i * quarter_step;
+        if (i > 0 && this_mi_col >= cm->mi_cols) break;
+
+        set_mode_info_b(cpi, tile, td, mi_row, this_mi_col, subsize,
+                        &pc_tree->vertical4[i]);
+      }
+      break;
 #endif  // CONFIG_EXT_PARTITION_TYPES
     default: assert(0 && "Invalid partition type."); break;
   }
@@ -1281,10 +1315,10 @@ static int set_segment_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
   return av1_compute_rd_mult(cpi, segment_qindex + cm->y_dc_delta_q);
 }
 
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
-static void daala_dist_set_sub8x8_dst(MACROBLOCK *const x, uint8_t *dst8x8,
-                                      BLOCK_SIZE bsize, int bw, int bh,
-                                      int mi_row, int mi_col) {
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
+static void dist_8x8_set_sub8x8_dst(MACROBLOCK *const x, uint8_t *dst8x8,
+                                    BLOCK_SIZE bsize, int bw, int bh,
+                                    int mi_row, int mi_col) {
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblockd_plane *const pd = &xd->plane[0];
   const int dst_stride = pd->dst.stride;
@@ -1294,12 +1328,24 @@ static void daala_dist_set_sub8x8_dst(MACROBLOCK *const x, uint8_t *dst8x8,
 
   if (bsize < BLOCK_8X8) {
     int i, j;
-    uint8_t *dst_sub8x8 = &dst8x8[((mi_row & 1) * 8 + (mi_col & 1)) << 2];
+#if CONFIG_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      uint16_t *dst8x8_16 = (uint16_t *)dst8x8;
+      uint16_t *dst_sub8x8 = &dst8x8_16[((mi_row & 1) * 8 + (mi_col & 1)) << 2];
 
-    for (j = 0; j < bh; ++j)
-      for (i = 0; i < bw; ++i) {
-        dst_sub8x8[j * 8 + i] = dst[j * dst_stride + i];
-      }
+      for (j = 0; j < bh; ++j)
+        for (i = 0; i < bw; ++i)
+          dst_sub8x8[j * 8 + i] = CONVERT_TO_SHORTPTR(dst)[j * dst_stride + i];
+    } else {
+#endif
+      uint8_t *dst_sub8x8 = &dst8x8[((mi_row & 1) * 8 + (mi_col & 1)) << 2];
+
+      for (j = 0; j < bh; ++j)
+        for (i = 0; i < bw; ++i)
+          dst_sub8x8[j * 8 + i] = dst[j * dst_stride + i];
+#if CONFIG_HIGHBITDEPTH
+    }
+#endif
   }
 }
 #endif
@@ -1330,10 +1376,6 @@ static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data,
   x->pvq_speed = 1;
   x->pvq_coded = 0;
 #endif
-#if CONFIG_CFL
-  // Don't store luma during RDO (we will store the best mode later).
-  x->cfl_store_y = 0;
-#endif
 
   set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
   mbmi = &xd->mi[0]->mbmi;
@@ -1342,6 +1384,10 @@ static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data,
   mbmi->mi_row = mi_row;
   mbmi->mi_col = mi_col;
 #endif
+#if CONFIG_CFL
+  // Don't store luma during RDO. Only store luma when best luma is known
+  x->cfl_store_y = 0;
+#endif
 #if CONFIG_SUPERTX
   // We set tx_size here as skip blocks would otherwise not set it.
   // tx_size needs to be set at this point as supertx_enable in
@@ -1542,6 +1588,7 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
 #endif
   if (!frame_is_intra_only(cm)) {
     FRAME_COUNTS *const counts = td->counts;
+    RD_COUNTS *rdc = &td->rd_counts;
     const int inter_block = is_inter_block(mbmi);
     const int seg_ref_active =
         segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
@@ -1560,6 +1607,12 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
 #endif  // CONFIG_EXT_REFS
 
         if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+          if (has_second_ref(mbmi))
+            // This flag is also updated for 4x4 blocks
+            rdc->compound_ref_used_flag = 1;
+          else
+            // This flag is also updated for 4x4 blocks
+            rdc->single_ref_used_flag = 1;
 #if !SUB8X8_COMP_REF
           if (mbmi->sb_type != BLOCK_4X4)
             counts->comp_inter[av1_get_reference_mode_context(cm, xd)]
@@ -1571,24 +1624,53 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
         }
 
         if (has_second_ref(mbmi)) {
+#if CONFIG_EXT_COMP_REFS
+          const COMP_REFERENCE_TYPE comp_ref_type = has_uni_comp_refs(mbmi)
+                                                        ? UNIDIR_COMP_REFERENCE
+                                                        : BIDIR_COMP_REFERENCE;
+#if !USE_UNI_COMP_REFS
+          // TODO(zoeliu): Temporarily turn off uni-directional comp refs
+          assert(comp_ref_type == BIDIR_COMP_REFERENCE);
+#endif  // !USE_UNI_COMP_REFS
+          counts->comp_ref_type[av1_get_comp_reference_type_context(xd)]
+                               [comp_ref_type]++;
+
+          if (comp_ref_type == UNIDIR_COMP_REFERENCE) {
+            const int bit = (ref0 == BWDREF_FRAME);
+            counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p(xd)][0]
+                                [bit]++;
+            if (!bit) {
+              const int bit1 = (ref1 == LAST3_FRAME || ref1 == GOLDEN_FRAME);
+              counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p1(xd)][1]
+                                  [bit1]++;
+              if (bit1) {
+                counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p2(xd)]
+                                    [2][ref1 == GOLDEN_FRAME]++;
+              }
+            }
+          } else {
+#endif  // CONFIG_EXT_COMP_REFS
 #if CONFIG_EXT_REFS
-          const int bit = (ref0 == GOLDEN_FRAME || ref0 == LAST3_FRAME);
+            const int bit = (ref0 == GOLDEN_FRAME || ref0 == LAST3_FRAME);
 
-          counts->comp_ref[av1_get_pred_context_comp_ref_p(cm, xd)][0][bit]++;
-          if (!bit) {
-            counts->comp_ref[av1_get_pred_context_comp_ref_p1(cm, xd)][1]
-                            [ref0 == LAST_FRAME]++;
-          } else {
-            counts->comp_ref[av1_get_pred_context_comp_ref_p2(cm, xd)][2]
-                            [ref0 == GOLDEN_FRAME]++;
-          }
+            counts->comp_ref[av1_get_pred_context_comp_ref_p(cm, xd)][0][bit]++;
+            if (!bit) {
+              counts->comp_ref[av1_get_pred_context_comp_ref_p1(cm, xd)][1]
+                              [ref0 == LAST_FRAME]++;
+            } else {
+              counts->comp_ref[av1_get_pred_context_comp_ref_p2(cm, xd)][2]
+                              [ref0 == GOLDEN_FRAME]++;
+            }
 
-          counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p(cm, xd)][0]
-                             [ref1 == ALTREF_FRAME]++;
-#else
+            counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p(cm, xd)][0]
+                               [ref1 == ALTREF_FRAME]++;
+#else   // !CONFIG_EXT_REFS
           counts->comp_ref[av1_get_pred_context_comp_ref_p(cm, xd)][0]
                           [ref0 == GOLDEN_FRAME]++;
 #endif  // CONFIG_EXT_REFS
+#if CONFIG_EXT_COMP_REFS
+          }
+#endif  // CONFIG_EXT_COMP_REFS
         } else {
 #if CONFIG_EXT_REFS
           const int bit = (ref0 == ALTREF_FRAME || ref0 == BWDREF_FRAME);
@@ -1609,7 +1691,7 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
                                 [ref0 != LAST3_FRAME]++;
             }
           }
-#else
+#else   // !CONFIG_EXT_REFS
           counts->single_ref[av1_get_pred_context_single_ref_p1(xd)][0]
                             [ref0 != LAST_FRAME]++;
           if (ref0 != LAST_FRAME) {
@@ -1619,7 +1701,14 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
 #endif  // CONFIG_EXT_REFS
         }
 
-#if CONFIG_EXT_INTER && CONFIG_INTERINTRA
+#if CONFIG_EXT_INTER
+#if CONFIG_COMPOUND_SINGLEREF
+        if (!has_second_ref(mbmi))
+          counts->comp_inter_mode[av1_get_inter_mode_context(xd)]
+                                 [is_inter_singleref_comp_mode(mbmi->mode)]++;
+#endif  // CONFIG_COMPOUND_SINGLEREF
+
+#if CONFIG_INTERINTRA
         if (cm->reference_mode != COMPOUND_REFERENCE &&
 #if CONFIG_SUPERTX
             !supertx_enabled &&
@@ -1635,14 +1724,33 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
             counts->interintra[bsize_group][0]++;
           }
         }
-#endif  // CONFIG_EXT_INTER && CONFIG_INTERINTRA
+#endif  // CONFIG_INTERINTRA
+#endif  // CONFIG_EXT_INTER
 
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+#if CONFIG_WARPED_MOTION
+        set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+#endif
+#if CONFIG_NCOBMC_ADAPT_WEIGHT
+        const MOTION_MODE motion_allowed =
+            motion_mode_allowed_wrapper(0,
+#if CONFIG_GLOBAL_MOTION
+                                        0, xd->global_motion,
+#endif  // CONFIG_GLOBAL_MOTION
+#if CONFIG_WARPED_MOTION
+                                        xd,
+#endif
+                                        mi);
+#else
         const MOTION_MODE motion_allowed = motion_mode_allowed(
-#if CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+#if CONFIG_GLOBAL_MOTION
             0, xd->global_motion,
-#endif  // CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+#endif  // CONFIG_GLOBAL_MOTION
+#if CONFIG_WARPED_MOTION
+            xd,
+#endif
             mi);
+#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
 #if CONFIG_SUPERTX
         if (!supertx_enabled)
 #endif  // CONFIG_SUPERTX
@@ -1660,11 +1768,28 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
         if (motion_allowed > SIMPLE_TRANSLATION)
           counts->motion_mode[mbmi->sb_type][mbmi->motion_mode]++;
 #endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+
+#if CONFIG_NCOBMC_ADAPT_WEIGHT
+        if (mbmi->motion_mode == NCOBMC_ADAPT_WEIGHT) {
+          ADAPT_OVERLAP_BLOCK ao_block =
+              adapt_overlap_block_lookup[mbmi->sb_type];
+          ++counts->ncobmc_mode[ao_block][mbmi->ncobmc_mode[0]];
+          if (mi_size_wide[mbmi->sb_type] != mi_size_high[mbmi->sb_type]) {
+            ++counts->ncobmc_mode[ao_block][mbmi->ncobmc_mode[1]];
+          }
+        }
+#endif
+
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 
 #if CONFIG_EXT_INTER
-        if (cm->reference_mode != SINGLE_REFERENCE &&
+        if (
+#if CONFIG_COMPOUND_SINGLEREF
+            is_inter_anyref_comp_mode(mbmi->mode)
+#else   // !CONFIG_COMPOUND_SINGLEREF
+            cm->reference_mode != SINGLE_REFERENCE &&
             is_inter_compound_mode(mbmi->mode)
+#endif  // CONFIG_COMPOUND_SINGLEREF
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
             && mbmi->motion_mode == SIMPLE_TRANSLATION
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
@@ -1683,6 +1808,12 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
       if (has_second_ref(mbmi)) {
         mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]];
         ++counts->inter_compound_mode[mode_ctx][INTER_COMPOUND_OFFSET(mode)];
+#if CONFIG_COMPOUND_SINGLEREF
+      } else if (is_inter_singleref_comp_mode(mode)) {
+        mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]];
+        ++counts->inter_singleref_comp_mode[mode_ctx]
+                                           [INTER_SINGLEREF_COMP_OFFSET(mode)];
+#endif  // CONFIG_COMPOUND_SINGLEREF
       } else {
 #endif  // CONFIG_EXT_INTER
         mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
@@ -1693,10 +1824,15 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
 #endif  // CONFIG_EXT_INTER
 
 #if CONFIG_EXT_INTER
+#if CONFIG_COMPOUND_SINGLEREF
+      if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV ||
+          mbmi->mode == SR_NEW_NEWMV) {
+#else   // !CONFIG_COMPOUND_SINGLEREF
       if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) {
-#else
+#endif  // CONFIG_COMPOUND_SINGLEREF
+#else   // !CONFIG_EXT_INTER
       if (mbmi->mode == NEWMV) {
-#endif
+#endif  // CONFIG_EXT_INTER
         uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
         int idx;
 
@@ -1871,10 +2007,16 @@ static void encode_b(const AV1_COMP *const cpi, const TileInfo *const tile,
   update_state(cpi, td, ctx, mi_row, mi_col, bsize, dry_run);
 #if CONFIG_MOTION_VAR && CONFIG_NCOBMC
   mbmi = &xd->mi[0]->mbmi;
+#if CONFIG_WARPED_MOTION
+  set_ref_ptrs(&cpi->common, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+#endif
   const MOTION_MODE motion_allowed = motion_mode_allowed(
-#if CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+#if CONFIG_GLOBAL_MOTION
       0, xd->global_motion,
-#endif  // CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+#endif  // CONFIG_GLOBAL_MOTION
+#if CONFIG_WARPED_MOTION
+      xd,
+#endif
       xd->mi[0]);
   check_ncobmc = is_inter_block(mbmi) && motion_allowed >= OBMC_CAUSAL;
   if (!dry_run && check_ncobmc) {
@@ -1922,6 +2064,8 @@ static void encode_sb(const AV1_COMP *const cpi, ThreadData *td,
   const BLOCK_SIZE subsize = get_subsize(bsize, partition);
 #if CONFIG_EXT_PARTITION_TYPES
   const BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+  int quarter_step = mi_size_wide[bsize] / 4;
+  int i;
 #endif
 
 #if CONFIG_CB4X4
@@ -1933,6 +2077,11 @@ static void encode_sb(const AV1_COMP *const cpi, ThreadData *td,
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
 
+#if CONFIG_SPEED_REFS
+  // First scanning pass of an SB is dry run only.
+  if (cpi->sb_scanning_pass_idx == 0) assert(dry_run == DRY_RUN_NORMAL);
+#endif  // CONFIG_SPEED_REFS
+
   if (!dry_run && ctx >= 0) td->counts->partition[ctx][partition]++;
 
 #if CONFIG_SUPERTX
@@ -2115,6 +2264,24 @@ static void encode_sb(const AV1_COMP *const cpi, ThreadData *td,
       encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col + hbs, dry_run, bsize2,
                partition, &pc_tree->verticalb[2], rate);
       break;
+    case PARTITION_HORZ_4:
+      for (i = 0; i < 4; ++i) {
+        int this_mi_row = mi_row + i * quarter_step;
+        if (i > 0 && this_mi_row >= cm->mi_rows) break;
+
+        encode_b(cpi, tile, td, tp, this_mi_row, mi_col, dry_run, subsize,
+                 partition, &pc_tree->horizontal4[i], rate);
+      }
+      break;
+    case PARTITION_VERT_4:
+      for (i = 0; i < 4; ++i) {
+        int this_mi_col = mi_col + i * quarter_step;
+        if (i > 0 && this_mi_col >= cm->mi_cols) break;
+
+        encode_b(cpi, tile, td, tp, mi_row, this_mi_col, dry_run, subsize,
+                 partition, &pc_tree->vertical4[i], rate);
+      }
+      break;
 #endif  // CONFIG_EXT_PARTITION_TYPES
     default: assert(0 && "Invalid partition type."); break;
   }
@@ -2302,8 +2469,7 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
 
       if (none_rdc.rate < INT_MAX) {
         none_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
-        none_rdc.rdcost =
-            RDCOST(x->rdmult, x->rddiv, none_rdc.rate, none_rdc.dist);
+        none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist);
 #if CONFIG_SUPERTX
         none_rate_nocoef += cpi->partition_cost[pl][PARTITION_NONE];
 #endif
@@ -2473,7 +2639,9 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
     case PARTITION_VERT_A:
     case PARTITION_VERT_B:
     case PARTITION_HORZ_A:
-    case PARTITION_HORZ_B: assert(0 && "Cannot handle extended partiton types");
+    case PARTITION_HORZ_B:
+    case PARTITION_HORZ_4:
+    case PARTITION_VERT_4: assert(0 && "Cannot handle extended partiton types");
 #endif  //  CONFIG_EXT_PARTITION_TYPES
     default: assert(0); break;
   }
@@ -2481,7 +2649,7 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
   if (last_part_rdc.rate < INT_MAX) {
     last_part_rdc.rate += cpi->partition_cost[pl][partition];
     last_part_rdc.rdcost =
-        RDCOST(x->rdmult, x->rddiv, last_part_rdc.rate, last_part_rdc.dist);
+        RDCOST(x->rdmult, last_part_rdc.rate, last_part_rdc.dist);
 #if CONFIG_SUPERTX
     last_part_rate_nocoef += cpi->partition_cost[pl][partition];
 #endif
@@ -2565,8 +2733,7 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
     }
     if (chosen_rdc.rate < INT_MAX) {
       chosen_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT];
-      chosen_rdc.rdcost =
-          RDCOST(x->rdmult, x->rddiv, chosen_rdc.rate, chosen_rdc.dist);
+      chosen_rdc.rdcost = RDCOST(x->rdmult, chosen_rdc.rate, chosen_rdc.dist);
 #if CONFIG_SUPERTX
       chosen_rate_nocoef += cpi->partition_cost[pl][PARTITION_NONE];
 #endif
@@ -2624,8 +2791,8 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
 }
 
 /* clang-format off */
-static const BLOCK_SIZE min_partition_size[BLOCK_SIZES] = {
-#if CONFIG_CB4X4
+static const BLOCK_SIZE min_partition_size[BLOCK_SIZES_ALL] = {
+#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
   BLOCK_2X2,   BLOCK_2X2,   BLOCK_2X2,    //    2x2,    2x4,     4x2
 #endif
                             BLOCK_4X4,    //                     4x4
@@ -2634,12 +2801,14 @@ static const BLOCK_SIZE min_partition_size[BLOCK_SIZES] = {
   BLOCK_8X8,   BLOCK_8X8,   BLOCK_16X16,  //  16x32,  32x16,   32x32
   BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,  //  32x64,  64x32,   64x64
 #if CONFIG_EXT_PARTITION
-  BLOCK_16X16, BLOCK_16X16, BLOCK_16X16   // 64x128, 128x64, 128x128
+  BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,  // 64x128, 128x64, 128x128
 #endif  // CONFIG_EXT_PARTITION
+  BLOCK_4X4,   BLOCK_4X4,   BLOCK_8X8,      //   4x16,   16x4,    8x32
+  BLOCK_8X8                                 //   32x8
 };
 
-static const BLOCK_SIZE max_partition_size[BLOCK_SIZES] = {
-#if CONFIG_CB4X4
+static const BLOCK_SIZE max_partition_size[BLOCK_SIZES_ALL] = {
+#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
   BLOCK_4X4,     BLOCK_4X4,       BLOCK_4X4,    //    2x2,    2x4,     4x2
 #endif
                                   BLOCK_8X8,    //                     4x4
@@ -2648,13 +2817,15 @@ static const BLOCK_SIZE max_partition_size[BLOCK_SIZES] = {
   BLOCK_64X64,   BLOCK_64X64,   BLOCK_64X64,    //  16x32,  32x16,   32x32
   BLOCK_LARGEST, BLOCK_LARGEST, BLOCK_LARGEST,  //  32x64,  64x32,   64x64
 #if CONFIG_EXT_PARTITION
-  BLOCK_LARGEST, BLOCK_LARGEST, BLOCK_LARGEST   // 64x128, 128x64, 128x128
+  BLOCK_LARGEST, BLOCK_LARGEST, BLOCK_LARGEST,  // 64x128, 128x64, 128x128
 #endif  // CONFIG_EXT_PARTITION
+  BLOCK_16X16,   BLOCK_16X16,   BLOCK_32X32,    //   4x16,   16x4,    8x32
+  BLOCK_32X32                                   //   32x8
 };
 
 // Next square block size less or equal than current block size.
-static const BLOCK_SIZE next_square_size[BLOCK_SIZES] = {
-#if CONFIG_CB4X4
+static const BLOCK_SIZE next_square_size[BLOCK_SIZES_ALL] = {
+#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
   BLOCK_2X2,   BLOCK_2X2,     BLOCK_2X2,    //    2x2,    2x4,     4x2
 #endif
                               BLOCK_4X4,    //                     4x4
@@ -2663,8 +2834,10 @@ static const BLOCK_SIZE next_square_size[BLOCK_SIZES] = {
   BLOCK_16X16, BLOCK_16X16,   BLOCK_32X32,  //  16x32,  32x16,   32x32
   BLOCK_32X32, BLOCK_32X32,   BLOCK_64X64,  //  32x64,  64x32,   64x64
 #if CONFIG_EXT_PARTITION
-  BLOCK_64X64, BLOCK_64X64, BLOCK_128X128   // 64x128, 128x64, 128x128
+  BLOCK_64X64, BLOCK_64X64, BLOCK_128X128,  // 64x128, 128x64, 128x128
 #endif  // CONFIG_EXT_PARTITION
+  BLOCK_4X4,   BLOCK_4X4,   BLOCK_8X8,      //   4x16,   16x4,    8x32
+  BLOCK_8X8                                 //   32x8
 };
 /* clang-format on */
 
@@ -3055,8 +3228,7 @@ static void rd_test_partition3(
             cm->fc->supertx_prob[partition_supertx_context_lookup[partition]]
                                 [supertx_size],
             0);
-        sum_rdc.rdcost =
-            RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+        sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
 
         if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) {
           TX_TYPE best_tx = DCT_DCT;
@@ -3071,8 +3243,7 @@ static void rd_test_partition3(
               cm->fc->supertx_prob[partition_supertx_context_lookup[partition]]
                                   [supertx_size],
               1);
-          tmp_rdc.rdcost =
-              RDCOST(x->rdmult, x->rddiv, tmp_rdc.rate, tmp_rdc.dist);
+          tmp_rdc.rdcost = RDCOST(x->rdmult, tmp_rdc.rate, tmp_rdc.dist);
           if (tmp_rdc.rdcost < sum_rdc.rdcost) {
             sum_rdc = tmp_rdc;
             update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx,
@@ -3091,8 +3262,7 @@ static void rd_test_partition3(
 #endif
                                          bsize);
         sum_rdc.rate += cpi->partition_cost[pl][partition];
-        sum_rdc.rdcost =
-            RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+        sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
 #if CONFIG_SUPERTX
         sum_rate_nocoef += cpi->partition_cost[pl][partition];
 #endif
@@ -3161,7 +3331,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 #if CONFIG_SUPERTX
   int this_rate_nocoef, sum_rate_nocoef = 0, best_rate_nocoef = INT_MAX;
   int abort_flag;
-  const int supertx_allowed = !frame_is_intra_only(cm) &&
+  const int supertx_allowed = !frame_is_intra_only(cm) && bsize >= BLOCK_8X8 &&
                               bsize <= MAX_SUPERTX_BLOCK_SIZE &&
                               !xd->lossless[0];
 #endif  // CONFIG_SUPERTX
@@ -3341,6 +3511,17 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
   }
 #endif
 
+#if CONFIG_SPEED_REFS
+  if (cpi->sb_scanning_pass_idx == 0) {
+    // NOTE: For the 1st pass of scanning, check all the subblocks of equal size
+    //       only.
+    partition_none_allowed = (bsize == MIN_SPEED_REFS_BLKSIZE);
+    partition_horz_allowed = 0;
+    partition_vert_allowed = 0;
+    do_square_split = (bsize > MIN_SPEED_REFS_BLKSIZE);
+  }
+#endif  // CONFIG_SPEED_REFS
+
   // PARTITION_NONE
   if (partition_none_allowed) {
     rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
@@ -3354,8 +3535,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
     if (this_rdc.rate != INT_MAX) {
       if (bsize_at_least_8x8) {
         this_rdc.rate += partition_cost[PARTITION_NONE];
-        this_rdc.rdcost =
-            RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist);
+        this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
 #if CONFIG_SUPERTX
         this_rate_nocoef += partition_cost[PARTITION_NONE];
 #endif
@@ -3494,8 +3674,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
             cm->fc->supertx_prob[partition_supertx_context_lookup
                                      [PARTITION_SPLIT]][supertx_size],
             0);
-        sum_rdc.rdcost =
-            RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+        sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
 
         if (is_inter_mode(pc_tree->leaf_split[0]->mic.mbmi.mode)) {
           TX_TYPE best_tx = DCT_DCT;
@@ -3512,8 +3691,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
               cm->fc->supertx_prob[partition_supertx_context_lookup
                                        [PARTITION_SPLIT]][supertx_size],
               1);
-          tmp_rdc.rdcost =
-              RDCOST(x->rdmult, x->rddiv, tmp_rdc.rate, tmp_rdc.dist);
+          tmp_rdc.rdcost = RDCOST(x->rdmult, tmp_rdc.rate, tmp_rdc.dist);
           if (tmp_rdc.rdcost < sum_rdc.rdcost) {
             sum_rdc = tmp_rdc;
             update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx,
@@ -3551,6 +3729,11 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
             &this_rdc, best_rdc.rdcost - sum_rdc.rdcost, pc_tree->split[idx]);
 #endif  // CONFIG_SUPERTX
 
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
+        if (bsize == BLOCK_8X8 && this_rdc.rate != INT_MAX) {
+          assert(this_rdc.dist_y < INT64_MAX);
+        }
+#endif
         if (this_rdc.rate == INT_MAX) {
           sum_rdc.rdcost = INT64_MAX;
 #if CONFIG_SUPERTX
@@ -3564,28 +3747,40 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 #if CONFIG_SUPERTX
           sum_rate_nocoef += this_rate_nocoef;
 #endif  // CONFIG_SUPERTX
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
-          sum_rdc.dist_y += this_rdc.dist_y;
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
+          if (bsize == BLOCK_8X8) {
+            assert(this_rdc.dist_y < INT64_MAX);
+            sum_rdc.dist_y += this_rdc.dist_y;
+          }
 #endif
         }
       }
       reached_last_index = (idx == 4);
 
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
       if (reached_last_index && sum_rdc.rdcost != INT64_MAX &&
           bsize == BLOCK_8X8) {
-        int use_activity_masking = 0;
-        int64_t daala_dist;
+        int64_t dist_8x8;
         const int src_stride = x->plane[0].src.stride;
-        daala_dist = av1_daala_dist(x->plane[0].src.buf - 4 * src_stride - 4,
-                                    src_stride, x->decoded_8x8, 8, 8, 8, 1,
-                                    use_activity_masking, x->qindex)
-                     << 4;
-        sum_rdc.dist = sum_rdc.dist - sum_rdc.dist_y + daala_dist;
-        sum_rdc.rdcost =
-            RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+        uint8_t *decoded_8x8;
+
+#if CONFIG_HIGHBITDEPTH
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+          decoded_8x8 = CONVERT_TO_BYTEPTR(x->decoded_8x8);
+        else
+#endif
+          decoded_8x8 = (uint8_t *)x->decoded_8x8;
+
+        dist_8x8 =
+            av1_dist_8x8(cpi, xd, x->plane[0].src.buf - 4 * src_stride - 4,
+                         src_stride, decoded_8x8, 8, BLOCK_8X8, 8, 8, 8, 8,
+                         x->qindex)
+            << 4;
+        assert(sum_rdc.dist_y < INT64_MAX);
+        sum_rdc.dist = sum_rdc.dist - sum_rdc.dist_y + dist_8x8;
+        sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
       }
-#endif  // CONFIG_DAALA_DIST && CONFIG_CB4X4
+#endif  // CONFIG_DIST_8X8 && CONFIG_CB4X4
 
 #if CONFIG_SUPERTX
       if (supertx_allowed && sum_rdc.rdcost < INT64_MAX && reached_last_index) {
@@ -3598,8 +3793,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
             cm->fc->supertx_prob[partition_supertx_context_lookup
                                      [PARTITION_SPLIT]][supertx_size],
             0);
-        sum_rdc.rdcost =
-            RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+        sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
 
         if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) {
           TX_TYPE best_tx = DCT_DCT;
@@ -3616,8 +3810,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
               cm->fc->supertx_prob[partition_supertx_context_lookup
                                        [PARTITION_SPLIT]][supertx_size],
               1);
-          tmp_rdc.rdcost =
-              RDCOST(x->rdmult, x->rddiv, tmp_rdc.rate, tmp_rdc.dist);
+          tmp_rdc.rdcost = RDCOST(x->rdmult, tmp_rdc.rate, tmp_rdc.dist);
           if (tmp_rdc.rdcost < sum_rdc.rdcost) {
             sum_rdc = tmp_rdc;
             update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx,
@@ -3632,7 +3825,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 
     if (reached_last_index && sum_rdc.rdcost < best_rdc.rdcost) {
       sum_rdc.rate += partition_cost[PARTITION_SPLIT];
-      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
 #if CONFIG_SUPERTX
       sum_rate_nocoef += partition_cost[PARTITION_SPLIT];
 #endif  // CONFIG_SUPERTX
@@ -3725,14 +3918,14 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
                        best_rdc.rdcost - sum_rdc.rdcost);
 #endif  // CONFIG_SUPERTX
 
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
       if (this_rdc.rate != INT_MAX && bsize == BLOCK_8X8) {
         update_state(cpi, td, &pc_tree->horizontal[1], mi_row + mi_step, mi_col,
                      subsize, DRY_RUN_NORMAL);
         encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row + mi_step, mi_col,
                           subsize, NULL);
       }
-#endif  // CONFIG_DAALA_DIST && CONFIG_CB4X4
+#endif  // CONFIG_DIST_8X8 && CONFIG_CB4X4
 
       if (this_rdc.rate == INT_MAX) {
         sum_rdc.rdcost = INT64_MAX;
@@ -3746,24 +3939,31 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 #if CONFIG_SUPERTX
         sum_rate_nocoef += this_rate_nocoef;
 #endif  // CONFIG_SUPERTX
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
         sum_rdc.dist_y += this_rdc.dist_y;
 #endif
       }
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
       if (sum_rdc.rdcost != INT64_MAX && bsize == BLOCK_8X8) {
-        int use_activity_masking = 0;
-        int64_t daala_dist;
+        int64_t dist_8x8;
         const int src_stride = x->plane[0].src.stride;
-        daala_dist = av1_daala_dist(x->plane[0].src.buf - 4 * src_stride,
-                                    src_stride, x->decoded_8x8, 8, 8, 8, 1,
-                                    use_activity_masking, x->qindex)
-                     << 4;
-        sum_rdc.dist = sum_rdc.dist - sum_rdc.dist_y + daala_dist;
-        sum_rdc.rdcost =
-            RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+        uint8_t *decoded_8x8;
+
+#if CONFIG_HIGHBITDEPTH
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+          decoded_8x8 = CONVERT_TO_BYTEPTR(x->decoded_8x8);
+        else
+#endif
+          decoded_8x8 = (uint8_t *)x->decoded_8x8;
+
+        dist_8x8 = av1_dist_8x8(cpi, xd, x->plane[0].src.buf - 4 * src_stride,
+                                src_stride, decoded_8x8, 8, BLOCK_8X8, 8, 8, 8,
+                                8, x->qindex)
+                   << 4;
+        sum_rdc.dist = sum_rdc.dist - sum_rdc.dist_y + dist_8x8;
+        sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
       }
-#endif  // CONFIG_DAALA_DIST && CONFIG_CB4X4
+#endif  // CONFIG_DIST_8X8 && CONFIG_CB4X4
     }
 
 #if CONFIG_SUPERTX
@@ -3777,7 +3977,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
           cm->fc->supertx_prob[partition_supertx_context_lookup[PARTITION_HORZ]]
                               [supertx_size],
           0);
-      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
 
       if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) {
         TX_TYPE best_tx = DCT_DCT;
@@ -3795,8 +3995,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
                 ->supertx_prob[partition_supertx_context_lookup[PARTITION_HORZ]]
                               [supertx_size],
             1);
-        tmp_rdc.rdcost =
-            RDCOST(x->rdmult, x->rddiv, tmp_rdc.rate, tmp_rdc.dist);
+        tmp_rdc.rdcost = RDCOST(x->rdmult, tmp_rdc.rate, tmp_rdc.dist);
         if (tmp_rdc.rdcost < sum_rdc.rdcost) {
           sum_rdc = tmp_rdc;
           update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx,
@@ -3810,7 +4009,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 
     if (sum_rdc.rdcost < best_rdc.rdcost) {
       sum_rdc.rate += partition_cost[PARTITION_HORZ];
-      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
 #if CONFIG_SUPERTX
       sum_rate_nocoef += partition_cost[PARTITION_HORZ];
 #endif  // CONFIG_SUPERTX
@@ -3899,14 +4098,14 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
                        best_rdc.rdcost - sum_rdc.rdcost);
 #endif  // CONFIG_SUPERTX
 
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
       if (this_rdc.rate != INT_MAX && bsize == BLOCK_8X8) {
         update_state(cpi, td, &pc_tree->vertical[1], mi_row, mi_col + mi_step,
                      subsize, DRY_RUN_NORMAL);
         encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col + mi_step,
                           subsize, NULL);
       }
-#endif  // CONFIG_DAALA_DIST && CONFIG_CB4X4
+#endif  // CONFIG_DIST_8X8 && CONFIG_CB4X4
 
       if (this_rdc.rate == INT_MAX) {
         sum_rdc.rdcost = INT64_MAX;
@@ -3920,24 +4119,31 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 #if CONFIG_SUPERTX
         sum_rate_nocoef += this_rate_nocoef;
 #endif  // CONFIG_SUPERTX
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
         sum_rdc.dist_y += this_rdc.dist_y;
 #endif
       }
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
       if (sum_rdc.rdcost != INT64_MAX && bsize == BLOCK_8X8) {
-        int use_activity_masking = 0;
-        int64_t daala_dist;
+        int64_t dist_8x8;
         const int src_stride = x->plane[0].src.stride;
-        daala_dist =
-            av1_daala_dist(x->plane[0].src.buf - 4, src_stride, x->decoded_8x8,
-                           8, 8, 8, 1, use_activity_masking, x->qindex)
+        uint8_t *decoded_8x8;
+
+#if CONFIG_HIGHBITDEPTH
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+          decoded_8x8 = CONVERT_TO_BYTEPTR(x->decoded_8x8);
+        else
+#endif
+          decoded_8x8 = (uint8_t *)x->decoded_8x8;
+
+        dist_8x8 =
+            av1_dist_8x8(cpi, xd, x->plane[0].src.buf - 4, src_stride,
+                         decoded_8x8, 8, BLOCK_8X8, 8, 8, 8, 8, x->qindex)
             << 4;
-        sum_rdc.dist = sum_rdc.dist - sum_rdc.dist_y + daala_dist;
-        sum_rdc.rdcost =
-            RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+        sum_rdc.dist = sum_rdc.dist - sum_rdc.dist_y + dist_8x8;
+        sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
       }
-#endif  // CONFIG_DAALA_DIST && CONFIG_CB4X4
+#endif  // CONFIG_DIST_8X8 && CONFIG_CB4X4
     }
 #if CONFIG_SUPERTX
     if (supertx_allowed && sum_rdc.rdcost < INT64_MAX && !abort_flag) {
@@ -3950,7 +4156,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
           cm->fc->supertx_prob[partition_supertx_context_lookup[PARTITION_VERT]]
                               [supertx_size],
           0);
-      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
 
       if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) {
         TX_TYPE best_tx = DCT_DCT;
@@ -3968,8 +4174,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
                 ->supertx_prob[partition_supertx_context_lookup[PARTITION_VERT]]
                               [supertx_size],
             1);
-        tmp_rdc.rdcost =
-            RDCOST(x->rdmult, x->rddiv, tmp_rdc.rate, tmp_rdc.dist);
+        tmp_rdc.rdcost = RDCOST(x->rdmult, tmp_rdc.rate, tmp_rdc.dist);
         if (tmp_rdc.rdcost < sum_rdc.rdcost) {
           sum_rdc = tmp_rdc;
           update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx,
@@ -3983,7 +4188,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 
     if (sum_rdc.rdcost < best_rdc.rdcost) {
       sum_rdc.rate += partition_cost[PARTITION_VERT];
-      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
 #if CONFIG_SUPERTX
       sum_rate_nocoef += partition_cost[PARTITION_VERT];
 #endif  // CONFIG_SUPERTX
@@ -4060,14 +4265,139 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
                        bsize2, mi_row + mi_step, mi_col + mi_step, bsize2);
     restore_context(x, &x_ctx, mi_row, mi_col, bsize);
   }
+
+  // PARTITION_HORZ_4
+  // TODO(david.barker): For this and PARTITION_VERT_4,
+  // * Add support for BLOCK_16X16 once we support 2x8 and 8x2 blocks for the
+  //   chroma plane
+  // * Add support for supertx
+  if (bsize == BLOCK_32X32 && partition_horz_allowed && !force_horz_split &&
+      (do_rectangular_split || av1_active_h_edge(cpi, mi_row, mi_step))) {
+    int i;
+    const int quarter_step = mi_size_high[bsize] / 4;
+    PICK_MODE_CONTEXT *ctx_prev = ctx_none;
+
+    subsize = get_subsize(bsize, PARTITION_HORZ_4);
+    av1_zero(sum_rdc);
+
+    for (i = 0; i < 4; ++i) {
+      int this_mi_row = mi_row + i * quarter_step;
+
+      if (i > 0 && this_mi_row >= cm->mi_rows) break;
+
+      if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_prev);
+
+      ctx_prev = &pc_tree->horizontal4[i];
+
+      rd_pick_sb_modes(cpi, tile_data, x, this_mi_row, mi_col, &this_rdc,
+                       PARTITION_HORZ_4, subsize, ctx_prev,
+                       best_rdc.rdcost - sum_rdc.rdcost);
+
+      if (this_rdc.rate == INT_MAX) {
+        sum_rdc.rdcost = INT64_MAX;
+        break;
+      } else {
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost += this_rdc.rdcost;
+      }
+
+      if (sum_rdc.rdcost >= best_rdc.rdcost) break;
+
+      if (i < 3) {
+        update_state(cpi, td, ctx_prev, this_mi_row, mi_col, subsize, 1);
+        encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, this_mi_row, mi_col,
+                          subsize, NULL);
+      }
+    }
+
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
+      sum_rdc.rate += partition_cost[PARTITION_HORZ_4];
+      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+      if (sum_rdc.rdcost < best_rdc.rdcost) {
+        best_rdc = sum_rdc;
+        pc_tree->partitioning = PARTITION_HORZ_4;
+      }
+    }
+#if !CONFIG_PVQ
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+#else
+    restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
+#endif
+  }
+  // PARTITION_VERT_4
+  if (bsize == BLOCK_32X32 && partition_vert_allowed && !force_vert_split &&
+      (do_rectangular_split || av1_active_v_edge(cpi, mi_row, mi_step))) {
+    int i;
+    const int quarter_step = mi_size_wide[bsize] / 4;
+    PICK_MODE_CONTEXT *ctx_prev = ctx_none;
+
+    subsize = get_subsize(bsize, PARTITION_VERT_4);
+    av1_zero(sum_rdc);
+
+    for (i = 0; i < 4; ++i) {
+      int this_mi_col = mi_col + i * quarter_step;
+
+      if (i > 0 && this_mi_col >= cm->mi_cols) break;
+
+      if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_prev);
+
+      ctx_prev = &pc_tree->vertical4[i];
+
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, this_mi_col, &this_rdc,
+                       PARTITION_VERT_4, subsize, ctx_prev,
+                       best_rdc.rdcost - sum_rdc.rdcost);
+
+      if (this_rdc.rate == INT_MAX) {
+        sum_rdc.rdcost = INT64_MAX;
+      } else {
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost += this_rdc.rdcost;
+      }
+
+      if (sum_rdc.rdcost >= best_rdc.rdcost) break;
+
+      if (i < 3) {
+        update_state(cpi, td, ctx_prev, mi_row, this_mi_col, subsize, 1);
+        encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, this_mi_col,
+                          subsize, NULL);
+      }
+    }
+
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
+      sum_rdc.rate += partition_cost[PARTITION_VERT_4];
+      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+      if (sum_rdc.rdcost < best_rdc.rdcost) {
+        best_rdc = sum_rdc;
+        pc_tree->partitioning = PARTITION_VERT_4;
+      }
+    }
+#if !CONFIG_PVQ
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+#else
+    restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
+#endif
+  }
 #endif  // CONFIG_EXT_PARTITION_TYPES
 
+#if CONFIG_SPEED_REFS
+  // First scanning is done.
+  if (cpi->sb_scanning_pass_idx == 0 && bsize == cm->sb_size) return;
+#endif  // CONFIG_SPEED_REFS
+
   // TODO(jbb): This code added so that we avoid static analysis
   // warning related to the fact that best_rd isn't used after this
   // point.  This code should be refactored so that the duplicate
   // checks occur in some sub function and thus are used...
   (void)best_rd;
   *rd_cost = best_rdc;
+
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
+  if (bsize <= BLOCK_8X8 && rd_cost->rate != INT_MAX) {
+    assert(rd_cost->dist_y < INT64_MAX);
+  }
+#endif  // CONFIG_DIST_8X8 && CONFIG_CB4X4
 #if CONFIG_SUPERTX
   *rate_nocoef = best_rate_nocoef;
 #endif  // CONFIG_SUPERTX
@@ -4093,13 +4423,13 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
   x->cfl_store_y = 0;
 #endif
 
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
   if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
       bsize == BLOCK_4X4 && pc_tree->index == 3) {
     encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
               pc_tree, NULL);
   }
-#endif  // CONFIG_DAALA_DIST && CONFIG_CB4X4
+#endif  // CONFIG_DIST_8X8 && CONFIG_CB4X4
 
   if (bsize == cm->sb_size) {
 #if !CONFIG_PVQ && !CONFIG_LV_MAP
@@ -4112,6 +4442,22 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
   }
 }
 
+#if CONFIG_SPEED_REFS
+static void restore_mi(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                       int mi_row, int mi_col) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
+  int x_idx, y;
+  for (y = 0; y < mi_size_high[cm->sb_size]; y++)
+    for (x_idx = 0; x_idx < mi_size_wide[cm->sb_size]; x_idx++)
+      if (mi_col + x_idx < cm->mi_cols && mi_row + y < cm->mi_rows) {
+        memset(xd->mi + y * cm->mi_stride + x_idx, 0, sizeof(*xd->mi));
+        memset(x->mbmi_ext + y * cm->mi_cols + x_idx, 0, sizeof(*x->mbmi_ext));
+      }
+}
+#endif  // CONFIG_SPEED_REFS
+
 static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
                              TileDataEnc *tile_data, int mi_row,
                              TOKENEXTRA **tp) {
@@ -4157,8 +4503,6 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
     MODE_INFO **mi = cm->mi_grid_visible + idx_str;
     PC_TREE *const pc_root = td->pc_root[cm->mib_size_log2 - MIN_MIB_SIZE_LOG2];
 
-    av1_update_boundary_info(cm, tile_info, mi_row, mi_col);
-
     if (sf->adaptive_pred_interp_filter) {
       for (i = 0; i < leaf_nodes; ++i)
         td->leaf_tree[i].pred_interp_filter = SWITCHABLE;
@@ -4258,12 +4602,35 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
         rd_auto_partition_range(cpi, tile_info, xd, mi_row, mi_col,
                                 &x->min_partition_size, &x->max_partition_size);
       }
+#if CONFIG_SPEED_REFS
+      // NOTE: Two scanning passes for the current superblock - the first pass
+      //       is only targeted to collect stats.
+      int m_search_count_backup = *(x->m_search_count_ptr);
+      for (int sb_pass_idx = 0; sb_pass_idx < 2; ++sb_pass_idx) {
+        cpi->sb_scanning_pass_idx = sb_pass_idx;
+        if (frame_is_intra_only(cm) && sb_pass_idx == 0) continue;
+
+        rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, cm->sb_size,
+                          &dummy_rdc,
+#if CONFIG_SUPERTX
+                          &dummy_rate_nocoef,
+#endif  // CONFIG_SUPERTX
+                          INT64_MAX, pc_root);
+        if (sb_pass_idx == 0) {
+          av1_zero(x->pred_mv);
+          pc_root->index = 0;
+          restore_mi(cpi, x, mi_row, mi_col);
+          *(x->m_search_count_ptr) = m_search_count_backup;
+        }
+      }
+#else  // !CONFIG_SPEED_REFS
       rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, cm->sb_size,
                         &dummy_rdc,
 #if CONFIG_SUPERTX
                         &dummy_rate_nocoef,
 #endif  // CONFIG_SUPERTX
                         INT64_MAX, pc_root);
+#endif  // CONFIG_SPEED_REFS
     }
   }
 }
@@ -4329,20 +4696,11 @@ static MV_REFERENCE_FRAME get_frame_type(const AV1_COMP *cpi) {
     return LAST_FRAME;
 }
 
-static TX_MODE select_tx_mode(const AV1_COMP *cpi, MACROBLOCKD *const xd) {
-  int i, all_lossless = 1;
-
-  if (cpi->common.seg.enabled) {
-    for (i = 0; i < MAX_SEGMENTS; ++i) {
-      if (!xd->lossless[i]) {
-        all_lossless = 0;
-        break;
-      }
-    }
-  } else {
-    all_lossless = xd->lossless[0];
-  }
-  if (all_lossless) return ONLY_4X4;
+static TX_MODE select_tx_mode(const AV1_COMP *cpi) {
+  if (cpi->common.all_lossless) return ONLY_4X4;
+#if CONFIG_VAR_TX_NO_TX_MODE
+  return TX_MODE_SELECT;
+#else
   if (cpi->sf.tx_size_search_method == USE_LARGESTALL)
     return ALLOW_32X32 + CONFIG_TX64X64;
   else if (cpi->sf.tx_size_search_method == USE_FULL_RD ||
@@ -4350,6 +4708,7 @@ static TX_MODE select_tx_mode(const AV1_COMP *cpi, MACROBLOCKD *const xd) {
     return TX_MODE_SELECT;
   else
     return cpi->common.tx_mode;
+#endif  // CONFIG_VAR_TX_NO_TX_MODE
 }
 
 void av1_init_tile_data(AV1_COMP *cpi) {
@@ -4372,7 +4731,7 @@ void av1_init_tile_data(AV1_COMP *cpi) {
         TileDataEnc *const tile_data =
             &cpi->tile_data[tile_row * tile_cols + tile_col];
         int i, j;
-        for (i = 0; i < BLOCK_SIZES; ++i) {
+        for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
           for (j = 0; j < MAX_MODES; ++j) {
             tile_data->thresh_freq_fact[i][j] = 32;
             tile_data->mode_map[i][j] = j;
@@ -4415,12 +4774,8 @@ void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row,
   int mi_row;
 
 #if CONFIG_DEPENDENT_HORZTILES
-#if CONFIG_TILE_GROUPS
   if ((!cm->dependent_horz_tiles) || (tile_row == 0) ||
       tile_info->tg_horz_boundary) {
-#else
-  if ((!cm->dependent_horz_tiles) || (tile_row == 0)) {
-#endif
     av1_zero_above_context(cm, tile_info->mi_col_start, tile_info->mi_col_end);
   }
 #else
@@ -4504,22 +4859,21 @@ void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row,
 #endif
 #endif  // #if CONFIG_PVQ
 
-#if CONFIG_EC_ADAPT
   this_tile->tctx = *cm->fc;
   td->mb.e_mbd.tile_ctx = &this_tile->tctx;
-#endif  // #if CONFIG_EC_ADAPT
 
 #if CONFIG_CFL
   MACROBLOCKD *const xd = &td->mb.e_mbd;
   xd->cfl = &this_tile->cfl;
-  cfl_init(xd->cfl, cm, xd->plane[AOM_PLANE_U].subsampling_x,
-           xd->plane[AOM_PLANE_U].subsampling_y);
+  cfl_init(xd->cfl, cm);
 #endif
 
 #if CONFIG_PVQ
   td->mb.daala_enc.state.adapt = &this_tile->tctx.pvq_context;
 #endif  // CONFIG_PVQ
 
+  av1_setup_across_tile_boundary_info(cm, tile_info);
+
   for (mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
        mi_row += cm->mib_size) {
     encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
@@ -4656,6 +5010,36 @@ static int do_gm_search_logic(SPEED_FEATURES *const sf, int num_refs_using_gm,
 }
 #endif  // CONFIG_GLOBAL_MOTION
 
+#if CONFIG_PALETTE
+// Estimate if the source frame is screen content, based on the portion of
+// blocks that have no more than 4 (experimentally selected) luma colors.
+static int is_screen_content(const uint8_t *src,
+#if CONFIG_HIGHBITDEPTH
+                             int use_hbd, int bd,
+#endif  // CONFIG_HIGHBITDEPTH
+                             int stride, int width, int height) {
+  assert(src != NULL);
+  int counts = 0;
+  const int blk_w = 16;
+  const int blk_h = 16;
+  const int limit = 4;
+  for (int r = 0; r + blk_h <= height; r += blk_h) {
+    for (int c = 0; c + blk_w <= width; c += blk_w) {
+      const int n_colors =
+#if CONFIG_HIGHBITDEPTH
+          use_hbd ? av1_count_colors_highbd(src + r * stride + c, stride, blk_w,
+                                            blk_h, bd)
+                  :
+#endif  // CONFIG_HIGHBITDEPTH
+                  av1_count_colors(src + r * stride + c, stride, blk_w, blk_h);
+      if (n_colors > 1 && n_colors <= limit) counts++;
+    }
+  }
+  // The threshold is 10%.
+  return counts * blk_h * blk_w * 10 > width * height;
+}
+#endif  // CONFIG_PALETTE
+
 static void encode_frame_internal(AV1_COMP *cpi) {
   ThreadData *const td = &cpi->td;
   MACROBLOCK *const x = &td->mb;
@@ -4682,6 +5066,23 @@ static void encode_frame_internal(AV1_COMP *cpi) {
   av1_zero(rdc->coef_counts);
   av1_zero(rdc->comp_pred_diff);
 
+#if CONFIG_PALETTE || CONFIG_INTRABC
+  if (frame_is_intra_only(cm)) {
+#if CONFIG_PALETTE
+    cm->allow_screen_content_tools =
+        cpi->oxcf.content == AOM_CONTENT_SCREEN ||
+        is_screen_content(cpi->source->y_buffer,
+#if CONFIG_HIGHBITDEPTH
+                          cpi->source->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
+#endif  // CONFIG_HIGHBITDEPTH
+                          cpi->source->y_stride, cpi->source->y_width,
+                          cpi->source->y_height);
+#else
+    cm->allow_screen_content_tools = cpi->oxcf.content == AOM_CONTENT_SCREEN;
+#endif  // CONFIG_PALETTE
+  }
+#endif  // CONFIG_PALETTE || CONFIG_INTRABC
+
 #if CONFIG_GLOBAL_MOTION
   av1_zero(rdc->global_motion_used);
   av1_zero(cpi->gmparams_cost);
@@ -4709,6 +5110,8 @@ static void encode_frame_internal(AV1_COMP *cpi) {
         memcpy(&cm->global_motion[frame], &cm->global_motion[pframe],
                sizeof(WarpedMotionParams));
       } else if (ref_buf[frame] &&
+                 ref_buf[frame]->y_crop_width == cpi->source->y_crop_width &&
+                 ref_buf[frame]->y_crop_height == cpi->source->y_crop_height &&
                  do_gm_search_logic(&cpi->sf, num_refs_using_gm, frame)) {
         TransformationType model;
         const int64_t ref_frame_error = av1_frame_error(
@@ -4716,8 +5119,8 @@ static void encode_frame_internal(AV1_COMP *cpi) {
             xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
 #endif  // CONFIG_HIGHBITDEPTH
             ref_buf[frame]->y_buffer, ref_buf[frame]->y_stride,
-            cpi->source->y_buffer, 0, 0, cpi->source->y_width,
-            cpi->source->y_height, cpi->source->y_stride);
+            cpi->source->y_buffer, cpi->source->y_width, cpi->source->y_height,
+            cpi->source->y_stride);
 
         if (ref_frame_error == 0) continue;
 
@@ -4752,7 +5155,8 @@ static void encode_frame_internal(AV1_COMP *cpi) {
                   ref_buf[frame]->y_buffer, ref_buf[frame]->y_width,
                   ref_buf[frame]->y_height, ref_buf[frame]->y_stride,
                   cpi->source->y_buffer, cpi->source->y_width,
-                  cpi->source->y_height, cpi->source->y_stride, 3);
+                  cpi->source->y_height, cpi->source->y_stride, 5,
+                  best_warp_error);
               if (warp_error < best_warp_error) {
                 best_warp_error = warp_error;
                 // Save the wm_params modified by refine_integerized_param()
@@ -4812,10 +5216,10 @@ static void encode_frame_internal(AV1_COMP *cpi) {
                       cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0;
     xd->qindex[i] = qindex;
   }
-
+  cm->all_lossless = all_lossless(cm, xd);
   if (!cm->seg.enabled && xd->lossless[0]) x->optimize = 0;
 
-  cm->tx_mode = select_tx_mode(cpi, xd);
+  cm->tx_mode = select_tx_mode(cpi);
 
 #if CONFIG_DELTA_Q
   // Fix delta q resolution for the moment
@@ -4859,18 +5263,32 @@ static void encode_frame_internal(AV1_COMP *cpi) {
 
 #if CONFIG_TEMPMV_SIGNALING
   if (cm->prev_frame) {
-    cm->use_prev_frame_mvs &= !cm->error_resilient_mode &&
-                              cm->width == cm->prev_frame->buf.y_width &&
-                              cm->height == cm->prev_frame->buf.y_height &&
-                              !cm->intra_only && !cm->prev_frame->intra_only;
+    cm->use_prev_frame_mvs &=
+        !cm->error_resilient_mode &&
+#if CONFIG_FRAME_SUPERRES
+        cm->width == cm->last_width && cm->height == cm->last_height &&
+#else
+        cm->width == cm->prev_frame->buf.y_crop_width &&
+        cm->height == cm->prev_frame->buf.y_crop_height &&
+#endif  // CONFIG_FRAME_SUPERRES
+        !cm->intra_only && !cm->prev_frame->intra_only && cm->last_show_frame;
   } else {
     cm->use_prev_frame_mvs = 0;
   }
 #else
-  cm->use_prev_frame_mvs = !cm->error_resilient_mode && cm->prev_frame &&
-                           cm->width == cm->prev_frame->buf.y_crop_width &&
-                           cm->height == cm->prev_frame->buf.y_crop_height &&
-                           !cm->intra_only && cm->last_show_frame;
+  if (cm->prev_frame) {
+    cm->use_prev_frame_mvs = !cm->error_resilient_mode &&
+#if CONFIG_FRAME_SUPERRES
+                             cm->width == cm->last_width &&
+                             cm->height == cm->last_height &&
+#else
+                             cm->width == cm->prev_frame->buf.y_crop_width &&
+                             cm->height == cm->prev_frame->buf.y_crop_height &&
+#endif  // CONFIG_FRAME_SUPERRES
+                             !cm->intra_only && cm->last_show_frame;
+  } else {
+    cm->use_prev_frame_mvs = 0;
+  }
 #endif  // CONFIG_TEMPMV_SIGNALING
 
   // Special case: set prev_mi to NULL when the previous mode info
@@ -4894,6 +5312,8 @@ static void encode_frame_internal(AV1_COMP *cpi) {
     }
 #endif
 
+    av1_setup_frame_boundary_info(cm);
+
     // If allowed, encoding tiles in parallel with one thread handling one tile.
     // TODO(geza.lore): The multi-threaded encoder is not safe with more than
     // 1 tile rows, as it uses the single above_context et al arrays from
@@ -4921,7 +5341,11 @@ static void make_consistent_compound_tools(AV1_COMMON *cm) {
     cm->allow_interintra_compound = 0;
 #endif  // CONFIG_INTERINTRA
 #if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+#if CONFIG_COMPOUND_SINGLEREF
+  if (frame_is_intra_only(cm))
+#else   // !CONFIG_COMPOUND_SINGLEREF
   if (frame_is_intra_only(cm) || cm->reference_mode == SINGLE_REFERENCE)
+#endif  // CONFIG_COMPOUND_SINGLEREF
     cm->allow_masked_compound = 0;
 #endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
 }
@@ -4942,14 +5366,14 @@ void av1_encode_frame(AV1_COMP *cpi) {
   // side behavior is where the ALT ref buffer has opposite sign bias to
   // the other two.
   if (!frame_is_intra_only(cm)) {
-#if !CONFIG_ONE_SIDED_COMPOUND
+#if !(CONFIG_ONE_SIDED_COMPOUND || CONFIG_EXT_COMP_REFS)
     if ((cm->ref_frame_sign_bias[ALTREF_FRAME] ==
          cm->ref_frame_sign_bias[GOLDEN_FRAME]) ||
         (cm->ref_frame_sign_bias[ALTREF_FRAME] ==
          cm->ref_frame_sign_bias[LAST_FRAME])) {
       cpi->allow_comp_inter_inter = 0;
     } else {
-#endif
+#endif  // !(CONFIG_ONE_SIDED_COMPOUND || CONFIG_EXT_COMP_REFS)
       cpi->allow_comp_inter_inter = 1;
 #if CONFIG_EXT_REFS
       cm->comp_fwd_ref[0] = LAST_FRAME;
@@ -4962,10 +5386,11 @@ void av1_encode_frame(AV1_COMP *cpi) {
     cm->comp_fixed_ref = ALTREF_FRAME;
     cm->comp_var_ref[0] = LAST_FRAME;
     cm->comp_var_ref[1] = GOLDEN_FRAME;
-#endif                          // CONFIG_EXT_REFS
-#if !CONFIG_ONE_SIDED_COMPOUND  // Normative in encoder
+#endif  // CONFIG_EXT_REFS
+#if !(CONFIG_ONE_SIDED_COMPOUND || \
+      CONFIG_EXT_COMP_REFS)  // Normative in encoder
     }
-#endif
+#endif  // !(CONFIG_ONE_SIDED_COMPOUND || CONFIG_EXT_COMP_REFS)
   } else {
     cpi->allow_comp_inter_inter = 0;
   }
@@ -4998,7 +5423,12 @@ void av1_encode_frame(AV1_COMP *cpi) {
     else
       cm->reference_mode = REFERENCE_MODE_SELECT;
 #else
+#if CONFIG_BGSPRITE
+    (void)is_alt_ref;
+    if (!cpi->allow_comp_inter_inter)
+#else
     if (is_alt_ref || !cpi->allow_comp_inter_inter)
+#endif  // CONFIG_BGSPRITE
       cm->reference_mode = SINGLE_REFERENCE;
     else if (mode_thrs[COMPOUND_REFERENCE] > mode_thrs[SINGLE_REFERENCE] &&
              mode_thrs[COMPOUND_REFERENCE] > mode_thrs[REFERENCE_MODE_SELECT] &&
@@ -5017,25 +5447,23 @@ void av1_encode_frame(AV1_COMP *cpi) {
 #if CONFIG_EXT_INTER
     make_consistent_compound_tools(cm);
 #endif  // CONFIG_EXT_INTER
+
+    rdc->single_ref_used_flag = 0;
+    rdc->compound_ref_used_flag = 0;
+
     encode_frame_internal(cpi);
 
     for (i = 0; i < REFERENCE_MODES; ++i)
       mode_thrs[i] = (mode_thrs[i] + rdc->comp_pred_diff[i] / cm->MBs) / 2;
 
     if (cm->reference_mode == REFERENCE_MODE_SELECT) {
-      int single_count_zero = 0;
-      int comp_count_zero = 0;
-
-      for (i = 0; i < COMP_INTER_CONTEXTS; i++) {
-        single_count_zero += counts->comp_inter[i][0];
-        comp_count_zero += counts->comp_inter[i][1];
-      }
-
-      if (comp_count_zero == 0) {
+      // Use a flag that includes 4x4 blocks
+      if (rdc->compound_ref_used_flag == 0) {
         cm->reference_mode = SINGLE_REFERENCE;
         av1_zero(counts->comp_inter);
 #if !CONFIG_REF_ADAPT
-      } else if (single_count_zero == 0) {
+        // Use a flag that includes 4x4 blocks
+      } else if (rdc->single_ref_used_flag == 0) {
         cm->reference_mode = COMPOUND_REFERENCE;
         av1_zero(counts->comp_inter);
 #endif  // !CONFIG_REF_ADAPT
@@ -5046,10 +5474,15 @@ void av1_encode_frame(AV1_COMP *cpi) {
 #endif  // CONFIG_EXT_INTER
 
 #if CONFIG_VAR_TX
+#if CONFIG_RECT_TX_EXT
+    if (cm->tx_mode == TX_MODE_SELECT && cpi->td.mb.txb_split_count == 0 &&
+        counts->quarter_tx_size[1] == 0)
+#else
     if (cm->tx_mode == TX_MODE_SELECT && cpi->td.mb.txb_split_count == 0)
+#endif
       cm->tx_mode = ALLOW_32X32 + CONFIG_TX64X64;
 #else
-#if CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
+#if CONFIG_RECT_TX_EXT && CONFIG_EXT_TX
     if (cm->tx_mode == TX_MODE_SELECT && counts->quarter_tx_size[1] == 0) {
 #else
     if (cm->tx_mode == TX_MODE_SELECT) {
@@ -5232,12 +5665,20 @@ static void sum_intra_stats(FRAME_COUNTS *counts, MACROBLOCKD *xd,
                             const MODE_INFO *left_mi, const int intraonly,
                             const int mi_row, const int mi_col) {
   const MB_MODE_INFO *const mbmi = &mi->mbmi;
+#if CONFIG_ENTROPY_STATS
   const PREDICTION_MODE y_mode = mbmi->mode;
-  const PREDICTION_MODE uv_mode = mbmi->uv_mode;
+  const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
+#else   // CONFIG_ENTROPY_STATS
+  (void)counts;
+  (void)above_mi;
+  (void)left_mi;
+  (void)intraonly;
+#endif  // CONFIG_ENTROPY_STATS
   const BLOCK_SIZE bsize = mbmi->sb_type;
   const int unify_bsize = CONFIG_CB4X4;
 
   if (bsize < BLOCK_8X8 && !unify_bsize) {
+#if CONFIG_ENTROPY_STATS
     int idx, idy;
     const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
     const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
@@ -5253,7 +5694,9 @@ static void sum_intra_stats(FRAME_COUNTS *counts, MACROBLOCKD *xd,
           ++counts->y_mode[0][bmode];
         }
       }
+#endif  // CONFIG_ENTROPY_STATS
   } else {
+#if CONFIG_ENTROPY_STATS
     if (intraonly) {
       const PREDICTION_MODE above = av1_above_block_mode(mi, above_mi, 0);
       const PREDICTION_MODE left = av1_left_block_mode(mi, left_mi, 0);
@@ -5261,6 +5704,7 @@ static void sum_intra_stats(FRAME_COUNTS *counts, MACROBLOCKD *xd,
     } else {
       ++counts->y_mode[size_group_lookup[bsize]][y_mode];
     }
+#endif  // CONFIG_ENTROPY_STATS
 #if CONFIG_FILTER_INTRA
     if (mbmi->mode == DC_PRED
 #if CONFIG_PALETTE
@@ -5271,7 +5715,7 @@ static void sum_intra_stats(FRAME_COUNTS *counts, MACROBLOCKD *xd,
           mbmi->filter_intra_mode_info.use_filter_intra_mode[0];
       ++counts->filter_intra[0][use_filter_intra_mode];
     }
-    if (mbmi->uv_mode == DC_PRED
+    if (mbmi->uv_mode == UV_DC_PRED
 #if CONFIG_CB4X4
         &&
         is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
@@ -5306,7 +5750,9 @@ static void sum_intra_stats(FRAME_COUNTS *counts, MACROBLOCKD *xd,
   (void)mi_col;
   (void)xd;
 #endif
+#if CONFIG_ENTROPY_STATS
   ++counts->uv_mode[y_mode][uv_mode];
+#endif  // CONFIG_ENTROPY_STATS
 }
 
 #if CONFIG_VAR_TX
@@ -5325,9 +5771,17 @@ static void update_txfm_count(MACROBLOCK *x, MACROBLOCKD *xd,
 
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
+#if CONFIG_RECT_TX_EXT
+  if (tx_size == plane_tx_size ||
+      mbmi->tx_size == quarter_txsize_lookup[mbmi->sb_type]) {
+#else
   if (tx_size == plane_tx_size) {
+#endif
     ++counts->txfm_partition[ctx][0];
-    mbmi->tx_size = tx_size;
+#if CONFIG_RECT_TX_EXT
+    if (tx_size == plane_tx_size)
+#endif
+      mbmi->tx_size = tx_size;
     txfm_partition_update(xd->above_txfm_context + blk_col,
                           xd->left_txfm_context + blk_row, tx_size, tx_size);
   } else {
@@ -5438,18 +5892,22 @@ static void tx_partition_set_contexts(const AV1_COMMON *const cm,
 
 void av1_update_tx_type_count(const AV1_COMMON *cm, MACROBLOCKD *xd,
 #if CONFIG_TXK_SEL
-                              int block, int plane,
+                              int blk_row, int blk_col, int block, int plane,
 #endif
                               BLOCK_SIZE bsize, TX_SIZE tx_size,
                               FRAME_COUNTS *counts) {
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   int is_inter = is_inter_block(mbmi);
+
 #if !CONFIG_TXK_SEL
   TX_TYPE tx_type = mbmi->tx_type;
 #else
+  (void)blk_row;
+  (void)blk_col;
   // Only y plane's tx_type is updated
   if (plane > 0) return;
-  TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block, tx_size);
+  TX_TYPE tx_type =
+      av1_get_tx_type(PLANE_TYPE_Y, xd, blk_row, blk_col, block, tx_size);
 #endif
 #if CONFIG_EXT_TX
   if (get_ext_tx_types(tx_size, bsize, is_inter, cm->reduced_tx_set_used) > 1 &&
@@ -5509,7 +5967,7 @@ static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
   x->pvq_coded = (dry_run == OUTPUT_ENABLED) ? 1 : 0;
 #endif
 #if CONFIG_CFL
-  x->cfl_store_y = (dry_run == OUTPUT_ENABLED) ? 1 : 0;
+  x->cfl_store_y = 1;
 #endif
 
   if (!is_inter) {
@@ -5526,13 +5984,8 @@ static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
 #if CONFIG_PALETTE
     if (bsize >= BLOCK_8X8 && !dry_run) {
       for (plane = 0; plane <= 1; ++plane) {
-        if (mbmi->palette_mode_info.palette_size[plane] > 0) {
-          mbmi->palette_mode_info.palette_first_color_idx[plane] =
-              xd->plane[plane].color_index_map[0];
-          // TODO(huisu): this increases the use of token buffer. Needs stretch
-          // test to verify.
+        if (mbmi->palette_mode_info.palette_size[plane] > 0)
           av1_tokenize_palette_sb(cpi, td, plane, t, dry_run, bsize, rate);
-        }
       }
     }
 #endif  // CONFIG_PALETTE
@@ -5559,9 +6012,21 @@ static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
       av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
                            &xd->block_refs[ref]->sf);
     }
-    av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, NULL, block_size);
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+    // Single ref compound mode
+    if (!is_compound && is_inter_singleref_comp_mode(mbmi->mode)) {
+      xd->block_refs[1] = xd->block_refs[0];
+      YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[0]);
+#if CONFIG_INTRABC
+      assert(IMPLIES(!is_intrabc_block(mbmi), cfg));
+#else
+      assert(cfg != NULL);
+#endif  // !CONFIG_INTRABC
+      av1_setup_pre_planes(xd, 1, cfg, mi_row, mi_col, &xd->block_refs[1]->sf);
+    }
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
 
-    av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, NULL, block_size);
+    av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, block_size);
 #if CONFIG_MOTION_VAR
     if (mbmi->motion_mode == OBMC_CAUSAL) {
 #if CONFIG_NCOBMC
@@ -5587,10 +6052,11 @@ static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
 #endif
   }
 
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
   if (bsize < BLOCK_8X8) {
-    daala_dist_set_sub8x8_dst(x, x->decoded_8x8, bsize, block_size_wide[bsize],
-                              block_size_high[bsize], mi_row, mi_col);
+    dist_8x8_set_sub8x8_dst(x, (uint8_t *)x->decoded_8x8, bsize,
+                            block_size_wide[bsize], block_size_high[bsize],
+                            mi_row, mi_col);
   }
 #endif
 
@@ -5629,13 +6095,16 @@ static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
 
       ++td->counts->tx_size[tx_size_cat][tx_size_ctx][depth];
 #endif
-#if CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
+
+#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
       if (is_quarter_tx_allowed(xd, mbmi, is_inter) &&
-          mbmi->tx_size != txsize_sqr_up_map[mbmi->tx_size]) {
-        ++td->counts->quarter_tx_size[mbmi->tx_size ==
-                                      quarter_txsize_lookup[mbmi->sb_type]];
+          quarter_txsize_lookup[bsize] != max_txsize_rect_lookup[bsize] &&
+          (mbmi->tx_size == quarter_txsize_lookup[bsize] ||
+           mbmi->tx_size == max_txsize_rect_lookup[bsize])) {
+        ++td->counts
+              ->quarter_tx_size[mbmi->tx_size == quarter_txsize_lookup[bsize]];
       }
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
+#endif
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
       assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed(xd, mbmi)));
 #endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
@@ -5673,8 +6142,8 @@ static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
     }
 
     ++td->counts->tx_size_totals[txsize_sqr_map[tx_size]];
-    ++td->counts
-          ->tx_size_totals[txsize_sqr_map[get_uv_tx_size(mbmi, &xd->plane[1])]];
+    ++td->counts->tx_size_totals[txsize_sqr_map[av1_get_uv_tx_size(
+        mbmi, &xd->plane[1])]];
 #if !CONFIG_TXK_SEL
     av1_update_tx_type_count(cm, xd, bsize, tx_size, td->counts);
 #endif
@@ -5837,7 +6306,7 @@ static void predict_superblock(const AV1_COMP *const cpi, ThreadData *td,
 #if CONFIG_EXT_INTER
                                int mi_row_ori, int mi_col_ori,
 #endif  // CONFIG_EXT_INTER
-                               int mi_row_pred, int mi_col_pred,
+                               int mi_row_pred, int mi_col_pred, int plane,
                                BLOCK_SIZE bsize_pred, int b_sub8x8, int block) {
   // Used in supertx
   // (mi_row_ori, mi_col_ori): location for mv
@@ -5859,28 +6328,39 @@ static void predict_superblock(const AV1_COMP *const cpi, ThreadData *td,
                          &xd->block_refs[ref]->sf);
   }
 
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  // Single ref compound mode
+  if (!is_compound && is_inter_singleref_comp_mode(mbmi->mode)) {
+    xd->block_refs[1] = xd->block_refs[0];
+    YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[0]);
+    av1_setup_pre_planes(xd, 1, cfg, mi_row_pred, mi_col_pred,
+                         &xd->block_refs[1]->sf);
+  }
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+
   if (!b_sub8x8)
-    av1_build_inter_predictors_sb_extend(cm, xd,
+    av1_build_inter_predictor_sb_extend(cm, xd,
 #if CONFIG_EXT_INTER
-                                         mi_row_ori, mi_col_ori,
+                                        mi_row_ori, mi_col_ori,
 #endif  // CONFIG_EXT_INTER
-                                         mi_row_pred, mi_col_pred, bsize_pred);
+                                        mi_row_pred, mi_col_pred, plane,
+                                        bsize_pred);
   else
-    av1_build_inter_predictors_sb_sub8x8_extend(cm, xd,
+    av1_build_inter_predictor_sb_sub8x8_extend(cm, xd,
 #if CONFIG_EXT_INTER
-                                                mi_row_ori, mi_col_ori,
+                                               mi_row_ori, mi_col_ori,
 #endif  // CONFIG_EXT_INTER
-                                                mi_row_pred, mi_col_pred,
-                                                bsize_pred, block);
+                                               mi_row_pred, mi_col_pred, plane,
+                                               bsize_pred, block);
 }
 
 static void predict_b_extend(const AV1_COMP *const cpi, ThreadData *td,
                              const TileInfo *const tile, int block,
                              int mi_row_ori, int mi_col_ori, int mi_row_pred,
                              int mi_col_pred, int mi_row_top, int mi_col_top,
-                             uint8_t *dst_buf[3], int dst_stride[3],
+                             int plane, uint8_t *dst_buf, int dst_stride,
                              BLOCK_SIZE bsize_top, BLOCK_SIZE bsize_pred,
-                             RUN_TYPE dry_run, int b_sub8x8, int bextend) {
+                             RUN_TYPE dry_run, int b_sub8x8) {
   // Used in supertx
   // (mi_row_ori, mi_col_ori): location for mv
   // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
@@ -5905,34 +6385,27 @@ static void predict_b_extend(const AV1_COMP *const cpi, ThreadData *td,
 
   set_offsets_extend(cpi, td, tile, mi_row_pred, mi_col_pred, mi_row_ori,
                      mi_col_ori, bsize_pred);
-  xd->plane[0].dst.stride = dst_stride[0];
-  xd->plane[1].dst.stride = dst_stride[1];
-  xd->plane[2].dst.stride = dst_stride[2];
-  xd->plane[0].dst.buf = dst_buf[0] +
-                         (r >> xd->plane[0].subsampling_y) * dst_stride[0] +
-                         (c >> xd->plane[0].subsampling_x);
-  xd->plane[1].dst.buf = dst_buf[1] +
-                         (r >> xd->plane[1].subsampling_y) * dst_stride[1] +
-                         (c >> xd->plane[1].subsampling_x);
-  xd->plane[2].dst.buf = dst_buf[2] +
-                         (r >> xd->plane[2].subsampling_y) * dst_stride[2] +
-                         (c >> xd->plane[2].subsampling_x);
+  xd->plane[plane].dst.stride = dst_stride;
+  xd->plane[plane].dst.buf =
+      dst_buf + (r >> xd->plane[plane].subsampling_y) * dst_stride +
+      (c >> xd->plane[plane].subsampling_x);
 
   predict_superblock(cpi, td,
 #if CONFIG_EXT_INTER
                      mi_row_ori, mi_col_ori,
 #endif  // CONFIG_EXT_INTER
-                     mi_row_pred, mi_col_pred, bsize_pred, b_sub8x8, block);
+                     mi_row_pred, mi_col_pred, plane, bsize_pred, b_sub8x8,
+                     block);
 
-  if (!dry_run && !bextend)
+  if (!dry_run && (plane == 0) && (block == 0 || !b_sub8x8))
     update_stats(&cpi->common, td, mi_row_pred, mi_col_pred, 1);
 }
 
 static void extend_dir(const AV1_COMP *const cpi, ThreadData *td,
                        const TileInfo *const tile, int block, BLOCK_SIZE bsize,
-                       BLOCK_SIZE top_bsize, int mi_row, int mi_col,
-                       int mi_row_top, int mi_col_top, RUN_TYPE dry_run,
-                       uint8_t *dst_buf[3], int dst_stride[3], int dir) {
+                       BLOCK_SIZE top_bsize, int mi_row_ori, int mi_col_ori,
+                       int mi_row, int mi_col, int mi_row_top, int mi_col_top,
+                       int plane, uint8_t *dst_buf, int dst_stride, int dir) {
   // dir: 0-lower, 1-upper, 2-left, 3-right
   //      4-lowerleft, 5-upperleft, 6-lowerright, 7-upperright
   MACROBLOCKD *xd = &td->mb.e_mbd;
@@ -5973,10 +6446,10 @@ static void extend_dir(const AV1_COMP *const cpi, ThreadData *td,
 
     for (j = 0; j < mi_height + ext_offset; j += high_unit)
       for (i = 0; i < mi_width + ext_offset; i += wide_unit)
-        predict_b_extend(cpi, td, tile, block, mi_row, mi_col, mi_row_pred + j,
-                         mi_col_pred + i, mi_row_top, mi_col_top, dst_buf,
-                         dst_stride, top_bsize, extend_bsize, dry_run, b_sub8x8,
-                         1);
+        predict_b_extend(cpi, td, tile, block, mi_row_ori, mi_col_ori,
+                         mi_row_pred + j, mi_col_pred + i, mi_row_top,
+                         mi_col_top, plane, dst_buf, dst_stride, top_bsize,
+                         extend_bsize, 1, b_sub8x8);
   } else if (dir == 2 || dir == 3) {  // left and right
     extend_bsize =
         (mi_height == mi_size_high[BLOCK_8X8] || bsize < BLOCK_8X8 || yss < xss)
@@ -5996,10 +6469,10 @@ static void extend_dir(const AV1_COMP *const cpi, ThreadData *td,
 
     for (j = 0; j < mi_height + ext_offset; j += high_unit)
       for (i = 0; i < mi_width + ext_offset; i += wide_unit)
-        predict_b_extend(cpi, td, tile, block, mi_row, mi_col, mi_row_pred + j,
-                         mi_col_pred + i, mi_row_top, mi_col_top, dst_buf,
-                         dst_stride, top_bsize, extend_bsize, dry_run, b_sub8x8,
-                         1);
+        predict_b_extend(cpi, td, tile, block, mi_row_ori, mi_col_ori,
+                         mi_row_pred + j, mi_col_pred + i, mi_row_top,
+                         mi_col_top, plane, dst_buf, dst_stride, top_bsize,
+                         extend_bsize, 1, b_sub8x8);
   } else {
     extend_bsize = BLOCK_8X8;
 #if CONFIG_CB4X4
@@ -6018,35 +6491,24 @@ static void extend_dir(const AV1_COMP *const cpi, ThreadData *td,
 
     for (j = 0; j < mi_height + ext_offset; j += high_unit)
       for (i = 0; i < mi_width + ext_offset; i += wide_unit)
-        predict_b_extend(cpi, td, tile, block, mi_row, mi_col, mi_row_pred + j,
-                         mi_col_pred + i, mi_row_top, mi_col_top, dst_buf,
-                         dst_stride, top_bsize, extend_bsize, dry_run, b_sub8x8,
-                         1);
+        predict_b_extend(cpi, td, tile, block, mi_row_ori, mi_col_ori,
+                         mi_row_pred + j, mi_col_pred + i, mi_row_top,
+                         mi_col_top, plane, dst_buf, dst_stride, top_bsize,
+                         extend_bsize, 1, b_sub8x8);
   }
 }
 
 static void extend_all(const AV1_COMP *const cpi, ThreadData *td,
                        const TileInfo *const tile, int block, BLOCK_SIZE bsize,
-                       BLOCK_SIZE top_bsize, int mi_row, int mi_col,
-                       int mi_row_top, int mi_col_top, RUN_TYPE dry_run,
-                       uint8_t *dst_buf[3], int dst_stride[3]) {
+                       BLOCK_SIZE top_bsize, int mi_row_ori, int mi_col_ori,
+                       int mi_row, int mi_col, int mi_row_top, int mi_col_top,
+                       int plane, uint8_t *dst_buf, int dst_stride) {
   assert(block >= 0 && block < 4);
-  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-             mi_col_top, dry_run, dst_buf, dst_stride, 0);
-  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-             mi_col_top, dry_run, dst_buf, dst_stride, 1);
-  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-             mi_col_top, dry_run, dst_buf, dst_stride, 2);
-  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-             mi_col_top, dry_run, dst_buf, dst_stride, 3);
-  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-             mi_col_top, dry_run, dst_buf, dst_stride, 4);
-  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-             mi_col_top, dry_run, dst_buf, dst_stride, 5);
-  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-             mi_col_top, dry_run, dst_buf, dst_stride, 6);
-  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-             mi_col_top, dry_run, dst_buf, dst_stride, 7);
+  for (int i = 0; i < 8; ++i) {
+    extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row_ori, mi_col_ori,
+               mi_row, mi_col, mi_row_top, mi_col_top, plane, dst_buf,
+               dst_stride, i);
+  }
 }
 
 // This function generates prediction for multiple blocks, between which
@@ -6140,29 +6602,36 @@ static void predict_sb_complex(const AV1_COMP *const cpi, ThreadData *td,
   switch (partition) {
     case PARTITION_NONE:
       assert(bsize < top_bsize);
-      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                       mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                       bsize, dry_run, 0, 0);
-      extend_all(cpi, td, tile, 0, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-                 mi_col_top, dry_run, dst_buf, dst_stride);
+      for (i = 0; i < MAX_MB_PLANE; ++i) {
+        predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, i, dst_buf[i], dst_stride[i],
+                         top_bsize, bsize, dry_run, 0);
+        extend_all(cpi, td, tile, 0, bsize, top_bsize, mi_row, mi_col, mi_row,
+                   mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
+                   dst_stride[i]);
+      }
       break;
     case PARTITION_HORZ:
       if (bsize == BLOCK_8X8 && !unify_bsize) {
-        // Fisrt half
-        predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                         mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                         BLOCK_8X8, dry_run, 1, 0);
-        if (bsize < top_bsize)
-          extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
-
-        // Second half
-        predict_b_extend(cpi, td, tile, 2, mi_row, mi_col, mi_row, mi_col,
-                         mi_row_top, mi_col_top, dst_buf1, dst_stride1,
-                         top_bsize, BLOCK_8X8, dry_run, 1, 1);
-        if (bsize < top_bsize)
-          extend_all(cpi, td, tile, 2, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
+        for (i = 0; i < MAX_MB_PLANE; ++i) {
+          // First half
+          predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                           mi_row_top, mi_col_top, i, dst_buf[i], dst_stride[i],
+                           top_bsize, BLOCK_8X8, dry_run, 1);
+          if (bsize < top_bsize)
+            extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                       mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
+                       dst_stride[i]);
+
+          // Second half
+          predict_b_extend(cpi, td, tile, 2, mi_row, mi_col, mi_row, mi_col,
+                           mi_row_top, mi_col_top, i, dst_buf1[i],
+                           dst_stride1[i], top_bsize, BLOCK_8X8, dry_run, 1);
+          if (bsize < top_bsize)
+            extend_all(cpi, td, tile, 2, subsize, top_bsize, mi_row, mi_col,
+                       mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf1[i],
+                       dst_stride1[i]);
+        }
 
         // Smooth
         xd->plane[0].dst.buf = dst_buf[0];
@@ -6172,60 +6641,89 @@ static void predict_sb_complex(const AV1_COMP *const cpi, ThreadData *td,
             mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ,
             0);
       } else {
-        // First half
-        predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                         mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                         subsize, dry_run, 0, 0);
-        if (bsize < top_bsize)
-          extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
-        else
-          extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 0);
-
-        if (mi_row + hbs < cm->mi_rows) {
-          // Second half
-          predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
-                           mi_col, mi_row_top, mi_col_top, dst_buf1,
-                           dst_stride1, top_bsize, subsize, dry_run, 0, 0);
-          if (bsize < top_bsize)
-            extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs,
-                       mi_col, mi_row_top, mi_col_top, dry_run, dst_buf1,
-                       dst_stride1);
-          else
-            extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs,
-                       mi_col, mi_row_top, mi_col_top, dry_run, dst_buf1,
-                       dst_stride1, 1);
-
-          // Smooth
-          for (i = 0; i < MAX_MB_PLANE; i++) {
+        for (i = 0; i < MAX_MB_PLANE; ++i) {
+#if CONFIG_CB4X4
+          const struct macroblockd_plane *pd = &xd->plane[i];
+          int handle_chroma_sub8x8 = need_handle_chroma_sub8x8(
+              subsize, pd->subsampling_x, pd->subsampling_y);
+
+          if (handle_chroma_sub8x8) {
+            int mode_offset_row = CONFIG_CHROMA_SUB8X8 ? hbs : 0;
+
+            predict_b_extend(cpi, td, tile, 0, mi_row + mode_offset_row, mi_col,
+                             mi_row, mi_col, mi_row_top, mi_col_top, i,
+                             dst_buf[i], dst_stride[i], top_bsize, bsize,
+                             dry_run, 0);
+            if (bsize < top_bsize)
+              extend_all(cpi, td, tile, 0, bsize, top_bsize,
+                         mi_row + mode_offset_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, i, dst_buf[i], dst_stride[i]);
+          } else {
+#endif
+            // First half
+            predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, i, dst_buf[i],
+                             dst_stride[i], top_bsize, subsize, dry_run, 0);
+            if (bsize < top_bsize)
+              extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
+                         dst_stride[i]);
+            else
+              extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
+                         dst_stride[i], 0);
             xd->plane[i].dst.buf = dst_buf[i];
             xd->plane[i].dst.stride = dst_stride[i];
-            av1_build_masked_inter_predictor_complex(
-                xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
-                mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
-                PARTITION_HORZ, i);
+
+            if (mi_row + hbs < cm->mi_rows) {
+              // Second half
+              predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col,
+                               mi_row + hbs, mi_col, mi_row_top, mi_col_top, i,
+                               dst_buf1[i], dst_stride1[i], top_bsize, subsize,
+                               dry_run, 0);
+              if (bsize < top_bsize)
+                extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs,
+                           mi_col, mi_row + hbs, mi_col, mi_row_top, mi_col_top,
+                           i, dst_buf1[i], dst_stride1[i]);
+              else
+                extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs,
+                           mi_col, mi_row + hbs, mi_col, mi_row_top, mi_col_top,
+                           i, dst_buf1[i], dst_stride1[i], 1);
+              // Smooth
+              xd->plane[i].dst.buf = dst_buf[i];
+              xd->plane[i].dst.stride = dst_stride[i];
+              av1_build_masked_inter_predictor_complex(
+                  xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
+                  mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+                  PARTITION_HORZ, i);
+            }
+#if CONFIG_CB4X4
           }
+#endif
         }
       }
       break;
     case PARTITION_VERT:
       if (bsize == BLOCK_8X8 && !unify_bsize) {
-        // First half
-        predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                         mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                         BLOCK_8X8, dry_run, 1, 0);
-        if (bsize < top_bsize)
-          extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
-
-        // Second half
-        predict_b_extend(cpi, td, tile, 1, mi_row, mi_col, mi_row, mi_col,
-                         mi_row_top, mi_col_top, dst_buf1, dst_stride1,
-                         top_bsize, BLOCK_8X8, dry_run, 1, 1);
-        if (bsize < top_bsize)
-          extend_all(cpi, td, tile, 1, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
+        for (i = 0; i < MAX_MB_PLANE; ++i) {
+          // First half
+          predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                           mi_row_top, mi_col_top, i, dst_buf[i], dst_stride[i],
+                           top_bsize, BLOCK_8X8, dry_run, 1);
+          if (bsize < top_bsize)
+            extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                       mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
+                       dst_stride[i]);
+
+          // Second half
+          predict_b_extend(cpi, td, tile, 1, mi_row, mi_col, mi_row, mi_col,
+                           mi_row_top, mi_col_top, i, dst_buf1[i],
+                           dst_stride1[i], top_bsize, BLOCK_8X8, dry_run, 1);
+          if (bsize < top_bsize)
+            extend_all(cpi, td, tile, 1, subsize, top_bsize, mi_row, mi_col,
+                       mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf1[i],
+                       dst_stride1[i]);
+        }
 
         // Smooth
         xd->plane[0].dst.buf = dst_buf[0];
@@ -6235,66 +6733,160 @@ static void predict_sb_complex(const AV1_COMP *const cpi, ThreadData *td,
             mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT,
             0);
       } else {
-        // bsize: not important, not useful
-        predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                         mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                         subsize, dry_run, 0, 0);
-        if (bsize < top_bsize)
-          extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
-        else
-          extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 3);
-
-        if (mi_col + hbs < cm->mi_cols) {
-          predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
-                           mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
-                           dst_stride1, top_bsize, subsize, dry_run, 0, 0);
-          if (bsize < top_bsize)
-            extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row,
-                       mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf1,
-                       dst_stride1);
-          else
-            extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row,
-                       mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf1,
-                       dst_stride1, 2);
-
-          for (i = 0; i < MAX_MB_PLANE; i++) {
+        for (i = 0; i < MAX_MB_PLANE; ++i) {
+#if CONFIG_CB4X4
+          const struct macroblockd_plane *pd = &xd->plane[i];
+          int handle_chroma_sub8x8 = need_handle_chroma_sub8x8(
+              subsize, pd->subsampling_x, pd->subsampling_y);
+
+          if (handle_chroma_sub8x8) {
+            int mode_offset_col = CONFIG_CHROMA_SUB8X8 ? hbs : 0;
+
+            predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + mode_offset_col,
+                             mi_row, mi_col, mi_row_top, mi_col_top, i,
+                             dst_buf[i], dst_stride[i], top_bsize, bsize,
+                             dry_run, 0);
+            if (bsize < top_bsize)
+              extend_all(cpi, td, tile, 0, bsize, top_bsize, mi_row,
+                         mi_col + mode_offset_col, mi_row, mi_col, mi_row_top,
+                         mi_col_top, i, dst_buf[i], dst_stride[i]);
+          } else {
+#endif
+            predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, i, dst_buf[i],
+                             dst_stride[i], top_bsize, subsize, dry_run, 0);
+            if (bsize < top_bsize)
+              extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
+                         dst_stride[i]);
+            else
+              extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
+                         dst_stride[i], 3);
             xd->plane[i].dst.buf = dst_buf[i];
             xd->plane[i].dst.stride = dst_stride[i];
-            av1_build_masked_inter_predictor_complex(
-                xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
-                mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
-                PARTITION_VERT, i);
+
+            if (mi_col + hbs < cm->mi_cols) {
+              predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
+                               mi_col + hbs, mi_row_top, mi_col_top, i,
+                               dst_buf1[i], dst_stride1[i], top_bsize, subsize,
+                               dry_run, 0);
+              if (bsize < top_bsize)
+                extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row,
+                           mi_col + hbs, mi_row, mi_col + hbs, mi_row_top,
+                           mi_col_top, i, dst_buf1[i], dst_stride1[i]);
+              else
+                extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row,
+                           mi_col + hbs, mi_row, mi_col + hbs, mi_row_top,
+                           mi_col_top, i, dst_buf1[i], dst_stride1[i], 2);
+
+              // smooth
+              xd->plane[i].dst.buf = dst_buf[i];
+              xd->plane[i].dst.stride = dst_stride[i];
+              av1_build_masked_inter_predictor_complex(
+                  xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
+                  mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+                  PARTITION_VERT, i);
+            }
+#if CONFIG_CB4X4
           }
+#endif
         }
       }
       break;
     case PARTITION_SPLIT:
       if (bsize == BLOCK_8X8 && !unify_bsize) {
-        predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                         mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                         BLOCK_8X8, dry_run, 1, 0);
-        predict_b_extend(cpi, td, tile, 1, mi_row, mi_col, mi_row, mi_col,
-                         mi_row_top, mi_col_top, dst_buf1, dst_stride1,
-                         top_bsize, BLOCK_8X8, dry_run, 1, 1);
-        predict_b_extend(cpi, td, tile, 2, mi_row, mi_col, mi_row, mi_col,
-                         mi_row_top, mi_col_top, dst_buf2, dst_stride2,
-                         top_bsize, BLOCK_8X8, dry_run, 1, 1);
-        predict_b_extend(cpi, td, tile, 3, mi_row, mi_col, mi_row, mi_col,
-                         mi_row_top, mi_col_top, dst_buf3, dst_stride3,
-                         top_bsize, BLOCK_8X8, dry_run, 1, 1);
-
-        if (bsize < top_bsize) {
-          extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
-          extend_all(cpi, td, tile, 1, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
-          extend_all(cpi, td, tile, 2, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2);
-          extend_all(cpi, td, tile, 3, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, dry_run, dst_buf3, dst_stride3);
+        for (i = 0; i < MAX_MB_PLANE; i++) {
+          predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                           mi_row_top, mi_col_top, i, dst_buf[i], dst_stride[i],
+                           top_bsize, BLOCK_8X8, dry_run, 1);
+          predict_b_extend(cpi, td, tile, 1, mi_row, mi_col, mi_row, mi_col,
+                           mi_row_top, mi_col_top, i, dst_buf1[i],
+                           dst_stride1[i], top_bsize, BLOCK_8X8, dry_run, 1);
+          predict_b_extend(cpi, td, tile, 2, mi_row, mi_col, mi_row, mi_col,
+                           mi_row_top, mi_col_top, i, dst_buf2[i],
+                           dst_stride2[i], top_bsize, BLOCK_8X8, dry_run, 1);
+          predict_b_extend(cpi, td, tile, 3, mi_row, mi_col, mi_row, mi_col,
+                           mi_row_top, mi_col_top, i, dst_buf3[i],
+                           dst_stride3[i], top_bsize, BLOCK_8X8, dry_run, 1);
+
+          if (bsize < top_bsize) {
+            extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                       mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
+                       dst_stride[i]);
+            extend_all(cpi, td, tile, 1, subsize, top_bsize, mi_row, mi_col,
+                       mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf1[i],
+                       dst_stride1[i]);
+            extend_all(cpi, td, tile, 2, subsize, top_bsize, mi_row, mi_col,
+                       mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf2[i],
+                       dst_stride2[i]);
+            extend_all(cpi, td, tile, 3, subsize, top_bsize, mi_row, mi_col,
+                       mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf3[i],
+                       dst_stride3[i]);
+          }
+        }
+#if CONFIG_CB4X4
+      } else if (bsize == BLOCK_8X8) {
+        for (i = 0; i < MAX_MB_PLANE; i++) {
+          const struct macroblockd_plane *pd = &xd->plane[i];
+          int handle_chroma_sub8x8 = need_handle_chroma_sub8x8(
+              subsize, pd->subsampling_x, pd->subsampling_y);
+
+          if (handle_chroma_sub8x8) {
+            int mode_offset_row =
+                CONFIG_CHROMA_SUB8X8 && mi_row + hbs < cm->mi_rows ? hbs : 0;
+            int mode_offset_col =
+                CONFIG_CHROMA_SUB8X8 && mi_col + hbs < cm->mi_cols ? hbs : 0;
+
+            predict_b_extend(cpi, td, tile, 0, mi_row + mode_offset_row,
+                             mi_col + mode_offset_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, i, dst_buf[i],
+                             dst_stride[i], top_bsize, BLOCK_8X8, dry_run, 0);
+            if (bsize < top_bsize)
+              extend_all(cpi, td, tile, 0, BLOCK_8X8, top_bsize,
+                         mi_row + mode_offset_row, mi_col + mode_offset_col,
+                         mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
+                         dst_stride[i]);
+          } else {
+            predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, i, dst_buf[i],
+                             dst_stride[i], top_bsize, subsize, dry_run, 0);
+            if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols)
+              predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
+                               mi_col + hbs, mi_row_top, mi_col_top, i,
+                               dst_buf1[i], dst_stride1[i], top_bsize, subsize,
+                               dry_run, 0);
+            if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols)
+              predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col,
+                               mi_row + hbs, mi_col, mi_row_top, mi_col_top, i,
+                               dst_buf2[i], dst_stride2[i], top_bsize, subsize,
+                               dry_run, 0);
+            if (mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols)
+              predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col + hbs,
+                               mi_row + hbs, mi_col + hbs, mi_row_top,
+                               mi_col_top, i, dst_buf3[i], dst_stride3[i],
+                               top_bsize, subsize, dry_run, 0);
+
+            if (bsize < top_bsize) {
+              extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
+                         dst_stride[i]);
+              if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols)
+                extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row,
+                           mi_col + hbs, mi_row, mi_col + hbs, mi_row_top,
+                           mi_col_top, i, dst_buf1[i], dst_stride1[i]);
+              if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols)
+                extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs,
+                           mi_col, mi_row + hbs, mi_col, mi_row_top, mi_col_top,
+                           i, dst_buf2[i], dst_stride2[i]);
+              if (mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols)
+                extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs,
+                           mi_col + hbs, mi_row + hbs, mi_col + hbs, mi_row_top,
+                           mi_col_top, i, dst_buf3[i], dst_stride3[i]);
+            }
+          }
         }
+#endif
       } else {
         predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row_top,
                            mi_col_top, dry_run, subsize, top_bsize, dst_buf,
@@ -6314,10 +6906,16 @@ static void predict_sb_complex(const AV1_COMP *const cpi, ThreadData *td,
                              pc_tree->split[3]);
       }
       for (i = 0; i < MAX_MB_PLANE; i++) {
-#if !CONFIG_CB4X4
+#if CONFIG_CB4X4
+        const struct macroblockd_plane *pd = &xd->plane[i];
+        int handle_chroma_sub8x8 = need_handle_chroma_sub8x8(
+            subsize, pd->subsampling_x, pd->subsampling_y);
+        if (handle_chroma_sub8x8) continue;  // Skip <4x4 chroma smoothing
+#else
         if (bsize == BLOCK_8X8 && i != 0)
           continue;  // Skip <4x4 chroma smoothing
 #endif
+
         if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols) {
           av1_build_masked_inter_predictor_complex(
               xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
@@ -6334,9 +6932,6 @@ static void predict_sb_complex(const AV1_COMP *const cpi, ThreadData *td,
                 PARTITION_HORZ, i);
           }
         } else if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols) {
-          if (bsize == BLOCK_8X8 && i != 0)
-            continue;  // Skip <4x4 chroma smoothing
-
           av1_build_masked_inter_predictor_complex(
               xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i],
               mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
@@ -6660,8 +7255,7 @@ static void rd_supertx_sb(const AV1_COMP *const cpi, ThreadData *td,
       *tmp_rate = av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
       x->skip = 1;
     } else {
-      if (RDCOST(x->rdmult, x->rddiv, *tmp_rate, *tmp_dist) <
-          RDCOST(x->rdmult, x->rddiv, 0, sse)) {
+      if (RDCOST(x->rdmult, *tmp_rate, *tmp_dist) < RDCOST(x->rdmult, 0, sse)) {
         *tmp_rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
         x->skip = 0;
       } else {
@@ -6671,7 +7265,7 @@ static void rd_supertx_sb(const AV1_COMP *const cpi, ThreadData *td,
       }
     }
     *tmp_rate += base_rate;
-    rd_tx = RDCOST(x->rdmult, x->rddiv, *tmp_rate, *tmp_dist);
+    rd_tx = RDCOST(x->rdmult, *tmp_rate, *tmp_dist);
     if (rd_tx < bestrd_tx * 0.99 || tx_type == DCT_DCT) {
       *best_tx = tx_type;
       bestrd_tx = rd_tx;