1 files changed, 453 insertions, 261 deletions
diff --git a/third_party/aom/av1/common/reconinter.c b/third_party/aom/av1/common/reconinter.c
index fa37a992f..a1b5c1f67 100644
--- a/third_party/aom/av1/common/reconinter.c
+++ b/third_party/aom/av1/common/reconinter.c
@@ -35,14 +35,14 @@ DECLARE_ALIGNED(16, static uint8_t,
                               [MASK_MASTER_SIZE * MASK_MASTER_SIZE]);
 
 DECLARE_ALIGNED(16, static uint8_t,
-                wedge_signflip_lookup[BLOCK_SIZES][MAX_WEDGE_TYPES]);
+                wedge_signflip_lookup[BLOCK_SIZES_ALL][MAX_WEDGE_TYPES]);
 
-// 3 * MAX_WEDGE_SQUARE is an easy to compute and fairly tight upper bound
+// 4 * MAX_WEDGE_SQUARE is an easy to compute and fairly tight upper bound
 // on the sum of all mask sizes up to an including MAX_WEDGE_SQUARE.
 DECLARE_ALIGNED(16, static uint8_t,
-                wedge_mask_buf[2 * MAX_WEDGE_TYPES * 3 * MAX_WEDGE_SQUARE]);
+                wedge_mask_buf[2 * MAX_WEDGE_TYPES * 4 * MAX_WEDGE_SQUARE]);
 
-static wedge_masks_type wedge_masks[BLOCK_SIZES][2];
+static wedge_masks_type wedge_masks[BLOCK_SIZES_ALL][2];
 
 // Some unused wedge codebooks left temporarily to facilitate experiments.
 // To be removed when settled.
@@ -159,12 +159,12 @@ static const wedge_code_type wedge_codebook_16_heqw[16] = {
   { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
 };
 
-const wedge_params_type wedge_params_lookup[BLOCK_SIZES] = {
-#if CONFIG_CB4X4
+const wedge_params_type wedge_params_lookup[BLOCK_SIZES_ALL] = {
+#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
   { 0, NULL, NULL, 0, NULL },
   { 0, NULL, NULL, 0, NULL },
   { 0, NULL, NULL, 0, NULL },
-#endif  // CONFIG_CB4X4
+#endif  // CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
   { 0, NULL, NULL, 0, NULL },
   { 0, NULL, NULL, 0, NULL },
   { 0, NULL, NULL, 0, NULL },
@@ -216,6 +216,14 @@ const wedge_params_type wedge_params_lookup[BLOCK_SIZES] = {
   { 0, NULL, NULL, 0, NULL },
   { 0, NULL, NULL, 0, NULL },
 #endif  // CONFIG_EXT_PARTITION
+  { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_4X16], 0,
+    wedge_masks[BLOCK_4X16] },
+  { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_16X4], 0,
+    wedge_masks[BLOCK_16X4] },
+  { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X32], 0,
+    wedge_masks[BLOCK_8X32] },
+  { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X8], 0,
+    wedge_masks[BLOCK_32X8] },
 };
 
 static const uint8_t *get_wedge_mask_inplace(int wedge_index, int neg,
@@ -349,6 +357,47 @@ void build_compound_seg_mask_highbd(uint8_t *mask, SEG_MASK_TYPE mask_type,
 
 #elif COMPOUND_SEGMENT_TYPE == 1
 #define DIFF_FACTOR 16
+
+#if CONFIG_CONVOLVE_ROUND
+static void diffwtd_mask_d32(uint8_t *mask, int which_inverse, int mask_base,
+                             const int32_t *src0, int src0_stride,
+                             const int32_t *src1, int src1_stride,
+                             BLOCK_SIZE sb_type, int h, int w,
+                             ConvolveParams *conv_params, int bd) {
+  int round =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
+  int i, j, m, diff;
+  int block_stride = block_size_wide[sb_type];
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      diff = abs(src0[i * src0_stride + j] - src1[i * src1_stride + j]);
+      diff = ROUND_POWER_OF_TWO(diff, round);
+      m = clamp(mask_base + (diff / DIFF_FACTOR), 0, AOM_BLEND_A64_MAX_ALPHA);
+      mask[i * block_stride + j] =
+          which_inverse ? AOM_BLEND_A64_MAX_ALPHA - m : m;
+    }
+  }
+}
+
+static void build_compound_seg_mask_d32(uint8_t *mask, SEG_MASK_TYPE mask_type,
+                                        const int32_t *src0, int src0_stride,
+                                        const int32_t *src1, int src1_stride,
+                                        BLOCK_SIZE sb_type, int h, int w,
+                                        ConvolveParams *conv_params, int bd) {
+  switch (mask_type) {
+    case DIFFWTD_38:
+      diffwtd_mask_d32(mask, 0, 38, src0, src0_stride, src1, src1_stride,
+                       sb_type, h, w, conv_params, bd);
+      break;
+    case DIFFWTD_38_INV:
+      diffwtd_mask_d32(mask, 1, 38, src0, src0_stride, src1, src1_stride,
+                       sb_type, h, w, conv_params, bd);
+      break;
+    default: assert(0);
+  }
+}
+#endif
+
 static void diffwtd_mask(uint8_t *mask, int which_inverse, int mask_base,
                          const uint8_t *src0, int src0_stride,
                          const uint8_t *src1, int src1_stride,
@@ -531,7 +580,7 @@ static void init_wedge_master_masks() {
 static void init_wedge_signs() {
   BLOCK_SIZE sb_type;
   memset(wedge_signflip_lookup, 0, sizeof(wedge_signflip_lookup));
-  for (sb_type = BLOCK_4X4; sb_type < BLOCK_SIZES; ++sb_type) {
+  for (sb_type = BLOCK_4X4; sb_type < BLOCK_SIZES_ALL; ++sb_type) {
     const int bw = block_size_wide[sb_type];
     const int bh = block_size_high[sb_type];
     const wedge_params_type wedge_params = wedge_params_lookup[sb_type];
@@ -565,7 +614,7 @@ static void init_wedge_masks() {
   uint8_t *dst = wedge_mask_buf;
   BLOCK_SIZE bsize;
   memset(wedge_masks, 0, sizeof(wedge_masks));
-  for (bsize = BLOCK_4X4; bsize < BLOCK_SIZES; ++bsize) {
+  for (bsize = BLOCK_4X4; bsize < BLOCK_SIZES_ALL; ++bsize) {
     const uint8_t *mask;
     const int bw = block_size_wide[bsize];
     const int bh = block_size_high[bsize];
@@ -656,6 +705,21 @@ static void build_masked_compound_wedge_extend_highbd(
 }
 #endif  // CONFIG_HIGHBITDEPTH
 #else
+#if CONFIG_CONVOLVE_ROUND
+static void build_masked_compound_no_round(
+    CONV_BUF_TYPE *dst, int dst_stride, const CONV_BUF_TYPE *src0,
+    int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride,
+    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
+    int w) {
+  // Derive subsampling from h and w passed in. May be refactored to
+  // pass in subsampling factors directly.
+  const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
+  const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
+  const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
+  aom_blend_a64_d32_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+                         mask, block_size_wide[sb_type], h, w, subh, subw);
+}
+#endif  // CONFIG_CONVOLVE_ROUND
 static void build_masked_compound(
     uint8_t *dst, int dst_stride, const uint8_t *src0, int src0_stride,
     const uint8_t *src1, int src1_stride,
@@ -694,7 +758,7 @@ void av1_make_masked_inter_predictor(const uint8_t *pre, int pre_stride,
                                      uint8_t *dst, int dst_stride,
                                      const int subpel_x, const int subpel_y,
                                      const struct scale_factors *sf, int w,
-                                     int h,
+                                     int h, ConvolveParams *conv_params,
 #if CONFIG_DUAL_FILTER
                                      const InterpFilter *interp_filter,
 #else
@@ -710,7 +774,8 @@ void av1_make_masked_inter_predictor(const uint8_t *pre, int pre_stride,
                                      int p_col, int p_row, int ref,
 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
                                      MACROBLOCKD *xd) {
-  MODE_INFO *mi = xd->mi[0];
+  const MODE_INFO *mi = xd->mi[0];
+
   const INTERINTER_COMPOUND_DATA comp_data = {
 #if CONFIG_WEDGE
     mi->mbmi.wedge_index,
@@ -722,24 +787,27 @@ void av1_make_masked_inter_predictor(const uint8_t *pre, int pre_stride,
 #endif  // CONFIG_COMPOUND_SEGMENT
     mi->mbmi.interinter_compound_type
   };
-// The prediction filter types used here should be those for
-// the second reference block.
-#if CONFIG_DUAL_FILTER
-  InterpFilter tmp_ipf[4] = {
-    interp_filter[2], interp_filter[3], interp_filter[2], interp_filter[3],
-  };
-#else
-  InterpFilter tmp_ipf = interp_filter;
-#endif  // CONFIG_DUAL_FILTER
-  ConvolveParams conv_params = get_conv_params(0, plane);
 
 #if CONFIG_HIGHBITDEPTH
+#if CONFIG_CONVOLVE_ROUND
+  DECLARE_ALIGNED(16, CONV_BUF_TYPE, tmp_dst2[MAX_SB_SQUARE]);
+  int tmp_dst2_stride = MAX_SB_SIZE;
+  CONV_BUF_TYPE *org_dst = conv_params->dst;
+  int org_dst_stride = conv_params->dst_stride;
+  if (conv_params->round == CONVOLVE_OPT_NO_ROUND) {
+    memset(tmp_dst2, 0, sizeof(tmp_dst2));
+    conv_params->dst = tmp_dst2;
+    conv_params->dst_stride = tmp_dst2_stride;
+    // mask compound has its own average mechanism
+    conv_params->do_average = 0;
+  }
+#endif  // CONFIG_CONVOLVE_ROUND
   DECLARE_ALIGNED(16, uint8_t, tmp_dst_[2 * MAX_SB_SQUARE]);
   uint8_t *tmp_dst = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
                          ? CONVERT_TO_BYTEPTR(tmp_dst_)
                          : tmp_dst_;
   av1_make_inter_predictor(pre, pre_stride, tmp_dst, MAX_SB_SIZE, subpel_x,
-                           subpel_y, sf, w, h, &conv_params, tmp_ipf,
+                           subpel_y, sf, w, h, conv_params, interp_filter,
 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
                            warp_types, p_col, p_row, plane, ref,
 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
@@ -749,14 +817,26 @@ void av1_make_masked_inter_predictor(const uint8_t *pre, int pre_stride,
                            xs, ys, xd);
 #if CONFIG_COMPOUND_SEGMENT
   if (!plane && comp_data.interinter_compound_type == COMPOUND_SEG) {
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-      build_compound_seg_mask_highbd(comp_data.seg_mask, comp_data.mask_type,
-                                     dst, dst_stride, tmp_dst, MAX_SB_SIZE,
-                                     mi->mbmi.sb_type, h, w, xd->bd);
-    else
-      build_compound_seg_mask(comp_data.seg_mask, comp_data.mask_type, dst,
-                              dst_stride, tmp_dst, MAX_SB_SIZE,
-                              mi->mbmi.sb_type, h, w);
+#if CONFIG_CONVOLVE_ROUND
+    if (conv_params->round == CONVOLVE_OPT_NO_ROUND) {
+      build_compound_seg_mask_d32(comp_data.seg_mask, comp_data.mask_type,
+                                  org_dst, org_dst_stride, tmp_dst2,
+                                  tmp_dst2_stride, mi->mbmi.sb_type, h, w,
+                                  conv_params, xd->bd);
+    } else {
+#endif  // CONFIG_CONVOLVE_ROUND
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        build_compound_seg_mask_highbd(comp_data.seg_mask, comp_data.mask_type,
+                                       dst, dst_stride, tmp_dst, MAX_SB_SIZE,
+                                       mi->mbmi.sb_type, h, w, xd->bd);
+      } else {
+        build_compound_seg_mask(comp_data.seg_mask, comp_data.mask_type, dst,
+                                dst_stride, tmp_dst, MAX_SB_SIZE,
+                                mi->mbmi.sb_type, h, w);
+      }
+#if CONFIG_CONVOLVE_ROUND
+    }
+#endif
   }
 #endif  // CONFIG_COMPOUND_SEGMENT
 
@@ -770,19 +850,55 @@ void av1_make_masked_inter_predictor(const uint8_t *pre, int pre_stride,
         dst, dst_stride, dst, dst_stride, tmp_dst, MAX_SB_SIZE, &comp_data,
         mi->mbmi.sb_type, wedge_offset_x, wedge_offset_y, h, w);
 #else
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-    build_masked_compound_highbd(dst, dst_stride, dst, dst_stride, tmp_dst,
-                                 MAX_SB_SIZE, &comp_data, mi->mbmi.sb_type, h,
-                                 w, xd->bd);
-  else
-    build_masked_compound(dst, dst_stride, dst, dst_stride, tmp_dst,
-                          MAX_SB_SIZE, &comp_data, mi->mbmi.sb_type, h, w);
+#if CONFIG_CONVOLVE_ROUND
+  if (conv_params->round == CONVOLVE_OPT_NO_ROUND) {
+    build_masked_compound_no_round(org_dst, org_dst_stride, org_dst,
+                                   org_dst_stride, tmp_dst2, tmp_dst2_stride,
+                                   &comp_data, mi->mbmi.sb_type, h, w);
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      av1_highbd_convolve_rounding(
+          org_dst, org_dst_stride, dst, dst_stride, w, h,
+          FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1,
+          xd->bd);
+    } else {
+      av1_convolve_rounding(
+          org_dst, org_dst_stride, dst, dst_stride, w, h,
+          FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1);
+    }
+    conv_params->do_post_rounding = 0;
+  } else {
+#endif  // CONFIG_CONVOLVE_ROUND
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      build_masked_compound_highbd(dst, dst_stride, dst, dst_stride, tmp_dst,
+                                   MAX_SB_SIZE, &comp_data, mi->mbmi.sb_type, h,
+                                   w, xd->bd);
+    } else {
+      build_masked_compound(dst, dst_stride, dst, dst_stride, tmp_dst,
+                            MAX_SB_SIZE, &comp_data, mi->mbmi.sb_type, h, w);
+    }
+#if CONFIG_CONVOLVE_ROUND
+  }
+#endif  // CONFIG_CONVOLVE_ROUND
 #endif  // CONFIG_SUPERTX
 
 #else  // CONFIG_HIGHBITDEPTH
+
+#if CONFIG_CONVOLVE_ROUND
+  DECLARE_ALIGNED(16, CONV_BUF_TYPE, tmp_dst2[MAX_SB_SQUARE]);
+  int tmp_dst2_stride = MAX_SB_SIZE;
+  CONV_BUF_TYPE *org_dst = conv_params->dst;
+  int org_dst_stride = conv_params->dst_stride;
+  if (conv_params->round == CONVOLVE_OPT_NO_ROUND) {
+    memset(tmp_dst2, 0, sizeof(tmp_dst2));
+    conv_params->dst = tmp_dst2;
+    conv_params->dst_stride = tmp_dst2_stride;
+    // mask compound has its own average mechanism
+    conv_params->do_average = 0;
+  }
+#endif
   DECLARE_ALIGNED(16, uint8_t, tmp_dst[MAX_SB_SQUARE]);
   av1_make_inter_predictor(pre, pre_stride, tmp_dst, MAX_SB_SIZE, subpel_x,
-                           subpel_y, sf, w, h, &conv_params, tmp_ipf,
+                           subpel_y, sf, w, h, conv_params, interp_filter,
 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
                            warp_types, p_col, p_row, plane, ref,
 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
@@ -791,18 +907,43 @@ void av1_make_masked_inter_predictor(const uint8_t *pre, int pre_stride,
 #endif
                            xs, ys, xd);
 #if CONFIG_COMPOUND_SEGMENT
-  if (!plane && comp_data.interinter_compound_type == COMPOUND_SEG)
-    build_compound_seg_mask(comp_data.seg_mask, comp_data.mask_type, dst,
-                            dst_stride, tmp_dst, MAX_SB_SIZE, mi->mbmi.sb_type,
-                            h, w);
+  if (!plane && comp_data.interinter_compound_type == COMPOUND_SEG) {
+#if CONFIG_CONVOLVE_ROUND
+    if (conv_params->round == CONVOLVE_OPT_NO_ROUND) {
+      build_compound_seg_mask_d32(
+          comp_data.seg_mask, comp_data.mask_type, org_dst, org_dst_stride,
+          tmp_dst2, tmp_dst2_stride, mi->mbmi.sb_type, h, w, conv_params, 8);
+    } else {
+#endif  // CONFIG_CONVOLVE_ROUND
+      build_compound_seg_mask(comp_data.seg_mask, comp_data.mask_type, dst,
+                              dst_stride, tmp_dst, MAX_SB_SIZE,
+                              mi->mbmi.sb_type, h, w);
+#if CONFIG_CONVOLVE_ROUND
+    }
+#endif
+  }
 #endif  // CONFIG_COMPOUND_SEGMENT
 #if CONFIG_SUPERTX
   build_masked_compound_wedge_extend(dst, dst_stride, dst, dst_stride, tmp_dst,
                                      MAX_SB_SIZE, &comp_data, mi->mbmi.sb_type,
                                      wedge_offset_x, wedge_offset_y, h, w);
 #else
-  build_masked_compound(dst, dst_stride, dst, dst_stride, tmp_dst, MAX_SB_SIZE,
-                        &comp_data, mi->mbmi.sb_type, h, w);
+#if CONFIG_CONVOLVE_ROUND
+  if (conv_params->round == CONVOLVE_OPT_NO_ROUND) {
+    build_masked_compound_no_round(org_dst, org_dst_stride, org_dst,
+                                   org_dst_stride, tmp_dst2, tmp_dst2_stride,
+                                   &comp_data, mi->mbmi.sb_type, h, w);
+    av1_convolve_rounding(
+        org_dst, org_dst_stride, dst, dst_stride, w, h,
+        FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1);
+    conv_params->do_post_rounding = 0;
+  } else {
+#endif  // CONFIG_CONVOLVE_ROUND
+    build_masked_compound(dst, dst_stride, dst, dst_stride, tmp_dst,
+                          MAX_SB_SIZE, &comp_data, mi->mbmi.sb_type, h, w);
+#if CONFIG_CONVOLVE_ROUND
+  }
+#endif  // CONFIG_CONVOLVE_ROUND
 #endif  // CONFIG_SUPERTX
 #endif  // CONFIG_HIGHBITDEPTH
 #if CONFIG_COMPOUND_SEGMENT
@@ -832,11 +973,14 @@ void av1_highbd_build_inter_predictor(
   const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
                      is_q4 ? src_mv->col : src_mv->col * 2 };
   MV32 mv = av1_scale_mv(&mv_q4, x, y, sf);
-  const int subpel_x = mv.col & SUBPEL_MASK;
-  const int subpel_y = mv.row & SUBPEL_MASK;
-  ConvolveParams conv_params = get_conv_params(ref, plane);
+  mv.col += SCALE_EXTRA_OFF;
+  mv.row += SCALE_EXTRA_OFF;
+  const int subpel_x = mv.col & SCALE_SUBPEL_MASK;
+  const int subpel_y = mv.row & SCALE_SUBPEL_MASK;
+  ConvolveParams conv_params = get_conv_params(ref, ref, plane);
 
-  src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS);
+  src += (mv.row >> SCALE_SUBPEL_BITS) * src_stride +
+         (mv.col >> SCALE_SUBPEL_BITS);
 
   av1_make_inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y,
                            sf, w, h, &conv_params, interp_filter,
@@ -869,10 +1013,13 @@ void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
   const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
                      is_q4 ? src_mv->col : src_mv->col * 2 };
   MV32 mv = av1_scale_mv(&mv_q4, x, y, sf);
-  const int subpel_x = mv.col & SUBPEL_MASK;
-  const int subpel_y = mv.row & SUBPEL_MASK;
+  mv.col += SCALE_EXTRA_OFF;
+  mv.row += SCALE_EXTRA_OFF;
+  const int subpel_x = mv.col & SCALE_SUBPEL_MASK;
+  const int subpel_y = mv.row & SCALE_SUBPEL_MASK;
 
-  src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS);
+  src += (mv.row >> SCALE_SUBPEL_BITS) * src_stride +
+         (mv.col >> SCALE_SUBPEL_BITS);
 
   av1_make_inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y,
                            sf, w, h, conv_params, interp_filter,
@@ -909,6 +1056,10 @@ void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd, int plane,
   const MODE_INFO *mi = xd->mi[0];
 #endif  // CONFIG_MOTION_VAR
   int is_compound = has_second_ref(&mi->mbmi);
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  int is_comp_mode_pred =
+      is_compound || is_inter_singleref_comp_mode(mi->mbmi.mode);
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
   int ref;
 #if CONFIG_INTRABC
   const int is_intrabc = is_intrabc_block(&mi->mbmi);
@@ -920,6 +1071,9 @@ void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd, int plane,
     WarpedMotionParams *const wm = &xd->global_motion[mi->mbmi.ref_frame[ref]];
     is_global[ref] = is_global_mv_block(mi, block, wm->wmtype);
   }
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  if (!is_compound && is_comp_mode_pred) is_global[1] = is_global[0];
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
 #endif  // CONFIG_GLOBAL_MOTION
 
 #if CONFIG_CB4X4
@@ -974,7 +1128,8 @@ void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd, int plane,
       for (idx = 0; idx < b8_w; idx += b4_w) {
         MB_MODE_INFO *this_mbmi = &xd->mi[row * xd->mi_stride + col]->mbmi;
         is_compound = has_second_ref(this_mbmi);
-
+        // TODO(zoeliu): If single ref comp modes are considered here, a
+        //               mismatch was caused. Need a further investigation.
         for (ref = 0; ref < 1 + is_compound; ++ref) {
           struct buf_2d *const dst_buf = &pd->dst;
 
@@ -995,23 +1150,19 @@ void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd, int plane,
 
 #if CONFIG_INTRABC
           const struct scale_factors *const sf =
-              is_intrabc ? &xd->sf_identity : &xd->block_refs[ref]->sf;
+              is_intrabc ? &xd->sf_identity : &ref_buf->sf;
           struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
 #else
-          const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
+          const struct scale_factors *const sf = &ref_buf->sf;
           struct buf_2d *const pre_buf = &pd->pre[ref];
 #endif  // CONFIG_INTRABC
           uint8_t *dst = dst_buf->buf;
 
           const MV mv = this_mbmi->mv[ref].as_mv;
 
-          const MV mv_q4 = clamp_mv_to_umv_border_sb(
-              xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
           uint8_t *pre;
-          MV32 scaled_mv;
           int xs, ys, subpel_x, subpel_y;
           const int is_scaled = av1_is_scaled(sf);
-          ConvolveParams conv_params = get_conv_params(ref, plane);
 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
           WarpTypesAllowed warp_types;
 #if CONFIG_GLOBAL_MOTION
@@ -1029,28 +1180,55 @@ void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd, int plane,
           dst += dst_buf->stride * y + x;
 
           if (is_scaled) {
-            pre =
-                pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, sf);
-            scaled_mv = av1_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
+            int ssx = pd->subsampling_x;
+            int ssy = pd->subsampling_y;
+            int orig_pos_y = (mi_y << (SUBPEL_BITS - ssy)) + (y << SUBPEL_BITS);
+            orig_pos_y += mv.row * (1 << (1 - ssy));
+            int orig_pos_x = (mi_x << (SUBPEL_BITS - ssx)) + (x << SUBPEL_BITS);
+            orig_pos_x += mv.col * (1 << (1 - ssx));
+            int pos_y = sf->scale_value_y(orig_pos_y, sf);
+            int pos_x = sf->scale_value_x(orig_pos_x, sf);
+            pos_x += SCALE_EXTRA_OFF;
+            pos_y += SCALE_EXTRA_OFF;
+
+            const int top = -((AOM_INTERP_EXTEND + bh) << SCALE_SUBPEL_BITS);
+            const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
+                               << SCALE_SUBPEL_BITS;
+            const int left = -((AOM_INTERP_EXTEND + bw) << SCALE_SUBPEL_BITS);
+            const int right = (pre_buf->width + AOM_INTERP_EXTEND)
+                              << SCALE_SUBPEL_BITS;
+            pos_y = clamp(pos_y, top, bottom);
+            pos_x = clamp(pos_x, left, right);
+
+            pre = pre_buf->buf0 +
+                  (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
+                  (pos_x >> SCALE_SUBPEL_BITS);
+            subpel_x = pos_x & SCALE_SUBPEL_MASK;
+            subpel_y = pos_y & SCALE_SUBPEL_MASK;
             xs = sf->x_step_q4;
             ys = sf->y_step_q4;
           } else {
-            pre = pre_buf->buf + y * pre_buf->stride + x;
-            scaled_mv.row = mv_q4.row;
-            scaled_mv.col = mv_q4.col;
-            xs = ys = 16;
+            const MV mv_q4 = clamp_mv_to_umv_border_sb(
+                xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
+            xs = ys = SCALE_SUBPEL_SHIFTS;
+            subpel_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+            subpel_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+            pre = pre_buf->buf +
+                  (y + (mv_q4.row >> SUBPEL_BITS)) * pre_buf->stride +
+                  (x + (mv_q4.col >> SUBPEL_BITS));
           }
 
-          subpel_x = scaled_mv.col & SUBPEL_MASK;
-          subpel_y = scaled_mv.row & SUBPEL_MASK;
-          pre += (scaled_mv.row >> SUBPEL_BITS) * pre_buf->stride +
-                 (scaled_mv.col >> SUBPEL_BITS);
-
+          ConvolveParams conv_params = get_conv_params(ref, ref, plane);
 #if CONFIG_EXT_INTER
+          if (is_masked_compound_type(mi->mbmi.interinter_compound_type)) {
+            // TODO(angiebird): use get_conv_params_no_round() here
+            // masked compound type has its own average mechanism
+            conv_params = get_conv_params(ref, 0, plane);
+          }
           if (ref && is_masked_compound_type(mi->mbmi.interinter_compound_type))
             av1_make_masked_inter_predictor(
                 pre, pre_buf->stride, dst, dst_buf->stride, subpel_x, subpel_y,
-                sf, w, h, mi->mbmi.interp_filter, xs, ys,
+                sf, w, h, &conv_params, mi->mbmi.interp_filter, xs, ys,
 #if CONFIG_SUPERTX
                 wedge_offset_x, wedge_offset_y,
 #endif  // CONFIG_SUPERTX
@@ -1071,7 +1249,7 @@ void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd, int plane,
 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
 #if CONFIG_MOTION_VAR
                 mi_col_offset, mi_row_offset,
-#endif
+#endif  // CONFIG_MOTION_VAR
                 xs, ys, xd);
         }
         ++col;
@@ -1090,14 +1268,17 @@ void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd, int plane,
     struct buf_2d *const dst_buf = &pd->dst;
     uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
     uint8_t *pre[2];
-    MV32 scaled_mv[2];
     SubpelParams subpel_params[2];
 #if CONFIG_CONVOLVE_ROUND
     DECLARE_ALIGNED(16, int32_t, tmp_dst[MAX_SB_SIZE * MAX_SB_SIZE]);
     av1_zero(tmp_dst);
 #endif  // CONFIG_CONVOLVE_ROUND
 
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+    for (ref = 0; ref < 1 + is_comp_mode_pred; ++ref) {
+#else
     for (ref = 0; ref < 1 + is_compound; ++ref) {
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
 #if CONFIG_INTRABC
       const struct scale_factors *const sf =
           is_intrabc ? &xd->sf_identity : &xd->block_refs[ref]->sf;
@@ -1120,42 +1301,70 @@ void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd, int plane,
               : mi->mbmi.mv[ref].as_mv;
 #endif
 
-      // TODO(jkoleszar): This clamping is done in the incorrect place for the
-      // scaling case. It needs to be done on the scaled MV, not the pre-scaling
-      // MV. Note however that it performs the subsampling aware scaling so
-      // that the result is always q4.
-      // mv_precision precision is MV_PRECISION_Q4.
-      const MV mv_q4 = clamp_mv_to_umv_border_sb(
-          xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
-
       const int is_scaled = av1_is_scaled(sf);
       if (is_scaled) {
-        pre[ref] =
-            pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, sf);
-        scaled_mv[ref] = av1_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
+        // Note: The various inputs here have different units:
+        // * mi_x/mi_y are in units of luma pixels
+        // * mv is in units of 1/8 luma pixels
+        // * x/y are in units of pixels *in the current plane*
+        // Here we unify these into a q4-format position within the current
+        // plane, then project into the reference frame
+        int ssx = pd->subsampling_x;
+        int ssy = pd->subsampling_y;
+        int orig_pos_y = (mi_y << (SUBPEL_BITS - ssy)) + (y << SUBPEL_BITS);
+        orig_pos_y += mv.row * (1 << (1 - ssy));
+        int orig_pos_x = (mi_x << (SUBPEL_BITS - ssx)) + (x << SUBPEL_BITS);
+        orig_pos_x += mv.col * (1 << (1 - ssx));
+        int pos_y = sf->scale_value_y(orig_pos_y, sf);
+        int pos_x = sf->scale_value_x(orig_pos_x, sf);
+        pos_x += SCALE_EXTRA_OFF;
+        pos_y += SCALE_EXTRA_OFF;
+
+        // Clamp against the reference frame borders, with enough extension
+        // that we don't force the reference block to be partially onscreen.
+        const int top = -((AOM_INTERP_EXTEND + bh) << SCALE_SUBPEL_BITS);
+        const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
+                           << SCALE_SUBPEL_BITS;
+        const int left = -((AOM_INTERP_EXTEND + bw) << SCALE_SUBPEL_BITS);
+        const int right = (pre_buf->width + AOM_INTERP_EXTEND)
+                          << SCALE_SUBPEL_BITS;
+        pos_y = clamp(pos_y, top, bottom);
+        pos_x = clamp(pos_x, left, right);
+
+        pre[ref] = pre_buf->buf0 +
+                   (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
+                   (pos_x >> SCALE_SUBPEL_BITS);
+        subpel_params[ref].subpel_x = pos_x & SCALE_SUBPEL_MASK;
+        subpel_params[ref].subpel_y = pos_y & SCALE_SUBPEL_MASK;
         subpel_params[ref].xs = sf->x_step_q4;
         subpel_params[ref].ys = sf->y_step_q4;
       } else {
-        pre[ref] = pre_buf->buf + (y * pre_buf->stride + x);
-        scaled_mv[ref].row = mv_q4.row;
-        scaled_mv[ref].col = mv_q4.col;
-        subpel_params[ref].xs = 16;
-        subpel_params[ref].ys = 16;
+        const MV mv_q4 = clamp_mv_to_umv_border_sb(
+            xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
+        subpel_params[ref].subpel_x = (mv_q4.col & SUBPEL_MASK)
+                                      << SCALE_EXTRA_BITS;
+        subpel_params[ref].subpel_y = (mv_q4.row & SUBPEL_MASK)
+                                      << SCALE_EXTRA_BITS;
+        subpel_params[ref].xs = SCALE_SUBPEL_SHIFTS;
+        subpel_params[ref].ys = SCALE_SUBPEL_SHIFTS;
+        pre[ref] = pre_buf->buf +
+                   (y + (mv_q4.row >> SUBPEL_BITS)) * pre_buf->stride +
+                   (x + (mv_q4.col >> SUBPEL_BITS));
       }
-
-      subpel_params[ref].subpel_x = scaled_mv[ref].col & SUBPEL_MASK;
-      subpel_params[ref].subpel_y = scaled_mv[ref].row & SUBPEL_MASK;
-      pre[ref] += (scaled_mv[ref].row >> SUBPEL_BITS) * pre_buf->stride +
-                  (scaled_mv[ref].col >> SUBPEL_BITS);
     }
 
 #if CONFIG_CONVOLVE_ROUND
     ConvolveParams conv_params =
-        get_conv_params_no_round(ref, plane, tmp_dst, MAX_SB_SIZE);
+        get_conv_params_no_round(ref, ref, plane, tmp_dst, MAX_SB_SIZE);
 #else
-    ConvolveParams conv_params = get_conv_params(ref, plane);
+    ConvolveParams conv_params = get_conv_params(ref, ref, plane);
 #endif  // CONFIG_CONVOLVE_ROUND
+
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+    for (ref = 0; ref < 1 + is_comp_mode_pred; ++ref) {
+#else
     for (ref = 0; ref < 1 + is_compound; ++ref) {
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
 #if CONFIG_INTRABC
       const struct scale_factors *const sf =
           is_intrabc ? &xd->sf_identity : &xd->block_refs[ref]->sf;
@@ -1174,12 +1383,23 @@ void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd, int plane,
 #endif  // CONFIG_WARPED_MOTION
 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
       conv_params.ref = ref;
+      conv_params.do_average = ref;
 #if CONFIG_EXT_INTER
+      if (is_masked_compound_type(mi->mbmi.interinter_compound_type)) {
+        // masked compound type has its own average mechanism
+        conv_params.do_average = 0;
+#if CONFIG_CONVOLVE_ROUND && CONFIG_COMPOUND_SEGMENT && CONFIG_SUPERTX
+        // TODO(angiebird): convolve_round does not support compound_segment
+        // when supertx is on
+        conv_params = get_conv_params(ref, 0, plane);
+#endif
+      }
+
       if (ref && is_masked_compound_type(mi->mbmi.interinter_compound_type))
         av1_make_masked_inter_predictor(
             pre[ref], pre_buf->stride, dst, dst_buf->stride,
             subpel_params[ref].subpel_x, subpel_params[ref].subpel_y, sf, w, h,
-            mi->mbmi.interp_filter, subpel_params[ref].xs,
+            &conv_params, mi->mbmi.interp_filter, subpel_params[ref].xs,
             subpel_params[ref].ys,
 #if CONFIG_SUPERTX
             wedge_offset_x, wedge_offset_y,
@@ -1202,7 +1422,7 @@ void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd, int plane,
 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
 #if CONFIG_MOTION_VAR
             mi_col_offset, mi_row_offset,
-#endif
+#endif  // CONFIG_MOTION_VAR
             subpel_params[ref].xs, subpel_params[ref].ys, xd);
     }
 
@@ -1218,76 +1438,20 @@ void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd, int plane,
                                      xd->bd);
       else
 #endif  // CONFIG_HIGHBITDEPTH
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
         av1_convolve_rounding(tmp_dst, MAX_SB_SIZE, dst, dst_buf->stride, w, h,
-                              FILTER_BITS * 2 + is_compound -
+                              FILTER_BITS * 2 + is_comp_mode_pred -
                                   conv_params.round_0 - conv_params.round_1);
+#else   // !(CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF)
+      av1_convolve_rounding(tmp_dst, MAX_SB_SIZE, dst, dst_buf->stride, w, h,
+                            FILTER_BITS * 2 + is_compound -
+                                conv_params.round_0 - conv_params.round_1);
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
     }
 #endif  // CONFIG_CONVOLVE_ROUND
   }
 }
 
-void av1_build_inter_predictor_sub8x8(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                      int plane, int i, int ir, int ic,
-                                      int mi_row, int mi_col) {
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  MODE_INFO *const mi = xd->mi[0];
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(mi->mbmi.sb_type, pd);
-  const int width = block_size_wide[plane_bsize];
-  const int height = block_size_high[plane_bsize];
-  uint8_t *const dst = &pd->dst.buf[(ir * pd->dst.stride + ic) << 2];
-  int ref;
-  const int is_compound = has_second_ref(&mi->mbmi);
-  (void)cm;
-
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-  WarpTypesAllowed warp_types;
-  const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x) + 4 * ic;
-  const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y) + 4 * ir;
-#if CONFIG_GLOBAL_MOTION
-  int is_global[2];
-  for (ref = 0; ref < 1 + is_compound; ++ref) {
-    WarpedMotionParams *const wm = &xd->global_motion[mi->mbmi.ref_frame[ref]];
-    is_global[ref] = is_global_mv_block(mi, i, wm->wmtype);
-  }
-#endif  // CONFIG_GLOBAL_MOTION
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-
-  for (ref = 0; ref < 1 + is_compound; ++ref) {
-    ConvolveParams conv_params = get_conv_params(ref, plane);
-    const uint8_t *pre =
-        &pd->pre[ref].buf[(ir * pd->pre[ref].stride + ic) << 2];
-#if CONFIG_GLOBAL_MOTION
-    warp_types.global_warp_allowed = is_global[ref];
-#endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_WARPED_MOTION
-    warp_types.local_warp_allowed = mi->mbmi.motion_mode == WARPED_CAUSAL;
-#endif  // CONFIG_WARPED_MOTION
-
-#if CONFIG_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-      av1_highbd_build_inter_predictor(
-          pre, pd->pre[ref].stride, dst, pd->dst.stride,
-          &mi->bmi[i].as_mv[ref].as_mv, &xd->block_refs[ref]->sf, width, height,
-          ref, mi->mbmi.interp_filter,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-          &warp_types, p_col, p_row,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-          plane, MV_PRECISION_Q3, mi_col * MI_SIZE + 4 * ic,
-          mi_row * MI_SIZE + 4 * ir, xd);
-    else
-#endif  // CONFIG_HIGHBITDEPTH
-      av1_build_inter_predictor(pre, pd->pre[ref].stride, dst, pd->dst.stride,
-                                &mi->bmi[i].as_mv[ref].as_mv,
-                                &xd->block_refs[ref]->sf, width, height,
-                                &conv_params, mi->mbmi.interp_filter,
-#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-                                &warp_types, p_col, p_row, plane, ref,
-#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-                                MV_PRECISION_Q3, mi_col * MI_SIZE + 4 * ic,
-                                mi_row * MI_SIZE + 4 * ir, xd);
-  }
-}
-
 static void build_inter_predictors_for_planes(const AV1_COMMON *cm,
                                               MACROBLOCKD *xd, BLOCK_SIZE bsize,
                                               int mi_row, int mi_col,
@@ -1386,28 +1550,11 @@ void av1_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
 #endif  // CONFIG_EXT_INTER && CONFIG_INTERINTRA
 }
 
-// TODO(afergs): Check if ctx can be made constant
 void av1_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                    int mi_row, int mi_col, BUFFER_SET *ctx,
                                    BLOCK_SIZE bsize) {
-  build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, 0,
-                                    MAX_MB_PLANE - 1);
-#if CONFIG_EXT_INTER && CONFIG_INTERINTRA
-  if (is_interintra_pred(&xd->mi[0]->mbmi)) {
-    BUFFER_SET default_ctx = {
-      { xd->plane[0].dst.buf, xd->plane[1].dst.buf, xd->plane[2].dst.buf },
-      { xd->plane[0].dst.stride, xd->plane[1].dst.stride,
-        xd->plane[2].dst.stride }
-    };
-    if (!ctx) ctx = &default_ctx;
-    av1_build_interintra_predictors(
-        xd, xd->plane[0].dst.buf, xd->plane[1].dst.buf, xd->plane[2].dst.buf,
-        xd->plane[0].dst.stride, xd->plane[1].dst.stride,
-        xd->plane[2].dst.stride, ctx, bsize);
-  }
-#else
-  (void)ctx;
-#endif  // CONFIG_EXT_INTER && CONFIG_INTERINTRA
+  av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize);
+  av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, ctx, bsize);
 }
 
 void av1_setup_dst_planes(struct macroblockd_plane planes[MAX_MB_PLANE],
@@ -1585,19 +1732,19 @@ void av1_build_masked_inter_predictor_complex(
   } while (--h_remain);
 }
 
-void av1_build_inter_predictors_sb_sub8x8_extend(const AV1_COMMON *cm,
-                                                 MACROBLOCKD *xd,
+void av1_build_inter_predictor_sb_sub8x8_extend(const AV1_COMMON *cm,
+                                                MACROBLOCKD *xd,
 #if CONFIG_EXT_INTER
-                                                 int mi_row_ori, int mi_col_ori,
+                                                int mi_row_ori, int mi_col_ori,
 #endif  // CONFIG_EXT_INTER
-                                                 int mi_row, int mi_col,
-                                                 BLOCK_SIZE bsize, int block) {
+                                                int mi_row, int mi_col,
+                                                int plane, BLOCK_SIZE bsize,
+                                                int block) {
   // Prediction function used in supertx:
   // Use the mv at current block (which is less than 8x8)
   // to get prediction of a block located at (mi_row, mi_col) at size of bsize
   // bsize can be larger than 8x8.
   // block (0-3): the sub8x8 location of current block
-  int plane;
   const int mi_x = mi_col * MI_SIZE;
   const int mi_y = mi_row * MI_SIZE;
 #if CONFIG_EXT_INTER
@@ -1608,68 +1755,50 @@ void av1_build_inter_predictors_sb_sub8x8_extend(const AV1_COMMON *cm,
   // For sub8x8 uv:
   // Skip uv prediction in supertx except the first block (block = 0)
   int max_plane = block ? 1 : MAX_MB_PLANE;
+  if (plane >= max_plane) return;
 
-  for (plane = 0; plane < max_plane; plane++) {
-    const BLOCK_SIZE plane_bsize =
-        get_plane_block_size(bsize, &xd->plane[plane]);
-    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
-    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
-    const int bw = 4 * num_4x4_w;
-    const int bh = 4 * num_4x4_h;
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, &xd->plane[plane]);
+  const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+  const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+  const int bw = 4 * num_4x4_w;
+  const int bh = 4 * num_4x4_h;
 
-    build_inter_predictors(cm, xd, plane,
+  build_inter_predictors(cm, xd, plane,
 #if CONFIG_MOTION_VAR
-                           0, 0,
+                         0, 0,
 #endif  // CONFIG_MOTION_VAR
-                           block, bw, bh, 0, 0, bw, bh,
+                         block, bw, bh, 0, 0, bw, bh,
 #if CONFIG_EXT_INTER
-                           wedge_offset_x, wedge_offset_y,
-#endif  // CONFIG_EXT_INTER
-                           mi_x, mi_y);
-  }
-#if CONFIG_EXT_INTER
-  if (is_interintra_pred(&xd->mi[0]->mbmi)) {
-    BUFFER_SET ctx = { { xd->plane[0].dst.buf, xd->plane[1].dst.buf,
-                         xd->plane[2].dst.buf },
-                       { xd->plane[0].dst.stride, xd->plane[1].dst.stride,
-                         xd->plane[2].dst.stride } };
-    av1_build_interintra_predictors(
-        xd, xd->plane[0].dst.buf, xd->plane[1].dst.buf, xd->plane[2].dst.buf,
-        xd->plane[0].dst.stride, xd->plane[1].dst.stride,
-        xd->plane[2].dst.stride, &ctx, bsize);
-  }
+                         wedge_offset_x, wedge_offset_y,
 #endif  // CONFIG_EXT_INTER
+                         mi_x, mi_y);
 }
 
-void av1_build_inter_predictors_sb_extend(const AV1_COMMON *cm, MACROBLOCKD *xd,
+void av1_build_inter_predictor_sb_extend(const AV1_COMMON *cm, MACROBLOCKD *xd,
 #if CONFIG_EXT_INTER
-                                          int mi_row_ori, int mi_col_ori,
+                                         int mi_row_ori, int mi_col_ori,
 #endif  // CONFIG_EXT_INTER
-                                          int mi_row, int mi_col,
-                                          BLOCK_SIZE bsize) {
-  int plane;
+                                         int mi_row, int mi_col, int plane,
+                                         BLOCK_SIZE bsize) {
   const int mi_x = mi_col * MI_SIZE;
   const int mi_y = mi_row * MI_SIZE;
 #if CONFIG_EXT_INTER
   const int wedge_offset_x = (mi_col_ori - mi_col) * MI_SIZE;
   const int wedge_offset_y = (mi_row_ori - mi_row) * MI_SIZE;
 #endif  // CONFIG_EXT_INTER
-  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-    const BLOCK_SIZE plane_bsize =
-        get_plane_block_size(bsize, &xd->plane[plane]);
-    const int bw = block_size_wide[plane_bsize];
-    const int bh = block_size_high[plane_bsize];
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, &xd->plane[plane]);
+  const int bw = block_size_wide[plane_bsize];
+  const int bh = block_size_high[plane_bsize];
 
-    build_inter_predictors(cm, xd, plane,
+  build_inter_predictors(cm, xd, plane,
 #if CONFIG_MOTION_VAR
-                           0, 0,
+                         0, 0,
 #endif  // CONFIG_MOTION_VAR
-                           0, bw, bh, 0, 0, bw, bh,
+                         0, bw, bh, 0, 0, bw, bh,
 #if CONFIG_EXT_INTER
-                           wedge_offset_x, wedge_offset_y,
+                         wedge_offset_x, wedge_offset_y,
 #endif  // CONFIG_EXT_INTER
-                           mi_x, mi_y);
-  }
+                         mi_x, mi_y);
 }
 #endif  // CONFIG_SUPERTX
 
@@ -1828,7 +1957,7 @@ int skip_u4x4_pred_in_obmc(BLOCK_SIZE bsize, const struct macroblockd_plane *pd,
 
   BLOCK_SIZE bsize_plane =
       ss_size_lookup[bsize][pd->subsampling_x][pd->subsampling_y];
-#if CONFIG_CB4X4
+#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
   if (bsize_plane < BLOCK_4X4) return 1;
 #endif
   switch (bsize_plane) {
@@ -1863,7 +1992,8 @@ void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
 
   // handle above row
   if (xd->up_available) {
-    const int overlap = num_4x4_blocks_high_lookup[bsize] * 2;
+    const int overlap =
+        AOMMIN(block_size_high[bsize] >> 1, block_size_high[BLOCK_64X64] >> 1);
     const int miw = AOMMIN(xd->n8_w, cm->mi_cols - mi_col);
     const int mi_row_offset = -1;
     const int neighbor_limit = max_neighbor_obmc[b_width_log2_lookup[bsize]];
@@ -1885,7 +2015,9 @@ void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
 #endif
 
       const BLOCK_SIZE a_bsize = AOMMAX(BLOCK_8X8, above_mbmi->sb_type);
-      const int mi_step = AOMMIN(xd->n8_w, mi_size_wide[a_bsize]);
+      const int above_step =
+          AOMMIN(mi_size_wide[a_bsize], mi_size_wide[BLOCK_64X64]);
+      const int mi_step = AOMMIN(xd->n8_w, above_step);
 
       if (is_neighbor_overlappable(above_mbmi)) {
         neighbor_count++;
@@ -1920,7 +2052,8 @@ void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
 
   // handle left column
   if (xd->left_available) {
-    const int overlap = num_4x4_blocks_wide_lookup[bsize] * 2;
+    const int overlap =
+        AOMMIN(block_size_wide[bsize] >> 1, block_size_wide[BLOCK_64X64] >> 1);
     const int mih = AOMMIN(xd->n8_h, cm->mi_rows - mi_row);
     const int mi_col_offset = -1;
     const int neighbor_limit = max_neighbor_obmc[b_height_log2_lookup[bsize]];
@@ -1942,7 +2075,9 @@ void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
 #endif
 
       const BLOCK_SIZE l_bsize = AOMMAX(BLOCK_8X8, left_mbmi->sb_type);
-      const int mi_step = AOMMIN(xd->n8_h, mi_size_high[l_bsize]);
+      const int left_step =
+          AOMMIN(mi_size_high[l_bsize], mi_size_high[BLOCK_64X64]);
+      const int mi_step = AOMMIN(xd->n8_h, left_step);
 
       if (is_neighbor_overlappable(left_mbmi)) {
         neighbor_count++;
@@ -1985,6 +2120,14 @@ void modify_neighbor_predictor_for_obmc(MB_MODE_INFO *mbmi) {
              is_masked_compound_type(mbmi->interinter_compound_type)) {
     mbmi->interinter_compound_type = COMPOUND_AVERAGE;
     mbmi->ref_frame[1] = NONE_FRAME;
+#if CONFIG_COMPOUND_SINGLEREF
+  } else if (!has_second_ref(mbmi) &&
+             is_inter_singleref_comp_mode(mbmi->mode)) {
+    // mbmi->mode = compound_ref0_mode(mbmi->mode);
+    mbmi->mode = compound_ref1_mode(mbmi->mode);
+    assert(is_inter_singleref_mode(mbmi->mode));
+    mbmi->mv[0].as_int = mbmi->mv[1].as_int;
+#endif  // CONFIG_COMPOUND_SINGLEREF
   }
 #endif  // CONFIG_EXT_INTER
   if (has_second_ref(mbmi)) mbmi->ref_frame[1] = NONE_FRAME;
@@ -2025,7 +2168,9 @@ void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
     const BLOCK_SIZE a_bsize = AOMMAX(BLOCK_8X8, above_mbmi->sb_type);
     MB_MODE_INFO backup_mbmi;
 
-    mi_step = AOMMIN(xd->n8_w, mi_size_wide[a_bsize]);
+    const int above_step =
+        AOMMIN(mi_size_wide[a_bsize], mi_size_wide[BLOCK_64X64]);
+    mi_step = AOMMIN(xd->n8_w, above_step);
 
     if (!is_neighbor_overlappable(above_mbmi)) continue;
 
@@ -2041,8 +2186,16 @@ void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
                        tmp_height[j], tmp_stride[j], 0, i, NULL,
                        pd->subsampling_x, pd->subsampling_y);
     }
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+    for (ref = 0; ref < 1 + (is_inter_anyref_comp_mode(above_mbmi->mode));
+         ++ref) {
+      const MV_REFERENCE_FRAME frame = has_second_ref(above_mbmi)
+                                           ? above_mbmi->ref_frame[ref]
+                                           : above_mbmi->ref_frame[0];
+#else   // !(CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF)
     for (ref = 0; ref < 1 + has_second_ref(above_mbmi); ++ref) {
       const MV_REFERENCE_FRAME frame = above_mbmi->ref_frame[ref];
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
       const RefBuffer *const ref_buf = &cm->frame_refs[frame - LAST_FRAME];
 
       xd->block_refs[ref] = ref_buf;
@@ -2064,6 +2217,7 @@ void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
       bw = (mi_step * MI_SIZE) >> pd->subsampling_x;
       bh = AOMMAX((num_4x4_blocks_high_lookup[bsize] * 2) >> pd->subsampling_y,
                   4);
+      bh = AOMMIN(bh, block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1));
 
       if (skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
       build_inter_predictors(cm, xd, j, mi_col_offset, mi_row_offset, 0, bw, bh,
@@ -2113,8 +2267,9 @@ void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
 
     const BLOCK_SIZE l_bsize = AOMMAX(left_mbmi->sb_type, BLOCK_8X8);
     MB_MODE_INFO backup_mbmi;
-
-    mi_step = AOMMIN(xd->n8_h, mi_size_high[l_bsize]);
+    const int left_step =
+        AOMMIN(mi_size_high[l_bsize], mi_size_high[BLOCK_64X64]);
+    mi_step = AOMMIN(xd->n8_h, left_step);
 
     if (!is_neighbor_overlappable(left_mbmi)) continue;
 
@@ -2130,8 +2285,16 @@ void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
                        tmp_height[j], tmp_stride[j], i, 0, NULL,
                        pd->subsampling_x, pd->subsampling_y);
     }
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+    for (ref = 0; ref < 1 + (is_inter_anyref_comp_mode(left_mbmi->mode));
+         ++ref) {
+      const MV_REFERENCE_FRAME frame = has_second_ref(left_mbmi)
+                                           ? left_mbmi->ref_frame[ref]
+                                           : left_mbmi->ref_frame[0];
+#else   // !(CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF)
     for (ref = 0; ref < 1 + has_second_ref(left_mbmi); ++ref) {
       const MV_REFERENCE_FRAME frame = left_mbmi->ref_frame[ref];
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
       const RefBuffer *const ref_buf = &cm->frame_refs[frame - LAST_FRAME];
 
       xd->block_refs[ref] = ref_buf;
@@ -2152,6 +2315,7 @@ void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
       const struct macroblockd_plane *pd = &xd->plane[j];
       bw = AOMMAX((num_4x4_blocks_wide_lookup[bsize] * 2) >> pd->subsampling_x,
                   4);
+      bw = AOMMIN(bw, block_size_wide[BLOCK_64X64] >> (pd->subsampling_x + 1));
       bh = (mi_step << MI_SIZE_LOG2) >> pd->subsampling_y;
 
       if (skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
@@ -2584,6 +2748,7 @@ void av1_build_ncobmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
 #endif  // CONFIG_HIGHBITDEPTH
 
   const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  // TODO(zoeliu): COMPOUND_SINGLEREF has not worked with NCOBMC yet.
   av1_build_prediction_by_bottom_preds(cm, xd, mi_row, mi_col, dst_buf1,
                                        dst_width1, dst_height1, dst_stride1);
   av1_build_prediction_by_right_preds(cm, xd, mi_row, mi_col, dst_buf2,
@@ -2614,12 +2779,13 @@ static const int ii_weights1d[MAX_SB_SIZE] = {
   2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  1,  1,  1,  1,  1,  1,  1,  1,
   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1
 };
-static int ii_size_scales[BLOCK_SIZES] = {
-#if CONFIG_CB4X4
+static int ii_size_scales[BLOCK_SIZES_ALL] = {
+#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
     32, 32, 32,
 #endif
     32, 16, 16, 16, 8, 8, 8, 4,
     4,  4,  2,  2,  2, 1, 1, 1,
+    16, 16, 8, 8,
 };
 #else
 static const int ii_weights1d[MAX_SB_SIZE] = {
@@ -2628,12 +2794,13 @@ static const int ii_weights1d[MAX_SB_SIZE] = {
   6,  6,  6,  5,  5,  4,  4,  4,  4,  3,  3,  3,  3,  3,  2,  2,
   2,  2,  2,  2,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1
 };
-static int ii_size_scales[BLOCK_SIZES] = {
-#if CONFIG_CB4X4
+static int ii_size_scales[BLOCK_SIZES_ALL] = {
+#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
     16, 16, 16,
 #endif
     16, 8, 8, 8, 4, 4, 4,
     2,  2, 2, 1, 1, 1,
+    8, 8, 4, 4,
 };
 /* clang-format on */
 #endif  // CONFIG_EXT_PARTITION
@@ -2918,23 +3085,20 @@ static void build_inter_predictors_single_buf(MACROBLOCKD *xd, int plane,
                     ? average_split_mvs(pd, mi, ref, block)
                     : mi->mbmi.mv[ref].as_mv;
 
-  // TODO(jkoleszar): This clamping is done in the incorrect place for the
-  // scaling case. It needs to be done on the scaled MV, not the pre-scaling
-  // MV. Note however that it performs the subsampling aware scaling so
-  // that the result is always q4.
-  // mv_precision precision is MV_PRECISION_Q4.
-  const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh, pd->subsampling_x,
-                                             pd->subsampling_y);
-
   uint8_t *pre;
-  MV32 scaled_mv;
   int xs, ys, subpel_x, subpel_y;
   const int is_scaled = av1_is_scaled(sf);
-  ConvolveParams conv_params = get_conv_params(0, plane);
+  ConvolveParams conv_params = get_conv_params(ref, 0, plane);
 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
   WarpTypesAllowed warp_types;
 #if CONFIG_GLOBAL_MOTION
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  WarpedMotionParams *const wm =
+      mi->mbmi.ref_frame[ref] > 0 ? &xd->global_motion[mi->mbmi.ref_frame[ref]]
+                                  : &xd->global_motion[mi->mbmi.ref_frame[0]];
+#else   // !(CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF)
   WarpedMotionParams *const wm = &xd->global_motion[mi->mbmi.ref_frame[ref]];
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
   warp_types.global_warp_allowed = is_global_mv_block(mi, block, wm->wmtype);
 #endif  // CONFIG_GLOBAL_MOTION
 #if CONFIG_WARPED_MOTION
@@ -2943,22 +3107,41 @@ static void build_inter_predictors_single_buf(MACROBLOCKD *xd, int plane,
 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
 
   if (is_scaled) {
-    pre = pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, sf);
-    scaled_mv = av1_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
+    int ssx = pd->subsampling_x;
+    int ssy = pd->subsampling_y;
+    int orig_pos_y = (mi_y << (SUBPEL_BITS - ssy)) + (y << SUBPEL_BITS);
+    orig_pos_y += mv.row * (1 << (1 - ssy));
+    int orig_pos_x = (mi_x << (SUBPEL_BITS - ssx)) + (x << SUBPEL_BITS);
+    orig_pos_x += mv.col * (1 << (1 - ssx));
+    int pos_y = sf->scale_value_y(orig_pos_y, sf);
+    int pos_x = sf->scale_value_x(orig_pos_x, sf);
+    pos_x += SCALE_EXTRA_OFF;
+    pos_y += SCALE_EXTRA_OFF;
+
+    const int top = -((AOM_INTERP_EXTEND + bh) << SCALE_SUBPEL_BITS);
+    const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
+                       << SCALE_SUBPEL_BITS;
+    const int left = -((AOM_INTERP_EXTEND + bw) << SCALE_SUBPEL_BITS);
+    const int right = (pre_buf->width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS;
+    pos_y = clamp(pos_y, top, bottom);
+    pos_x = clamp(pos_x, left, right);
+
+    pre = pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
+          (pos_x >> SCALE_SUBPEL_BITS);
+    subpel_x = pos_x & SCALE_SUBPEL_MASK;
+    subpel_y = pos_y & SCALE_SUBPEL_MASK;
     xs = sf->x_step_q4;
     ys = sf->y_step_q4;
   } else {
-    pre = pre_buf->buf + (y * pre_buf->stride + x);
-    scaled_mv.row = mv_q4.row;
-    scaled_mv.col = mv_q4.col;
-    xs = ys = 16;
+    const MV mv_q4 = clamp_mv_to_umv_border_sb(
+        xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
+    xs = ys = SCALE_SUBPEL_SHIFTS;
+    subpel_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+    subpel_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+    pre = pre_buf->buf + (y + (mv_q4.row >> SUBPEL_BITS)) * pre_buf->stride +
+          (x + (mv_q4.col >> SUBPEL_BITS));
   }
 
-  subpel_x = scaled_mv.col & SUBPEL_MASK;
-  subpel_y = scaled_mv.row & SUBPEL_MASK;
-  pre += (scaled_mv.row >> SUBPEL_BITS) * pre_buf->stride +
-         (scaled_mv.col >> SUBPEL_BITS);
-
   av1_make_inter_predictor(pre, pre_buf->stride, dst, ext_dst_stride, subpel_x,
                            subpel_y, sf, w, h, &conv_params,
                            mi->mbmi.interp_filter,
@@ -2989,6 +3172,10 @@ void av1_build_inter_predictors_for_planes_single_buf(
       const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
       const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
       assert(bsize == BLOCK_8X8);
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+      assert(has_second_ref(&xd->mi[0]->mbmi) ||
+             !is_inter_singleref_comp_mode(xd->mi[0]->mbmi.mode));
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
       for (y = 0; y < num_4x4_h; ++y)
         for (x = 0; x < num_4x4_w; ++x)
           build_inter_predictors_single_buf(
@@ -3026,7 +3213,12 @@ static void build_wedge_inter_predictor_from_buf(
     mbmi->interinter_compound_type
   };
 
+#if CONFIG_COMPOUND_SINGLEREF
+  if ((is_compound || is_inter_singleref_comp_mode(mbmi->mode)) &&
+      is_masked_compound_type(mbmi->interinter_compound_type)) {
+#else   // !CONFIG_COMPOUND_SINGLEREF
   if (is_compound && is_masked_compound_type(mbmi->interinter_compound_type)) {
+#endif  // CONFIG_COMPOUND_SINGLEREF
 #if CONFIG_COMPOUND_SEGMENT
     if (!plane && comp_data.interinter_compound_type == COMPOUND_SEG) {
 #if CONFIG_HIGHBITDEPTH
@@ -3057,7 +3249,7 @@ static void build_wedge_inter_predictor_from_buf(
           dst, dst_buf->stride, ext_dst0, ext_dst_stride0, ext_dst1,
           ext_dst_stride1, &comp_data, mbmi->sb_type, wedge_offset_x,
           wedge_offset_y, h, w);
-#else
+#else  // !CONFIG_SUPERTX
 #if CONFIG_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
       build_masked_compound_highbd(