1 files changed, 2505 insertions, 912 deletions
diff --git a/third_party/aom/av1/encoder/rdopt.c b/third_party/aom/av1/encoder/rdopt.c
index 2a537a06a..43b00b83b 100644
--- a/third_party/aom/av1/encoder/rdopt.c
+++ b/third_party/aom/av1/encoder/rdopt.c
@@ -63,7 +63,7 @@
 #endif  // CONFIG_PVQ
 #if CONFIG_PVQ || CONFIG_DAALA_DIST
 #include "av1/common/pvq.h"
-#endif  // CONFIG_PVQ || CONFIG_DAALA_DIST
+#endif  // CONFIG_PVQ || CONFIG_DIST_8X8
 #if CONFIG_DUAL_FILTER
 #define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS)
 #if USE_EXTRA_FILTER
@@ -113,8 +113,14 @@ static const int filter_sets[DUAL_FILTER_SET_SIZE][2] = {
 #endif  // CONFIG_EXT_REFS
 
 #if CONFIG_EXT_REFS
+#if CONFIG_EXT_COMP_REFS
+#define SECOND_REF_FRAME_MASK                                        \
+  ((1 << ALTREF_FRAME) | (1 << BWDREF_FRAME) | (1 << GOLDEN_FRAME) | \
+   (1 << LAST2_FRAME) | 0x01)  // NOLINT
+#else                          // !CONFIG_EXT_COMP_REFS
 #define SECOND_REF_FRAME_MASK ((1 << ALTREF_FRAME) | (1 << BWDREF_FRAME) | 0x01)
-#else
+#endif  // CONFIG_EXT_COMP_REFS
+#else   // !CONFIG_EXT_REFS
 #define SECOND_REF_FRAME_MASK ((1 << ALTREF_FRAME) | 0x01)
 #endif  // CONFIG_EXT_REFS
 
@@ -126,6 +132,11 @@ static const int filter_sets[DUAL_FILTER_SET_SIZE][2] = {
 #define FILTER_FAST_SEARCH 1
 #endif  // CONFIG_EXT_INTRA
 
+// Setting this to 1 will disable trellis optimization within the
+// transform search. Trellis optimization will still be applied
+// in the final encode.
+#define DISABLE_TRELLISQ_SEARCH 0
+
 const double ADST_FLIP_SVM[8] = { -6.6623, -2.8062, -3.2531, 3.1671,    // vert
                                   -7.7051, -3.2234, -3.6193, 3.4533 };  // horz
 
@@ -191,6 +202,56 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
 // TODO(zoeliu): May need to reconsider the order on the modes to check
 
 #if CONFIG_EXT_INTER
+
+#if CONFIG_COMPOUND_SINGLEREF
+  // Single ref comp mode
+  { SR_NEAREST_NEARMV, { LAST_FRAME, NONE_FRAME } },
+#if CONFIG_EXT_REFS
+  { SR_NEAREST_NEARMV, { LAST2_FRAME, NONE_FRAME } },
+  { SR_NEAREST_NEARMV, { LAST3_FRAME, NONE_FRAME } },
+  { SR_NEAREST_NEARMV, { BWDREF_FRAME, NONE_FRAME } },
+#endif  // CONFIG_EXT_REFS
+  { SR_NEAREST_NEARMV, { GOLDEN_FRAME, NONE_FRAME } },
+  { SR_NEAREST_NEARMV, { ALTREF_FRAME, NONE_FRAME } },
+
+  /*
+  { SR_NEAREST_NEWMV, { LAST_FRAME, NONE_FRAME } },
+#if CONFIG_EXT_REFS
+  { SR_NEAREST_NEWMV, { LAST2_FRAME, NONE_FRAME } },
+  { SR_NEAREST_NEWMV, { LAST3_FRAME, NONE_FRAME } },
+  { SR_NEAREST_NEWMV, { BWDREF_FRAME, NONE_FRAME } },
+#endif  // CONFIG_EXT_REFS
+  { SR_NEAREST_NEWMV, { GOLDEN_FRAME, NONE_FRAME } },
+  { SR_NEAREST_NEWMV, { ALTREF_FRAME, NONE_FRAME } },*/
+
+  { SR_NEAR_NEWMV, { LAST_FRAME, NONE_FRAME } },
+#if CONFIG_EXT_REFS
+  { SR_NEAR_NEWMV, { LAST2_FRAME, NONE_FRAME } },
+  { SR_NEAR_NEWMV, { LAST3_FRAME, NONE_FRAME } },
+  { SR_NEAR_NEWMV, { BWDREF_FRAME, NONE_FRAME } },
+#endif  // CONFIG_EXT_REFS
+  { SR_NEAR_NEWMV, { GOLDEN_FRAME, NONE_FRAME } },
+  { SR_NEAR_NEWMV, { ALTREF_FRAME, NONE_FRAME } },
+
+  { SR_ZERO_NEWMV, { LAST_FRAME, NONE_FRAME } },
+#if CONFIG_EXT_REFS
+  { SR_ZERO_NEWMV, { LAST2_FRAME, NONE_FRAME } },
+  { SR_ZERO_NEWMV, { LAST3_FRAME, NONE_FRAME } },
+  { SR_ZERO_NEWMV, { BWDREF_FRAME, NONE_FRAME } },
+#endif  // CONFIG_EXT_REFS
+  { SR_ZERO_NEWMV, { GOLDEN_FRAME, NONE_FRAME } },
+  { SR_ZERO_NEWMV, { ALTREF_FRAME, NONE_FRAME } },
+
+  { SR_NEW_NEWMV, { LAST_FRAME, NONE_FRAME } },
+#if CONFIG_EXT_REFS
+  { SR_NEW_NEWMV, { LAST2_FRAME, NONE_FRAME } },
+  { SR_NEW_NEWMV, { LAST3_FRAME, NONE_FRAME } },
+  { SR_NEW_NEWMV, { BWDREF_FRAME, NONE_FRAME } },
+#endif  // CONFIG_EXT_REFS
+  { SR_NEW_NEWMV, { GOLDEN_FRAME, NONE_FRAME } },
+  { SR_NEW_NEWMV, { ALTREF_FRAME, NONE_FRAME } },
+#endif  // CONFIG_COMPOUND_SINGLEREF
+
   { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
 #if CONFIG_EXT_REFS
   { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
@@ -202,6 +263,13 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEAREST_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
   { NEAREST_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
   { NEAREST_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+
+#if CONFIG_EXT_COMP_REFS
+  { NEAREST_NEARESTMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEAREST_NEARESTMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEAREST_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEAREST_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } },
+#endif  // CONFIG_EXT_COMP_REFS
 #endif  // CONFIG_EXT_REFS
 
 #else  // CONFIG_EXT_INTER
@@ -217,6 +285,13 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
   { NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
   { NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+
+#if CONFIG_EXT_COMP_REFS
+  { NEARESTMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEARESTMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } },
+#endif  // CONFIG_EXT_COMP_REFS
 #endif  // CONFIG_EXT_REFS
 #endif  // CONFIG_EXT_INTER
 
@@ -297,9 +372,43 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEAR_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
   { NEW_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
   { ZERO_ZEROMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+
+#if CONFIG_EXT_COMP_REFS
+  { NEAR_NEARMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEW_NEARESTMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEAREST_NEWMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEW_NEARMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEAR_NEWMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, LAST2_FRAME } },
+  { ZERO_ZEROMV, { LAST_FRAME, LAST2_FRAME } },
+
+  { NEAR_NEARMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEW_NEARESTMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEAREST_NEWMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEW_NEARMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEAR_NEWMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, LAST3_FRAME } },
+  { ZERO_ZEROMV, { LAST_FRAME, LAST3_FRAME } },
+
+  { NEAR_NEARMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEW_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEAREST_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEW_NEARMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEAR_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { ZERO_ZEROMV, { LAST_FRAME, GOLDEN_FRAME } },
+
+  { NEAR_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } },
+  { NEW_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
+  { NEW_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } },
+  { NEAR_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
+  { NEW_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
+  { ZERO_ZEROMV, { BWDREF_FRAME, ALTREF_FRAME } },
+#endif  // CONFIG_EXT_COMP_REFS
 #endif  // CONFIG_EXT_REFS
 
-#else  // CONFIG_EXT_INTER
+#else  // !CONFIG_EXT_INTER
 
   { NEARMV, { LAST_FRAME, ALTREF_FRAME } },
   { NEWMV, { LAST_FRAME, ALTREF_FRAME } },
@@ -321,6 +430,17 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
   { NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
   { NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+
+#if CONFIG_EXT_COMP_REFS
+  { NEARMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEWMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEARMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEWMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEARMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEARMV, { BWDREF_FRAME, ALTREF_FRAME } },
+  { NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
+#endif  // CONFIG_EXT_COMP_REFS
 #endif  // CONFIG_EXT_REFS
 
   { ZEROMV, { LAST_FRAME, ALTREF_FRAME } },
@@ -335,6 +455,13 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { ZEROMV, { LAST2_FRAME, BWDREF_FRAME } },
   { ZEROMV, { LAST3_FRAME, BWDREF_FRAME } },
   { ZEROMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+
+#if CONFIG_EXT_COMP_REFS
+  { ZEROMV, { LAST_FRAME, LAST2_FRAME } },
+  { ZEROMV, { LAST_FRAME, LAST3_FRAME } },
+  { ZEROMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { ZEROMV, { BWDREF_FRAME, ALTREF_FRAME } },
+#endif  // CONFIG_EXT_COMP_REFS
 #endif  // CONFIG_EXT_REFS
 
 #endif  // CONFIG_EXT_INTER
@@ -385,6 +512,35 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
 #endif  // CONFIG_EXT_INTER
 };
 
+static const PREDICTION_MODE intra_rd_search_mode_order[INTRA_MODES] = {
+  DC_PRED,       H_PRED,        V_PRED,
+#if CONFIG_ALT_INTRA
+  SMOOTH_PRED,
+#endif  // CONFIG_ALT_INTRA
+  TM_PRED,
+#if CONFIG_ALT_INTRA && CONFIG_SMOOTH_HV
+  SMOOTH_V_PRED, SMOOTH_H_PRED,
+#endif  // CONFIG_ALT_INTRA && CONFIG_SMOOTH_HV
+  D135_PRED,     D207_PRED,     D153_PRED, D63_PRED, D117_PRED, D45_PRED,
+};
+
+#if CONFIG_CFL
+static const UV_PREDICTION_MODE uv_rd_search_mode_order[UV_INTRA_MODES] = {
+  UV_DC_PRED,       UV_H_PRED,        UV_V_PRED,
+#if CONFIG_ALT_INTRA
+  UV_SMOOTH_PRED,
+#endif  // CONFIG_ALT_INTRA
+  UV_TM_PRED,
+#if CONFIG_ALT_INTRA && CONFIG_SMOOTH_HV
+  UV_SMOOTH_V_PRED, UV_SMOOTH_H_PRED,
+#endif  // CONFIG_ALT_INTRA && CONFIG_SMOOTH_HV
+  UV_D135_PRED,     UV_D207_PRED,     UV_D153_PRED,
+  UV_D63_PRED,      UV_D117_PRED,     UV_D45_PRED,
+};
+#else
+#define uv_rd_search_mode_order intra_rd_search_mode_order
+#endif  // CONFIG_CFL
+
 #if CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA || CONFIG_PALETTE
 static INLINE int write_uniform_cost(int n, int v) {
   const int l = get_unsigned_bits(n);
@@ -404,7 +560,7 @@ static INLINE int write_uniform_cost(int n, int v) {
 #define FAST_EXT_TX_EDST_MARGIN 0.3
 
 #if CONFIG_DAALA_DIST
-static int od_compute_var_4x4(od_coeff *x, int stride) {
+static int od_compute_var_4x4(uint16_t *x, int stride) {
   int sum;
   int s2;
   int i;
@@ -420,7 +576,7 @@ static int od_compute_var_4x4(od_coeff *x, int stride) {
       s2 += t * t;
     }
   }
-  // TODO(yushin) : Check wheter any changes are required for high bit depth.
+
   return (s2 - (sum * sum >> 4)) >> 4;
 }
 
@@ -431,8 +587,8 @@ static int od_compute_var_4x4(od_coeff *x, int stride) {
 #define OD_DIST_LP_MID (5)
 #define OD_DIST_LP_NORM (OD_DIST_LP_MID + 2)
 
-static double od_compute_dist_8x8(int qm, int use_activity_masking, od_coeff *x,
-                                  od_coeff *y, od_coeff *e_lp, int stride) {
+static double od_compute_dist_8x8(int use_activity_masking, uint16_t *x,
+                                  uint16_t *y, od_coeff *e_lp, int stride) {
   double sum;
   int min_var;
   double mean_var;
@@ -444,8 +600,7 @@ static double od_compute_dist_8x8(int qm, int use_activity_masking, od_coeff *x,
   double vardist;
 
   vardist = 0;
-  OD_ASSERT(qm != OD_FLAT_QM);
-  (void)qm;
+
 #if 1
   min_var = INT_MAX;
   mean_var = 0;
@@ -490,22 +645,61 @@ static double od_compute_dist_8x8(int qm, int use_activity_masking, od_coeff *x,
 }
 
 // Note : Inputs x and y are in a pixel domain
-static double od_compute_dist(int qm, int activity_masking, od_coeff *x,
-                              od_coeff *y, int bsize_w, int bsize_h,
-                              int qindex) {
+static double od_compute_dist_common(int activity_masking, uint16_t *x,
+                                     uint16_t *y, int bsize_w, int bsize_h,
+                                     int qindex, od_coeff *tmp,
+                                     od_coeff *e_lp) {
+  int i, j;
+  double sum = 0;
+  const int mid = OD_DIST_LP_MID;
+
+  for (j = 0; j < bsize_w; j++) {
+    e_lp[j] = mid * tmp[j] + 2 * tmp[bsize_w + j];
+    e_lp[(bsize_h - 1) * bsize_w + j] = mid * tmp[(bsize_h - 1) * bsize_w + j] +
+                                        2 * tmp[(bsize_h - 2) * bsize_w + j];
+  }
+  for (i = 1; i < bsize_h - 1; i++) {
+    for (j = 0; j < bsize_w; j++) {
+      e_lp[i * bsize_w + j] = mid * tmp[i * bsize_w + j] +
+                              tmp[(i - 1) * bsize_w + j] +
+                              tmp[(i + 1) * bsize_w + j];
+    }
+  }
+  for (i = 0; i < bsize_h; i += 8) {
+    for (j = 0; j < bsize_w; j += 8) {
+      sum += od_compute_dist_8x8(activity_masking, &x[i * bsize_w + j],
+                                 &y[i * bsize_w + j], &e_lp[i * bsize_w + j],
+                                 bsize_w);
+    }
+  }
+  /* Scale according to linear regression against SSE, for 8x8 blocks. */
+  if (activity_masking) {
+    sum *= 2.2 + (1.7 - 2.2) * (qindex - 99) / (210 - 99) +
+           (qindex < 99 ? 2.5 * (qindex - 99) / 99 * (qindex - 99) / 99 : 0);
+  } else {
+    sum *= qindex >= 128
+               ? 1.4 + (0.9 - 1.4) * (qindex - 128) / (209 - 128)
+               : qindex <= 43 ? 1.5 + (2.0 - 1.5) * (qindex - 43) / (16 - 43)
+                              : 1.5 + (1.4 - 1.5) * (qindex - 43) / (128 - 43);
+  }
+
+  return sum;
+}
+
+static double od_compute_dist(uint16_t *x, uint16_t *y, int bsize_w,
+                              int bsize_h, int qindex) {
   int i;
   double sum;
   sum = 0;
 
   assert(bsize_w >= 8 && bsize_h >= 8);
 
-  if (qm == OD_FLAT_QM) {
-    for (i = 0; i < bsize_w * bsize_h; i++) {
-      double tmp;
-      tmp = x[i] - y[i];
-      sum += tmp * tmp;
-    }
-  } else {
+#if CONFIG_PVQ
+  int activity_masking = 1;
+#else
+  int activity_masking = 0;
+#endif
+  {
     int j;
     DECLARE_ALIGNED(16, od_coeff, e[MAX_TX_SQUARE]);
     DECLARE_ALIGNED(16, od_coeff, tmp[MAX_TX_SQUARE]);
@@ -525,63 +719,242 @@ static double od_compute_dist(int qm, int activity_masking, od_coeff *x,
                                e[i * bsize_w + j - 1] + e[i * bsize_w + j + 1];
       }
     }
-    for (j = 0; j < bsize_w; j++) {
-      e_lp[j] = mid * tmp[j] + 2 * tmp[bsize_w + j];
-      e_lp[(bsize_h - 1) * bsize_w + j] =
-          mid * tmp[(bsize_h - 1) * bsize_w + j] +
-          2 * tmp[(bsize_h - 2) * bsize_w + j];
-    }
-    for (i = 1; i < bsize_h - 1; i++) {
+    sum = od_compute_dist_common(activity_masking, x, y, bsize_w, bsize_h,
+                                 qindex, tmp, e_lp);
+  }
+  return sum;
+}
+
+static double od_compute_dist_diff(uint16_t *x, int16_t *e, int bsize_w,
+                                   int bsize_h, int qindex) {
+  int i;
+  double sum;
+  sum = 0;
+
+  assert(bsize_w >= 8 && bsize_h >= 8);
+
+#if CONFIG_PVQ
+  int activity_masking = 1;
+#else
+  int activity_masking = 0;
+#endif
+  {
+    int j;
+    DECLARE_ALIGNED(16, uint16_t, y[MAX_TX_SQUARE]);
+    DECLARE_ALIGNED(16, od_coeff, tmp[MAX_TX_SQUARE]);
+    DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_TX_SQUARE]);
+    int mid = OD_DIST_LP_MID;
+    for (i = 0; i < bsize_h; i++) {
       for (j = 0; j < bsize_w; j++) {
-        e_lp[i * bsize_w + j] = mid * tmp[i * bsize_w + j] +
-                                tmp[(i - 1) * bsize_w + j] +
-                                tmp[(i + 1) * bsize_w + j];
+        y[i * bsize_w + j] = x[i * bsize_w + j] - e[i * bsize_w + j];
       }
     }
-    for (i = 0; i < bsize_h; i += 8) {
-      for (j = 0; j < bsize_w; j += 8) {
-        sum += od_compute_dist_8x8(qm, activity_masking, &x[i * bsize_w + j],
-                                   &y[i * bsize_w + j], &e_lp[i * bsize_w + j],
-                                   bsize_w);
+    for (i = 0; i < bsize_h; i++) {
+      tmp[i * bsize_w] = mid * e[i * bsize_w] + 2 * e[i * bsize_w + 1];
+      tmp[i * bsize_w + bsize_w - 1] =
+          mid * e[i * bsize_w + bsize_w - 1] + 2 * e[i * bsize_w + bsize_w - 2];
+      for (j = 1; j < bsize_w - 1; j++) {
+        tmp[i * bsize_w + j] = mid * e[i * bsize_w + j] +
+                               e[i * bsize_w + j - 1] + e[i * bsize_w + j + 1];
       }
     }
-    /* Scale according to linear regression against SSE, for 8x8 blocks. */
-    if (activity_masking) {
-      sum *= 2.2 + (1.7 - 2.2) * (qindex - 99) / (210 - 99) +
-             (qindex < 99 ? 2.5 * (qindex - 99) / 99 * (qindex - 99) / 99 : 0);
-    } else {
-      sum *= qindex >= 128
-                 ? 1.4 + (0.9 - 1.4) * (qindex - 128) / (209 - 128)
-                 : qindex <= 43
-                       ? 1.5 + (2.0 - 1.5) * (qindex - 43) / (16 - 43)
-                       : 1.5 + (1.4 - 1.5) * (qindex - 43) / (128 - 43);
-    }
+    sum = od_compute_dist_common(activity_masking, x, y, bsize_w, bsize_h,
+                                 qindex, tmp, e_lp);
   }
   return sum;
 }
+#endif  // CONFIG_DAALA_DIST
+
+#if CONFIG_DIST_8X8
+#define NEW_FUTURE_DIST 0
+int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCKD *xd,
+                     const uint8_t *src, int src_stride, const uint8_t *dst,
+                     int dst_stride, const BLOCK_SIZE tx_bsize, int bsw,
+                     int bsh, int visible_w, int visible_h, int qindex) {
+  int64_t d = 0;
 
-int64_t av1_daala_dist(const uint8_t *src, int src_stride, const uint8_t *dst,
-                       int dst_stride, int bsw, int bsh, int qm,
-                       int use_activity_masking, int qindex) {
+#if CONFIG_DAALA_DIST || NEW_FUTURE_DIST
   int i, j;
-  int64_t d;
-  DECLARE_ALIGNED(16, od_coeff, orig[MAX_TX_SQUARE]);
-  DECLARE_ALIGNED(16, od_coeff, rec[MAX_TX_SQUARE]);
 
-  assert(qm == OD_HVS_QM);
+  DECLARE_ALIGNED(16, uint16_t, orig[MAX_TX_SQUARE]);
+  DECLARE_ALIGNED(16, uint16_t, rec[MAX_TX_SQUARE]);
+  (void)cpi;
+  (void)tx_bsize;
+#endif  // CONFIG_DAALA_DIST || NEW_FUTURE_DIST
+
+#if !CONFIG_HIGHBITDEPTH
+  (void)xd;
+#endif
+
+#if !CONFIG_DAALA_DIST
+  (void)qindex;
+#endif
+
+#if !CONFIG_DAALA_DIST || !NEW_FUTURE_DIST
+  (void)xd;
+  (void)bsw, (void)bsh;
+  (void)visible_w, (void)visible_h;
+#endif
+
+#if CONFIG_DAALA_DIST || NEW_FUTURE_DIST
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    for (j = 0; j < bsh; j++)
+      for (i = 0; i < bsw; i++)
+        orig[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
+
+    if ((bsw == visible_w) && (bsh == visible_h)) {
+      for (j = 0; j < bsh; j++)
+        for (i = 0; i < bsw; i++)
+          rec[j * bsw + i] = CONVERT_TO_SHORTPTR(dst)[j * dst_stride + i];
+    } else {
+      for (j = 0; j < visible_h; j++)
+        for (i = 0; i < visible_w; i++)
+          rec[j * bsw + i] = CONVERT_TO_SHORTPTR(dst)[j * dst_stride + i];
+
+      if (visible_w < bsw) {
+        for (j = 0; j < bsh; j++)
+          for (i = visible_w; i < bsw; i++)
+            rec[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
+      }
+
+      if (visible_h < bsh) {
+        for (j = visible_h; j < bsh; j++)
+          for (i = 0; i < bsw; i++)
+            rec[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
+      }
+    }
+  } else {
+#endif
+    for (j = 0; j < bsh; j++)
+      for (i = 0; i < bsw; i++) orig[j * bsw + i] = src[j * src_stride + i];
+
+    if ((bsw == visible_w) && (bsh == visible_h)) {
+      for (j = 0; j < bsh; j++)
+        for (i = 0; i < bsw; i++) rec[j * bsw + i] = dst[j * dst_stride + i];
+    } else {
+      for (j = 0; j < visible_h; j++)
+        for (i = 0; i < visible_w; i++)
+          rec[j * bsw + i] = dst[j * dst_stride + i];
 
-  for (j = 0; j < bsh; j++)
-    for (i = 0; i < bsw; i++) orig[j * bsw + i] = src[j * src_stride + i];
+      if (visible_w < bsw) {
+        for (j = 0; j < bsh; j++)
+          for (i = visible_w; i < bsw; i++)
+            rec[j * bsw + i] = src[j * src_stride + i];
+      }
 
-  for (j = 0; j < bsh; j++)
-    for (i = 0; i < bsw; i++) rec[j * bsw + i] = dst[j * dst_stride + i];
+      if (visible_h < bsh) {
+        for (j = visible_h; j < bsh; j++)
+          for (i = 0; i < bsw; i++) rec[j * bsw + i] = src[j * src_stride + i];
+      }
+    }
+#if CONFIG_HIGHBITDEPTH
+  }
+#endif  // CONFIG_HIGHBITDEPTH
+#endif  // CONFIG_DAALA_DIST || NEW_FUTURE_DIST
+
+#if CONFIG_DAALA_DIST
+  d = (int64_t)od_compute_dist(orig, rec, bsw, bsh, qindex);
+#elif NEW_FUTURE_DIST
+  // Call new 8x8-wise distortion function here, for example
+  for (i = 0; i < bsh; i += 8) {
+    for (j = 0; j < bsw; j += 8) {
+      d +=
+          av1_compute_dist_8x8(&orig[i * bsw + j], &rec[i * bsw + j], bsw, bsh);
+    }
+  }
+#else
+  // Otherwise, MSE by default
+  unsigned sse;
+  // TODO(Any): Use even faster function which does not calculate variance
+  cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
+  d = sse;
+#endif  // CONFIG_DAALA_DIST
 
-  d = (int64_t)od_compute_dist(qm, use_activity_masking, orig, rec, bsw, bsh,
-                               qindex);
   return d;
 }
+
+static int64_t av1_dist_8x8_diff(const MACROBLOCKD *xd, const uint8_t *src,
+                                 int src_stride, const int16_t *diff,
+                                 int diff_stride, int bsw, int bsh,
+                                 int visible_w, int visible_h, int qindex) {
+  int64_t d = 0;
+
+#if CONFIG_DAALA_DIST || NEW_FUTURE_DIST
+  int i, j;
+
+  DECLARE_ALIGNED(16, uint16_t, orig[MAX_TX_SQUARE]);
+  DECLARE_ALIGNED(16, int16_t, diff16[MAX_TX_SQUARE]);
+#endif  // CONFIG_DAALA_DIST || NEW_FUTURE_DIST
+
+#if !CONFIG_HIGHBITDEPTH
+  (void)xd;
+#endif
+
+#if !CONFIG_DAALA_DIST
+  (void)qindex;
+#endif
+
+#if !CONFIG_DAALA_DIST || !NEW_FUTURE_DIST
+  (void)xd;
+  (void)src, (void)src_stride;
+  (void)bsw, (void)bsh;
+  (void)visible_w, (void)visible_h;
+#endif
+
+#if CONFIG_DAALA_DIST || NEW_FUTURE_DIST
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    for (j = 0; j < bsh; j++)
+      for (i = 0; i < bsw; i++)
+        orig[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
+  } else {
+#endif
+    for (j = 0; j < bsh; j++)
+      for (i = 0; i < bsw; i++) orig[j * bsw + i] = src[j * src_stride + i];
+#if CONFIG_HIGHBITDEPTH
+  }
+#endif  // CONFIG_HIGHBITDEPTH
+
+  if ((bsw == visible_w) && (bsh == visible_h)) {
+    for (j = 0; j < bsh; j++)
+      for (i = 0; i < bsw; i++) diff16[j * bsw + i] = diff[j * diff_stride + i];
+  } else {
+    for (j = 0; j < visible_h; j++)
+      for (i = 0; i < visible_w; i++)
+        diff16[j * bsw + i] = diff[j * diff_stride + i];
+
+    if (visible_w < bsw) {
+      for (j = 0; j < bsh; j++)
+        for (i = visible_w; i < bsw; i++) diff16[j * bsw + i] = 0;
+    }
+
+    if (visible_h < bsh) {
+      for (j = visible_h; j < bsh; j++)
+        for (i = 0; i < bsw; i++) diff16[j * bsw + i] = 0;
+    }
+  }
+#endif  // CONFIG_DAALA_DIST || NEW_FUTURE_DIST
+
+#if CONFIG_DAALA_DIST
+  d = (int64_t)od_compute_dist_diff(orig, diff16, bsw, bsh, qindex);
+#elif NEW_FUTURE_DIST
+  // Call new 8x8-wise distortion function (with diff inpu) here, for example
+  for (i = 0; i < bsh; i += 8) {
+    for (j = 0; j < bsw; j += 8) {
+      d += av1_compute_dist_8x8_diff(&orig[i * bsw + j], &diff16[i * bsw + j],
+                                     bsw, bsh);
+    }
+  }
+#else
+  // Otherwise, MSE by default
+  d = aom_sum_squares_2d_i16(diff, diff_stride, bsw, bsh);
 #endif  // CONFIG_DAALA_DIST
 
+  return d;
+}
+#endif  // CONFIG_DIST_8X8
+
 static void get_energy_distribution_fine(const AV1_COMP *cpi, BLOCK_SIZE bsize,
                                          const uint8_t *src, int src_stride,
                                          const uint8_t *dst, int dst_stride,
@@ -892,11 +1265,11 @@ static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
   for (plane = plane_from; plane <= plane_to; ++plane) {
     struct macroblock_plane *const p = &x->plane[plane];
     struct macroblockd_plane *const pd = &xd->plane[plane];
-#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
+#if CONFIG_CHROMA_SUB8X8
     const BLOCK_SIZE bs = AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
 #else
     const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
-#endif  // CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
+#endif  // CONFIG_CHROMA_SUB8X8
 
     unsigned int sse;
     int rate;
@@ -1068,7 +1441,7 @@ static int cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
 
 #if !CONFIG_VAR_TX && !CONFIG_SUPERTX
   // Check for consistency of tx_size with mode info
-  assert(tx_size == get_tx_size(plane, xd));
+  assert(tx_size == av1_get_tx_size(plane, xd));
 #endif  // !CONFIG_VAR_TX && !CONFIG_SUPERTX
   (void)cm;
 
@@ -1144,10 +1517,12 @@ static int cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
 #endif  // !CONFIG_LV_MAP
 
 int av1_cost_coeffs(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
-                    int block, TX_SIZE tx_size, const SCAN_ORDER *scan_order,
-                    const ENTROPY_CONTEXT *a, const ENTROPY_CONTEXT *l,
-                    int use_fast_coef_costing) {
+                    int blk_row, int blk_col, int block, TX_SIZE tx_size,
+                    const SCAN_ORDER *scan_order, const ENTROPY_CONTEXT *a,
+                    const ENTROPY_CONTEXT *l, int use_fast_coef_costing) {
 #if !CONFIG_LV_MAP
+  (void)blk_row;
+  (void)blk_col;
   const AV1_COMMON *const cm = &cpi->common;
   return cost_coeffs(cm, x, plane, block, tx_size, scan_order, a, l,
                      use_fast_coef_costing);
@@ -1158,13 +1533,11 @@ int av1_cost_coeffs(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
   const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   const struct macroblockd_plane *pd = &xd->plane[plane];
   const BLOCK_SIZE bsize = mbmi->sb_type;
-#if CONFIG_CB4X4
-#if CONFIG_CHROMA_2X2
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-#else
+#if CONFIG_CHROMA_SUB8X8
   const BLOCK_SIZE plane_bsize =
       AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
-#endif  // CONFIG_CHROMA_2X2
+#elif CONFIG_CB4X4
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
 #else   // CONFIG_CB4X4
   const BLOCK_SIZE plane_bsize =
       get_plane_block_size(AOMMAX(BLOCK_8X8, bsize), pd);
@@ -1172,7 +1545,8 @@ int av1_cost_coeffs(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
 
   TXB_CTX txb_ctx;
   get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
-  return av1_cost_coeffs_txb(cpi, x, plane, block, &txb_ctx);
+  return av1_cost_coeffs_txb(cpi, x, plane, blk_row, blk_col, block, tx_size,
+                             &txb_ctx);
 #endif  // !CONFIG_LV_MAP
 }
 #endif  // !CONFIG_PVQ || CONFIG_VAR_TX
@@ -1182,9 +1556,9 @@ static void get_txb_dimensions(const MACROBLOCKD *xd, int plane,
                                BLOCK_SIZE plane_bsize, int blk_row, int blk_col,
                                BLOCK_SIZE tx_bsize, int *width, int *height,
                                int *visible_width, int *visible_height) {
-#if !(CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT)
+#if !(CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX))
   assert(tx_bsize <= plane_bsize);
-#endif  // !(CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT)
+#endif
   int txb_height = block_size_high[tx_bsize];
   int txb_width = block_size_wide[tx_bsize];
   const int block_height = block_size_high[plane_bsize];
@@ -1208,19 +1582,31 @@ static void get_txb_dimensions(const MACROBLOCKD *xd, int plane,
       clamp(block_rows - (blk_row << tx_unit_size), 0, txb_height);
 }
 
-// Compute the pixel domain sum square error on all visible 4x4s in the
+// Compute the pixel domain distortion from src and dst on all visible 4x4s in
+// the
 // transform block.
-static unsigned pixel_sse(const AV1_COMP *const cpi, const MACROBLOCKD *xd,
-                          int plane, const uint8_t *src, const int src_stride,
-                          const uint8_t *dst, const int dst_stride, int blk_row,
-                          int blk_col, const BLOCK_SIZE plane_bsize,
-                          const BLOCK_SIZE tx_bsize) {
+static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x,
+                           int plane, const uint8_t *src, const int src_stride,
+                           const uint8_t *dst, const int dst_stride,
+                           int blk_row, int blk_col,
+                           const BLOCK_SIZE plane_bsize,
+                           const BLOCK_SIZE tx_bsize) {
   int txb_rows, txb_cols, visible_rows, visible_cols;
+  const MACROBLOCKD *xd = &x->e_mbd;
+
   get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize,
                      &txb_cols, &txb_rows, &visible_cols, &visible_rows);
   assert(visible_rows > 0);
   assert(visible_cols > 0);
-#if CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
+
+#if CONFIG_DIST_8X8
+  if (plane == 0 && txb_cols >= 8 && txb_rows >= 8)
+    return av1_dist_8x8(cpi, xd, src, src_stride, dst, dst_stride, tx_bsize,
+                        txb_cols, txb_rows, visible_cols, visible_rows,
+                        x->qindex);
+#endif  // CONFIG_DIST_8X8
+
+#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
   if ((txb_rows == visible_rows && txb_cols == visible_cols) &&
       tx_bsize < BLOCK_SIZES) {
 #else
@@ -1242,36 +1628,86 @@ static unsigned pixel_sse(const AV1_COMP *const cpi, const MACROBLOCKD *xd,
   return sse;
 }
 
-// Compute the squares sum squares on all visible 4x4s in the transform block.
-static int64_t sum_squares_visible(const MACROBLOCKD *xd, int plane,
-                                   const int16_t *diff, const int diff_stride,
-                                   int blk_row, int blk_col,
-                                   const BLOCK_SIZE plane_bsize,
-                                   const BLOCK_SIZE tx_bsize) {
+// Compute the pixel domain distortion from diff on all visible 4x4s in the
+// transform block.
+static int64_t pixel_diff_dist(const MACROBLOCK *x, int plane,
+                               const int16_t *diff, const int diff_stride,
+                               int blk_row, int blk_col,
+                               const BLOCK_SIZE plane_bsize,
+                               const BLOCK_SIZE tx_bsize) {
   int visible_rows, visible_cols;
+  const MACROBLOCKD *xd = &x->e_mbd;
+#if CONFIG_DIST_8X8
+  int txb_height = block_size_high[tx_bsize];
+  int txb_width = block_size_wide[tx_bsize];
+  const int src_stride = x->plane[plane].src.stride;
+  const int src_idx = (blk_row * src_stride + blk_col) << tx_size_wide_log2[0];
+  const uint8_t *src = &x->plane[plane].src.buf[src_idx];
+#endif
+
   get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL,
                      NULL, &visible_cols, &visible_rows);
-  return aom_sum_squares_2d_i16(diff, diff_stride, visible_cols, visible_rows);
+
+#if CONFIG_DIST_8X8
+  if (plane == 0 && txb_width >= 8 && txb_height >= 8)
+    return av1_dist_8x8_diff(xd, src, src_stride, diff, diff_stride, txb_width,
+                             txb_height, visible_cols, visible_rows, x->qindex);
+  else
+#endif
+    return aom_sum_squares_2d_i16(diff, diff_stride, visible_cols,
+                                  visible_rows);
+}
+
+#if CONFIG_PALETTE || CONFIG_INTRABC
+int av1_count_colors(const uint8_t *src, int stride, int rows, int cols) {
+  int val_count[256];
+  memset(val_count, 0, sizeof(val_count));
+  for (int r = 0; r < rows; ++r) {
+    for (int c = 0; c < cols; ++c) {
+      ++val_count[src[r * stride + c]];
+    }
+  }
+  int n = 0;
+  for (int i = 0; i < 256; ++i) {
+    if (val_count[i]) ++n;
+  }
+  return n;
 }
 
+#if CONFIG_HIGHBITDEPTH
+int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols,
+                            int bit_depth) {
+  assert(bit_depth <= 12);
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  int val_count[1 << 12];
+  memset(val_count, 0, (1 << 12) * sizeof(val_count[0]));
+  for (int r = 0; r < rows; ++r) {
+    for (int c = 0; c < cols; ++c) {
+      ++val_count[src[r * stride + c]];
+    }
+  }
+  int n = 0;
+  for (int i = 0; i < (1 << bit_depth); ++i) {
+    if (val_count[i]) ++n;
+  }
+  return n;
+}
+#endif  // CONFIG_HIGHBITDEPTH
+#endif  // CONFIG_PALETTE || CONFIG_INTRABC
+
 void av1_dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
                     BLOCK_SIZE plane_bsize, int block, int blk_row, int blk_col,
                     TX_SIZE tx_size, int64_t *out_dist, int64_t *out_sse,
                     OUTPUT_STATUS output_status) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
-#if CONFIG_DAALA_DIST
-  int qm = OD_HVS_QM;
-  int use_activity_masking = 0;
-#if CONFIG_PVQ
-  use_activity_masking = x->daala_enc.use_activity_masking;
-#endif  // CONFIG_PVQ
+#if CONFIG_DIST_8X8
   struct macroblockd_plane *const pd = &xd->plane[plane];
-#else   // CONFIG_DAALA_DIST
+#else   // CONFIG_DIST_8X8
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-#endif  // CONFIG_DAALA_DIST
+#endif  // CONFIG_DIST_8X8
 
-  if (cpi->sf.use_transform_domain_distortion && !CONFIG_DAALA_DIST) {
+  if (cpi->sf.use_transform_domain_distortion && !CONFIG_DIST_8X8) {
     // Transform domain distortion computation is more efficient as it does
     // not involve an inverse transform, but it is less accurate.
     const int buffer_length = tx_size_2d[tx_size];
@@ -1292,19 +1728,21 @@ void av1_dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
                                    &this_sse) >>
                 shift;
 #endif  // CONFIG_HIGHBITDEPTH
-#elif CONFIG_HIGHBITDEPTH
-    const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;
-    *out_dist =
-        av1_highbd_block_error(coeff, dqcoeff, buffer_length, &this_sse, bd) >>
-        shift;
-#else
-    *out_dist =
-        av1_block_error(coeff, dqcoeff, buffer_length, &this_sse) >> shift;
+#else   // !CONFIG_PVQ
+#if CONFIG_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+      *out_dist = av1_highbd_block_error(coeff, dqcoeff, buffer_length,
+                                         &this_sse, xd->bd) >>
+                  shift;
+    else
+#endif
+      *out_dist =
+          av1_block_error(coeff, dqcoeff, buffer_length, &this_sse) >> shift;
 #endif  // CONFIG_PVQ
     *out_sse = this_sse >> shift;
   } else {
     const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
-#if !CONFIG_PVQ || CONFIG_DAALA_DIST
+#if !CONFIG_PVQ || CONFIG_DIST_8X8
     const int bsw = block_size_wide[tx_bsize];
     const int bsh = block_size_high[tx_bsize];
 #endif
@@ -1323,34 +1761,13 @@ void av1_dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
     assert(cpi != NULL);
     assert(tx_size_wide_log2[0] == tx_size_high_log2[0]);
 
-#if CONFIG_DAALA_DIST
-    if (plane == 0 && bsw >= 8 && bsh >= 8) {
-      if (output_status == OUTPUT_HAS_DECODED_PIXELS) {
-        const int pred_stride = block_size_wide[plane_bsize];
-        const int pred_idx = (blk_row * pred_stride + blk_col)
-                             << tx_size_wide_log2[0];
-        const int16_t *pred = &pd->pred[pred_idx];
-        int i, j;
-        DECLARE_ALIGNED(16, uint8_t, pred8[MAX_TX_SQUARE]);
-
-        for (j = 0; j < bsh; j++)
-          for (i = 0; i < bsw; i++)
-            pred8[j * bsw + i] = pred[j * pred_stride + i];
-        *out_sse = av1_daala_dist(src, src_stride, pred8, bsw, bsw, bsh, qm,
-                                  use_activity_masking, x->qindex);
-      } else {
-        *out_sse = av1_daala_dist(src, src_stride, dst, dst_stride, bsw, bsh,
-                                  qm, use_activity_masking, x->qindex);
-      }
-    } else
-#endif  // CONFIG_DAALA_DIST
     {
       const int diff_stride = block_size_wide[plane_bsize];
       const int diff_idx = (blk_row * diff_stride + blk_col)
                            << tx_size_wide_log2[0];
       const int16_t *diff = &p->src_diff[diff_idx];
-      *out_sse = sum_squares_visible(xd, plane, diff, diff_stride, blk_row,
-                                     blk_col, plane_bsize, tx_bsize);
+      *out_sse = pixel_diff_dist(x, plane, diff, diff_stride, blk_row, blk_col,
+                                 plane_bsize, tx_bsize);
 #if CONFIG_HIGHBITDEPTH
       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
         *out_sse = ROUND_POWER_OF_TWO(*out_sse, (xd->bd - 8) * 2);
@@ -1360,15 +1777,8 @@ void av1_dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
 
     if (eob) {
       if (output_status == OUTPUT_HAS_DECODED_PIXELS) {
-#if CONFIG_DAALA_DIST
-        if (plane == 0 && bsw >= 8 && bsh >= 8)
-          *out_dist = av1_daala_dist(src, src_stride, dst, dst_stride, bsw, bsh,
-                                     qm, use_activity_masking, x->qindex);
-        else
-#endif  // CONFIG_DAALA_DIST
-          *out_dist =
-              pixel_sse(cpi, xd, plane, src, src_stride, dst, dst_stride,
-                        blk_row, blk_col, plane_bsize, tx_bsize);
+        *out_dist = pixel_dist(cpi, x, plane, src, src_stride, dst, dst_stride,
+                               blk_row, blk_col, plane_bsize, tx_bsize);
       } else {
 #if CONFIG_HIGHBITDEPTH
         uint8_t *recon;
@@ -1399,37 +1809,44 @@ void av1_dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
 #endif  // !CONFIG_PVQ
 
         const PLANE_TYPE plane_type = get_plane_type(plane);
-        TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
-
-        av1_inverse_transform_block(xd, dqcoeff, tx_type, tx_size, recon,
-                                    MAX_TX_SIZE, eob);
-
-#if CONFIG_DAALA_DIST
-        if (plane == 0 && bsw >= 8 && bsh >= 8) {
-          *out_dist = av1_daala_dist(src, src_stride, recon, MAX_TX_SIZE, bsw,
-                                     bsh, qm, use_activity_masking, x->qindex);
-        } else {
-          if (plane == 0) {
-            // Save decoded pixels for inter block in pd->pred to avoid
-            // block_8x8_rd_txfm_daala_dist() need to produce them
-            // by calling av1_inverse_transform_block() again.
-            const int pred_stride = block_size_wide[plane_bsize];
-            const int pred_idx = (blk_row * pred_stride + blk_col)
-                                 << tx_size_wide_log2[0];
-            int16_t *pred = &pd->pred[pred_idx];
-            int i, j;
+        TX_TYPE tx_type =
+            av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
+        av1_inverse_transform_block(xd, dqcoeff,
+#if CONFIG_LGT
+                                    xd->mi[0]->mbmi.mode,
+#endif
+                                    tx_type, tx_size, recon, MAX_TX_SIZE, eob);
+
+#if CONFIG_DIST_8X8
+        if (plane == 0 && (bsw < 8 || bsh < 8)) {
+          // Save decoded pixels for inter block in pd->pred to avoid
+          // block_8x8_rd_txfm_daala_dist() need to produce them
+          // by calling av1_inverse_transform_block() again.
+          const int pred_stride = block_size_wide[plane_bsize];
+          const int pred_idx = (blk_row * pred_stride + blk_col)
+                               << tx_size_wide_log2[0];
+          int16_t *pred = &pd->pred[pred_idx];
+          int i, j;
 
+#if CONFIG_HIGHBITDEPTH
+          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+            for (j = 0; j < bsh; j++)
+              for (i = 0; i < bsw; i++)
+                pred[j * pred_stride + i] =
+                    CONVERT_TO_SHORTPTR(recon)[j * MAX_TX_SIZE + i];
+          } else {
+#endif
             for (j = 0; j < bsh; j++)
               for (i = 0; i < bsw; i++)
                 pred[j * pred_stride + i] = recon[j * MAX_TX_SIZE + i];
+#if CONFIG_HIGHBITDEPTH
           }
-#endif  // CONFIG_DAALA_DIST
-          *out_dist =
-              pixel_sse(cpi, xd, plane, src, src_stride, recon, MAX_TX_SIZE,
-                        blk_row, blk_col, plane_bsize, tx_bsize);
-#if CONFIG_DAALA_DIST
+#endif  // CONFIG_HIGHBITDEPTH
         }
-#endif  // CONFIG_DAALA_DIST
+#endif  // CONFIG_DIST_8X8
+        *out_dist =
+            pixel_dist(cpi, x, plane, src, src_stride, recon, MAX_TX_SIZE,
+                       blk_row, blk_col, plane_bsize, tx_bsize);
       }
       *out_dist *= 16;
     } else {
@@ -1453,33 +1870,25 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
   int64_t rd1, rd2, rd;
   RD_STATS this_rd_stats;
 
-  assert(tx_size == get_tx_size(plane, xd));
+#if !CONFIG_SUPERTX && !CONFIG_VAR_TX
+  assert(tx_size == av1_get_tx_size(plane, xd));
+#endif  // !CONFIG_SUPERTX
 
   av1_init_rd_stats(&this_rd_stats);
 
   if (args->exit_early) return;
 
   if (!is_inter_block(mbmi)) {
-#if CONFIG_CFL
-
-#if CONFIG_EC_ADAPT
-    FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
-#else
-    FRAME_CONTEXT *const ec_ctx = cm->fc;
-#endif  // CONFIG_EC_ADAPT
-
-    av1_predict_intra_block_encoder_facade(x, ec_ctx, plane, block, blk_col,
-                                           blk_row, tx_size, plane_bsize);
-#else
     av1_predict_intra_block_facade(xd, plane, block, blk_col, blk_row, tx_size);
-#endif
 #if CONFIG_DPCM_INTRA
     const int block_raster_idx =
         av1_block_index_to_raster_order(tx_size, block);
-    const PREDICTION_MODE mode =
-        (plane == 0) ? get_y_mode(xd->mi[0], block_raster_idx) : mbmi->uv_mode;
-    TX_TYPE tx_type = get_tx_type((plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV,
-                                  xd, block, tx_size);
+    const PREDICTION_MODE mode = (plane == AOM_PLANE_Y)
+                                     ? get_y_mode(xd->mi[0], block_raster_idx)
+                                     : get_uv_mode(mbmi->uv_mode);
+    TX_TYPE tx_type =
+        av1_get_tx_type((plane == AOM_PLANE_Y) ? PLANE_TYPE_Y : PLANE_TYPE_UV,
+                        xd, blk_row, blk_col, block, tx_size);
     if (av1_use_dpcm_intra(plane, mode, tx_type, mbmi)) {
       int8_t skip;
       av1_encode_block_intra_dpcm(cm, x, mode, plane, block, blk_row, blk_col,
@@ -1496,9 +1905,36 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
 #if !CONFIG_TXK_SEL
   // full forward transform and quantization
   const int coeff_ctx = combine_entropy_contexts(*a, *l);
+#if DISABLE_TRELLISQ_SEARCH
+  av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                  coeff_ctx, AV1_XFORM_QUANT_B);
+#else
   av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
                   coeff_ctx, AV1_XFORM_QUANT_FP);
-  av1_optimize_b(cm, x, plane, block, plane_bsize, tx_size, a, l);
+
+  const int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
+  tran_low_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);
+  const int buffer_length = tx_size_2d[tx_size];
+  int64_t tmp_dist;
+  int64_t tmp;
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    tmp_dist =
+        av1_highbd_block_error(coeff, dqcoeff, buffer_length, &tmp, xd->bd) >>
+        shift;
+  else
+#endif
+    tmp_dist = av1_block_error(coeff, dqcoeff, buffer_length, &tmp) >> shift;
+
+  if (RDCOST(x->rdmult, 0, tmp_dist) + args->this_rd < args->best_rd) {
+    av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size,
+                   a, l);
+  } else {
+    args->exit_early = 1;
+    return;
+  }
+#endif  // DISABLE_TRELLISQ_SEARCH
 
   if (!is_inter_block(mbmi)) {
     struct macroblock_plane *const p = &x->plane[plane];
@@ -1518,25 +1954,27 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
     const int dst_stride = pd->dst.stride;
     uint8_t *dst =
         &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
-    cfl_store(xd->cfl, dst, dst_stride, blk_row, blk_col, tx_size);
+    // TODO (ltrudeau) Store sub-8x8 inter blocks when bottom right block is
+    // intra predicted.
+    cfl_store(xd->cfl, dst, dst_stride, blk_row, blk_col, tx_size, plane_bsize);
   }
 #endif
 #if CONFIG_DPCM_INTRA
 CALCULATE_RD : {}
 #endif  // CONFIG_DPCM_INTRA
-  rd = RDCOST(x->rdmult, x->rddiv, 0, this_rd_stats.dist);
+  rd = RDCOST(x->rdmult, 0, this_rd_stats.dist);
   if (args->this_rd + rd > args->best_rd) {
     args->exit_early = 1;
     return;
   }
 #if !CONFIG_PVQ
   const PLANE_TYPE plane_type = get_plane_type(plane);
-  const TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
-  const SCAN_ORDER *scan_order =
-      get_scan(cm, tx_size, tx_type, is_inter_block(mbmi));
+  const TX_TYPE tx_type =
+      av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
+  const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, mbmi);
   this_rd_stats.rate =
-      av1_cost_coeffs(cpi, x, plane, block, tx_size, scan_order, a, l,
-                      args->use_fast_coef_costing);
+      av1_cost_coeffs(cpi, x, plane, blk_row, blk_col, block, tx_size,
+                      scan_order, a, l, args->use_fast_coef_costing);
 #else   // !CONFIG_PVQ
   this_rd_stats.rate = x->rate;
 #endif  // !CONFIG_PVQ
@@ -1554,22 +1992,12 @@ CALCULATE_RD : {}
   av1_set_txb_context(x, plane, block, tx_size, a, l);
 #endif  // !CONFIG_PVQ
 
-  rd1 = RDCOST(x->rdmult, x->rddiv, this_rd_stats.rate, this_rd_stats.dist);
-  rd2 = RDCOST(x->rdmult, x->rddiv, 0, this_rd_stats.sse);
+  rd1 = RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
+  rd2 = RDCOST(x->rdmult, 0, this_rd_stats.sse);
 
   // TODO(jingning): temporarily enabled only for luma component
   rd = AOMMIN(rd1, rd2);
 
-#if CONFIG_DAALA_DIST
-  if (plane == 0 && plane_bsize >= BLOCK_8X8 &&
-      (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4)) {
-    this_rd_stats.dist = 0;
-    this_rd_stats.sse = 0;
-    rd = 0;
-    x->rate_4x4[block] = this_rd_stats.rate;
-  }
-#endif  // CONFIG_DAALA_DIST
-
 #if !CONFIG_PVQ
   this_rd_stats.skip &= !x->plane[plane].eobs[block];
 #else
@@ -1579,113 +2007,93 @@ CALCULATE_RD : {}
 
   args->this_rd += rd;
 
-  if (args->this_rd > args->best_rd) {
-    args->exit_early = 1;
-    return;
+#if CONFIG_DIST_8X8
+  if (!(plane == 0 && plane_bsize >= BLOCK_8X8 &&
+        (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4))) {
+#endif
+    if (args->this_rd > args->best_rd) {
+      args->exit_early = 1;
+      return;
+    }
+#if CONFIG_DIST_8X8
   }
+#endif
 }
 
-#if CONFIG_DAALA_DIST
-static void block_8x8_rd_txfm_daala_dist(int plane, int block, int blk_row,
-                                         int blk_col, BLOCK_SIZE plane_bsize,
-                                         TX_SIZE tx_size, void *arg) {
-  struct rdcost_block_args *args = arg;
-  MACROBLOCK *const x = args->x;
+#if CONFIG_DIST_8X8
+static void dist_8x8_sub8x8_txfm_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                    BLOCK_SIZE bsize,
+                                    struct rdcost_block_args *args) {
   MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblockd_plane *const pd = &xd->plane[0];
+  const struct macroblock_plane *const p = &x->plane[0];
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  int64_t rd, rd1, rd2;
-  RD_STATS this_rd_stats;
-  int qm = OD_HVS_QM;
-  int use_activity_masking = 0;
-
-  (void)tx_size;
-
-  assert(plane == 0);
-  assert(plane_bsize >= BLOCK_8X8);
-#if CONFIG_PVQ
-  use_activity_masking = x->daala_enc.use_activity_masking;
-#endif  // CONFIG_PVQ
-  av1_init_rd_stats(&this_rd_stats);
+  const int src_stride = p->src.stride;
+  const int dst_stride = pd->dst.stride;
+  const uint8_t *src = &p->src.buf[0];
+  const uint8_t *dst = &pd->dst.buf[0];
+  const int16_t *pred = &pd->pred[0];
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
 
-  if (args->exit_early) return;
+  int i, j;
+  int64_t rd, rd1, rd2;
+  unsigned int tmp1, tmp2;
+  int qindex = x->qindex;
 
-  {
-    const struct macroblock_plane *const p = &x->plane[plane];
-    struct macroblockd_plane *const pd = &xd->plane[plane];
+  assert((bw & 0x07) == 0);
+  assert((bh & 0x07) == 0);
 
-    const int src_stride = p->src.stride;
-    const int dst_stride = pd->dst.stride;
-    const int diff_stride = block_size_wide[plane_bsize];
+#if CONFIG_HIGHBITDEPTH
+  uint8_t *pred8;
+  DECLARE_ALIGNED(16, uint16_t, pred16[MAX_TX_SQUARE]);
 
-    const uint8_t *src =
-        &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
-    const uint8_t *dst =
-        &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    pred8 = CONVERT_TO_BYTEPTR(pred16);
+  else
+    pred8 = (uint8_t *)pred16;
+#else
+  DECLARE_ALIGNED(16, uint8_t, pred8[MAX_TX_SQUARE]);
+#endif  // CONFIG_HIGHBITDEPTH
 
-    unsigned int tmp1, tmp2;
-    int qindex = x->qindex;
-    const int pred_stride = block_size_wide[plane_bsize];
-    const int pred_idx = (blk_row * pred_stride + blk_col)
-                         << tx_size_wide_log2[0];
-    int16_t *pred = &pd->pred[pred_idx];
-    int i, j;
-    const int tx_blk_size = 8;
-
-    DECLARE_ALIGNED(16, uint8_t, pred8[8 * 8]);
-
-    for (j = 0; j < tx_blk_size; j++)
-      for (i = 0; i < tx_blk_size; i++)
-        pred8[j * tx_blk_size + i] = pred[j * diff_stride + i];
-
-    tmp1 = av1_daala_dist(src, src_stride, pred8, tx_blk_size, 8, 8, qm,
-                          use_activity_masking, qindex);
-    tmp2 = av1_daala_dist(src, src_stride, dst, dst_stride, 8, 8, qm,
-                          use_activity_masking, qindex);
-
-    if (!is_inter_block(mbmi)) {
-      this_rd_stats.sse = (int64_t)tmp1 * 16;
-      this_rd_stats.dist = (int64_t)tmp2 * 16;
-    } else {
-      // For inter mode, the decoded pixels are provided in pd->pred,
-      // while the predicted pixels are in dst.
-      this_rd_stats.sse = (int64_t)tmp2 * 16;
-      this_rd_stats.dist = (int64_t)tmp1 * 16;
-    }
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    for (j = 0; j < bh; j++)
+      for (i = 0; i < bw; i++)
+        CONVERT_TO_SHORTPTR(pred8)[j * bw + i] = pred[j * bw + i];
+  } else {
+#endif
+    for (j = 0; j < bh; j++)
+      for (i = 0; i < bw; i++) pred8[j * bw + i] = pred[j * bw + i];
+#if CONFIG_HIGHBITDEPTH
   }
+#endif  // CONFIG_HIGHBITDEPTH
 
-  rd = RDCOST(x->rdmult, x->rddiv, 0, this_rd_stats.dist);
-  if (args->this_rd + rd > args->best_rd) {
-    args->exit_early = 1;
-    return;
+  tmp1 = av1_dist_8x8(cpi, xd, src, src_stride, pred8, bw, bsize, bw, bh, bw,
+                      bh, qindex);
+  tmp2 = av1_dist_8x8(cpi, xd, src, src_stride, dst, dst_stride, bsize, bw, bh,
+                      bw, bh, qindex);
+
+  if (!is_inter_block(mbmi)) {
+    args->rd_stats.sse = (int64_t)tmp1 * 16;
+    args->rd_stats.dist = (int64_t)tmp2 * 16;
+  } else {
+    // For inter mode, the decoded pixels are provided in pd->pred,
+    // while the predicted pixels are in dst.
+    args->rd_stats.sse = (int64_t)tmp2 * 16;
+    args->rd_stats.dist = (int64_t)tmp1 * 16;
   }
 
-  {
-    const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
-    const uint8_t txw_unit = tx_size_wide_unit[tx_size];
-    const uint8_t txh_unit = tx_size_high_unit[tx_size];
-    const int step = txw_unit * txh_unit;
-    int offset_h = tx_size_high_unit[TX_4X4];
-    // The rate of the current 8x8 block is the sum of four 4x4 blocks in it.
-    this_rd_stats.rate =
-        x->rate_4x4[block - max_blocks_wide * offset_h - step] +
-        x->rate_4x4[block - max_blocks_wide * offset_h] +
-        x->rate_4x4[block - step] + x->rate_4x4[block];
-  }
-  rd1 = RDCOST(x->rdmult, x->rddiv, this_rd_stats.rate, this_rd_stats.dist);
-  rd2 = RDCOST(x->rdmult, x->rddiv, 0, this_rd_stats.sse);
+  rd1 = RDCOST(x->rdmult, args->rd_stats.rate, args->rd_stats.dist);
+  rd2 = RDCOST(x->rdmult, 0, args->rd_stats.sse);
   rd = AOMMIN(rd1, rd2);
 
-  args->rd_stats.dist += this_rd_stats.dist;
-  args->rd_stats.sse += this_rd_stats.sse;
-
-  args->this_rd += rd;
+  args->rd_stats.rdcost = rd;
+  args->this_rd = rd;
 
-  if (args->this_rd > args->best_rd) {
-    args->exit_early = 1;
-    return;
-  }
+  if (args->this_rd > args->best_rd) args->exit_early = 1;
 }
-#endif  // CONFIG_DAALA_DIST
+#endif  // CONFIG_DIST_8X8
 
 static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
                              RD_STATS *rd_stats, int64_t ref_best_rd, int plane,
@@ -1705,15 +2113,13 @@ static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
 
   av1_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
 
-#if CONFIG_DAALA_DIST
-  if (plane == 0 && bsize >= BLOCK_8X8 &&
+  av1_foreach_transformed_block_in_plane(xd, bsize, plane, block_rd_txfm,
+                                         &args);
+#if CONFIG_DIST_8X8
+  if (!args.exit_early && plane == 0 && bsize >= BLOCK_8X8 &&
       (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4))
-    av1_foreach_8x8_transformed_block_in_yplane(
-        xd, bsize, block_rd_txfm, block_8x8_rd_txfm_daala_dist, &args);
-  else
-#endif  // CONFIG_DAALA_DIST
-    av1_foreach_transformed_block_in_plane(xd, bsize, plane, block_rd_txfm,
-                                           &args);
+    dist_8x8_sub8x8_txfm_rd(cpi, x, bsize, &args);
+#endif
 
   if (args.exit_early) {
     av1_invalid_rd_stats(rd_stats);
@@ -1768,8 +2174,14 @@ static int tx_size_cost(const AV1_COMP *const cpi, const MACROBLOCK *const x,
   const MACROBLOCKD *const xd = &x->e_mbd;
   const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
 
-  const int tx_select =
-      cm->tx_mode == TX_MODE_SELECT && mbmi->sb_type >= BLOCK_8X8;
+  const int tx_select = cm->tx_mode == TX_MODE_SELECT &&
+#if CONFIG_EXT_PARTITION_TYPES
+                        // Currently these block shapes can only use 4x4
+                        // transforms
+                        mbmi->sb_type != BLOCK_4X16 &&
+                        mbmi->sb_type != BLOCK_16X4 &&
+#endif
+                        mbmi->sb_type >= BLOCK_8X8;
 
   if (tx_select) {
     const int is_inter = is_inter_block(mbmi);
@@ -1779,11 +2191,11 @@ static int tx_size_cost(const AV1_COMP *const cpi, const MACROBLOCK *const x,
     const int depth = tx_size_to_depth(coded_tx_size);
     const int tx_size_ctx = get_tx_size_context(xd);
     int r_tx_size = cpi->tx_size_cost[tx_size_cat][tx_size_ctx][depth];
-#if CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
+#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
     if (is_quarter_tx_allowed(xd, mbmi, is_inter) && tx_size != coded_tx_size)
       r_tx_size += av1_cost_bit(cm->fc->quarter_tx_size_prob,
                                 tx_size == quarter_txsize_lookup[bsize]);
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
+#endif
     return r_tx_size;
   } else {
     return 0;
@@ -1796,6 +2208,10 @@ int av1_tx_type_cost(const AV1_COMP *cpi, const MACROBLOCKD *xd,
                      TX_TYPE tx_type) {
   if (plane > 0) return 0;
 
+#if CONFIG_VAR_TX
+  tx_size = get_min_tx_size(tx_size);
+#endif
+
   const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   const int is_inter = is_inter_block(mbmi);
 #if CONFIG_EXT_TX
@@ -1844,6 +2260,9 @@ static int64_t txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
 
   const int r_tx_size = tx_size_cost(cpi, x, bs, tx_size);
 
+#if CONFIG_PVQ
+  assert(tx_size >= TX_4X4);
+#endif  // CONFIG_PVQ
   assert(skip_prob > 0);
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
   assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed_bsize(bs)));
@@ -1864,21 +2283,20 @@ static int64_t txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
 
   if (rd_stats->skip) {
     if (is_inter) {
-      rd = RDCOST(x->rdmult, x->rddiv, s1, rd_stats->sse);
+      rd = RDCOST(x->rdmult, s1, rd_stats->sse);
     } else {
-      rd = RDCOST(x->rdmult, x->rddiv, s1 + r_tx_size * tx_select,
-                  rd_stats->sse);
+      rd = RDCOST(x->rdmult, s1 + r_tx_size * tx_select, rd_stats->sse);
     }
   } else {
-    rd = RDCOST(x->rdmult, x->rddiv,
-                rd_stats->rate + s0 + r_tx_size * tx_select, rd_stats->dist);
+    rd = RDCOST(x->rdmult, rd_stats->rate + s0 + r_tx_size * tx_select,
+                rd_stats->dist);
   }
 
   if (tx_select) rd_stats->rate += r_tx_size;
 
   if (is_inter && !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
       !(rd_stats->skip))
-    rd = AOMMIN(rd, RDCOST(x->rdmult, x->rddiv, s1, rd_stats->sse));
+    rd = AOMMIN(rd, RDCOST(x->rdmult, s1, rd_stats->sse));
 
   return rd;
 }
@@ -1895,6 +2313,11 @@ static int skip_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs,
     // transforms should be considered for pruning
     prune = prune_tx_types(cpi, bs, x, xd, -1);
 
+#if CONFIG_MRC_TX
+  // MRC_DCT only implemented for TX_32X32 so only include this tx in
+  // the search for TX_32X32
+  if (tx_type == MRC_DCT && tx_size != TX_32X32) return 1;
+#endif  // CONFIG_MRC_TX
   if (mbmi->ref_mv_idx > 0 && tx_type != DCT_DCT) return 1;
   if (FIXED_TX_TYPE && tx_type != get_default_tx_type(0, xd, 0, tx_size))
     return 1;
@@ -1929,7 +2352,8 @@ static int skip_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs,
   return 0;
 }
 
-#if CONFIG_EXT_INTER && (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT)
+#if CONFIG_EXT_INTER && \
+    (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT || CONFIG_INTERINTRA)
 static int64_t estimate_yrd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bs,
                                    MACROBLOCK *x, int *r, int64_t *d, int *s,
                                    int64_t *sse, int64_t ref_best_rd) {
@@ -2020,14 +2444,13 @@ static void choose_largest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
       av1_tx_type_cost(cpi, xd, bs, plane, mbmi->tx_size, tx_type);
 
       if (this_rd_stats.skip)
-        this_rd = RDCOST(x->rdmult, x->rddiv, s1, this_rd_stats.sse);
+        this_rd = RDCOST(x->rdmult, s1, this_rd_stats.sse);
       else
-        this_rd = RDCOST(x->rdmult, x->rddiv, this_rd_stats.rate + s0,
-                         this_rd_stats.dist);
+        this_rd =
+            RDCOST(x->rdmult, this_rd_stats.rate + s0, this_rd_stats.dist);
       if (is_inter_block(mbmi) && !xd->lossless[mbmi->segment_id] &&
           !this_rd_stats.skip)
-        this_rd =
-            AOMMIN(this_rd, RDCOST(x->rdmult, x->rddiv, s1, this_rd_stats.sse));
+        this_rd = AOMMIN(this_rd, RDCOST(x->rdmult, s1, this_rd_stats.sse));
 
       if (this_rd < best_rd) {
         best_rd = this_rd;
@@ -2068,13 +2491,12 @@ static void choose_largest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
           continue;
       }
       if (this_rd_stats.skip)
-        this_rd = RDCOST(x->rdmult, x->rddiv, s1, this_rd_stats.sse);
+        this_rd = RDCOST(x->rdmult, s1, this_rd_stats.sse);
       else
-        this_rd = RDCOST(x->rdmult, x->rddiv, this_rd_stats.rate + s0,
-                         this_rd_stats.dist);
-      if (is_inter && !xd->lossless[mbmi->segment_id] && !this_rd_stats.skip)
         this_rd =
-            AOMMIN(this_rd, RDCOST(x->rdmult, x->rddiv, s1, this_rd_stats.sse));
+            RDCOST(x->rdmult, this_rd_stats.rate + s0, this_rd_stats.dist);
+      if (is_inter && !xd->lossless[mbmi->segment_id] && !this_rd_stats.skip)
+        this_rd = AOMMIN(this_rd, RDCOST(x->rdmult, s1, this_rd_stats.sse));
 
       if (this_rd < best_rd) {
         best_rd = this_rd;
@@ -2129,7 +2551,6 @@ static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
   TX_TYPE best_tx_type = DCT_DCT;
 #if CONFIG_TXK_SEL
   TX_TYPE best_txk_type[MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
-  const int num_blk = bsize_to_num_blk(bs);
 #endif  // CONFIG_TXK_SEL
   const int tx_select = cm->tx_mode == TX_MODE_SELECT;
   const int is_inter = is_inter_block(mbmi);
@@ -2171,8 +2592,7 @@ static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
                       rect_tx_size);
         if (rd < best_rd) {
 #if CONFIG_TXK_SEL
-          memcpy(best_txk_type, mbmi->txk_type,
-                 sizeof(best_txk_type[0]) * num_blk);
+          memcpy(best_txk_type, mbmi->txk_type, sizeof(best_txk_type[0]) * 256);
 #endif
           best_tx_type = tx_type;
           best_tx_size = rect_tx_size;
@@ -2278,8 +2698,7 @@ static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
       last_rd = rd;
       if (rd < best_rd) {
 #if CONFIG_TXK_SEL
-        memcpy(best_txk_type, mbmi->txk_type,
-               sizeof(best_txk_type[0]) * num_blk);
+        memcpy(best_txk_type, mbmi->txk_type, sizeof(best_txk_type[0]) * 256);
 #endif
         best_tx_type = tx_type;
         best_tx_size = n;
@@ -2295,7 +2714,7 @@ static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
   mbmi->tx_size = best_tx_size;
   mbmi->tx_type = best_tx_type;
 #if CONFIG_TXK_SEL
-  memcpy(mbmi->txk_type, best_txk_type, sizeof(best_txk_type[0]) * num_blk);
+  memcpy(mbmi->txk_type, best_txk_type, sizeof(best_txk_type[0]) * 256);
 #endif
 
 #if CONFIG_VAR_TX
@@ -2366,21 +2785,7 @@ static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x,
   int block = 0;
   for (row = 0; row < max_blocks_high; row += stepr) {
     for (col = 0; col < max_blocks_wide; col += stepc) {
-#if CONFIG_CFL
-      const struct macroblockd_plane *const pd = &xd->plane[0];
-      const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-
-#if CONFIG_EC_ADAPT
-      FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
-#else
-      FRAME_CONTEXT *const ec_ctx = cpi->common.fc;
-#endif  // CONFIG_EC_ADAPT
-
-      av1_predict_intra_block_encoder_facade(x, ec_ctx, 0, block, col, row,
-                                             tx_size, plane_bsize);
-#else
       av1_predict_intra_block_facade(xd, 0, block, col, row, tx_size);
-#endif
       block += step;
     }
   }
@@ -2388,7 +2793,8 @@ static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x,
   model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &this_rd_stats.rate,
                   &this_rd_stats.dist, &this_rd_stats.skip, &temp_sse);
 #if CONFIG_EXT_INTRA
-  if (av1_is_directional_mode(mbmi->mode, bsize)) {
+  if (av1_is_directional_mode(mbmi->mode, bsize) &&
+      av1_use_angle_delta(bsize)) {
     mode_cost += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
                                     MAX_ANGLE_DELTA + mbmi->angle_delta[0]);
   }
@@ -2405,8 +2811,8 @@ static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x,
     }
   }
 #endif  // CONFIG_FILTER_INTRA
-  this_rd = RDCOST(x->rdmult, x->rddiv, this_rd_stats.rate + mode_cost,
-                   this_rd_stats.dist);
+  this_rd =
+      RDCOST(x->rdmult, this_rd_stats.rate + mode_cost, this_rd_stats.dist);
   return this_rd;
 }
 
@@ -2620,7 +3026,7 @@ static int rd_pick_palette_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
       super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
       if (tokenonly_rd_stats.rate == INT_MAX) continue;
       this_rate = tokenonly_rd_stats.rate + palette_mode_cost;
-      this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist);
+      this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
       if (!xd->lossless[mbmi->segment_id] && mbmi->sb_type >= BLOCK_8X8) {
         tokenonly_rd_stats.rate -= tx_size_cost(cpi, x, bsize, mbmi->tx_size);
       }
@@ -2773,15 +3179,17 @@ static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
                                     src_stride, dst, dst_stride, xd->bd);
 #endif
           if (is_lossless) {
-            TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block, tx_size);
-            const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, 0);
+            TX_TYPE tx_type =
+                av1_get_tx_type(PLANE_TYPE_Y, xd, 0, 0, block, tx_size);
+            const SCAN_ORDER *scan_order =
+                get_scan(cm, tx_size, tx_type, &xd->mi[0]->mbmi);
             const int coeff_ctx =
                 combine_entropy_contexts(tempa[idx], templ[idy]);
 #if !CONFIG_PVQ
             av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8,
                             tx_size, coeff_ctx, AV1_XFORM_QUANT_FP);
-            ratey += av1_cost_coeffs(cpi, x, 0, block, tx_size, scan_order,
-                                     tempa + idx, templ + idy,
+            ratey += av1_cost_coeffs(cpi, x, 0, 0, 0, block, tx_size,
+                                     scan_order, tempa + idx, templ + idy,
                                      cpi->sf.use_fast_coef_costing);
             skip = (p->eobs[block] == 0);
             can_skip &= skip;
@@ -2806,28 +3214,38 @@ static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
             templ[idy] = !skip;
             can_skip &= skip;
 #endif
-            if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+            if (RDCOST(x->rdmult, ratey, distortion) >= best_rd)
               goto next_highbd;
 #if CONFIG_PVQ
             if (!skip)
 #endif
               av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block),
+#if CONFIG_LGT
+                                          mode,
+#endif
                                           DCT_DCT, tx_size, dst, dst_stride,
                                           p->eobs[block]);
           } else {
             int64_t dist;
             unsigned int tmp;
-            TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block, tx_size);
-            const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, 0);
+            TX_TYPE tx_type =
+                av1_get_tx_type(PLANE_TYPE_Y, xd, 0, 0, block, tx_size);
+            const SCAN_ORDER *scan_order =
+                get_scan(cm, tx_size, tx_type, &xd->mi[0]->mbmi);
             const int coeff_ctx =
                 combine_entropy_contexts(tempa[idx], templ[idy]);
 #if !CONFIG_PVQ
+#if DISABLE_TRELLISQ_SEARCH
+            av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8,
+                            tx_size, coeff_ctx, AV1_XFORM_QUANT_B);
+#else
             av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8,
                             tx_size, coeff_ctx, AV1_XFORM_QUANT_FP);
-            av1_optimize_b(cm, x, 0, block, BLOCK_8X8, tx_size, tempa + idx,
-                           templ + idy);
-            ratey += av1_cost_coeffs(cpi, x, 0, block, tx_size, scan_order,
-                                     tempa + idx, templ + idy,
+            av1_optimize_b(cm, x, 0, 0, 0, block, BLOCK_8X8, tx_size,
+                           tempa + idx, templ + idy);
+#endif  // DISABLE_TRELLISQ_SEARCH
+            ratey += av1_cost_coeffs(cpi, x, 0, 0, 0, block, tx_size,
+                                     scan_order, tempa + idx, templ + idy,
                                      cpi->sf.use_fast_coef_costing);
             skip = (p->eobs[block] == 0);
             can_skip &= skip;
@@ -2855,19 +3273,22 @@ static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
             if (!skip)
 #endif
               av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block),
+#if CONFIG_LGT
+                                          mode,
+#endif
                                           tx_type, tx_size, dst, dst_stride,
                                           p->eobs[block]);
             cpi->fn_ptr[sub_bsize].vf(src, src_stride, dst, dst_stride, &tmp);
             dist = (int64_t)tmp << 4;
             distortion += dist;
-            if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+            if (RDCOST(x->rdmult, ratey, distortion) >= best_rd)
               goto next_highbd;
           }
         }
       }
 
       rate += ratey;
-      this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+      this_rd = RDCOST(x->rdmult, rate, distortion);
 
       if (this_rd < best_rd) {
         *bestrate = rate;
@@ -2966,14 +3387,24 @@ static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
         aom_subtract_block(tx_height, tx_width, src_diff, 8, src, src_stride,
                            dst, dst_stride);
 #endif  // !CONFIG_PVQ
-
-        TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block, tx_size);
-        const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, 0);
+        TX_TYPE tx_type =
+            av1_get_tx_type(PLANE_TYPE_Y, xd, 0, 0, block, tx_size);
+        const SCAN_ORDER *scan_order =
+            get_scan(cm, tx_size, tx_type, &xd->mi[0]->mbmi);
         const int coeff_ctx = combine_entropy_contexts(tempa[idx], templ[idy]);
 #if CONFIG_CB4X4
         block = 4 * block;
 #endif  // CONFIG_CB4X4
 #if !CONFIG_PVQ
+#if DISABLE_TRELLISQ_SEARCH
+        av1_xform_quant(cm, x, 0, block,
+#if CONFIG_CB4X4
+                        2 * (row + idy), 2 * (col + idx),
+#else
+                        row + idy, col + idx,
+#endif  // CONFIG_CB4X4
+                        BLOCK_8X8, tx_size, coeff_ctx, AV1_XFORM_QUANT_B);
+#else
         const AV1_XFORM_QUANT xform_quant =
             is_lossless ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP;
         av1_xform_quant(cm, x, 0, block,
@@ -2984,12 +3415,12 @@ static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
 #endif  // CONFIG_CB4X4
                         BLOCK_8X8, tx_size, coeff_ctx, xform_quant);
 
-        av1_optimize_b(cm, x, 0, block, BLOCK_8X8, tx_size, tempa + idx,
+        av1_optimize_b(cm, x, 0, 0, 0, block, BLOCK_8X8, tx_size, tempa + idx,
                        templ + idy);
-
-        ratey +=
-            av1_cost_coeffs(cpi, x, 0, block, tx_size, scan_order, tempa + idx,
-                            templ + idy, cpi->sf.use_fast_coef_costing);
+#endif  // DISABLE_TRELLISQ_SEARCH
+        ratey += av1_cost_coeffs(cpi, x, 0, 0, 0, block, tx_size, scan_order,
+                                 tempa + idx, templ + idy,
+                                 cpi->sf.use_fast_coef_costing);
         skip = (p->eobs[block] == 0);
         can_skip &= skip;
         tempa[idx] = !skip;
@@ -3028,6 +3459,9 @@ static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
           if (!skip)
 #endif  // CONFIG_PVQ
             av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block),
+#if CONFIG_LGT
+                                        mode,
+#endif
                                         tx_type, tx_size, dst, dst_stride,
                                         p->eobs[block]);
           unsigned int tmp;
@@ -3036,14 +3470,16 @@ static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
           distortion += dist;
         }
 
-        if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
-          goto next;
+        if (RDCOST(x->rdmult, ratey, distortion) >= best_rd) goto next;
 
         if (is_lossless) {  // Calculate inverse txfm *after* RD cost.
 #if CONFIG_PVQ
           if (!skip)
 #endif  // CONFIG_PVQ
             av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block),
+#if CONFIG_LGT
+                                        mode,
+#endif
                                         DCT_DCT, tx_size, dst, dst_stride,
                                         p->eobs[block]);
         }
@@ -3051,7 +3487,7 @@ static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
     }
 
     rate += ratey;
-    this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+    this_rd = RDCOST(x->rdmult, rate, distortion);
 
     if (this_rd < best_rd) {
       *bestrate = rate;
@@ -3153,9 +3589,9 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(const AV1_COMP *const cpi,
           cpi, mb, idy, idx, &best_mode, bmode_costs,
           xd->plane[0].above_context + idx, xd->plane[0].left_context + idy, &r,
           &ry, &d, bsize, tx_size, y_skip, best_rd - total_rd);
-#if !CONFIG_DAALA_DIST
+#if !CONFIG_DIST_8X8
       if (this_rd >= best_rd - total_rd) return INT64_MAX;
-#endif  // !CONFIG_DAALA_DIST
+#endif  // !CONFIG_DIST_8X8
       total_rd += this_rd;
       cost += r;
       total_distortion += d;
@@ -3172,7 +3608,7 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(const AV1_COMP *const cpi,
   }
   mbmi->mode = mic->bmi[3].as_mode;
 
-#if CONFIG_DAALA_DIST
+#if CONFIG_DIST_8X8
   {
     const struct macroblock_plane *p = &mb->plane[0];
     const struct macroblockd_plane *pd = &xd->plane[0];
@@ -3180,18 +3616,16 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(const AV1_COMP *const cpi,
     const int dst_stride = pd->dst.stride;
     uint8_t *src = p->src.buf;
     uint8_t *dst = pd->dst.buf;
-    int use_activity_masking = 0;
-    int qm = OD_HVS_QM;
 
 #if CONFIG_PVQ
     use_activity_masking = mb->daala_enc.use_activity_masking;
 #endif  // CONFIG_PVQ
     // Daala-defined distortion computed for the block of 8x8 pixels
-    total_distortion = av1_daala_dist(src, src_stride, dst, dst_stride, 8, 8,
-                                      qm, use_activity_masking, mb->qindex)
+    total_distortion = av1_dist_8x8(cpi, xd, src, src_stride, dst, dst_stride,
+                                    BLOCK_8X8, 8, 8, 8, 8, mb->qindex)
                        << 4;
   }
-#endif  // CONFIG_DAALA_DIST
+#endif  // CONFIG_DIST_8X8
   // Add in the cost of the transform type
   if (!is_lossless) {
     int rate_tx_type = 0;
@@ -3218,7 +3652,7 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(const AV1_COMP *const cpi,
   *rate_y = tot_rate_y;
   *distortion = total_distortion;
 
-  return RDCOST(mb->rdmult, mb->rddiv, cost, total_distortion);
+  return RDCOST(mb->rdmult, cost, total_distortion);
 }
 
 #if CONFIG_FILTER_INTRA
@@ -3261,7 +3695,7 @@ static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
     this_rate = tokenonly_rd_stats.rate +
                 av1_cost_bit(cpi->common.fc->filter_intra_probs[0], 1) +
                 write_uniform_cost(FILTER_INTRA_MODES, mode) + mode_cost;
-    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist);
+    this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
 
     if (this_rd < *best_rd) {
       *best_rd = this_rd;
@@ -3321,7 +3755,7 @@ static int64_t calc_rd_given_intra_angle(
   this_rate = tokenonly_rd_stats.rate + mode_cost +
               write_uniform_cost(2 * max_angle_delta + 1,
                                  mbmi->angle_delta[0] + max_angle_delta);
-  this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist);
+  this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
 
   if (this_rd < *best_rd) {
     *best_rd = this_rd;
@@ -3496,8 +3930,8 @@ static void angle_estimation(const uint8_t *src, int src_stride, int rows,
                              uint8_t *directional_mode_skip_mask) {
   memset(directional_mode_skip_mask, 0,
          INTRA_MODES * sizeof(*directional_mode_skip_mask));
-  // Sub-8x8 blocks do not use extra directions.
-  if (bsize < BLOCK_8X8) return;
+  // Check if angle_delta is used
+  if (!av1_use_angle_delta(bsize)) return;
   uint64_t hist[DIRECTIONAL_MODES];
   memset(hist, 0, DIRECTIONAL_MODES * sizeof(hist[0]));
   src += src_stride;
@@ -3551,8 +3985,8 @@ static void highbd_angle_estimation(const uint8_t *src8, int src_stride,
                                     uint8_t *directional_mode_skip_mask) {
   memset(directional_mode_skip_mask, 0,
          INTRA_MODES * sizeof(*directional_mode_skip_mask));
-  // Sub-8x8 blocks do not use extra directions.
-  if (bsize < BLOCK_8X8) return;
+  // Check if angle_delta is used
+  if (!av1_use_angle_delta(bsize)) return;
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   uint64_t hist[DIRECTIONAL_MODES];
   memset(hist, 0, DIRECTIONAL_MODES * sizeof(hist[0]));
@@ -3608,7 +4042,6 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
                                       int *rate, int *rate_tokenonly,
                                       int64_t *distortion, int *skippable,
                                       BLOCK_SIZE bsize, int64_t best_rd) {
-  uint8_t mode_idx;
   MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *const mic = xd->mi[0];
   MB_MODE_INFO *const mbmi = &mic->mbmi;
@@ -3683,7 +4116,7 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     x->use_default_intra_tx_type = 0;
 
   /* Y Search for intra prediction mode */
-  for (mode_idx = DC_PRED; mode_idx <= FINAL_MODE_SEARCH; ++mode_idx) {
+  for (int mode_idx = DC_PRED; mode_idx <= FINAL_MODE_SEARCH; ++mode_idx) {
     RD_STATS this_rd_stats;
     int this_rate, this_rate_tokenonly, s;
     int64_t this_distortion, this_rd, this_model_rd;
@@ -3692,7 +4125,8 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
       mbmi->mode = best_mbmi.mode;
       x->use_default_intra_tx_type = 0;
     } else {
-      mbmi->mode = mode_idx;
+      assert(mode_idx < INTRA_MODES);
+      mbmi->mode = intra_rd_search_mode_order[mode_idx];
     }
 #if CONFIG_PVQ
     od_encode_rollback(&x->daala_enc, &pre_buf);
@@ -3708,7 +4142,7 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
 #if CONFIG_EXT_INTRA
     is_directional_mode = av1_is_directional_mode(mbmi->mode, bsize);
     if (is_directional_mode && directional_mode_skip_mask[mbmi->mode]) continue;
-    if (is_directional_mode) {
+    if (is_directional_mode && av1_use_angle_delta(bsize)) {
       this_rd_stats.rate = INT_MAX;
       rd_pick_intra_angle_sby(cpi, x, &this_rate, &this_rd_stats, bsize,
                               bmode_costs[mbmi->mode], best_rd, &best_model_rd);
@@ -3754,11 +4188,13 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
         this_rate +=
             cpi->intra_filter_cost[intra_filter_ctx][mbmi->intra_filter];
 #endif  // CONFIG_INTRA_INTERP
-      this_rate += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
-                                      MAX_ANGLE_DELTA + mbmi->angle_delta[0]);
+      if (av1_use_angle_delta(bsize)) {
+        this_rate += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
+                                        MAX_ANGLE_DELTA + mbmi->angle_delta[0]);
+      }
     }
 #endif  // CONFIG_EXT_INTRA
-    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+    this_rd = RDCOST(x->rdmult, this_rate, this_distortion);
 #if CONFIG_FILTER_INTRA
     if (best_rd == INT64_MAX || this_rd - best_rd < (best_rd >> 4)) {
       filter_intra_mode_skip_mask ^= (1 << mbmi->mode);
@@ -3785,16 +4221,6 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   od_encode_rollback(&x->daala_enc, &post_buf);
 #endif  // CONFIG_PVQ
 
-#if CONFIG_CFL
-  // Perform one extra txfm_rd_in_plane() call, this time with the best value so
-  // we can store reconstructed luma values
-  RD_STATS this_rd_stats;
-  x->cfl_store_y = 1;
-  txfm_rd_in_plane(x, cpi, &this_rd_stats, INT64_MAX, 0, bsize,
-                   mic->mbmi.tx_size, cpi->sf.use_fast_coef_costing);
-  x->cfl_store_y = 0;
-#endif
-
 #if CONFIG_PALETTE
   if (try_palette) {
     rd_pick_palette_intra_sby(cpi, x, bsize, palette_y_mode_ctx,
@@ -3826,7 +4252,7 @@ static int super_block_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x,
                             int64_t ref_best_rd) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const TX_SIZE uv_tx_size = get_uv_tx_size(mbmi, &xd->plane[1]);
+  const TX_SIZE uv_tx_size = av1_get_uv_tx_size(mbmi, &xd->plane[1]);
   int plane;
   int is_cost_valid = 1;
   av1_init_rd_stats(rd_stats);
@@ -3857,9 +4283,8 @@ static int super_block_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x,
         break;
       }
       av1_merge_rd_stats(rd_stats, &pn_rd_stats);
-      if (RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist) >
-              ref_best_rd &&
-          RDCOST(x->rdmult, x->rddiv, 0, rd_stats->sse) > ref_best_rd) {
+      if (RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) > ref_best_rd &&
+          RDCOST(x->rdmult, 0, rd_stats->sse) > ref_best_rd) {
         is_cost_valid = 0;
         break;
       }
@@ -3875,13 +4300,6 @@ static int super_block_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x,
 }
 
 #if CONFIG_VAR_TX
-// FIXME crop these calls
-static uint64_t sum_squares_2d(const int16_t *diff, int diff_stride,
-                               TX_SIZE tx_size) {
-  return aom_sum_squares_2d_i16(diff, diff_stride, tx_size_wide[tx_size],
-                                tx_size_high[tx_size]);
-}
-
 void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
                        int blk_row, int blk_col, int plane, int block,
                        int plane_bsize, const ENTROPY_CONTEXT *a,
@@ -3890,18 +4308,23 @@ void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
   MACROBLOCKD *xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
+
+#if CONFIG_TXK_SEL
+  av1_search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize,
+                      tx_size, a, l, 0, rd_stats);
+  return;
+#endif
+
   int64_t tmp;
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   PLANE_TYPE plane_type = get_plane_type(plane);
-  TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+  TX_TYPE tx_type =
+      av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
   const SCAN_ORDER *const scan_order =
-      get_scan(cm, tx_size, tx_type, is_inter_block(&xd->mi[0]->mbmi));
+      get_scan(cm, tx_size, tx_type, &xd->mi[0]->mbmi);
   BLOCK_SIZE txm_bsize = txsize_to_bsize[tx_size];
   int bh = block_size_high[txm_bsize];
   int bw = block_size_wide[txm_bsize];
-  int txb_h = tx_size_high_unit[tx_size];
-  int txb_w = tx_size_wide_unit[tx_size];
-
   int src_stride = p->src.stride;
   uint8_t *src =
       &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
@@ -3914,30 +4337,15 @@ void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
 #else
   DECLARE_ALIGNED(16, uint8_t, rec_buffer[MAX_TX_SQUARE]);
 #endif  // CONFIG_HIGHBITDEPTH
-  int max_blocks_high = block_size_high[plane_bsize];
-  int max_blocks_wide = block_size_wide[plane_bsize];
-  const int diff_stride = max_blocks_wide;
+  const int diff_stride = block_size_wide[plane_bsize];
   const int16_t *diff =
       &p->src_diff[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
   int txb_coeff_cost;
 
   assert(tx_size < TX_SIZES_ALL);
 
-  if (xd->mb_to_bottom_edge < 0)
-    max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y);
-  if (xd->mb_to_right_edge < 0)
-    max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x);
-
-  max_blocks_high >>= tx_size_wide_log2[0];
-  max_blocks_wide >>= tx_size_wide_log2[0];
-
   int coeff_ctx = get_entropy_context(tx_size, a, l);
 
-  av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                  coeff_ctx, AV1_XFORM_QUANT_FP);
-
-  av1_optimize_b(cm, x, plane, block, plane_bsize, tx_size, a, l);
-
 // TODO(any): Use av1_dist_block to compute distortion
 #if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -3954,21 +4362,35 @@ void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
                     0, bw, bh);
 #endif  // CONFIG_HIGHBITDEPTH
 
-  if (blk_row + txb_h > max_blocks_high || blk_col + txb_w > max_blocks_wide) {
-    int idx, idy;
-    int blocks_height = AOMMIN(txb_h, max_blocks_high - blk_row);
-    int blocks_width = AOMMIN(txb_w, max_blocks_wide - blk_col);
-    tmp = 0;
-    for (idy = 0; idy < blocks_height; ++idy) {
-      for (idx = 0; idx < blocks_width; ++idx) {
-        const int16_t *d =
-            diff + ((idy * diff_stride + idx) << tx_size_wide_log2[0]);
-        tmp += sum_squares_2d(d, diff_stride, 0);
-      }
-    }
-  } else {
-    tmp = sum_squares_2d(diff, diff_stride, tx_size);
+#if DISABLE_TRELLISQ_SEARCH
+  av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                  coeff_ctx, AV1_XFORM_QUANT_B);
+
+#else
+  av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                  coeff_ctx, AV1_XFORM_QUANT_FP);
+
+  const int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  const int buffer_length = tx_size_2d[tx_size];
+  int64_t tmp_dist;
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    tmp_dist =
+        av1_highbd_block_error(coeff, dqcoeff, buffer_length, &tmp, xd->bd) >>
+        shift;
+  else
+#endif
+    tmp_dist = av1_block_error(coeff, dqcoeff, buffer_length, &tmp) >> shift;
+
+  if (RDCOST(x->rdmult, 0, tmp_dist) < rd_stats->ref_rdcost) {
+    av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size,
+                   a, l);
   }
+#endif  // DISABLE_TRELLISQ_SEARCH
+
+  tmp = pixel_diff_dist(x, plane, diff, diff_stride, blk_row, blk_col,
+                        plane_bsize, txm_bsize);
 
 #if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
@@ -3977,36 +4399,48 @@ void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
   rd_stats->sse += tmp * 16;
   const int eob = p->eobs[block];
 
+#if CONFIG_LGT
+  PREDICTION_MODE mode = get_prediction_mode(xd->mi[0], plane, tx_size, block);
+  av1_inverse_transform_block(xd, dqcoeff, mode, tx_type, tx_size, rec_buffer,
+                              MAX_TX_SIZE, eob);
+#else
   av1_inverse_transform_block(xd, dqcoeff, tx_type, tx_size, rec_buffer,
                               MAX_TX_SIZE, eob);
+#endif
   if (eob > 0) {
-    if (txb_w + blk_col > max_blocks_wide ||
-        txb_h + blk_row > max_blocks_high) {
-      int idx, idy;
-      unsigned int this_dist;
-      int blocks_height = AOMMIN(txb_h, max_blocks_high - blk_row);
-      int blocks_width = AOMMIN(txb_w, max_blocks_wide - blk_col);
-      tmp = 0;
-      for (idy = 0; idy < blocks_height; ++idy) {
-        for (idx = 0; idx < blocks_width; ++idx) {
-          uint8_t *const s =
-              src + ((idy * src_stride + idx) << tx_size_wide_log2[0]);
-          uint8_t *const r =
-              rec_buffer + ((idy * MAX_TX_SIZE + idx) << tx_size_wide_log2[0]);
-          cpi->fn_ptr[0].vf(s, src_stride, r, MAX_TX_SIZE, &this_dist);
-          tmp += this_dist;
-        }
+#if CONFIG_DIST_8X8
+    if (plane == 0 && (bw < 8 && bh < 8)) {
+      // Save sub8x8 luma decoded pixels
+      // since 8x8 luma decoded pixels are not available for daala-dist
+      // after recursive split of BLOCK_8x8 is done.
+      const int pred_stride = block_size_wide[plane_bsize];
+      const int pred_idx = (blk_row * pred_stride + blk_col)
+                           << tx_size_wide_log2[0];
+      int16_t *decoded = &pd->pred[pred_idx];
+      int i, j;
+
+#if CONFIG_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        for (j = 0; j < bh; j++)
+          for (i = 0; i < bw; i++)
+            decoded[j * pred_stride + i] =
+                CONVERT_TO_SHORTPTR(rec_buffer)[j * MAX_TX_SIZE + i];
+      } else {
+#endif
+        for (j = 0; j < bh; j++)
+          for (i = 0; i < bw; i++)
+            decoded[j * pred_stride + i] = rec_buffer[j * MAX_TX_SIZE + i];
+#if CONFIG_HIGHBITDEPTH
       }
-    } else {
-      uint32_t this_dist;
-      cpi->fn_ptr[txm_bsize].vf(src, src_stride, rec_buffer, MAX_TX_SIZE,
-                                &this_dist);
-      tmp = this_dist;
+#endif  // CONFIG_HIGHBITDEPTH
     }
+#endif  // CONFIG_DIST_8X8
+    tmp = pixel_dist(cpi, x, plane, src, src_stride, rec_buffer, MAX_TX_SIZE,
+                     blk_row, blk_col, plane_bsize, txm_bsize);
   }
   rd_stats->dist += tmp * 16;
-  txb_coeff_cost =
-      av1_cost_coeffs(cpi, x, plane, block, tx_size, scan_order, a, l, 0);
+  txb_coeff_cost = av1_cost_coeffs(cpi, x, plane, blk_row, blk_col, block,
+                                   tx_size, scan_order, a, l, 0);
   rd_stats->rate += txb_coeff_cost;
   rd_stats->skip &= (eob == 0);
 
@@ -4038,14 +4472,35 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
   int64_t this_rd = INT64_MAX;
   ENTROPY_CONTEXT *pta = ta + blk_col;
   ENTROPY_CONTEXT *ptl = tl + blk_row;
-  int coeff_ctx, i;
+  int i;
   int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row,
                                    mbmi->sb_type, tx_size);
   int64_t sum_rd = INT64_MAX;
   int tmp_eob = 0;
   int zero_blk_rate;
   RD_STATS sum_rd_stats;
-  const int tx_size_ctx = txsize_sqr_map[tx_size];
+#if CONFIG_TXK_SEL
+  TX_TYPE best_tx_type = TX_TYPES;
+  int txk_idx = (blk_row << 4) + blk_col;
+#endif
+#if CONFIG_RECT_TX_EXT
+  TX_SIZE quarter_txsize = quarter_txsize_lookup[mbmi->sb_type];
+  int check_qttx = is_quarter_tx_allowed(xd, mbmi, is_inter_block(mbmi)) &&
+                   tx_size == max_txsize_rect_lookup[mbmi->sb_type] &&
+                   quarter_txsize != tx_size;
+  int is_qttx_picked = 0;
+  int eobs_qttx[2] = { 0, 0 };
+  int skip_qttx[2] = { 0, 0 };
+  int block_offset_qttx = check_qttx
+                              ? tx_size_wide_unit[quarter_txsize] *
+                                    tx_size_high_unit[quarter_txsize]
+                              : 0;
+  int blk_row_offset, blk_col_offset;
+  int is_wide_qttx =
+      tx_size_wide_unit[quarter_txsize] > tx_size_high_unit[quarter_txsize];
+  blk_row_offset = is_wide_qttx ? tx_size_high_unit[quarter_txsize] : 0;
+  blk_col_offset = is_wide_qttx ? 0 : tx_size_wide_unit[quarter_txsize];
+#endif
 
   av1_init_rd_stats(&sum_rd_stats);
 
@@ -4056,15 +4511,25 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
     return;
   }
 
-  coeff_ctx = get_entropy_context(tx_size, pta, ptl);
-
   av1_init_rd_stats(rd_stats);
 
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
+#if CONFIG_LV_MAP
+  TX_SIZE txs_ctx = get_txsize_context(tx_size);
+  TXB_CTX txb_ctx;
+  get_txb_ctx(plane_bsize, tx_size, plane, pta, ptl, &txb_ctx);
+  zero_blk_rate =
+      av1_cost_bit(xd->fc->txb_skip[txs_ctx][txb_ctx.txb_skip_ctx], 1);
+#else
+  int tx_size_ctx = txsize_sqr_map[tx_size];
+  int coeff_ctx = get_entropy_context(tx_size, pta, ptl);
   zero_blk_rate = x->token_costs[tx_size_ctx][pd->plane_type][1][0][0]
                                 [coeff_ctx][EOB_TOKEN];
+#endif
 
+  rd_stats->ref_rdcost = ref_best_rd;
+  rd_stats->zero_rate = zero_blk_rate;
   if (cpi->common.tx_mode == TX_MODE_SELECT || tx_size == TX_4X4) {
     inter_tx_size[0][0] = tx_size;
 
@@ -4081,8 +4546,8 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
       }
     }
 
-    if ((RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist) >=
-             RDCOST(x->rdmult, x->rddiv, zero_blk_rate, rd_stats->sse) ||
+    if ((RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
+             RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) ||
          rd_stats->skip == 1) &&
         !xd->lossless[mbmi->segment_id]) {
 #if CONFIG_RD_DEBUG
@@ -4094,6 +4559,9 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
       rd_stats->skip = 1;
       x->blk_skip[plane][blk_row * bw + blk_col] = 1;
       p->eobs[block] = 0;
+#if CONFIG_TXK_SEL
+      mbmi->txk_type[txk_idx] = DCT_DCT;
+#endif
     } else {
       x->blk_skip[plane][blk_row * bw + blk_col] = 0;
       rd_stats->skip = 0;
@@ -4102,23 +4570,143 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
     if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
       rd_stats->rate +=
           av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 0);
-    this_rd = RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist);
+#if CONFIG_RECT_TX_EXT
+    if (check_qttx) {
+      assert(blk_row == 0 && blk_col == 0);
+      rd_stats->rate += av1_cost_bit(cpi->common.fc->quarter_tx_size_prob, 0);
+    }
+#endif
+    this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+#if CONFIG_LV_MAP
+    tmp_eob = p->txb_entropy_ctx[block];
+#else
     tmp_eob = p->eobs[block];
+#endif
+
+#if CONFIG_TXK_SEL
+    best_tx_type = mbmi->txk_type[txk_idx];
+#endif
+
+#if CONFIG_RECT_TX_EXT
+    if (check_qttx) {
+      assert(blk_row == 0 && blk_col == 0 && block == 0 && plane == 0);
+
+      RD_STATS rd_stats_tmp, rd_stats_qttx;
+      int64_t rd_qttx;
+
+      av1_init_rd_stats(&rd_stats_qttx);
+      av1_init_rd_stats(&rd_stats_tmp);
+
+      av1_tx_block_rd_b(cpi, x, quarter_txsize, 0, 0, plane, 0, plane_bsize,
+                        pta, ptl, &rd_stats_qttx);
+
+      tx_size_ctx = txsize_sqr_map[quarter_txsize];
+      coeff_ctx = get_entropy_context(quarter_txsize, pta, ptl);
+      zero_blk_rate = x->token_costs[tx_size_ctx][pd->plane_type][1][0][0]
+                                    [coeff_ctx][EOB_TOKEN];
+      if ((RDCOST(x->rdmult, rd_stats_qttx.rate, rd_stats_qttx.dist) >=
+               RDCOST(x->rdmult, zero_blk_rate, rd_stats_qttx.sse) ||
+           rd_stats_qttx.skip == 1) &&
+          !xd->lossless[mbmi->segment_id]) {
+#if CONFIG_RD_DEBUG
+        av1_update_txb_coeff_cost(&rd_stats_qttx, plane, quarter_txsize, 0, 0,
+                                  zero_blk_rate - rd_stats_qttx.rate);
+#endif  // CONFIG_RD_DEBUG
+        rd_stats_qttx.rate = zero_blk_rate;
+        rd_stats_qttx.dist = rd_stats_qttx.sse;
+        rd_stats_qttx.skip = 1;
+        x->blk_skip[plane][blk_row * bw + blk_col] = 1;
+        skip_qttx[0] = 1;
+        p->eobs[block] = 0;
+      } else {
+        x->blk_skip[plane][blk_row * bw + blk_col] = 0;
+        skip_qttx[0] = 0;
+        rd_stats->skip = 0;
+      }
+
+      // Second tx block
+      av1_tx_block_rd_b(cpi, x, quarter_txsize, blk_row_offset, blk_col_offset,
+                        plane, block_offset_qttx, plane_bsize, pta, ptl,
+                        &rd_stats_tmp);
+
+      av1_set_txb_context(x, plane, 0, quarter_txsize, pta, ptl);
+      coeff_ctx = get_entropy_context(quarter_txsize, pta + blk_col_offset,
+                                      ptl + blk_row_offset);
+      zero_blk_rate = x->token_costs[tx_size_ctx][pd->plane_type][1][0][0]
+                                    [coeff_ctx][EOB_TOKEN];
+      if ((RDCOST(x->rdmult, rd_stats_tmp.rate, rd_stats_tmp.dist) >=
+               RDCOST(x->rdmult, zero_blk_rate, rd_stats_tmp.sse) ||
+           rd_stats_tmp.skip == 1) &&
+          !xd->lossless[mbmi->segment_id]) {
+#if CONFIG_RD_DEBUG
+        av1_update_txb_coeff_cost(&rd_stats_tmp, plane, quarter_txsize, 0, 0,
+                                  zero_blk_rate - rd_stats_tmp.rate);
+#endif  // CONFIG_RD_DEBUG
+        rd_stats_tmp.rate = zero_blk_rate;
+        rd_stats_tmp.dist = rd_stats_tmp.sse;
+        rd_stats_tmp.skip = 1;
+        x->blk_skip[plane][blk_row_offset * bw + blk_col_offset] = 1;
+        skip_qttx[1] = 1;
+        p->eobs[block_offset_qttx] = 0;
+      } else {
+        x->blk_skip[plane][blk_row_offset * bw + blk_col_offset] = 0;
+        skip_qttx[1] = 0;
+        rd_stats_tmp.skip = 0;
+      }
+
+      av1_merge_rd_stats(&rd_stats_qttx, &rd_stats_tmp);
+
+      if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH) {
+        rd_stats_qttx.rate +=
+            av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 0);
+      }
+      rd_stats_qttx.rate +=
+          av1_cost_bit(cpi->common.fc->quarter_tx_size_prob, 1);
+      rd_qttx = RDCOST(x->rdmult, rd_stats_qttx.rate, rd_stats_qttx.dist);
+#if CONFIG_LV_MAP
+      eobs_qttx[0] = p->txb_entropy_ctx[0];
+      eobs_qttx[1] = p->txb_entropy_ctx[block_offset_qttx];
+#else
+      eobs_qttx[0] = p->eobs[0];
+      eobs_qttx[1] = p->eobs[block_offset_qttx];
+#endif
+      if (rd_qttx < this_rd) {
+        is_qttx_picked = 1;
+        this_rd = rd_qttx;
+        rd_stats->rate = rd_stats_qttx.rate;
+        rd_stats->dist = rd_stats_qttx.dist;
+        rd_stats->sse = rd_stats_qttx.sse;
+        rd_stats->skip = rd_stats_qttx.skip;
+        rd_stats->rdcost = rd_stats_qttx.rdcost;
+      }
+      av1_get_entropy_contexts(plane_bsize, 0, pd, ta, tl);
+    }
+#endif
   }
 
+#if CONFIG_MRC_TX
+  // If the tx type we are trying is MRC_DCT, we cannot partition the transform
+  // into anything smaller than TX_32X32
+  if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH && mbmi->tx_type != MRC_DCT) {
+#else
   if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH) {
+#endif
     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
     const int bsl = tx_size_wide_unit[sub_txs];
     int sub_step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
     RD_STATS this_rd_stats;
     int this_cost_valid = 1;
     int64_t tmp_rd = 0;
-
+#if CONFIG_DIST_8X8
+    int sub8x8_eob[4];
+#endif
     sum_rd_stats.rate =
         av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 1);
 
     assert(tx_size < TX_SIZES_ALL);
 
+    ref_best_rd = AOMMIN(this_rd, ref_best_rd);
+
     for (i = 0; i < 4 && this_cost_valid; ++i) {
       int offsetr = blk_row + (i >> 1) * bsl;
       int offsetc = blk_col + (i & 0x01) * bsl;
@@ -4129,30 +4717,170 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
                       depth + 1, plane_bsize, ta, tl, tx_above, tx_left,
                       &this_rd_stats, ref_best_rd - tmp_rd, &this_cost_valid,
                       rd_stats_stack);
-
+#if CONFIG_DIST_8X8
+      if (plane == 0 && tx_size == TX_8X8) {
+        sub8x8_eob[i] = p->eobs[block];
+      }
+#endif  // CONFIG_DIST_8X8
       av1_merge_rd_stats(&sum_rd_stats, &this_rd_stats);
 
-      tmp_rd =
-          RDCOST(x->rdmult, x->rddiv, sum_rd_stats.rate, sum_rd_stats.dist);
+      tmp_rd = RDCOST(x->rdmult, sum_rd_stats.rate, sum_rd_stats.dist);
+#if !CONFIG_DIST_8X8
       if (this_rd < tmp_rd) break;
+#endif
       block += sub_step;
     }
+#if CONFIG_DIST_8X8
+    if (this_cost_valid && plane == 0 && tx_size == TX_8X8) {
+      const int src_stride = p->src.stride;
+      const int dst_stride = pd->dst.stride;
+
+      const uint8_t *src =
+          &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
+      const uint8_t *dst =
+          &pd->dst
+               .buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+
+      int64_t dist_8x8;
+      int qindex = x->qindex;
+      const int pred_stride = block_size_wide[plane_bsize];
+      const int pred_idx = (blk_row * pred_stride + blk_col)
+                           << tx_size_wide_log2[0];
+      int16_t *pred = &pd->pred[pred_idx];
+      int j;
+      int row, col;
+
+#if CONFIG_HIGHBITDEPTH
+      uint8_t *pred8;
+      DECLARE_ALIGNED(16, uint16_t, pred8_16[8 * 8]);
+#else
+      DECLARE_ALIGNED(16, uint8_t, pred8[8 * 8]);
+#endif  // CONFIG_HIGHBITDEPTH
+
+      dist_8x8 = av1_dist_8x8(cpi, xd, src, src_stride, dst, dst_stride,
+                              BLOCK_8X8, 8, 8, 8, 8, qindex) *
+                 16;
+      sum_rd_stats.sse = dist_8x8;
+
+#if CONFIG_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+        pred8 = CONVERT_TO_BYTEPTR(pred8_16);
+      else
+        pred8 = (uint8_t *)pred8_16;
+#endif
+
+#if CONFIG_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        for (row = 0; row < 2; ++row) {
+          for (col = 0; col < 2; ++col) {
+            int idx = row * 2 + col;
+            int eob = sub8x8_eob[idx];
+
+            if (eob > 0) {
+              for (j = 0; j < 4; j++)
+                for (i = 0; i < 4; i++)
+                  CONVERT_TO_SHORTPTR(pred8)
+                  [(row * 4 + j) * 8 + 4 * col + i] =
+                      pred[(row * 4 + j) * pred_stride + 4 * col + i];
+            } else {
+              for (j = 0; j < 4; j++)
+                for (i = 0; i < 4; i++)
+                  CONVERT_TO_SHORTPTR(pred8)
+                  [(row * 4 + j) * 8 + 4 * col + i] = CONVERT_TO_SHORTPTR(
+                      dst)[(row * 4 + j) * dst_stride + 4 * col + i];
+            }
+          }
+        }
+      } else {
+#endif
+        for (row = 0; row < 2; ++row) {
+          for (col = 0; col < 2; ++col) {
+            int idx = row * 2 + col;
+            int eob = sub8x8_eob[idx];
+
+            if (eob > 0) {
+              for (j = 0; j < 4; j++)
+                for (i = 0; i < 4; i++)
+                  pred8[(row * 4 + j) * 8 + 4 * col + i] =
+                      pred[(row * 4 + j) * pred_stride + 4 * col + i];
+            } else {
+              for (j = 0; j < 4; j++)
+                for (i = 0; i < 4; i++)
+                  pred8[(row * 4 + j) * 8 + 4 * col + i] =
+                      dst[(row * 4 + j) * dst_stride + 4 * col + i];
+            }
+          }
+        }
+#if CONFIG_HIGHBITDEPTH
+      }
+#endif  // CONFIG_HIGHBITDEPTH
+      dist_8x8 = av1_dist_8x8(cpi, xd, src, src_stride, pred8, 8, BLOCK_8X8, 8,
+                              8, 8, 8, qindex) *
+                 16;
+      sum_rd_stats.dist = dist_8x8;
+      tmp_rd = RDCOST(x->rdmult, sum_rd_stats.rate, sum_rd_stats.dist);
+    }
+#endif  // CONFIG_DIST_8X8
     if (this_cost_valid) sum_rd = tmp_rd;
   }
 
   if (this_rd < sum_rd) {
     int idx, idy;
-    for (i = 0; i < tx_size_wide_unit[tx_size]; ++i) pta[i] = !(tmp_eob == 0);
-    for (i = 0; i < tx_size_high_unit[tx_size]; ++i) ptl[i] = !(tmp_eob == 0);
+#if CONFIG_RECT_TX_EXT
+    TX_SIZE tx_size_selected = is_qttx_picked ? quarter_txsize : tx_size;
+#else
+    TX_SIZE tx_size_selected = tx_size;
+#endif
+
+#if CONFIG_RECT_TX_EXT
+    if (is_qttx_picked) {
+      assert(blk_row == 0 && blk_col == 0 && plane == 0);
+#if CONFIG_LV_MAP
+      p->txb_entropy_ctx[0] = eobs_qttx[0];
+      p->txb_entropy_ctx[block_offset_qttx] = eobs_qttx[1];
+#else
+      p->eobs[0] = eobs_qttx[0];
+      p->eobs[block_offset_qttx] = eobs_qttx[1];
+#endif
+    } else {
+#endif
+#if CONFIG_LV_MAP
+      p->txb_entropy_ctx[block] = tmp_eob;
+#else
+    p->eobs[block] = tmp_eob;
+#endif
+#if CONFIG_RECT_TX_EXT
+    }
+#endif
+
+    av1_set_txb_context(x, plane, block, tx_size_selected, pta, ptl);
+#if CONFIG_RECT_TX_EXT
+    if (is_qttx_picked)
+      av1_set_txb_context(x, plane, block_offset_qttx, tx_size_selected,
+                          pta + blk_col_offset, ptl + blk_row_offset);
+#endif
+
     txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size,
                           tx_size);
-    inter_tx_size[0][0] = tx_size;
+    inter_tx_size[0][0] = tx_size_selected;
     for (idy = 0; idy < tx_size_high_unit[tx_size] / 2; ++idy)
       for (idx = 0; idx < tx_size_wide_unit[tx_size] / 2; ++idx)
-        inter_tx_size[idy][idx] = tx_size;
-    mbmi->tx_size = tx_size;
+        inter_tx_size[idy][idx] = tx_size_selected;
+    mbmi->tx_size = tx_size_selected;
+#if CONFIG_TXK_SEL
+    mbmi->txk_type[txk_idx] = best_tx_type;
+#endif
     if (this_rd == INT64_MAX) *is_cost_valid = 0;
-    x->blk_skip[plane][blk_row * bw + blk_col] = rd_stats->skip;
+#if CONFIG_RECT_TX_EXT
+    if (is_qttx_picked) {
+      x->blk_skip[plane][0] = skip_qttx[0];
+      x->blk_skip[plane][blk_row_offset * bw + blk_col_offset] = skip_qttx[1];
+    } else {
+#endif
+      x->blk_skip[plane][blk_row * bw + blk_col] = rd_stats->skip;
+#if CONFIG_RECT_TX_EXT
+    }
+#endif
   } else {
     *rd_stats = sum_rd_stats;
     if (sum_rd == INT64_MAX) *is_cost_valid = 0;
@@ -4201,17 +4929,16 @@ static void inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
                         tx_above, tx_left, &pn_rd_stats, ref_best_rd - this_rd,
                         &is_cost_valid, rd_stats_stack);
         av1_merge_rd_stats(rd_stats, &pn_rd_stats);
-        this_rd += AOMMIN(
-            RDCOST(x->rdmult, x->rddiv, pn_rd_stats.rate, pn_rd_stats.dist),
-            RDCOST(x->rdmult, x->rddiv, 0, pn_rd_stats.sse));
+        this_rd += AOMMIN(RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist),
+                          RDCOST(x->rdmult, 0, pn_rd_stats.sse));
         block += step;
         ++block32;
       }
     }
   }
 
-  this_rd = AOMMIN(RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist),
-                   RDCOST(x->rdmult, x->rddiv, 0, rd_stats->sse));
+  this_rd = AOMMIN(RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist),
+                   RDCOST(x->rdmult, 0, rd_stats->sse));
   if (this_rd > ref_best_rd) is_cost_valid = 0;
 
   if (!is_cost_valid) {
@@ -4247,6 +4974,7 @@ static int64_t select_tx_size_fix_type(const AV1_COMP *cpi, MACROBLOCK *x,
       mbmi->min_tx_size = AOMMIN(
           mbmi->min_tx_size, get_min_tx_size(mbmi->inter_tx_size[row][col]));
 
+#if !CONFIG_TXK_SEL
 #if CONFIG_EXT_TX
   if (get_ext_tx_types(mbmi->min_tx_size, bsize, is_inter,
                        cm->reduced_tx_set_used) > 1 &&
@@ -4266,20 +4994,21 @@ static int64_t select_tx_size_fix_type(const AV1_COMP *cpi, MACROBLOCK *x,
                                     [mbmi->tx_type];
     }
   }
-#else   // CONFIG_EXT_TX
+#else
   if (mbmi->min_tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id])
     rd_stats->rate +=
         cpi->inter_tx_type_costs[mbmi->min_tx_size][mbmi->tx_type];
 #endif  // CONFIG_EXT_TX
+#endif  // CONFIG_TXK_SEL
 
   if (rd_stats->skip)
-    rd = RDCOST(x->rdmult, x->rddiv, s1, rd_stats->sse);
+    rd = RDCOST(x->rdmult, s1, rd_stats->sse);
   else
-    rd = RDCOST(x->rdmult, x->rddiv, rd_stats->rate + s0, rd_stats->dist);
+    rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist);
 
   if (is_inter && !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
       !(rd_stats->skip))
-    rd = AOMMIN(rd, RDCOST(x->rdmult, x->rddiv, s1, rd_stats->sse));
+    rd = AOMMIN(rd, RDCOST(x->rdmult, s1, rd_stats->sse));
 
   return rd;
 }
@@ -4299,6 +5028,12 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
   TX_SIZE best_tx = max_txsize_lookup[bsize];
   TX_SIZE best_min_tx_size = TX_SIZES_ALL;
   uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE * 8];
+  TX_TYPE txk_start = DCT_DCT;
+#if CONFIG_TXK_SEL
+  TX_TYPE txk_end = DCT_DCT + 1;
+#else
+  TX_TYPE txk_end = TX_TYPES;
+#endif
   const int n4 = bsize_to_num_blk(bsize);
   int idx, idy;
   int prune = 0;
@@ -4326,9 +5061,14 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
   for (idx = 0; idx < count32; ++idx)
     av1_invalid_rd_stats(&rd_stats_stack[idx]);
 
-  for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
+  for (tx_type = txk_start; tx_type < txk_end; ++tx_type) {
     RD_STATS this_rd_stats;
     av1_init_rd_stats(&this_rd_stats);
+#if CONFIG_MRC_TX
+    // MRC_DCT only implemented for TX_32X32 so only include this tx in
+    // the search for TX_32X32
+    if (tx_type == MRC_DCT && max_tx_size != TX_32X32) continue;
+#endif  // CONFIG_MRC_TX
 #if CONFIG_EXT_TX
     if (is_inter) {
       if (!ext_tx_used_inter[ext_tx_set][tx_type]) continue;
@@ -4384,7 +5124,6 @@ static void tx_block_rd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
                         ENTROPY_CONTEXT *left_ctx, RD_STATS *rd_stats) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
   const int tx_row = blk_row >> (1 - pd->subsampling_y);
@@ -4402,16 +5141,11 @@ static void tx_block_rd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
             : mbmi->inter_tx_size[tx_row][tx_col];
 
   if (tx_size == plane_tx_size) {
-    int i;
     ENTROPY_CONTEXT *ta = above_ctx + blk_col;
     ENTROPY_CONTEXT *tl = left_ctx + blk_row;
     av1_tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
                       plane_bsize, ta, tl, rd_stats);
-
-    for (i = 0; i < tx_size_wide_unit[tx_size]; ++i)
-      ta[i] = !(p->eobs[block] == 0);
-    for (i = 0; i < tx_size_high_unit[tx_size]; ++i)
-      tl[i] = !(p->eobs[block] == 0);
+    av1_set_txb_context(x, plane, block, tx_size, ta, tl);
   } else {
     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
     const int bsl = tx_size_wide_unit[sub_txs];
@@ -4498,9 +5232,8 @@ static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x,
 
     av1_merge_rd_stats(rd_stats, &pn_rd_stats);
 
-    this_rd =
-        AOMMIN(RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist),
-               RDCOST(x->rdmult, x->rddiv, 0, rd_stats->sse));
+    this_rd = AOMMIN(RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist),
+                     RDCOST(x->rdmult, 0, rd_stats->sse));
 
     if (this_rd > ref_best_rd) {
       is_cost_valid = 0;
@@ -4543,7 +5276,7 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
                            &plane_block_height, &rows, &cols);
   if (rows * cols > PALETTE_MAX_BLOCK_SIZE) return;
 
-  mbmi->uv_mode = DC_PRED;
+  mbmi->uv_mode = UV_DC_PRED;
 #if CONFIG_FILTER_INTRA
   mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
 #endif  // CONFIG_FILTER_INTRA
@@ -4689,7 +5422,7 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
         }
       }
 
-      this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist);
+      this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
       if (this_rd < *best_rd) {
         *best_rd = this_rd;
         *best_mbmi = *mbmi;
@@ -4727,7 +5460,7 @@ static int rd_pick_filter_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
 
   av1_zero(filter_intra_mode_info);
   mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 1;
-  mbmi->uv_mode = DC_PRED;
+  mbmi->uv_mode = UV_DC_PRED;
 #if CONFIG_PALETTE
   mbmi->palette_mode_info.palette_size[1] = 0;
 #endif  // CONFIG_PALETTE
@@ -4741,7 +5474,7 @@ static int rd_pick_filter_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
                 av1_cost_bit(cpi->common.fc->filter_intra_probs[1], 1) +
                 cpi->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode] +
                 write_uniform_cost(FILTER_INTRA_MODES, mode);
-    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist);
+    this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
     if (this_rd < *best_rd) {
       *best_rd = this_rd;
       *rate = this_rate;
@@ -4754,7 +5487,7 @@ static int rd_pick_filter_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
   }
 
   if (filter_intra_selected_flag) {
-    mbmi->uv_mode = DC_PRED;
+    mbmi->uv_mode = UV_DC_PRED;
     mbmi->filter_intra_mode_info.use_filter_intra_mode[1] =
         filter_intra_mode_info.use_filter_intra_mode[1];
     mbmi->filter_intra_mode_info.filter_intra_mode[1] =
@@ -4782,7 +5515,7 @@ static int64_t pick_intra_angle_routine_sbuv(
   if (!super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd_in))
     return INT64_MAX;
   this_rate = tokenonly_rd_stats.rate + rate_overhead;
-  this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist);
+  this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
   if (this_rd < *best_rd) {
     *best_rd = this_rd;
     *best_angle_delta = mbmi->angle_delta[1];
@@ -4852,8 +5585,172 @@ static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
 }
 #endif  // CONFIG_EXT_INTRA
 
+#if CONFIG_CFL
+static int64_t cfl_alpha_dist(const uint8_t *y_pix, int y_stride,
+                              const int y_averages_q3[MAX_NUM_TXB],
+                              const uint8_t *src, int src_stride, int width,
+                              int height, TX_SIZE tx_size, int dc_pred,
+                              int alpha_q3, int64_t *dist_neg_out) {
+  int64_t dist = 0;
+  int diff;
+
+  if (alpha_q3 == 0) {
+    for (int j = 0; j < height; j++) {
+      for (int i = 0; i < width; i++) {
+        diff = src[i] - dc_pred;
+        dist += diff * diff;
+      }
+      src += src_stride;
+    }
+
+    if (dist_neg_out) *dist_neg_out = dist;
+
+    return dist;
+  }
+
+  int64_t dist_neg = 0;
+  const int tx_height = tx_size_high[tx_size];
+  const int tx_width = tx_size_wide[tx_size];
+  const int y_block_row_off = y_stride * tx_height;
+  const int src_block_row_off = src_stride * tx_height;
+  const uint8_t *t_y_pix;
+  const uint8_t *t_src;
+  int a = 0;
+  for (int b_j = 0; b_j < height; b_j += tx_height) {
+    const int h = b_j + tx_height;
+    for (int b_i = 0; b_i < width; b_i += tx_width) {
+      const int w = b_i + tx_width;
+      const int tx_avg_q3 = y_averages_q3[a++];
+      t_y_pix = y_pix;
+      t_src = src;
+      for (int t_j = b_j; t_j < h; t_j++) {
+        for (int t_i = b_i; t_i < w; t_i++) {
+          const int uv = t_src[t_i];
+
+          const int scaled_luma =
+              get_scaled_luma_q0(alpha_q3, t_y_pix[t_i], tx_avg_q3);
+
+          // TODO(ltrudeau) add support for HBD.
+          diff = uv - clamp(scaled_luma + dc_pred, 0, 255);
+          dist += diff * diff;
+
+          // TODO(ltrudeau) add support for HBD.
+          diff = uv - clamp(-scaled_luma + dc_pred, 0, 255);
+          dist_neg += diff * diff;
+        }
+        t_y_pix += y_stride;
+        t_src += src_stride;
+      }
+    }
+    y_pix += y_block_row_off;
+    src += src_block_row_off;
+  }
+
+  if (dist_neg_out) *dist_neg_out = dist_neg;
+
+  return dist;
+}
+
+static inline void cfl_update_costs(CFL_CTX *cfl, FRAME_CONTEXT *ec_ctx) {
+  assert(ec_ctx->cfl_alpha_cdf[CFL_ALPHABET_SIZE - 1] ==
+         AOM_ICDF(CDF_PROB_TOP));
+
+  aom_cdf_prob prev_cdf = 0;
+
+  for (int c = 0; c < CFL_ALPHABET_SIZE; c++) {
+    const int sign_bit_cost = (cfl_alpha_codes[c][CFL_PRED_U] != 0) +
+                              (cfl_alpha_codes[c][CFL_PRED_V] != 0);
+
+    aom_cdf_prob prob = AOM_ICDF(ec_ctx->cfl_alpha_cdf[c]) - prev_cdf;
+    prev_cdf = AOM_ICDF(ec_ctx->cfl_alpha_cdf[c]);
+
+    cfl->costs[c] = av1_cost_symbol(prob) + av1_cost_literal(sign_bit_cost);
+  }
+}
+
+static int cfl_rd_pick_alpha(MACROBLOCK *const x, TX_SIZE tx_size) {
+  const struct macroblock_plane *const p_u = &x->plane[AOM_PLANE_U];
+  const struct macroblock_plane *const p_v = &x->plane[AOM_PLANE_V];
+  const uint8_t *const src_u = p_u->src.buf;
+  const uint8_t *const src_v = p_v->src.buf;
+  const int src_stride_u = p_u->src.stride;
+  const int src_stride_v = p_v->src.stride;
+
+  MACROBLOCKD *const xd = &x->e_mbd;
+  FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+
+  CFL_CTX *const cfl = xd->cfl;
+  cfl_compute_parameters(xd, tx_size);
+  const int width = cfl->uv_width;
+  const int height = cfl->uv_height;
+  const int dc_pred_u = cfl->dc_pred[CFL_PRED_U];
+  const int dc_pred_v = cfl->dc_pred[CFL_PRED_V];
+  const int *y_averages_q3 = cfl->y_averages_q3;
+  const uint8_t *y_pix = cfl->y_down_pix;
+
+  CFL_SIGN_TYPE *signs = mbmi->cfl_alpha_signs;
+
+  cfl_update_costs(cfl, ec_ctx);
+
+  int64_t sse[CFL_PRED_PLANES][CFL_MAGS_SIZE];
+  sse[CFL_PRED_U][0] =
+      cfl_alpha_dist(y_pix, MAX_SB_SIZE, y_averages_q3, src_u, src_stride_u,
+                     width, height, tx_size, dc_pred_u, 0, NULL);
+  sse[CFL_PRED_V][0] =
+      cfl_alpha_dist(y_pix, MAX_SB_SIZE, y_averages_q3, src_v, src_stride_v,
+                     width, height, tx_size, dc_pred_v, 0, NULL);
+
+  for (int m = 1; m < CFL_MAGS_SIZE; m += 2) {
+    assert(cfl_alpha_mags_q3[m + 1] == -cfl_alpha_mags_q3[m]);
+    sse[CFL_PRED_U][m] = cfl_alpha_dist(
+        y_pix, MAX_SB_SIZE, y_averages_q3, src_u, src_stride_u, width, height,
+        tx_size, dc_pred_u, cfl_alpha_mags_q3[m], &sse[CFL_PRED_U][m + 1]);
+    sse[CFL_PRED_V][m] = cfl_alpha_dist(
+        y_pix, MAX_SB_SIZE, y_averages_q3, src_v, src_stride_v, width, height,
+        tx_size, dc_pred_v, cfl_alpha_mags_q3[m], &sse[CFL_PRED_V][m + 1]);
+  }
+
+  int64_t dist;
+  int64_t cost;
+  int64_t best_cost;
+
+  // Compute least squares parameter of the entire block
+  // IMPORTANT: We assume that the first code is 0,0
+  int ind = 0;
+  signs[CFL_PRED_U] = CFL_SIGN_POS;
+  signs[CFL_PRED_V] = CFL_SIGN_POS;
+
+  dist = sse[CFL_PRED_U][0] + sse[CFL_PRED_V][0];
+  dist *= 16;
+  best_cost = RDCOST(x->rdmult, cfl->costs[0], dist);
+
+  for (int c = 1; c < CFL_ALPHABET_SIZE; c++) {
+    const int idx_u = cfl_alpha_codes[c][CFL_PRED_U];
+    const int idx_v = cfl_alpha_codes[c][CFL_PRED_V];
+    for (CFL_SIGN_TYPE sign_u = idx_u == 0; sign_u < CFL_SIGNS; sign_u++) {
+      for (CFL_SIGN_TYPE sign_v = idx_v == 0; sign_v < CFL_SIGNS; sign_v++) {
+        dist = sse[CFL_PRED_U][idx_u + (sign_u == CFL_SIGN_NEG)] +
+               sse[CFL_PRED_V][idx_v + (sign_v == CFL_SIGN_NEG)];
+        dist *= 16;
+        cost = RDCOST(x->rdmult, cfl->costs[c], dist);
+        if (cost < best_cost) {
+          best_cost = cost;
+          ind = c;
+          signs[CFL_PRED_U] = sign_u;
+          signs[CFL_PRED_V] = sign_v;
+        }
+      }
+    }
+  }
+
+  mbmi->cfl_alpha_idx = ind;
+  return cfl->costs[ind];
+}
+#endif  // CONFIG_CFL
+
 static void init_sbuv_mode(MB_MODE_INFO *const mbmi) {
-  mbmi->uv_mode = DC_PRED;
+  mbmi->uv_mode = UV_DC_PRED;
 #if CONFIG_PALETTE
   mbmi->palette_mode_info.palette_size[1] = 0;
 #endif  // CONFIG_PALETTE
@@ -4870,20 +5767,19 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   assert(!is_inter_block(mbmi));
   MB_MODE_INFO best_mbmi = *mbmi;
-  PREDICTION_MODE mode;
   int64_t best_rd = INT64_MAX, this_rd;
-  int this_rate;
-  RD_STATS tokenonly_rd_stats;
 #if CONFIG_PVQ
   od_rollback_buffer buf;
   od_encode_checkpoint(&x->daala_enc, &buf);
 #endif  // CONFIG_PVQ
 #if CONFIG_PALETTE
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-  uint8_t *best_palette_color_map = NULL;
 #endif  // CONFIG_PALETTE
 
-  for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
+  for (int mode_idx = 0; mode_idx < UV_INTRA_MODES; ++mode_idx) {
+    int this_rate;
+    RD_STATS tokenonly_rd_stats;
+    UV_PREDICTION_MODE mode = uv_rd_search_mode_order[mode_idx];
 #if CONFIG_EXT_INTRA
     const int is_directional_mode =
         av1_is_directional_mode(mode, mbmi->sb_type);
@@ -4893,9 +5789,16 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
       continue;
 
     mbmi->uv_mode = mode;
+#if CONFIG_CFL
+    int cfl_alpha_rate = 0;
+    if (mode == UV_DC_PRED) {
+      const TX_SIZE uv_tx_size = av1_get_uv_tx_size(mbmi, &xd->plane[1]);
+      cfl_alpha_rate = cfl_rd_pick_alpha(x, uv_tx_size);
+    }
+#endif
 #if CONFIG_EXT_INTRA
     mbmi->angle_delta[1] = 0;
-    if (is_directional_mode) {
+    if (is_directional_mode && av1_use_angle_delta(mbmi->sb_type)) {
       const int rate_overhead = cpi->intra_uv_mode_cost[mbmi->mode][mode] +
                                 write_uniform_cost(2 * MAX_ANGLE_DELTA + 1, 0);
       if (!rd_pick_intra_angle_sbuv(cpi, x, bsize, rate_overhead, best_rd,
@@ -4915,8 +5818,13 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     this_rate =
         tokenonly_rd_stats.rate + cpi->intra_uv_mode_cost[mbmi->mode][mode];
 
+#if CONFIG_CFL
+    if (mode == UV_DC_PRED) {
+      this_rate += cfl_alpha_rate;
+    }
+#endif
 #if CONFIG_EXT_INTRA
-    if (is_directional_mode) {
+    if (is_directional_mode && av1_use_angle_delta(mbmi->sb_type)) {
       this_rate += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
                                       MAX_ANGLE_DELTA + mbmi->angle_delta[1]);
     }
@@ -4927,7 +5835,7 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
 #endif  // CONFIG_FILTER_INTRA
 #if CONFIG_PALETTE
     if (cpi->common.allow_screen_content_tools && mbmi->sb_type >= BLOCK_8X8 &&
-        mode == DC_PRED)
+        mode == UV_DC_PRED)
       this_rate += av1_cost_bit(
           av1_default_palette_uv_mode_prob[pmi->palette_size[0] > 0], 0);
 #endif  // CONFIG_PALETTE
@@ -4935,7 +5843,7 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
 #if CONFIG_PVQ
     od_encode_rollback(&x->daala_enc, &buf);
 #endif  // CONFIG_PVQ
-    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist);
+    this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
 
     if (this_rd < best_rd) {
       best_mbmi = *mbmi;
@@ -4949,9 +5857,9 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
 
 #if CONFIG_PALETTE
   if (cpi->common.allow_screen_content_tools && mbmi->sb_type >= BLOCK_8X8) {
-    best_palette_color_map = x->palette_buffer->best_palette_color_map;
+    uint8_t *best_palette_color_map = x->palette_buffer->best_palette_color_map;
     rd_pick_palette_intra_sbuv(cpi, x,
-                               cpi->intra_uv_mode_cost[mbmi->mode][DC_PRED],
+                               cpi->intra_uv_mode_cost[mbmi->mode][UV_DC_PRED],
                                best_palette_color_map, &best_mbmi, &best_rd,
                                rate, rate_tokenonly, distortion, skippable);
   }
@@ -4975,7 +5883,7 @@ static void choose_intra_uv_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
                                  PICK_MODE_CONTEXT *ctx, BLOCK_SIZE bsize,
                                  TX_SIZE max_tx_size, int *rate_uv,
                                  int *rate_uv_tokenonly, int64_t *dist_uv,
-                                 int *skip_uv, PREDICTION_MODE *mode_uv) {
+                                 int *skip_uv, UV_PREDICTION_MODE *mode_uv) {
   // Use an estimated rd for uv_intra based on DC_PRED if the
   // appropriate speed flag is set.
   (void)ctx;
@@ -4990,7 +5898,7 @@ static void choose_intra_uv_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
     *rate_uv_tokenonly = 0;
     *dist_uv = 0;
     *skip_uv = 1;
-    *mode_uv = DC_PRED;
+    *mode_uv = UV_DC_PRED;
     return;
   }
   BLOCK_SIZE bs = scale_chroma_bsize(bsize, x->e_mbd.plane[1].subsampling_x,
@@ -5011,6 +5919,12 @@ static int cost_mv_ref(const AV1_COMP *const cpi, PREDICTION_MODE mode,
   if (is_inter_compound_mode(mode)) {
     return cpi
         ->inter_compound_mode_cost[mode_context][INTER_COMPOUND_OFFSET(mode)];
+#if CONFIG_COMPOUND_SINGLEREF
+  } else if (is_inter_singleref_comp_mode(mode)) {
+    return cpi
+        ->inter_singleref_comp_mode_cost[mode_context]
+                                        [INTER_SINGLEREF_COMP_OFFSET(mode)];
+#endif  // CONFIG_COMPOUND_SINGLEREF
   }
 #endif
 
@@ -5096,8 +6010,13 @@ typedef struct {
   int segment_yrate;
   PREDICTION_MODE modes[4];
 #if CONFIG_EXT_INTER
+#if CONFIG_COMPOUND_SINGLEREF
+  SEG_RDSTAT rdstat[4][INTER_MODES + INTER_SINGLEREF_COMP_MODES +
+                       INTER_COMPOUND_MODES];
+#else   // !CONFIG_COMPOUND_SINGLEREF
   SEG_RDSTAT rdstat[4][INTER_MODES + INTER_COMPOUND_MODES];
-#else
+#endif  // CONFIG_COMPOUND_SINGLEREF
+#else   // !CONFIG_EXT_INTER
   SEG_RDSTAT rdstat[4][INTER_MODES];
 #endif  // CONFIG_EXT_INTER
   int mvthresh;
@@ -5120,27 +6039,28 @@ static int check_best_zero_mv(
     int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME], int this_mode,
     const MV_REFERENCE_FRAME ref_frames[2], const BLOCK_SIZE bsize, int block,
     int mi_row, int mi_col) {
-  int_mv zeromv[2];
+  int_mv zeromv[2] = { {.as_int = 0 } };
+#if CONFIG_GLOBAL_MOTION
   int comp_pred_mode = ref_frames[1] > INTRA_FRAME;
-  int cur_frm;
+#endif
   (void)mi_row;
   (void)mi_col;
-  for (cur_frm = 0; cur_frm < 1 + comp_pred_mode; cur_frm++) {
 #if CONFIG_GLOBAL_MOTION
-    if (this_mode == ZEROMV
+  if (this_mode == ZEROMV
 #if CONFIG_EXT_INTER
-        || this_mode == ZERO_ZEROMV
+      || this_mode == ZERO_ZEROMV
 #endif  // CONFIG_EXT_INTER
-        )
+      ) {
+    for (int cur_frm = 0; cur_frm < 1 + comp_pred_mode; cur_frm++) {
       zeromv[cur_frm].as_int =
           gm_get_motion_vector(&cpi->common.global_motion[ref_frames[cur_frm]],
                                cpi->common.allow_high_precision_mv, bsize,
                                mi_col, mi_row, block)
               .as_int;
-    else
-#endif  // CONFIG_GLOBAL_MOTION
-      zeromv[cur_frm].as_int = 0;
+    }
   }
+#endif  // CONFIG_GLOBAL_MOTION
+
 #if !CONFIG_EXT_INTER
   assert(ref_frames[1] != INTRA_FRAME);  // Just sanity check
 #endif                                   // !CONFIG_EXT_INTER
@@ -5201,8 +6121,11 @@ static int check_best_zero_mv(
 }
 
 static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
-                                BLOCK_SIZE bsize, int_mv *frame_mv, int mi_row,
-                                int mi_col,
+                                BLOCK_SIZE bsize, int_mv *frame_mv,
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+                                int_mv *frame_comp_mv,
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+                                int mi_row, int mi_col,
 #if CONFIG_EXT_INTER
                                 int_mv *ref_mv_sub8x8[2], const uint8_t *mask,
                                 int mask_stride,
@@ -5213,35 +6136,47 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
   const int ph = block_size_high[bsize];
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  // This function should only ever be called for compound modes
+// This function should only ever be called for compound modes
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  if (!has_second_ref(mbmi)) {
+    assert(is_inter_singleref_comp_mode(mbmi->mode));
+    assert(frame_comp_mv);
+  }
+  assert(has_second_ref(mbmi) || is_inter_singleref_comp_mode(mbmi->mode));
+  const int refs[2] = { mbmi->ref_frame[0], has_second_ref(mbmi)
+                                                ? mbmi->ref_frame[1]
+                                                : mbmi->ref_frame[0] };
+#else
   assert(has_second_ref(mbmi));
   const int refs[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] };
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
   int_mv ref_mv[2];
   int ite, ref;
-#if CONFIG_DUAL_FILTER
-  InterpFilter interp_filter[4] = {
-    mbmi->interp_filter[0], mbmi->interp_filter[1], mbmi->interp_filter[2],
-    mbmi->interp_filter[3],
-  };
-#else
-  const InterpFilter interp_filter = mbmi->interp_filter;
-#endif  // CONFIG_DUAL_FILTER
   struct scale_factors sf;
-  struct macroblockd_plane *const pd = &xd->plane[0];
 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
   // ic and ir are the 4x4 coordiantes of the sub8x8 at index "block"
   const int ic = block & 1;
   const int ir = (block - ic) >> 1;
+  struct macroblockd_plane *const pd = &xd->plane[0];
   const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x) + 4 * ic;
   const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y) + 4 * ir;
 #if CONFIG_GLOBAL_MOTION
   int is_global[2];
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+#else
   for (ref = 0; ref < 2; ++ref) {
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
     WarpedMotionParams *const wm =
         &xd->global_motion[xd->mi[0]->mbmi.ref_frame[ref]];
     is_global[ref] = is_global_mv_block(xd->mi[0], block, wm->wmtype);
   }
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  if (!has_second_ref(mbmi)) is_global[1] = is_global[0];
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
 #endif  // CONFIG_GLOBAL_MOTION
+#else   // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+  (void)block;
 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
 
   // Do joint motion search in compound mode to get more accurate mv.
@@ -5264,7 +6199,11 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
   (void)ref_mv_sub8x8;
 #endif  // CONFIG_EXT_INTER && CONFIG_CB4X4
 
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+#else
   for (ref = 0; ref < 2; ++ref) {
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
 #if CONFIG_EXT_INTER && !CONFIG_CB4X4
     if (bsize < BLOCK_8X8 && ref_mv_sub8x8 != NULL)
       ref_mv[ref].as_int = ref_mv_sub8x8[ref]->as_int;
@@ -5284,6 +6223,24 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
     }
   }
 
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  if (!has_second_ref(mbmi)) {
+    assert(is_inter_singleref_comp_mode(mbmi->mode));
+    // NOTE: For single ref comp mode, set up the 2nd set of ref_mv/pre_planes
+    //       all from the 1st reference frame, i.e. refs[0].
+    ref_mv[1] = x->mbmi_ext->ref_mvs[refs[0]][0];
+    if (scaled_ref_frame[0]) {
+      int i;
+      // Swap out the reference frame for a version that's been scaled to
+      // match the resolution of the current frame, allowing the existing
+      // motion search code to be used without additional modifications.
+      for (i = 0; i < MAX_MB_PLANE; i++)
+        backup_yv12[1][i] = xd->plane[i].pre[1];
+      av1_setup_pre_planes(xd, 1, scaled_ref_frame[0], mi_row, mi_col, NULL);
+    }
+  }
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+
 // Since we have scaled the reference frames to match the size of the current
 // frame we must use a unit scaling factor during mode selection.
 #if CONFIG_HIGHBITDEPTH
@@ -5294,9 +6251,16 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
                                     cm->height);
 #endif  // CONFIG_HIGHBITDEPTH
 
-  // Allow joint search multiple times iteratively for each reference frame
-  // and break out of the search loop if it couldn't find a better mv.
+// Allow joint search multiple times iteratively for each reference frame
+// and break out of the search loop if it couldn't find a better mv.
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  const int num_ites =
+      (has_second_ref(mbmi) || mbmi->mode == SR_NEW_NEWMV) ? 4 : 1;
+  const int start_ite = has_second_ref(mbmi) ? 0 : 1;
+  for (ite = start_ite; ite < (start_ite + num_ites); ite++) {
+#else
   for (ite = 0; ite < 4; ite++) {
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
     struct buf_2d ref_yv12[2];
     int bestsme = INT_MAX;
     int sadpb = x->sadperbit16;
@@ -5308,7 +6272,7 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
                        // odd iterations search in the second. The predictor
                        // found for the 'other' reference frame is factored in.
     const int plane = 0;
-    ConvolveParams conv_params = get_conv_params(0, plane);
+    ConvolveParams conv_params = get_conv_params(!id, 0, plane);
 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
     WarpTypesAllowed warp_types;
 #if CONFIG_GLOBAL_MOTION
@@ -5323,21 +6287,24 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
     ref_yv12[0] = xd->plane[plane].pre[0];
     ref_yv12[1] = xd->plane[plane].pre[1];
 
-#if CONFIG_DUAL_FILTER
-    // reload the filter types
-    interp_filter[0] =
-        (id == 0) ? mbmi->interp_filter[2] : mbmi->interp_filter[0];
-    interp_filter[1] =
-        (id == 0) ? mbmi->interp_filter[3] : mbmi->interp_filter[1];
-#endif  // CONFIG_DUAL_FILTER
-
 // Get the prediction block from the 'other' reference frame.
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+    MV *const the_other_mv = (has_second_ref(mbmi) || id)
+                                 ? &frame_mv[refs[!id]].as_mv
+                                 : &frame_comp_mv[refs[0]].as_mv;
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+
 #if CONFIG_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
       av1_highbd_build_inter_predictor(
           ref_yv12[!id].buf, ref_yv12[!id].stride, second_pred, pw,
-          &frame_mv[refs[!id]].as_mv, &sf, pw, ph, 0, interp_filter,
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+          the_other_mv,
+#else   // !(CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF)
+          &frame_mv[refs[!id]].as_mv,
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+          &sf, pw, ph, 0, mbmi->interp_filter,
 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
           &warp_types, p_col, p_row,
 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
@@ -5347,7 +6314,12 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
 #endif  // CONFIG_HIGHBITDEPTH
       av1_build_inter_predictor(
           ref_yv12[!id].buf, ref_yv12[!id].stride, second_pred, pw,
-          &frame_mv[refs[!id]].as_mv, &sf, pw, ph, &conv_params, interp_filter,
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+          the_other_mv,
+#else   // !(CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF)
+        &frame_mv[refs[!id]].as_mv,
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+          &sf, pw, ph, &conv_params, mbmi->interp_filter,
 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
           &warp_types, p_col, p_row, plane, !id,
 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
@@ -5360,13 +6332,24 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
     if (id) xd->plane[plane].pre[0] = ref_yv12[id];
     av1_set_mv_search_range(&x->mv_limits, &ref_mv[id].as_mv);
 
-    // Use the mv result from the single mode as mv predictor.
-    *best_mv = frame_mv[refs[id]].as_mv;
+// Use the mv result from the single mode as mv predictor.
+// Use the mv result from the single mode as mv predictor.
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+    if (!has_second_ref(mbmi) && id)
+      *best_mv = frame_comp_mv[refs[0]].as_mv;
+    else
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+      *best_mv = frame_mv[refs[id]].as_mv;
 
     best_mv->col >>= 3;
     best_mv->row >>= 3;
 
-    av1_set_mvcost(x, refs[id], id, mbmi->ref_mv_idx);
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+    if (!has_second_ref(mbmi))
+      av1_set_mvcost(x, refs[0], 0, mbmi->ref_mv_idx);
+    else
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+      av1_set_mvcost(x, refs[id], id, mbmi->ref_mv_idx);
 
     // Small-range full-pixel motion search.
     bestsme =
@@ -5392,60 +6375,33 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
     if (bestsme < INT_MAX) {
       int dis; /* TODO: use dis in distortion calculation later. */
       unsigned int sse;
-      if (cpi->sf.use_upsampled_references) {
-        // Use up-sampled reference frames.
-        struct buf_2d backup_pred = pd->pre[0];
-        const YV12_BUFFER_CONFIG *upsampled_ref =
-            get_upsampled_ref(cpi, refs[id]);
-
-        // Set pred for Y plane
-        setup_pred_plane(&pd->pre[0], bsize, upsampled_ref->y_buffer,
-                         upsampled_ref->y_crop_width,
-                         upsampled_ref->y_crop_height, upsampled_ref->y_stride,
-                         (mi_row << 3), (mi_col << 3), NULL, pd->subsampling_x,
-                         pd->subsampling_y);
-
-// If bsize < BLOCK_8X8, adjust pred pointer for this block
-#if !CONFIG_CB4X4
-        if (bsize < BLOCK_8X8)
-          pd->pre[0].buf =
-              &pd->pre[0].buf[(av1_raster_block_offset(BLOCK_8X8, block,
-                                                       pd->pre[0].stride))
-                              << 3];
-#endif  // !CONFIG_CB4X4
-
-        bestsme = cpi->find_fractional_mv_step(
-            x, &ref_mv[id].as_mv, cpi->common.allow_high_precision_mv,
-            x->errorperbit, &cpi->fn_ptr[bsize], 0,
-            cpi->sf.mv.subpel_iters_per_step, NULL, x->nmvjointcost, x->mvcost,
-            &dis, &sse, second_pred,
-#if CONFIG_EXT_INTER
-            mask, mask_stride, id,
-#endif
-            pw, ph, 1);
-
-        // Restore the reference frames.
-        pd->pre[0] = backup_pred;
-      } else {
-        (void)block;
-        bestsme = cpi->find_fractional_mv_step(
-            x, &ref_mv[id].as_mv, cpi->common.allow_high_precision_mv,
-            x->errorperbit, &cpi->fn_ptr[bsize], 0,
-            cpi->sf.mv.subpel_iters_per_step, NULL, x->nmvjointcost, x->mvcost,
-            &dis, &sse, second_pred,
+      bestsme = cpi->find_fractional_mv_step(
+          x, &ref_mv[id].as_mv, cpi->common.allow_high_precision_mv,
+          x->errorperbit, &cpi->fn_ptr[bsize], 0,
+          cpi->sf.mv.subpel_iters_per_step, NULL, x->nmvjointcost, x->mvcost,
+          &dis, &sse, second_pred,
 #if CONFIG_EXT_INTER
-            mask, mask_stride, id,
+          mask, mask_stride, id,
 #endif
-            pw, ph, 0);
-      }
+          pw, ph, cpi->sf.use_upsampled_references);
     }
 
     // Restore the pointer to the first (possibly scaled) prediction buffer.
     if (id) xd->plane[plane].pre[0] = ref_yv12[0];
 
     if (bestsme < last_besterr[id]) {
-      frame_mv[refs[id]].as_mv = *best_mv;
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+      // NOTE: For single ref comp mode, frame_mv stores the first mv and
+      //       frame_comp_mv stores the second mv.
+      if (!has_second_ref(mbmi) && id)
+        frame_comp_mv[refs[0]].as_mv = *best_mv;
+      else
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+        frame_mv[refs[id]].as_mv = *best_mv;
       last_besterr[id] = bestsme;
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+      if (!has_second_ref(mbmi)) last_besterr[!id] = last_besterr[id];
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
     } else {
       break;
     }
@@ -5453,40 +6409,92 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
 
   *rate_mv = 0;
 
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+#else
   for (ref = 0; ref < 2; ++ref) {
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
     if (scaled_ref_frame[ref]) {
       // Restore the prediction frame pointers to their unscaled versions.
       int i;
       for (i = 0; i < MAX_MB_PLANE; i++)
         xd->plane[i].pre[ref] = backup_yv12[ref][i];
     }
-    av1_set_mvcost(x, refs[ref], ref, mbmi->ref_mv_idx);
+
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+    if (!has_second_ref(mbmi))
+      av1_set_mvcost(x, refs[0], 0, mbmi->ref_mv_idx);
+    else
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+      av1_set_mvcost(x, refs[ref], ref, mbmi->ref_mv_idx);
+
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+    if (!has_second_ref(mbmi)) {
+      // NOTE: For single ref comp mode, i.e. !has_second_ref(mbmi) is true, the
+      //       first mv is stored in frame_mv[] and the second mv is stored in
+      //       frame_comp_mv[].
+      if (compound_ref0_mode(mbmi->mode) == NEWMV)  // SR_NEW_NEWMV
+        *rate_mv += av1_mv_bit_cost(&frame_mv[refs[0]].as_mv,
+                                    &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
+                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+      assert(compound_ref1_mode(mbmi->mode) == NEWMV);
+      *rate_mv += av1_mv_bit_cost(&frame_comp_mv[refs[0]].as_mv,
+                                  &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
+                                  x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+    } else {
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
 #if CONFIG_EXT_INTER && !CONFIG_CB4X4
-    if (bsize >= BLOCK_8X8)
+      if (bsize >= BLOCK_8X8)
 #endif  // CONFIG_EXT_INTER && !CONFIG_CB4X4
-      *rate_mv += av1_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
-                                  &x->mbmi_ext->ref_mvs[refs[ref]][0].as_mv,
-                                  x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+        *rate_mv += av1_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
+                                    &x->mbmi_ext->ref_mvs[refs[ref]][0].as_mv,
+                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
 #if CONFIG_EXT_INTER && !CONFIG_CB4X4
-    else
-      *rate_mv += av1_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
-                                  &ref_mv_sub8x8[ref]->as_mv, x->nmvjointcost,
-                                  x->mvcost, MV_COST_WEIGHT);
+      else
+        *rate_mv += av1_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
+                                    &ref_mv_sub8x8[ref]->as_mv, x->nmvjointcost,
+                                    x->mvcost, MV_COST_WEIGHT);
 #endif  // CONFIG_EXT_INTER && !CONFIG_CB4X4
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+    }
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
   }
+
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  if (!has_second_ref(mbmi)) {
+    if (scaled_ref_frame[0]) {
+      // Restore the prediction frame pointers to their unscaled versions.
+      int i;
+      for (i = 0; i < MAX_MB_PLANE; i++)
+        xd->plane[i].pre[1] = backup_yv12[1][i];
+    }
+  }
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
 }
 
-static void estimate_ref_frame_costs(const AV1_COMMON *cm,
-                                     const MACROBLOCKD *xd, int segment_id,
-                                     unsigned int *ref_costs_single,
-                                     unsigned int *ref_costs_comp,
-                                     aom_prob *comp_mode_p) {
+static void estimate_ref_frame_costs(
+    const AV1_COMMON *cm, const MACROBLOCKD *xd, int segment_id,
+    unsigned int *ref_costs_single,
+#if CONFIG_EXT_COMP_REFS
+    unsigned int (*ref_costs_comp)[TOTAL_REFS_PER_FRAME],
+#else
+    unsigned int *ref_costs_comp,
+#endif  // CONFIG_EXT_COMP_REFS
+    aom_prob *comp_mode_p) {
   int seg_ref_active =
       segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
   if (seg_ref_active) {
     memset(ref_costs_single, 0,
            TOTAL_REFS_PER_FRAME * sizeof(*ref_costs_single));
+#if CONFIG_EXT_COMP_REFS
+    int ref_frame;
+    for (ref_frame = 0; ref_frame < TOTAL_REFS_PER_FRAME; ++ref_frame)
+      memset(ref_costs_comp[ref_frame], 0,
+             TOTAL_REFS_PER_FRAME * sizeof((*ref_costs_comp)[0]));
+#else
     memset(ref_costs_comp, 0, TOTAL_REFS_PER_FRAME * sizeof(*ref_costs_comp));
+#endif  // CONFIG_EXT_COMP_REFS
+
     *comp_mode_p = 128;
   } else {
     aom_prob intra_inter_p = av1_get_intra_inter_prob(cm, xd);
@@ -5541,7 +6549,7 @@ static void estimate_ref_frame_costs(const AV1_COMMON *cm,
 
       ref_costs_single[LAST3_FRAME] += av1_cost_bit(ref_single_p5, 0);
       ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p5, 1);
-#else
+#else   // !CONFIG_EXT_REFS
       ref_costs_single[LAST_FRAME] += av1_cost_bit(ref_single_p1, 0);
       ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p1, 1);
       ref_costs_single[ALTREF_FRAME] += av1_cost_bit(ref_single_p1, 1);
@@ -5570,6 +6578,63 @@ static void estimate_ref_frame_costs(const AV1_COMMON *cm,
 
       unsigned int base_cost = av1_cost_bit(intra_inter_p, 1);
 
+#if CONFIG_EXT_COMP_REFS
+      aom_prob comp_ref_type_p = av1_get_comp_reference_type_prob(cm, xd);
+      unsigned int ref_bicomp_costs[TOTAL_REFS_PER_FRAME] = { 0 };
+
+      ref_bicomp_costs[LAST_FRAME] = ref_bicomp_costs[LAST2_FRAME] =
+          ref_bicomp_costs[LAST3_FRAME] = ref_bicomp_costs[GOLDEN_FRAME] =
+#if USE_UNI_COMP_REFS
+              base_cost + av1_cost_bit(comp_ref_type_p, 1);
+#else
+              base_cost;
+#endif  // USE_UNI_COMP_REFS
+      ref_bicomp_costs[BWDREF_FRAME] = ref_bicomp_costs[ALTREF_FRAME] = 0;
+
+      ref_bicomp_costs[LAST_FRAME] += av1_cost_bit(ref_comp_p, 0);
+      ref_bicomp_costs[LAST2_FRAME] += av1_cost_bit(ref_comp_p, 0);
+      ref_bicomp_costs[LAST3_FRAME] += av1_cost_bit(ref_comp_p, 1);
+      ref_bicomp_costs[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p, 1);
+
+      ref_bicomp_costs[LAST_FRAME] += av1_cost_bit(ref_comp_p1, 1);
+      ref_bicomp_costs[LAST2_FRAME] += av1_cost_bit(ref_comp_p1, 0);
+
+      ref_bicomp_costs[LAST3_FRAME] += av1_cost_bit(ref_comp_p2, 0);
+      ref_bicomp_costs[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p2, 1);
+
+      ref_bicomp_costs[BWDREF_FRAME] += av1_cost_bit(bwdref_comp_p, 0);
+      ref_bicomp_costs[ALTREF_FRAME] += av1_cost_bit(bwdref_comp_p, 1);
+
+      int ref0;
+      for (ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) {
+        ref_costs_comp[ref0][BWDREF_FRAME] =
+            ref_bicomp_costs[ref0] + ref_bicomp_costs[BWDREF_FRAME];
+        ref_costs_comp[ref0][ALTREF_FRAME] =
+            ref_bicomp_costs[ref0] + ref_bicomp_costs[ALTREF_FRAME];
+      }
+
+      aom_prob uni_comp_ref_p = av1_get_pred_prob_uni_comp_ref_p(cm, xd);
+      aom_prob uni_comp_ref_p1 = av1_get_pred_prob_uni_comp_ref_p1(cm, xd);
+      aom_prob uni_comp_ref_p2 = av1_get_pred_prob_uni_comp_ref_p2(cm, xd);
+
+      ref_costs_comp[LAST_FRAME][LAST2_FRAME] =
+          base_cost + av1_cost_bit(comp_ref_type_p, 0) +
+          av1_cost_bit(uni_comp_ref_p, 0) + av1_cost_bit(uni_comp_ref_p1, 0);
+      ref_costs_comp[LAST_FRAME][LAST3_FRAME] =
+          base_cost + av1_cost_bit(comp_ref_type_p, 0) +
+          av1_cost_bit(uni_comp_ref_p, 0) + av1_cost_bit(uni_comp_ref_p1, 1) +
+          av1_cost_bit(uni_comp_ref_p2, 0);
+      ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] =
+          base_cost + av1_cost_bit(comp_ref_type_p, 0) +
+          av1_cost_bit(uni_comp_ref_p, 0) + av1_cost_bit(uni_comp_ref_p1, 1) +
+          av1_cost_bit(uni_comp_ref_p2, 1);
+
+      ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] =
+          base_cost + av1_cost_bit(comp_ref_type_p, 0) +
+          av1_cost_bit(uni_comp_ref_p, 1);
+
+#else  // !CONFIG_EXT_COMP_REFS
+
       ref_costs_comp[LAST_FRAME] =
 #if CONFIG_EXT_REFS
           ref_costs_comp[LAST2_FRAME] = ref_costs_comp[LAST3_FRAME] =
@@ -5596,11 +6661,23 @@ static void estimate_ref_frame_costs(const AV1_COMMON *cm,
       //               more bit.
       ref_costs_comp[BWDREF_FRAME] += av1_cost_bit(bwdref_comp_p, 0);
       ref_costs_comp[ALTREF_FRAME] += av1_cost_bit(bwdref_comp_p, 1);
-#else
+#else   // !CONFIG_EXT_REFS
       ref_costs_comp[LAST_FRAME] += av1_cost_bit(ref_comp_p, 0);
       ref_costs_comp[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p, 1);
 #endif  // CONFIG_EXT_REFS
+#endif  // CONFIG_EXT_COMP_REFS
     } else {
+#if CONFIG_EXT_COMP_REFS
+      int ref0;
+      for (ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) {
+        ref_costs_comp[ref0][BWDREF_FRAME] = 512;
+        ref_costs_comp[ref0][ALTREF_FRAME] = 512;
+      }
+      ref_costs_comp[LAST_FRAME][LAST2_FRAME] = 512;
+      ref_costs_comp[LAST_FRAME][LAST3_FRAME] = 512;
+      ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] = 512;
+      ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] = 512;
+#else  // !CONFIG_EXT_COMP_REFS
       ref_costs_comp[LAST_FRAME] = 512;
 #if CONFIG_EXT_REFS
       ref_costs_comp[LAST2_FRAME] = 512;
@@ -5609,6 +6686,7 @@ static void estimate_ref_frame_costs(const AV1_COMMON *cm,
       ref_costs_comp[ALTREF_FRAME] = 512;
 #endif  // CONFIG_EXT_REFS
       ref_costs_comp[GOLDEN_FRAME] = 512;
+#endif  // CONFIG_EXT_COMP_REFS
     }
   }
 }
@@ -5693,8 +6771,13 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
   int sadpb = x->sadperbit16;
   MV mvp_full;
 #if CONFIG_EXT_INTER
+#if CONFIG_COMPOUND_SINGLEREF
+  int ref =
+      has_second_ref(mbmi) ? mbmi->ref_frame[ref_idx] : mbmi->ref_frame[0];
+#else   // !CONFIG_COMPOUND_SINGLEREF
   int ref = mbmi->ref_frame[ref_idx];
-#else
+#endif  // CONFIG_COMPOUND_SINGLEREF
+#else   // !CONFIG_EXT_INTER
   int ref = mbmi->ref_frame[0];
   int ref_idx = 0;
 #endif  // CONFIG_EXT_INTER
@@ -5802,7 +6885,7 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
           MAX_MVSEARCH_STEPS - 1 - step_param, 1, &cpi->fn_ptr[bsize], &ref_mv,
           &(x->best_mv.as_mv), 0);
       break;
-    default: assert("Invalid motion mode!\n");
+    default: assert(0 && "Invalid motion mode!\n");
   }
 #endif  // CONFIG_MOTION_VAR
 
@@ -5820,17 +6903,6 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
                                  x->second_best_mv.as_int != x->best_mv.as_int;
           const int pw = block_size_wide[bsize];
           const int ph = block_size_high[bsize];
-          // Use up-sampled reference frames.
-          struct macroblockd_plane *const pd = &xd->plane[0];
-          struct buf_2d backup_pred = pd->pre[ref_idx];
-          const YV12_BUFFER_CONFIG *upsampled_ref = get_upsampled_ref(cpi, ref);
-
-          // Set pred for Y plane
-          setup_pred_plane(
-              &pd->pre[ref_idx], bsize, upsampled_ref->y_buffer,
-              upsampled_ref->y_crop_width, upsampled_ref->y_crop_height,
-              upsampled_ref->y_stride, (mi_row << 3), (mi_col << 3), NULL,
-              pd->subsampling_x, pd->subsampling_y);
 
           best_mv_var = cpi->find_fractional_mv_step(
               x, &ref_mv, cm->allow_high_precision_mv, x->errorperbit,
@@ -5873,9 +6945,6 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
               x->best_mv.as_mv = best_mv;
             }
           }
-
-          // Restore the reference frames.
-          pd->pre[ref_idx] = backup_pred;
         } else {
           cpi->find_fractional_mv_step(
               x, &ref_mv, cm->allow_high_precision_mv, x->errorperbit,
@@ -5891,13 +6960,12 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
         break;
       case OBMC_CAUSAL:
         av1_find_best_obmc_sub_pixel_tree_up(
-            cpi, x, mi_row, mi_col, &x->best_mv.as_mv, &ref_mv,
-            cm->allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize],
-            cpi->sf.mv.subpel_force_stop, cpi->sf.mv.subpel_iters_per_step,
-            x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], 0,
-            cpi->sf.use_upsampled_references);
+            x, &x->best_mv.as_mv, &ref_mv, cm->allow_high_precision_mv,
+            x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
+            cpi->sf.mv.subpel_iters_per_step, x->nmvjointcost, x->mvcost, &dis,
+            &x->pred_sse[ref], 0, cpi->sf.use_upsampled_references);
         break;
-      default: assert("Invalid motion mode!\n");
+      default: assert(0 && "Invalid motion mode!\n");
     }
 #endif  // CONFIG_MOTION_VAR
   }
@@ -5936,15 +7004,12 @@ static void build_second_inter_pred(const AV1_COMP *cpi, MACROBLOCK *x,
   const int ph = block_size_high[bsize];
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+#if CONFIG_COMPOUND_SINGLEREF
+  const int other_ref =
+      has_second_ref(mbmi) ? mbmi->ref_frame[!ref_idx] : mbmi->ref_frame[0];
+#else  // !CONFIG_COMPOUND_SINGLEREF
   const int other_ref = mbmi->ref_frame[!ref_idx];
-#if CONFIG_DUAL_FILTER
-  InterpFilter interp_filter[2] = {
-    (ref_idx == 0) ? mbmi->interp_filter[2] : mbmi->interp_filter[0],
-    (ref_idx == 0) ? mbmi->interp_filter[3] : mbmi->interp_filter[1]
-  };
-#else
-  const InterpFilter interp_filter = mbmi->interp_filter;
-#endif  // CONFIG_DUAL_FILTER
+#endif  // CONFIG_COMPOUND_SINGLEREF
   struct scale_factors sf;
 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
   struct macroblockd_plane *const pd = &xd->plane[0];
@@ -5961,8 +7026,12 @@ static void build_second_inter_pred(const AV1_COMP *cpi, MACROBLOCK *x,
   (void)block;
 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
 
-  // This function should only ever be called for compound modes
+// This function should only ever be called for compound modes
+#if CONFIG_COMPOUND_SINGLEREF
+  assert(has_second_ref(mbmi) || is_inter_singleref_comp_mode(mbmi->mode));
+#else   // !CONFIG_COMPOUND_SINGLEREF
   assert(has_second_ref(mbmi));
+#endif  // CONFIG_COMPOUND_SINGLEREF
 
   struct buf_2d backup_yv12[MAX_MB_PLANE];
   const YV12_BUFFER_CONFIG *const scaled_ref_frame =
@@ -5991,7 +7060,7 @@ static void build_second_inter_pred(const AV1_COMP *cpi, MACROBLOCK *x,
   struct buf_2d ref_yv12;
 
   const int plane = 0;
-  ConvolveParams conv_params = get_conv_params(0, plane);
+  ConvolveParams conv_params = get_conv_params(!ref_idx, 0, plane);
 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
   WarpTypesAllowed warp_types;
 #if CONFIG_GLOBAL_MOTION
@@ -6010,7 +7079,7 @@ static void build_second_inter_pred(const AV1_COMP *cpi, MACROBLOCK *x,
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     av1_highbd_build_inter_predictor(
         ref_yv12.buf, ref_yv12.stride, second_pred, pw, other_mv, &sf, pw, ph,
-        0, interp_filter,
+        0, mbmi->interp_filter,
 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
         &warp_types, p_col, p_row,
 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
@@ -6019,7 +7088,7 @@ static void build_second_inter_pred(const AV1_COMP *cpi, MACROBLOCK *x,
 #endif  // CONFIG_HIGHBITDEPTH
     av1_build_inter_predictor(
         ref_yv12.buf, ref_yv12.stride, second_pred, pw, other_mv, &sf, pw, ph,
-        &conv_params, interp_filter,
+        &conv_params, mbmi->interp_filter,
 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
         &warp_types, p_col, p_row, plane, !ref_idx,
 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
@@ -6038,15 +7107,22 @@ static void build_second_inter_pred(const AV1_COMP *cpi, MACROBLOCK *x,
 
 // Search for the best mv for one component of a compound,
 // given that the other component is fixed.
-static void compound_single_motion_search(
-    const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, MV *this_mv,
-    int mi_row, int mi_col, const uint8_t *second_pred, const uint8_t *mask,
-    int mask_stride, int *rate_mv, const int block, int ref_idx) {
+static void compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
+                                          BLOCK_SIZE bsize, MV *this_mv,
+                                          int mi_row, int mi_col,
+                                          const uint8_t *second_pred,
+                                          const uint8_t *mask, int mask_stride,
+                                          int *rate_mv, int ref_idx) {
   const int pw = block_size_wide[bsize];
   const int ph = block_size_high[bsize];
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+#if CONFIG_COMPOUND_SINGLEREF
+  const int ref =
+      has_second_ref(mbmi) ? mbmi->ref_frame[ref_idx] : mbmi->ref_frame[0];
+#else
   const int ref = mbmi->ref_frame[ref_idx];
+#endif  // CONFIG_COMPOUND_SINGLEREF
   int_mv ref_mv = x->mbmi_ext->ref_mvs[ref][0];
   struct macroblockd_plane *const pd = &xd->plane[0];
 
@@ -6054,9 +7130,16 @@ static void compound_single_motion_search(
   const YV12_BUFFER_CONFIG *const scaled_ref_frame =
       av1_get_scaled_ref_frame(cpi, ref);
 
-  // Check that this is either an interinter or an interintra block
+// Check that this is either an interinter or an interintra block
+#if CONFIG_COMPOUND_SINGLEREF
   assert(has_second_ref(mbmi) ||
+         // or a single ref comp pred mode
+         is_inter_singleref_comp_mode(mbmi->mode) ||
          (ref_idx == 0 && mbmi->ref_frame[1] == INTRA_FRAME));
+#else
+  assert(has_second_ref(mbmi) ||
+         (ref_idx == 0 && mbmi->ref_frame[1] == INTRA_FRAME));
+#endif  // CONFIG_COMPOUND_SINGLEREF
 
   if (scaled_ref_frame) {
     int i;
@@ -6091,7 +7174,12 @@ static void compound_single_motion_search(
   best_mv->col >>= 3;
   best_mv->row >>= 3;
 
-  av1_set_mvcost(x, ref, ref_idx, mbmi->ref_mv_idx);
+#if CONFIG_COMPOUND_SINGLEREF
+  if (!has_second_ref(mbmi))
+    av1_set_mvcost(x, ref, 0, mbmi->ref_mv_idx);
+  else
+#endif  // CONFIG_COMPOUND_SINGLEREF
+    av1_set_mvcost(x, ref, ref_idx, mbmi->ref_mv_idx);
 
   // Small-range full-pixel motion search.
   bestsme = av1_refining_search_8p_c(x, sadpb, search_range,
@@ -6112,43 +7200,11 @@ static void compound_single_motion_search(
   if (bestsme < INT_MAX) {
     int dis; /* TODO: use dis in distortion calculation later. */
     unsigned int sse;
-    if (cpi->sf.use_upsampled_references) {
-      // Use up-sampled reference frames.
-      struct buf_2d backup_pred = pd->pre[0];
-      const YV12_BUFFER_CONFIG *upsampled_ref = get_upsampled_ref(cpi, ref);
-
-      // Set pred for Y plane
-      setup_pred_plane(&pd->pre[0], bsize, upsampled_ref->y_buffer,
-                       upsampled_ref->y_crop_width,
-                       upsampled_ref->y_crop_height, upsampled_ref->y_stride,
-                       (mi_row << 3), (mi_col << 3), NULL, pd->subsampling_x,
-                       pd->subsampling_y);
-
-// If bsize < BLOCK_8X8, adjust pred pointer for this block
-#if !CONFIG_CB4X4
-      if (bsize < BLOCK_8X8)
-        pd->pre[0].buf =
-            &pd->pre[0].buf[(av1_raster_block_offset(BLOCK_8X8, block,
-                                                     pd->pre[0].stride))
-                            << 3];
-#endif  // !CONFIG_CB4X4
-
-      bestsme = cpi->find_fractional_mv_step(
-          x, &ref_mv.as_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
-          &cpi->fn_ptr[bsize], 0, cpi->sf.mv.subpel_iters_per_step, NULL,
-          x->nmvjointcost, x->mvcost, &dis, &sse, second_pred, mask,
-          mask_stride, ref_idx, pw, ph, 1);
-
-      // Restore the reference frames.
-      pd->pre[0] = backup_pred;
-    } else {
-      (void)block;
-      bestsme = cpi->find_fractional_mv_step(
-          x, &ref_mv.as_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
-          &cpi->fn_ptr[bsize], 0, cpi->sf.mv.subpel_iters_per_step, NULL,
-          x->nmvjointcost, x->mvcost, &dis, &sse, second_pred, mask,
-          mask_stride, ref_idx, pw, ph, 0);
-    }
+    bestsme = cpi->find_fractional_mv_step(
+        x, &ref_mv.as_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
+        &cpi->fn_ptr[bsize], 0, cpi->sf.mv.subpel_iters_per_step, NULL,
+        x->nmvjointcost, x->mvcost, &dis, &sse, second_pred, mask, mask_stride,
+        ref_idx, pw, ph, cpi->sf.use_upsampled_references);
   }
 
   // Restore the pointer to the first (possibly scaled) prediction buffer.
@@ -6165,7 +7221,12 @@ static void compound_single_motion_search(
       xd->plane[i].pre[ref_idx] = backup_yv12[i];
   }
 
-  av1_set_mvcost(x, ref, ref_idx, mbmi->ref_mv_idx);
+#if CONFIG_COMPOUND_SINGLEREF
+  if (!has_second_ref(mbmi))
+    av1_set_mvcost(x, ref, 0, mbmi->ref_mv_idx);
+  else
+#endif  // CONFIG_COMPOUND_SINGLEREF
+    av1_set_mvcost(x, ref, ref_idx, mbmi->ref_mv_idx);
   *rate_mv += av1_mv_bit_cost(this_mv, &ref_mv.as_mv, x->nmvjointcost,
                               x->mvcost, MV_COST_WEIGHT);
 }
@@ -6174,13 +7235,23 @@ static void compound_single_motion_search(
 // where the second prediction is also an inter mode.
 static void compound_single_motion_search_interinter(
     const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *frame_mv,
+#if CONFIG_COMPOUND_SINGLEREF
+    int_mv *frame_comp_mv,
+#endif  // CONFIG_COMPOUND_SINGLEREF
     int mi_row, int mi_col, const uint8_t *mask, int mask_stride, int *rate_mv,
     const int block, int ref_idx) {
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
 
-  // This function should only ever be called for compound modes
+// This function should only ever be called for compound modes
+#if CONFIG_COMPOUND_SINGLEREF
+  int is_singleref_comp_mode =
+      !has_second_ref(mbmi) && is_inter_singleref_comp_mode(mbmi->mode);
+  assert(has_second_ref(mbmi) || is_singleref_comp_mode);
+  if (is_singleref_comp_mode && ref_idx) assert(frame_comp_mv);
+#else   // !CONFIG_COMPOUND_SINGLEREF
   assert(has_second_ref(mbmi));
+#endif  // CONFIG_COMPOUND_SINGLEREF
 
 // Prediction buffer from second frame.
 #if CONFIG_HIGHBITDEPTH
@@ -6194,14 +7265,26 @@ static void compound_single_motion_search_interinter(
   DECLARE_ALIGNED(16, uint8_t, second_pred[MAX_SB_SQUARE]);
 #endif  // CONFIG_HIGHBITDEPTH
 
+#if CONFIG_COMPOUND_SINGLEREF
+  MV *this_mv = has_second_ref(mbmi)
+                    ? &frame_mv[mbmi->ref_frame[ref_idx]].as_mv
+                    : (ref_idx ? &frame_comp_mv[mbmi->ref_frame[0]].as_mv
+                               : &frame_mv[mbmi->ref_frame[0]].as_mv);
+  const MV *other_mv =
+      has_second_ref(mbmi)
+          ? &frame_mv[mbmi->ref_frame[!ref_idx]].as_mv
+          : (ref_idx ? &frame_mv[mbmi->ref_frame[0]].as_mv
+                     : &frame_comp_mv[mbmi->ref_frame[0]].as_mv);
+#else   // !CONFIG_COMPOUND_SINGLEREF
   MV *this_mv = &frame_mv[mbmi->ref_frame[ref_idx]].as_mv;
   const MV *other_mv = &frame_mv[mbmi->ref_frame[!ref_idx]].as_mv;
+#endif  // CONFIG_COMPOUND_SINGLEREF
 
   build_second_inter_pred(cpi, x, bsize, other_mv, mi_row, mi_col, block,
                           ref_idx, second_pred);
 
   compound_single_motion_search(cpi, x, bsize, this_mv, mi_row, mi_col,
-                                second_pred, mask, mask_stride, rate_mv, block,
+                                second_pred, mask, mask_stride, rate_mv,
                                 ref_idx);
 }
 
@@ -6220,21 +7303,40 @@ static void do_masked_motion_search_indexed(
   mask = av1_get_compound_type_mask(comp_data, sb_type);
 
   int_mv frame_mv[TOTAL_REFS_PER_FRAME];
+#if CONFIG_COMPOUND_SINGLEREF
+  int_mv frame_comp_mv[TOTAL_REFS_PER_FRAME];
+#endif  // CONFIG_COMPOUND_SINGLEREF
   MV_REFERENCE_FRAME rf[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] };
   assert(bsize >= BLOCK_8X8 || CONFIG_CB4X4);
 
   frame_mv[rf[0]].as_int = cur_mv[0].as_int;
-  frame_mv[rf[1]].as_int = cur_mv[1].as_int;
+#if CONFIG_COMPOUND_SINGLEREF
+  if (!has_second_ref(mbmi))
+    frame_comp_mv[rf[0]].as_int = cur_mv[1].as_int;
+  else
+#endif  // CONFIG_COMPOUND_SINGLEREF
+    frame_mv[rf[1]].as_int = cur_mv[1].as_int;
   if (which == 0 || which == 1) {
-    compound_single_motion_search_interinter(cpi, x, bsize, frame_mv, mi_row,
-                                             mi_col, mask, mask_stride, rate_mv,
-                                             0, which);
+    compound_single_motion_search_interinter(
+        cpi, x, bsize, frame_mv,
+#if CONFIG_COMPOUND_SINGLEREF
+        has_second_ref(mbmi) ? NULL : frame_comp_mv,
+#endif  // CONFIG_COMPOUND_SINGLEREF
+        mi_row, mi_col, mask, mask_stride, rate_mv, 0, which);
   } else if (which == 2) {
-    joint_motion_search(cpi, x, bsize, frame_mv, mi_row, mi_col, NULL, mask,
-                        mask_stride, rate_mv, 0);
+    joint_motion_search(cpi, x, bsize, frame_mv,
+#if CONFIG_COMPOUND_SINGLEREF
+                        has_second_ref(mbmi) ? NULL : frame_comp_mv,
+#endif  // CONFIG_COMPOUND_SINGLEREF
+                        mi_row, mi_col, NULL, mask, mask_stride, rate_mv, 0);
   }
   tmp_mv[0].as_int = frame_mv[rf[0]].as_int;
-  tmp_mv[1].as_int = frame_mv[rf[1]].as_int;
+#if CONFIG_COMPOUND_SINGLEREF
+  if (!has_second_ref(mbmi))
+    tmp_mv[1].as_int = frame_comp_mv[rf[0]].as_int;
+  else  // comp ref
+#endif  // CONFIG_COMPOUND_SINGLEREF
+    tmp_mv[1].as_int = frame_mv[rf[1]].as_int;
 }
 #endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
 #endif  // CONFIG_EXT_INTER
@@ -6483,7 +7585,7 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
     sse = ROUND_POWER_OF_TWO(sse, bd_round);
 
     model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
-    rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
+    rd = RDCOST(x->rdmult, rate, dist);
 
     if (rd < best_rd) {
       *best_wedge_index = wedge_index;
@@ -6544,7 +7646,7 @@ static int64_t pick_wedge_fixed_sign(
     sse = ROUND_POWER_OF_TWO(sse, bd_round);
 
     model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
-    rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
+    rd = RDCOST(x->rdmult, rate, dist);
 
     if (rd < best_rd) {
       *best_wedge_index = wedge_index;
@@ -6646,7 +7748,7 @@ static int64_t pick_interinter_seg(const AV1_COMP *const cpi,
     sse = ROUND_POWER_OF_TWO(sse, bd_round);
 
     model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
-    rd0 = RDCOST(x->rdmult, x->rddiv, rate, dist);
+    rd0 = RDCOST(x->rdmult, rate, dist);
 
     if (rd0 < best_rd) {
       best_mask_type = cur_mask_type;
@@ -6729,7 +7831,17 @@ static int interinter_compound_motion_search(
 #endif  // CONFIG_COMPOUND_SEGMENT
     mbmi->interinter_compound_type
   };
-  if (this_mode == NEW_NEWMV) {
+#if CONFIG_COMPOUND_SINGLEREF
+  // NOTE: Mode is needed to identify the compound mode prediction, regardless
+  //       of comp refs or single ref.
+  mbmi->mode = this_mode;
+#endif  // CONFIG_COMPOUND_SINGLEREF
+
+  if (this_mode == NEW_NEWMV
+#if CONFIG_COMPOUND_SINGLEREF
+      || this_mode == SR_NEW_NEWMV
+#endif  // CONFIG_COMPOUND_SINGLEREF
+      ) {
     do_masked_motion_search_indexed(cpi, x, cur_mv, &compound_data, bsize,
                                     mi_row, mi_col, tmp_mv, &tmp_rate_mv, 2);
     mbmi->mv[0].as_int = tmp_mv[0].as_int;
@@ -6738,7 +7850,12 @@ static int interinter_compound_motion_search(
     do_masked_motion_search_indexed(cpi, x, cur_mv, &compound_data, bsize,
                                     mi_row, mi_col, tmp_mv, &tmp_rate_mv, 0);
     mbmi->mv[0].as_int = tmp_mv[0].as_int;
-  } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
+  } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV
+#if CONFIG_COMPOUND_SINGLEREF
+             // || this_mode == SR_NEAREST_NEWMV
+             || this_mode == SR_NEAR_NEWMV || this_mode == SR_ZERO_NEWMV
+#endif  // CONFIG_COMPOUND_SINGLEREF
+             ) {
     do_masked_motion_search_indexed(cpi, x, cur_mv, &compound_data, bsize,
                                     mi_row, mi_col, tmp_mv, &tmp_rate_mv, 1);
     mbmi->mv[1].as_int = tmp_mv[1].as_int;
@@ -6763,7 +7880,7 @@ static int64_t build_and_cost_compound_type(
   const COMPOUND_TYPE compound_type = mbmi->interinter_compound_type;
 
   best_rd_cur = pick_interinter_mask(cpi, x, bsize, *preds0, *preds1);
-  best_rd_cur += RDCOST(x->rdmult, x->rddiv, rs2 + rate_mv, 0);
+  best_rd_cur += RDCOST(x->rdmult, rs2 + rate_mv, 0);
 
   if (have_newmv_in_inter_mode(this_mode) &&
       use_masked_motion_search(compound_type)) {
@@ -6772,7 +7889,7 @@ static int64_t build_and_cost_compound_type(
     av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize);
     model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
                     &tmp_skip_txfm_sb, &tmp_skip_sse_sb);
-    rd = RDCOST(x->rdmult, x->rddiv, rs2 + *out_rate_mv + rate_sum, dist_sum);
+    rd = RDCOST(x->rdmult, rs2 + *out_rate_mv + rate_sum, dist_sum);
     if (rd >= best_rd_cur) {
       mbmi->mv[0].as_int = cur_mv[0].as_int;
       mbmi->mv[1].as_int = cur_mv[1].as_int;
@@ -6788,7 +7905,7 @@ static int64_t build_and_cost_compound_type(
     rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
                              &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
     if (rd != INT64_MAX)
-      rd = RDCOST(x->rdmult, x->rddiv, rs2 + *out_rate_mv + rate_sum, dist_sum);
+      rd = RDCOST(x->rdmult, rs2 + *out_rate_mv + rate_sum, dist_sum);
     best_rd_cur = rd;
 
   } else {
@@ -6801,7 +7918,7 @@ static int64_t build_and_cost_compound_type(
     rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
                              &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
     if (rd != INT64_MAX)
-      rd = RDCOST(x->rdmult, x->rddiv, rs2 + rate_mv + rate_sum, dist_sum);
+      rd = RDCOST(x->rdmult, rs2 + rate_mv + rate_sum, dist_sum);
     best_rd_cur = rd;
   }
   return best_rd_cur;
@@ -6832,6 +7949,9 @@ typedef struct {
 static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
                             const BLOCK_SIZE bsize,
                             int_mv (*const mode_mv)[TOTAL_REFS_PER_FRAME],
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+                            int_mv (*const mode_comp_mv)[TOTAL_REFS_PER_FRAME],
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
                             const int mi_row, const int mi_col,
                             int *const rate_mv, int_mv *const single_newmv,
                             HandleInterModeArgs *const args) {
@@ -6844,6 +7964,9 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
   const int is_comp_interintra_pred = (mbmi->ref_frame[1] == INTRA_FRAME);
 #endif  // CONFIG_EXT_INTER
   int_mv *const frame_mv = mode_mv[this_mode];
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  int_mv *const frame_comp_mv = mode_comp_mv[this_mode];
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
   const int refs[2] = { mbmi->ref_frame[0],
                         mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] };
   int i;
@@ -6861,8 +7984,11 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
       frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
 
       if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
-        joint_motion_search(cpi, x, bsize, frame_mv, mi_row, mi_col, NULL, NULL,
-                            0, rate_mv, 0);
+        joint_motion_search(cpi, x, bsize, frame_mv,
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+                            NULL,  // int_mv *frame_comp_mv
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+                            mi_row, mi_col, NULL, NULL, 0, rate_mv, 0);
       } else {
         *rate_mv = 0;
         for (i = 0; i < 2; ++i) {
@@ -6877,8 +8003,12 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
       if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
         frame_mv[refs[0]].as_int =
             mode_mv[compound_ref0_mode(this_mode)][refs[0]].as_int;
-        compound_single_motion_search_interinter(
-            cpi, x, bsize, frame_mv, mi_row, mi_col, NULL, 0, rate_mv, 0, 1);
+        compound_single_motion_search_interinter(cpi, x, bsize, frame_mv,
+#if CONFIG_COMPOUND_SINGLEREF
+                                                 NULL,
+#endif  // CONFIG_COMPOUND_SINGLEREF
+                                                 mi_row, mi_col, NULL, 0,
+                                                 rate_mv, 0, 1);
       } else {
         av1_set_mvcost(x, refs[1], 1, mbmi->ref_mv_idx);
         *rate_mv = av1_mv_bit_cost(&frame_mv[refs[1]].as_mv,
@@ -6891,8 +8021,12 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
       if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
         frame_mv[refs[1]].as_int =
             mode_mv[compound_ref1_mode(this_mode)][refs[1]].as_int;
-        compound_single_motion_search_interinter(
-            cpi, x, bsize, frame_mv, mi_row, mi_col, NULL, 0, rate_mv, 0, 0);
+        compound_single_motion_search_interinter(cpi, x, bsize, frame_mv,
+#if CONFIG_COMPOUND_SINGLEREF
+                                                 NULL,
+#endif  // CONFIG_COMPOUND_SINGLEREF
+                                                 mi_row, mi_col, NULL, 0,
+                                                 rate_mv, 0, 0);
       } else {
         av1_set_mvcost(x, refs[0], 0, mbmi->ref_mv_idx);
         *rate_mv = av1_mv_bit_cost(&frame_mv[refs[0]].as_mv,
@@ -6900,7 +8034,7 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
       }
     }
-#else
+#else   // !CONFIG_EXT_INTER
     // Initialize mv using single prediction mode result.
     frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
     frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
@@ -6917,6 +8051,41 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
       }
     }
 #endif  // CONFIG_EXT_INTER
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  } else if (is_inter_singleref_comp_mode(this_mode)) {
+    // Single ref comp mode
+    const int mode0 = compound_ref0_mode(this_mode);
+
+    single_newmv[refs[0]].as_int = args->single_newmv[refs[0]].as_int;
+    frame_mv[refs[0]].as_int = (mode0 == NEWMV)
+                                   ? single_newmv[refs[0]].as_int
+                                   : mode_mv[mode0][refs[0]].as_int;
+    assert(compound_ref1_mode(this_mode) == NEWMV);
+    frame_comp_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
+
+    if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
+      if (this_mode == SR_NEW_NEWMV) {
+        joint_motion_search(cpi, x, bsize, frame_mv, frame_comp_mv, mi_row,
+                            mi_col, NULL, NULL, 0, rate_mv, 0);
+      } else {
+        assert(  // this_mode == SR_NEAREST_NEWMV ||
+            this_mode == SR_NEAR_NEWMV || this_mode == SR_ZERO_NEWMV);
+        compound_single_motion_search_interinter(cpi, x, bsize, frame_mv,
+                                                 frame_comp_mv, mi_row, mi_col,
+                                                 NULL, 0, rate_mv, 0, 1);
+      }
+    } else {
+      *rate_mv = 0;
+      av1_set_mvcost(x, refs[0], 0, mbmi->ref_mv_idx);
+      if (mode0 == NEWMV)
+        *rate_mv += av1_mv_bit_cost(&frame_mv[refs[0]].as_mv,
+                                    &mbmi_ext->ref_mvs[refs[0]][0].as_mv,
+                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+      *rate_mv += av1_mv_bit_cost(&frame_comp_mv[refs[0]].as_mv,
+                                  &mbmi_ext->ref_mvs[refs[0]][0].as_mv,
+                                  x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+    }
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
   } else {
 #if CONFIG_EXT_INTER
     if (is_comp_interintra_pred) {
@@ -6984,7 +8153,7 @@ int64_t interpolation_filter_search(
   av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
   model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate, &tmp_dist,
                   skip_txfm_sb, skip_sse_sb);
-  *rd = RDCOST(x->rdmult, x->rddiv, *switchable_rate + tmp_rate, tmp_dist);
+  *rd = RDCOST(x->rdmult, *switchable_rate + tmp_rate, tmp_dist);
 
   if (assign_filter == SWITCHABLE) {
     // do interp_filter search
@@ -7020,7 +8189,7 @@ int64_t interpolation_filter_search(
         av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
         model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
                         &tmp_dist, &tmp_skip_sb, &tmp_skip_sse);
-        tmp_rd = RDCOST(x->rdmult, x->rddiv, tmp_rs + tmp_rate, tmp_dist);
+        tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate, tmp_dist);
 
         if (tmp_rd < *rd) {
           *rd = tmp_rd;
@@ -7072,12 +8241,10 @@ static int64_t motion_mode_rd(
     int mi_col, HandleInterModeArgs *const args, const int64_t ref_best_rd,
     const int *refs, int rate_mv,
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+    // only used when WARPED_MOTION is on?
     int_mv *const single_newmv,
 #if CONFIG_EXT_INTER
-    int rate2_bmc_nocoeff, MB_MODE_INFO *best_bmc_mbmi,
-#if CONFIG_MOTION_VAR
-    int rate_mv_bmc,
-#endif  // CONFIG_MOTION_VAR
+    int rate2_bmc_nocoeff, MB_MODE_INFO *best_bmc_mbmi, int rate_mv_bmc,
 #endif  // CONFIG_EXT_INTER
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
     int rs, int *skip_txfm_sb, int64_t *skip_sse_sb, BUFFER_SET *orig_dst) {
@@ -7108,7 +8275,13 @@ static int64_t motion_mode_rd(
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 
 #if CONFIG_WARPED_MOTION
+#if WARPED_MOTION_SORT_SAMPLES
+  int pts0[SAMPLES_ARRAY_SIZE], pts_inref0[SAMPLES_ARRAY_SIZE];
+  int pts_mv0[SAMPLES_ARRAY_SIZE];
+  int total_samples;
+#else
   int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+#endif  // WARPED_MOTION_SORT_SAMPLES
 #endif  // CONFIG_WARPED_MOTION
 
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
@@ -7118,18 +8291,39 @@ static int64_t motion_mode_rd(
   if (cm->interp_filter == SWITCHABLE) rd_stats->rate += rs;
 #if CONFIG_WARPED_MOTION
   aom_clear_system_state();
+#if WARPED_MOTION_SORT_SAMPLES
+  mbmi->num_proj_ref[0] =
+      findSamples(cm, xd, mi_row, mi_col, pts0, pts_inref0, pts_mv0);
+  total_samples = mbmi->num_proj_ref[0];
+#else
   mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref);
+#endif  // WARPED_MOTION_SORT_SAMPLES
 #if CONFIG_EXT_INTER
   best_bmc_mbmi->num_proj_ref[0] = mbmi->num_proj_ref[0];
 #endif  // CONFIG_EXT_INTER
 #endif  // CONFIG_WARPED_MOTION
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
   rate2_nocoeff = rd_stats->rate;
+#if CONFIG_NCOBMC_ADAPT_WEIGHT
+  // We cannot estimate the rd cost for the motion mode NCOBMC_ADAPT_WEIGHT
+  // right now since it requires mvs from all neighboring blocks. We will
+  // check if this mode is beneficial after all the mv's in the current
+  // superblock are selected.
+  last_motion_mode_allowed = motion_mode_allowed_wrapper(1,
+#if CONFIG_GLOBAL_MOTION
+                                                         0, xd->global_motion,
+#endif  // CONFIG_GLOBAL_MOTION
+                                                         mi);
+#else
   last_motion_mode_allowed = motion_mode_allowed(
-#if CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+#if CONFIG_GLOBAL_MOTION
       0, xd->global_motion,
-#endif  // CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+#endif  // CONFIG_GLOBAL_MOTION
+#if CONFIG_WARPED_MOTION
+      xd,
+#endif
       mi);
+#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
   base_mbmi = *mbmi;
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 
@@ -7155,7 +8349,11 @@ static int64_t motion_mode_rd(
       *mbmi = *best_bmc_mbmi;
       mbmi->motion_mode = OBMC_CAUSAL;
 #endif  // CONFIG_EXT_INTER
-      if (!is_comp_pred && have_newmv_in_inter_mode(this_mode)) {
+      if (!is_comp_pred &&
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+          !is_inter_singleref_comp_mode(this_mode) &&
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+          have_newmv_in_inter_mode(this_mode)) {
         int tmp_rate_mv = 0;
 
         single_motion_search(cpi, x, bsize, mi_row, mi_col,
@@ -7195,6 +8393,9 @@ static int64_t motion_mode_rd(
 
 #if CONFIG_WARPED_MOTION
     if (mbmi->motion_mode == WARPED_CAUSAL) {
+#if WARPED_MOTION_SORT_SAMPLES
+      int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+#endif  // WARPED_MOTION_SORT_SAMPLES
 #if CONFIG_EXT_INTER
       *mbmi = *best_bmc_mbmi;
       mbmi->motion_mode = WARPED_CAUSAL;
@@ -7210,6 +8411,19 @@ static int64_t motion_mode_rd(
                                                             : cm->interp_filter;
 #endif  // CONFIG_DUAL_FILTER
 
+#if WARPED_MOTION_SORT_SAMPLES
+      memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
+      memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0));
+      // Rank the samples by motion vector difference
+      if (mbmi->num_proj_ref[0] > 1) {
+        mbmi->num_proj_ref[0] = sortSamples(pts_mv0, &mbmi->mv[0].as_mv, pts,
+                                            pts_inref, mbmi->num_proj_ref[0]);
+#if CONFIG_EXT_INTER
+        best_bmc_mbmi->num_proj_ref[0] = mbmi->num_proj_ref[0];
+#endif  // CONFIG_EXT_INTER
+      }
+#endif  // WARPED_MOTION_SORT_SAMPLES
+
       if (!find_projection(mbmi->num_proj_ref[0], pts, pts_inref, bsize,
                            mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
                            &mbmi->wm_params[0], mi_row, mi_col)) {
@@ -7218,9 +8432,16 @@ static int64_t motion_mode_rd(
           int tmp_rate_mv = 0;
           const int_mv mv0 = mbmi->mv[0];
           WarpedMotionParams wm_params0 = mbmi->wm_params[0];
+#if WARPED_MOTION_SORT_SAMPLES
+          int num_proj_ref0 = mbmi->num_proj_ref[0];
 
           // Refine MV in a small range.
+          av1_refine_warped_mv(cpi, x, bsize, mi_row, mi_col, pts0, pts_inref0,
+                               pts_mv0, total_samples);
+#else
+          // Refine MV in a small range.
           av1_refine_warped_mv(cpi, x, bsize, mi_row, mi_col, pts, pts_inref);
+#endif  // WARPED_MOTION_SORT_SAMPLES
 
           // Keep the refined MV and WM parameters.
           if (mv0.as_int != mbmi->mv[0].as_int) {
@@ -7241,6 +8462,9 @@ static int64_t motion_mode_rd(
               tmp_rate_mv = AOMMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
             }
 #if CONFIG_EXT_INTER
+#if WARPED_MOTION_SORT_SAMPLES
+            best_bmc_mbmi->num_proj_ref[0] = mbmi->num_proj_ref[0];
+#endif  // WARPED_MOTION_SORT_SAMPLES
             tmp_rate2 = rate2_bmc_nocoeff - rate_mv_bmc + tmp_rate_mv;
 #else
             tmp_rate2 = rate2_nocoeff - rate_mv + tmp_rate_mv;
@@ -7255,6 +8479,9 @@ static int64_t motion_mode_rd(
             // Restore the old MV and WM parameters.
             mbmi->mv[0] = mv0;
             mbmi->wm_params[0] = wm_params0;
+#if WARPED_MOTION_SORT_SAMPLES
+            mbmi->num_proj_ref[0] = num_proj_ref0;
+#endif  // WARPED_MOTION_SORT_SAMPLES
           }
         }
 
@@ -7328,8 +8555,8 @@ static int64_t motion_mode_rd(
 
       av1_merge_rd_stats(rd_stats, rd_stats_y);
 
-      rdcosty = RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist);
-      rdcosty = AOMMIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, rd_stats->sse));
+      rdcosty = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+      rdcosty = AOMMIN(rdcosty, RDCOST(x->rdmult, 0, rd_stats->sse));
 /* clang-format off */
 #if CONFIG_VAR_TX
       is_cost_valid_uv =
@@ -7365,12 +8592,11 @@ static int64_t motion_mode_rd(
         mbmi->skip = 0;
         // here mbmi->skip temporarily plays a role as what this_skip2 does
       } else if (!xd->lossless[mbmi->segment_id] &&
-                 (RDCOST(x->rdmult, x->rddiv,
+                 (RDCOST(x->rdmult,
                          rd_stats_y->rate + rd_stats_uv->rate +
                              av1_cost_bit(av1_get_skip_prob(cm, xd), 0),
                          rd_stats->dist) >=
-                  RDCOST(x->rdmult, x->rddiv,
-                         av1_cost_bit(av1_get_skip_prob(cm, xd), 1),
+                  RDCOST(x->rdmult, av1_cost_bit(av1_get_skip_prob(cm, xd), 1),
                          rd_stats->sse))) {
         rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate;
         rd_stats->rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
@@ -7427,7 +8653,7 @@ static int64_t motion_mode_rd(
 #endif  // CONFIG_GLOBAL_MOTION
 
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-    tmp_rd = RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist);
+    tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
     if (mbmi->motion_mode == SIMPLE_TRANSLATION || (tmp_rd < best_rd)) {
       best_mbmi = *mbmi;
       best_rd = tmp_rd;
@@ -7466,11 +8692,17 @@ static int64_t motion_mode_rd(
   return 0;
 }
 
-static int64_t handle_inter_mode(
-    const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
-    RD_STATS *rd_stats, RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
-    int *disable_skip, int_mv (*mode_mv)[TOTAL_REFS_PER_FRAME], int mi_row,
-    int mi_col, HandleInterModeArgs *args, const int64_t ref_best_rd) {
+static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                 BLOCK_SIZE bsize, RD_STATS *rd_stats,
+                                 RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
+                                 int *disable_skip,
+                                 int_mv (*mode_mv)[TOTAL_REFS_PER_FRAME],
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+                                 int_mv (*mode_comp_mv)[TOTAL_REFS_PER_FRAME],
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+                                 int mi_row, int mi_col,
+                                 HandleInterModeArgs *args,
+                                 const int64_t ref_best_rd) {
   const AV1_COMMON *cm = &cpi->common;
   (void)cm;
   MACROBLOCKD *xd = &x->e_mbd;
@@ -7479,7 +8711,14 @@ static int64_t handle_inter_mode(
   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   const int is_comp_pred = has_second_ref(mbmi);
   const int this_mode = mbmi->mode;
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  const int is_singleref_comp_mode = is_inter_singleref_comp_mode(this_mode);
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
   int_mv *frame_mv = mode_mv[this_mode];
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  // The comp mv for the compound mode in single ref
+  int_mv *frame_comp_mv = mode_comp_mv[this_mode];
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
   int i;
   int refs[2] = { mbmi->ref_frame[0],
                   (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
@@ -7487,7 +8726,7 @@ static int64_t handle_inter_mode(
   int rate_mv = 0;
 #if CONFIG_EXT_INTER
   int pred_exists = 1;
-#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
+#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT || CONFIG_INTERINTRA
   const int bw = block_size_wide[bsize];
 #endif  // ONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
   int_mv single_newmv[TOTAL_REFS_PER_FRAME];
@@ -7511,9 +8750,7 @@ static int64_t handle_inter_mode(
 #if CONFIG_EXT_INTER
   int rate2_bmc_nocoeff;
   MB_MODE_INFO best_bmc_mbmi;
-#if CONFIG_MOTION_VAR
   int rate_mv_bmc;
-#endif  // CONFIG_MOTION_VAR
 #endif  // CONFIG_EXT_INTER
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
   int64_t rd = INT64_MAX;
@@ -7523,6 +8760,11 @@ static int64_t handle_inter_mode(
   int skip_txfm_sb = 0;
   int64_t skip_sse_sb = INT64_MAX;
   int16_t mode_ctx;
+#if CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_MOTION_VAR
+  // dummy fillers
+  mbmi->ncobmc_mode[0] = NO_OVERLAP;
+  mbmi->ncobmc_mode[1] = NO_OVERLAP;
+#endif
 
 #if CONFIG_EXT_INTER
 #if CONFIG_INTERINTRA
@@ -7546,7 +8788,11 @@ static int64_t handle_inter_mode(
 #endif  // CONFIG_EXT_INTER
 
 #if CONFIG_EXT_INTER
+#if CONFIG_COMPOUND_SINGLEREF
+  if (is_comp_pred || is_singleref_comp_mode)
+#else   // !CONFIG_COMPOUND_SINGLEREF
   if (is_comp_pred)
+#endif  // CONFIG_COMPOUND_SINGLEREF
     mode_ctx = mbmi_ext->compound_mode_context[refs[0]];
   else
 #endif  // CONFIG_EXT_INTER
@@ -7572,12 +8818,22 @@ static int64_t handle_inter_mode(
     if (frame_mv[refs[0]].as_int == INVALID_MV ||
         frame_mv[refs[1]].as_int == INVALID_MV)
       return INT64_MAX;
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  } else if (is_singleref_comp_mode) {
+    if (frame_mv[refs[0]].as_int == INVALID_MV ||
+        frame_comp_mv[refs[0]].as_int == INVALID_MV)
+      return INT64_MAX;
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
   }
 
   mbmi->motion_mode = SIMPLE_TRANSLATION;
   if (have_newmv_in_inter_mode(this_mode)) {
-    const int64_t ret_val = handle_newmv(cpi, x, bsize, mode_mv, mi_row, mi_col,
-                                         &rate_mv, single_newmv, args);
+    const int64_t ret_val =
+        handle_newmv(cpi, x, bsize, mode_mv,
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+                     mode_comp_mv,
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+                     mi_row, mi_col, &rate_mv, single_newmv, args);
     if (ret_val != 0)
       return ret_val;
     else
@@ -7591,6 +8847,16 @@ static int64_t handle_inter_mode(
     mbmi->mv[i].as_int = cur_mv[i].as_int;
   }
 
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  if (!is_comp_pred && is_singleref_comp_mode) {
+    cur_mv[1] = frame_comp_mv[refs[0]];
+    // Clip "next_nearest" so that it does not extend to far out of image
+    if (this_mode != NEWMV) clamp_mv2(&cur_mv[1].as_mv, xd);
+    if (mv_check_bounds(&x->mv_limits, &cur_mv[1].as_mv)) return INT64_MAX;
+    mbmi->mv[1].as_int = cur_mv[1].as_int;
+  }
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+
 #if CONFIG_EXT_INTER
   if (this_mode == NEAREST_NEARESTMV)
 #else
@@ -7614,7 +8880,13 @@ static int64_t handle_inter_mode(
 
 #if CONFIG_EXT_INTER
   if (mbmi_ext->ref_mv_count[ref_frame_type] > 0) {
-    if (this_mode == NEAREST_NEWMV) {
+#if CONFIG_COMPOUND_SINGLEREF
+    if (this_mode == NEAREST_NEWMV ||  // this_mode == SR_NEAREST_NEWMV ||
+        this_mode == SR_NEAREST_NEARMV)
+#else   // !CONFIG_COMPOUND_SINGLEREF
+    if (this_mode == NEAREST_NEWMV)
+#endif  // CONFIG_COMPOUND_SINGLEREF
+    {
       cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv;
 
       lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv);
@@ -7635,7 +8907,11 @@ static int64_t handle_inter_mode(
 
   if (mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
     int ref_mv_idx = mbmi->ref_mv_idx + 1;
-    if (this_mode == NEAR_NEWMV || this_mode == NEAR_NEARMV) {
+    if (this_mode == NEAR_NEWMV ||
+#if CONFIG_COMPOUND_SINGLEREF
+        this_mode == SR_NEAR_NEWMV ||
+#endif  // CONFIG_COMPOUND_SINGLEREF
+        this_mode == NEAR_NEARMV) {
       cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
 
       lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv);
@@ -7644,8 +8920,17 @@ static int64_t handle_inter_mode(
       mbmi->mv[0].as_int = cur_mv[0].as_int;
     }
 
-    if (this_mode == NEW_NEARMV || this_mode == NEAR_NEARMV) {
-      cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
+    if (this_mode == NEW_NEARMV ||
+#if CONFIG_COMPOUND_SINGLEREF
+        this_mode == SR_NEAREST_NEARMV ||
+#endif  // CONFIG_COMPOUND_SINGLEREF
+        this_mode == NEAR_NEARMV) {
+#if CONFIG_COMPOUND_SINGLEREF
+      if (this_mode == SR_NEAREST_NEARMV)
+        cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
+      else
+#endif  // CONFIG_COMPOUND_SINGLEREF
+        cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
 
       lower_mv_precision(&cur_mv[1].as_mv, cm->allow_high_precision_mv);
       clamp_mv2(&cur_mv[1].as_mv, xd);
@@ -7653,7 +8938,7 @@ static int64_t handle_inter_mode(
       mbmi->mv[1].as_int = cur_mv[1].as_int;
     }
   }
-#else
+#else   // !CONFIG_EXT_INTER
   if (this_mode == NEARMV && is_comp_pred) {
     uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
     if (mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
@@ -7706,7 +8991,7 @@ static int64_t handle_inter_mode(
     rd_stats->rate += cost_mv_ref(cpi, this_mode, mode_ctx);
   }
 
-  if (RDCOST(x->rdmult, x->rddiv, rd_stats->rate, 0) > ref_best_rd &&
+  if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd &&
 #if CONFIG_EXT_INTER
       mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV
 #else
@@ -7725,13 +9010,16 @@ static int64_t handle_inter_mode(
   best_bmc_mbmi = *mbmi;
   rate2_bmc_nocoeff = rd_stats->rate;
   if (cm->interp_filter == SWITCHABLE) rate2_bmc_nocoeff += rs;
-#if CONFIG_MOTION_VAR
   rate_mv_bmc = rate_mv;
-#endif  // CONFIG_MOTION_VAR
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 
 #if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
-  if (is_comp_pred) {
+#if CONFIG_COMPOUND_SINGLEREF
+  if (is_comp_pred || is_singleref_comp_mode)
+#else
+  if (is_comp_pred)
+#endif  // CONFIG_COMPOUND_SINGLEREF
+  {
     int rate_sum, rs2;
     int64_t dist_sum;
     int64_t best_rd_compound = INT64_MAX, best_rd_cur = INT64_MAX;
@@ -7741,8 +9029,8 @@ static int64_t handle_inter_mode(
     int tmp_skip_txfm_sb;
     int64_t tmp_skip_sse_sb;
     int compound_type_cost[COMPOUND_TYPES];
-    uint8_t pred0[2 * MAX_SB_SQUARE];
-    uint8_t pred1[2 * MAX_SB_SQUARE];
+    DECLARE_ALIGNED(16, uint8_t, pred0[2 * MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(16, uint8_t, pred1[2 * MAX_SB_SQUARE]);
     uint8_t *preds0[1] = { pred0 };
     uint8_t *preds1[1] = { pred1 };
     int strides[1] = { bw };
@@ -7761,6 +9049,17 @@ static int64_t handle_inter_mode(
     best_compound_data.seg_mask = tmp_mask_buf;
 #endif  // CONFIG_COMPOUND_SEGMENT
 
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+    // TODO(zoeliu): To further check whether the following setups are needed.
+    // Single ref compound mode: Prepare the 2nd ref frame predictor the same as
+    // the 1st one.
+    if (!is_comp_pred && is_singleref_comp_mode) {
+      xd->block_refs[1] = xd->block_refs[0];
+      for (i = 0; i < MAX_MB_PLANE; i++)
+        xd->plane[i].pre[1] = xd->plane[i].pre[0];
+    }
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+
     if (masked_compound_used) {
       av1_cost_tokens(compound_type_cost, cm->fc->compound_type_prob[bsize],
                       av1_compound_type_tree);
@@ -7773,7 +9072,7 @@ static int64_t handle_inter_mode(
 
     for (cur_type = COMPOUND_AVERAGE; cur_type < COMPOUND_TYPES; cur_type++) {
       if (cur_type != COMPOUND_AVERAGE && !masked_compound_used) break;
-      if (!is_interinter_compound_used(cur_type, bsize)) break;
+      if (!is_interinter_compound_used(cur_type, bsize)) continue;
       tmp_rate_mv = rate_mv;
       best_rd_cur = INT64_MAX;
       mbmi->interinter_compound_type = cur_type;
@@ -7792,8 +9091,7 @@ static int64_t handle_inter_mode(
                                    &tmp_skip_txfm_sb, &tmp_skip_sse_sb,
                                    INT64_MAX);
           if (rd != INT64_MAX)
-            best_rd_cur =
-                RDCOST(x->rdmult, x->rddiv, rs2 + rate_mv + rate_sum, dist_sum);
+            best_rd_cur = RDCOST(x->rdmult, rs2 + rate_mv + rate_sum, dist_sum);
           best_rd_compound = best_rd_cur;
           break;
 #if CONFIG_WEDGE
@@ -7923,8 +9221,7 @@ static int64_t handle_inter_mode(
       av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
       model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
                       &tmp_skip_txfm_sb, &tmp_skip_sse_sb);
-      rd =
-          RDCOST(x->rdmult, x->rddiv, tmp_rate_mv + rate_sum + rmode, dist_sum);
+      rd = RDCOST(x->rdmult, tmp_rate_mv + rate_sum + rmode, dist_sum);
       if (rd < best_interintra_rd) {
         best_interintra_rd = rd;
         best_interintra_mode = mbmi->interintra_mode;
@@ -7939,7 +9236,7 @@ static int64_t handle_inter_mode(
     rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
                              &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
     if (rd != INT64_MAX)
-      rd = RDCOST(x->rdmult, x->rddiv, rate_mv + rmode + rate_sum, dist_sum);
+      rd = RDCOST(x->rdmult, rate_mv + rmode + rate_sum, dist_sum);
     best_interintra_rd = rd;
 
     if (ref_best_rd < INT64_MAX && best_interintra_rd > 2 * ref_best_rd) {
@@ -7953,8 +9250,7 @@ static int64_t handle_inter_mode(
       int_mv tmp_mv;
       int rwedge = av1_cost_bit(cm->fc->wedge_interintra_prob[bsize], 0);
       if (rd != INT64_MAX)
-        rd = RDCOST(x->rdmult, x->rddiv, rmode + rate_mv + rwedge + rate_sum,
-                    dist_sum);
+        rd = RDCOST(x->rdmult, rmode + rate_mv + rwedge + rate_sum, dist_sum);
       best_interintra_rd_nowedge = best_interintra_rd;
 
       // Disable wedge search if source variance is small
@@ -7968,7 +9264,7 @@ static int64_t handle_inter_mode(
             pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
 
         best_interintra_rd_wedge +=
-            RDCOST(x->rdmult, x->rddiv, rmode + rate_mv + rwedge, 0);
+            RDCOST(x->rdmult, rmode + rate_mv + rwedge, 0);
         // Refine motion vector.
         if (have_newmv_in_inter_mode(this_mode)) {
           // get negative of mask
@@ -7977,14 +9273,14 @@ static int64_t handle_inter_mode(
           tmp_mv.as_int = x->mbmi_ext->ref_mvs[refs[0]][0].as_int;
           compound_single_motion_search(cpi, x, bsize, &tmp_mv.as_mv, mi_row,
                                         mi_col, intrapred, mask, bw,
-                                        &tmp_rate_mv, 0, 0);
+                                        &tmp_rate_mv, 0);
           mbmi->mv[0].as_int = tmp_mv.as_int;
           av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, &orig_dst,
                                          bsize);
           model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
                           &tmp_skip_txfm_sb, &tmp_skip_sse_sb);
-          rd = RDCOST(x->rdmult, x->rddiv,
-                      rmode + tmp_rate_mv + rwedge + rate_sum, dist_sum);
+          rd = RDCOST(x->rdmult, rmode + tmp_rate_mv + rwedge + rate_sum,
+                      dist_sum);
           if (rd >= best_interintra_rd_wedge) {
             tmp_mv.as_int = cur_mv[0].as_int;
             tmp_rate_mv = rate_mv;
@@ -8000,8 +9296,8 @@ static int64_t handle_inter_mode(
             estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
                                 &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
         if (rd != INT64_MAX)
-          rd = RDCOST(x->rdmult, x->rddiv,
-                      rmode + tmp_rate_mv + rwedge + rate_sum, dist_sum);
+          rd = RDCOST(x->rdmult, rmode + tmp_rate_mv + rwedge + rate_sum,
+                      dist_sum);
         best_interintra_rd_wedge = rd;
         if (best_interintra_rd_wedge < best_interintra_rd_nowedge) {
           mbmi->use_wedge_interintra = 1;
@@ -8042,7 +9338,7 @@ static int64_t handle_inter_mode(
     av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, &orig_dst, bsize);
     model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
                     &tmp_dist, &skip_txfm_sb, &skip_sse_sb);
-    rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
+    rd = RDCOST(x->rdmult, rs + tmp_rate, tmp_dist);
   }
 #endif  // CONFIG_EXT_INTER
 
@@ -8097,10 +9393,7 @@ static int64_t handle_inter_mode(
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
                            single_newmv,
 #if CONFIG_EXT_INTER
-                           rate2_bmc_nocoeff, &best_bmc_mbmi,
-#if CONFIG_MOTION_VAR
-                           rate_mv_bmc,
-#endif  // CONFIG_MOTION_VAR
+                           rate2_bmc_nocoeff, &best_bmc_mbmi, rate_mv_bmc,
 #endif  // CONFIG_EXT_INTER
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
                            rs, &skip_txfm_sb, &skip_sse_sb, &orig_dst);
@@ -8118,11 +9411,7 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
 
   MACROBLOCKD *const xd = &x->e_mbd;
   const TileInfo *tile = &xd->tile;
-#if CONFIG_EC_ADAPT
   FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
-#else
-  FRAME_CONTEXT *const ec_ctx = cm->fc;
-#endif  // CONFIG_EC_ADAPT
   MODE_INFO *const mi = xd->mi[0];
   const int mi_row = -xd->mb_to_top_edge / (8 * MI_SIZE);
   const int mi_col = -xd->mb_to_left_edge / (8 * MI_SIZE);
@@ -8222,7 +9511,7 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
 #endif
     mbmi->use_intrabc = 1;
     mbmi->mode = DC_PRED;
-    mbmi->uv_mode = DC_PRED;
+    mbmi->uv_mode = UV_DC_PRED;
     mbmi->mv[0].as_mv = dv;
 #if CONFIG_DUAL_FILTER
     for (int idx = 0; idx < 4; ++idx) mbmi->interp_filter[idx] = BILINEAR;
@@ -8233,12 +9522,12 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
     x->skip = 0;
     av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
 
+    assert(x->mvcost == x->mv_cost_stack[0]);
+    // TODO(aconverse@google.com): The full motion field defining discount
+    // in MV_COST_WEIGHT is too large. Explore other values.
     int rate_mv = av1_mv_bit_cost(&dv, &dv_ref.as_mv, x->nmvjointcost,
-                                  x->mvcost, MV_COST_WEIGHT);
-    const PREDICTION_MODE A = av1_above_block_mode(mi, xd->above_mi, 0);
-    const PREDICTION_MODE L = av1_left_block_mode(mi, xd->left_mi, 0);
-    const int rate_mode = cpi->y_mode_costs[A][L][DC_PRED] +
-                          av1_cost_bit(ec_ctx->intrabc_prob, 1);
+                                  x->mvcost, MV_COST_WEIGHT_SUB);
+    const int rate_mode = av1_cost_bit(ec_ctx->intrabc_prob, 1);
 
     RD_STATS rd_stats, rd_stats_uv;
     av1_subtract_plane(x, bsize, 0);
@@ -8267,8 +9556,7 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
     rdc_noskip.rate =
         rate_mode + rate_mv + rd_stats.rate + av1_cost_bit(skip_prob, 0);
     rdc_noskip.dist = rd_stats.dist;
-    rdc_noskip.rdcost =
-        RDCOST(x->rdmult, x->rddiv, rdc_noskip.rate, rdc_noskip.dist);
+    rdc_noskip.rdcost = RDCOST(x->rdmult, rdc_noskip.rate, rdc_noskip.dist);
     if (rdc_noskip.rdcost < best_rd) {
       best_rd = rdc_noskip.rdcost;
       best_mbmi = *mbmi;
@@ -8282,7 +9570,7 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
     av1_init_rd_stats(&rdc_skip);
     rdc_skip.rate = rate_mode + rate_mv + av1_cost_bit(skip_prob, 1);
     rdc_skip.dist = rd_stats.sse;
-    rdc_skip.rdcost = RDCOST(x->rdmult, x->rddiv, rdc_skip.rate, rdc_skip.dist);
+    rdc_skip.rdcost = RDCOST(x->rdmult, rdc_skip.rate, rdc_skip.dist);
     if (rdc_skip.rdcost < best_rd) {
       best_rd = rdc_skip.rdcost;
       best_mbmi = *mbmi;
@@ -8302,6 +9590,7 @@ void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
                                PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   struct macroblockd_plane *const pd = xd->plane;
   int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
   int y_skip = 0, uv_skip = 0;
@@ -8310,11 +9599,11 @@ void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
   const int unify_bsize = CONFIG_CB4X4;
 
   ctx->skip = 0;
-  xd->mi[0]->mbmi.ref_frame[0] = INTRA_FRAME;
-  xd->mi[0]->mbmi.ref_frame[1] = NONE_FRAME;
+  mbmi->ref_frame[0] = INTRA_FRAME;
+  mbmi->ref_frame[1] = NONE_FRAME;
 #if CONFIG_INTRABC
-  xd->mi[0]->mbmi.use_intrabc = 0;
-  xd->mi[0]->mbmi.mv[0].as_int = 0;
+  mbmi->use_intrabc = 0;
+  mbmi->mv[0].as_int = 0;
 #endif  // CONFIG_INTRABC
 
   const int64_t intra_yrd =
@@ -8325,9 +9614,29 @@ void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
                                          &dist_y, &y_skip, best_rd);
 
   if (intra_yrd < best_rd) {
-    max_uv_tx_size = uv_txsize_lookup[bsize][xd->mi[0]->mbmi.tx_size]
-                                     [pd[1].subsampling_x][pd[1].subsampling_y];
-    init_sbuv_mode(&xd->mi[0]->mbmi);
+#if CONFIG_CFL
+    // Perform one extra txfm_rd_in_plane() call, this time with the best value
+    // so we can store reconstructed luma values
+    RD_STATS this_rd_stats;
+
+#if CONFIG_CB4X4
+    // Don't store the luma value if no chroma is associated.
+    // Don't worry, we will store this reconstructed luma in the following
+    // encode dry-run the chroma plane will never know.
+    x->cfl_store_y = !x->skip_chroma_rd;
+#else
+    x->cfl_store_y = 1;
+#endif
+
+    txfm_rd_in_plane(x, cpi, &this_rd_stats, INT64_MAX, AOM_PLANE_Y,
+                     mbmi->sb_type, mbmi->tx_size,
+                     cpi->sf.use_fast_coef_costing);
+
+    x->cfl_store_y = 0;
+#endif
+    max_uv_tx_size = uv_txsize_lookup[bsize][mbmi->tx_size][pd[1].subsampling_x]
+                                     [pd[1].subsampling_y];
+    init_sbuv_mode(mbmi);
 #if CONFIG_CB4X4
     if (!x->skip_chroma_rd)
       rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, &dist_uv,
@@ -8346,8 +9655,8 @@ void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
           rate_y + rate_uv + av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
       rd_cost->dist = dist_y + dist_uv;
     }
-    rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+    rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
     rd_cost->dist_y = dist_y;
 #endif
   } else {
@@ -8360,7 +9669,7 @@ void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
   if (rd_pick_intrabc_mode_sb(cpi, x, rd_cost, bsize, best_rd) < best_rd) {
     ctx->skip = x->skip;  // FIXME where is the proper place to set this?!
     assert(rd_cost->rate != INT_MAX);
-    rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
+    rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
   }
 #endif
   if (rd_cost->rate == INT_MAX) return;
@@ -8494,7 +9803,8 @@ static void pick_filter_intra_interframe(
     const AV1_COMP *cpi, MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
     BLOCK_SIZE bsize, int mi_row, int mi_col, int *rate_uv_intra,
     int *rate_uv_tokenonly, int64_t *dist_uv, int *skip_uv,
-    PREDICTION_MODE *mode_uv, FILTER_INTRA_MODE_INFO *filter_intra_mode_info_uv,
+    UV_PREDICTION_MODE *mode_uv,
+    FILTER_INTRA_MODE_INFO *filter_intra_mode_info_uv,
 #if CONFIG_EXT_INTRA
     int8_t *uv_angle_delta,
 #endif  // CONFIG_EXT_INTRA
@@ -8531,7 +9841,7 @@ static void pick_filter_intra_interframe(
   // TODO(huisu): use skip_mask for further speedup.
   (void)skip_mask;
   mbmi->mode = DC_PRED;
-  mbmi->uv_mode = DC_PRED;
+  mbmi->uv_mode = UV_DC_PRED;
   mbmi->ref_frame[0] = INTRA_FRAME;
   mbmi->ref_frame[1] = NONE_FRAME;
   if (!rd_pick_filter_intra_sby(cpi, x, &rate_dummy, &rate_y, &distortion_y,
@@ -8600,7 +9910,8 @@ static void pick_filter_intra_interframe(
   rate2 += write_uniform_cost(
       FILTER_INTRA_MODES, mbmi->filter_intra_mode_info.filter_intra_mode[0]);
 #if CONFIG_EXT_INTRA
-  if (av1_is_directional_mode(mbmi->uv_mode, bsize)) {
+  if (av1_is_directional_mode(mbmi->uv_mode, bsize) &&
+      av1_use_angle_delta(bsize)) {
     rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
                                 MAX_ANGLE_DELTA + mbmi->angle_delta[1]);
   }
@@ -8628,7 +9939,7 @@ static void pick_filter_intra_interframe(
   } else {
     rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
   }
-  this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+  this_rd = RDCOST(x->rdmult, rate2, distortion2);
 
   if (this_rd < *best_intra_rd) {
     *best_intra_rd = this_rd;
@@ -8693,6 +10004,9 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
   unsigned char segment_id = mbmi->segment_id;
   int comp_pred, i, k;
   int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  int_mv frame_comp_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
   struct buf_2d yv12_mb[TOTAL_REFS_PER_FRAME][MAX_MB_PLANE];
   int_mv single_newmv[TOTAL_REFS_PER_FRAME] = { { 0 } };
 #if CONFIG_EXT_INTER
@@ -8722,7 +10036,11 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
   int best_mode_skippable = 0;
   int midx, best_mode_index = -1;
   unsigned int ref_costs_single[TOTAL_REFS_PER_FRAME];
+#if CONFIG_EXT_COMP_REFS
+  unsigned int ref_costs_comp[TOTAL_REFS_PER_FRAME][TOTAL_REFS_PER_FRAME];
+#else
   unsigned int ref_costs_comp[TOTAL_REFS_PER_FRAME];
+#endif  // CONFIG_EXT_COMP_REFS
   aom_prob comp_mode_p;
   int64_t best_intra_rd = INT64_MAX;
   unsigned int best_pred_sse = UINT_MAX;
@@ -8730,7 +10048,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
   int rate_uv_intra[TX_SIZES_ALL], rate_uv_tokenonly[TX_SIZES_ALL];
   int64_t dist_uvs[TX_SIZES_ALL];
   int skip_uvs[TX_SIZES_ALL];
-  PREDICTION_MODE mode_uv[TX_SIZES_ALL];
+  UV_PREDICTION_MODE mode_uv[TX_SIZES_ALL];
 #if CONFIG_PALETTE
   PALETTE_MODE_INFO pmi_uv[TX_SIZES_ALL];
 #endif  // CONFIG_PALETTE
@@ -8747,7 +10065,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
   const int *const intra_mode_cost = cpi->mbmode_cost[size_group_lookup[bsize]];
   int best_skip2 = 0;
-  uint8_t ref_frame_skip_mask[2] = { 0 };
+  uint16_t ref_frame_skip_mask[2] = { 0 };
   uint32_t mode_skip_mask[TOTAL_REFS_PER_FRAME] = { 0 };
 #if CONFIG_EXT_INTER && CONFIG_INTERINTRA
   MV_REFERENCE_FRAME best_single_inter_ref = LAST_FRAME;
@@ -8850,6 +10168,10 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
   *returnrate_nocoef = INT_MAX;
 #endif  // CONFIG_SUPERTX
 
+#if CONFIG_SPEED_REFS
+  memset(x->mbmi_ext->ref_mvs, 0, sizeof(x->mbmi_ext->ref_mvs));
+#endif  // CONFIG_SPEED_REFS
+
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     x->pred_mv_sad[ref_frame] = INT_MAX;
     x->mbmi_ext->mode_context[ref_frame] = 0;
@@ -8873,6 +10195,10 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 #endif  // CONFIG_GLOBAL_MOTION
 #if CONFIG_EXT_INTER
     frame_mv[NEW_NEWMV][ref_frame].as_int = INVALID_MV;
+#if CONFIG_COMPOUND_SINGLEREF
+    frame_mv[SR_NEW_NEWMV][ref_frame].as_int = INVALID_MV;
+    frame_comp_mv[SR_NEW_NEWMV][ref_frame].as_int = INVALID_MV;
+#endif  // CONFIG_COMPOUND_SINGLEREF
 #if CONFIG_GLOBAL_MOTION
     frame_mv[ZERO_ZEROMV][ref_frame].as_int =
         gm_get_motion_vector(&cm->global_motion[ref_frame],
@@ -8934,6 +10260,10 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 // Skip checking missing references in both single and compound reference
 // modes. Note that a mode will be skipped iff both reference frames
 // are masked out.
+#if CONFIG_EXT_COMP_REFS
+      ref_frame_skip_mask[0] |= (1 << ref_frame);
+      ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+#else  // !CONFIG_EXT_COMP_REFS
 #if CONFIG_EXT_REFS
       if (ref_frame == BWDREF_FRAME || ref_frame == ALTREF_FRAME) {
         ref_frame_skip_mask[0] |= (1 << ref_frame);
@@ -8945,6 +10275,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 #if CONFIG_EXT_REFS
       }
 #endif  // CONFIG_EXT_REFS
+#endif  // CONFIG_EXT_COMP_REFS
     } else {
       for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
         // Skip fixed mv modes for poor references
@@ -9000,6 +10331,12 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         mode_skip_mask[ALTREF_FRAME] |= (1 << NEAREST_NEARESTMV);
       if (frame_mv[NEAR_NEARMV][ALTREF_FRAME].as_int != zeromv.as_int)
         mode_skip_mask[ALTREF_FRAME] |= (1 << NEAR_NEARMV);
+#if CONFIG_COMPOUND_SINGLEREF
+      if (frame_mv[SR_NEAREST_NEARMV][ALTREF_FRAME].as_int != zeromv.as_int ||
+          frame_comp_mv[SR_NEAREST_NEARMV][ALTREF_FRAME].as_int !=
+              zeromv.as_int)
+        mode_skip_mask[ALTREF_FRAME] |= (1 << SR_NEAREST_NEARMV);
+#endif  // CONFIG_COMPOUND_SINGLEREF
 #endif  // CONFIG_EXT_INTER
     }
   }
@@ -9077,7 +10414,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     int compmode_cost = 0;
     int rate2 = 0, rate_y = 0, rate_uv = 0;
     int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
     int64_t distortion2_y = 0;
     int64_t total_sse_y = INT64_MAX;
 #endif
@@ -9106,6 +10443,13 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
           frame_mv[compound_ref0_mode(this_mode)][ref_frame].as_int;
       frame_mv[this_mode][second_ref_frame].as_int =
           frame_mv[compound_ref1_mode(this_mode)][second_ref_frame].as_int;
+#if CONFIG_COMPOUND_SINGLEREF
+    } else if (is_inter_singleref_comp_mode(this_mode)) {
+      frame_mv[this_mode][ref_frame].as_int =
+          frame_mv[compound_ref0_mode(this_mode)][ref_frame].as_int;
+      frame_comp_mv[this_mode][ref_frame].as_int =
+          frame_mv[compound_ref1_mode(this_mode)][ref_frame].as_int;
+#endif  // CONFIG_COMPOUND_SINGLEREF
     }
 #endif  // CONFIG_EXT_INTER
 
@@ -9154,6 +10498,34 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         (ref_frame_skip_mask[1] & (1 << AOMMAX(0, second_ref_frame))))
       continue;
 
+#if CONFIG_EXT_COMP_REFS
+// TODO(zoeliu): Following toggle between #if 0/1 and the bug will manifest
+// itself.
+#if 0
+    if (!(cpi->ref_frame_flags & flag_list[ref_frame]) ||
+        (second_ref_frame > INTRA_FRAME &&
+         (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))))
+      printf("Frame=%d, bsize=%d, (mi_row,mi_col)=(%d,%d), ref_frame=%d, "
+             "second_ref_frame=%d\n", cm->current_video_frame, bsize, mi_row,
+             mi_col, ref_frame, second_ref_frame);
+
+    if (!(cpi->ref_frame_flags & flag_list[ref_frame])) continue;
+    if (second_ref_frame > INTRA_FRAME &&
+        (!(cpi->ref_frame_flags & flag_list[second_ref_frame])))
+      continue;
+#endif  // 0
+
+#if !USE_UNI_COMP_REFS
+    // NOTE(zoeliu): Temporarily disable uni-directional comp refs
+    if (second_ref_frame > INTRA_FRAME) {
+      if (!((ref_frame < BWDREF_FRAME) ^ (second_ref_frame < BWDREF_FRAME)))
+        continue;
+    }
+    assert(second_ref_frame <= INTRA_FRAME ||
+           ((ref_frame < BWDREF_FRAME) ^ (second_ref_frame < BWDREF_FRAME)));
+#endif  // !USE_UNI_COMP_REFS
+#endif  // CONFIG_EXT_COMP_REFS
+
     if (mode_skip_mask[ref_frame] & (1 << this_mode)) continue;
 
     // Test best rd so far against threshold for trying this mode.
@@ -9239,7 +10611,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     }
 
     mbmi->mode = this_mode;
-    mbmi->uv_mode = DC_PRED;
+    mbmi->uv_mode = UV_DC_PRED;
     mbmi->ref_frame[0] = ref_frame;
     mbmi->ref_frame[1] = second_ref_frame;
 #if CONFIG_PALETTE
@@ -9267,6 +10639,15 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
     }
 
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+    // Single ref compound mode
+    if (!comp_pred && is_inter_singleref_comp_mode(mbmi->mode)) {
+      xd->block_refs[1] = xd->block_refs[0];
+      for (i = 0; i < MAX_MB_PLANE; i++)
+        xd->plane[i].pre[1] = xd->plane[i].pre[0];
+    }
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+
 #if CONFIG_EXT_INTER && CONFIG_INTERINTRA
     mbmi->interintra_mode = (INTERINTRA_MODE)(II_DC_PRED - 1);
 #endif  // CONFIG_EXT_INTER && CONFIG_INTERINTRA
@@ -9277,7 +10658,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       struct macroblockd_plane *const pd = &xd->plane[1];
 #if CONFIG_EXT_INTRA
       is_directional_mode = av1_is_directional_mode(mbmi->mode, bsize);
-      if (is_directional_mode) {
+      if (is_directional_mode && av1_use_angle_delta(bsize)) {
         int rate_dummy;
         int64_t model_rd = INT64_MAX;
         if (!angle_stats_ready) {
@@ -9390,10 +10771,13 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         if (av1_is_intra_filter_switchable(p_angle))
           rate2 += cpi->intra_filter_cost[intra_filter_ctx][mbmi->intra_filter];
 #endif  // CONFIG_INTRA_INTERP
-        rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
-                                    MAX_ANGLE_DELTA + mbmi->angle_delta[0]);
+        if (av1_use_angle_delta(bsize)) {
+          rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
+                                      MAX_ANGLE_DELTA + mbmi->angle_delta[0]);
+        }
       }
-      if (mbmi->uv_mode != DC_PRED && mbmi->uv_mode != TM_PRED) {
+      if (av1_is_directional_mode(mbmi->uv_mode, bsize) &&
+          av1_use_angle_delta(bsize)) {
         rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
                                     MAX_ANGLE_DELTA + mbmi->angle_delta[1]);
       }
@@ -9409,7 +10793,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
               mbmi->filter_intra_mode_info.filter_intra_mode[0]);
         }
       }
-      if (mbmi->uv_mode == DC_PRED) {
+      if (mbmi->uv_mode == UV_DC_PRED) {
         rate2 +=
             av1_cost_bit(cpi->common.fc->filter_intra_probs[1],
                          mbmi->filter_intra_mode_info.use_filter_intra_mode[1]);
@@ -9422,7 +10806,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED)
         rate2 += intra_cost_penalty;
       distortion2 = distortion_y + distortion_uv;
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
       if (bsize < BLOCK_8X8) distortion2_y = distortion_y;
 #endif
     } else {
@@ -9481,6 +10865,27 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
             mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0] = this_mv;
           }
         }
+#if CONFIG_COMPOUND_SINGLEREF
+      } else if (is_inter_singleref_comp_mode(mbmi->mode)) {
+        if (mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
+          // TODO(zoeliu): To further investigate which ref_mv_idx should be
+          //               chosen for the mode of SR_NEAR_NEWMV.
+          int ref_mv_idx = 0;
+          // Special case: SR_NEAR_NEWMV mode use
+          // 1 + mbmi->ref_mv_idx (like NEARMV) instead of
+          // mbmi->ref_mv_idx (like NEWMV)
+          if (mbmi->mode == SR_NEAR_NEWMV) ref_mv_idx = 1;
+
+          if (compound_ref0_mode(mbmi->mode) == NEWMV ||
+              compound_ref1_mode(mbmi->mode) == NEWMV) {
+            int_mv this_mv =
+                mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
+            clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
+                         xd->n8_h << MI_SIZE_LOG2, xd);
+            mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv;
+          }
+        }
+#endif  // CONFIG_COMPOUND_SINGLEREF
       } else {
 #endif  // CONFIG_EXT_INTER
         if (mbmi->mode == NEWMV && mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
@@ -9500,6 +10905,19 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       {
         RD_STATS rd_stats, rd_stats_y, rd_stats_uv;
         av1_init_rd_stats(&rd_stats);
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
+        // While av1 master uses rd_stats_y.rate through out the codebase,
+        // which is set when handle_inter_moden is called, the daala-dist code
+        // in rd_pick_partition() for cb4x4 and sub8x8 blocks need to know
+        // .dist_y which comes from rd_stats_y.dist and rd_stats_y.sse.
+        // The problem is rd_stats_y.dist and rd_stats_y.sse are sometimes not
+        // initialized when rd_stats.skip = 1,
+        // then instead rd_stats.dist and rd_stats.sse have the
+        // combined luma and chroma dist and sse.
+        // This can be seen inside motion_mode_rd(), which is called by
+        // handle_inter_mode().
+        if (bsize < BLOCK_8X8) av1_init_rd_stats(&rd_stats_y);
+#endif
         rd_stats.rate = rate2;
 
         // Point to variables that are maintained between loop iterations
@@ -9510,6 +10928,9 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 #endif  // CONFIG_EXT_INTER
         this_rd = handle_inter_mode(cpi, x, bsize, &rd_stats, &rd_stats_y,
                                     &rd_stats_uv, &disable_skip, frame_mv,
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+                                    frame_comp_mv,
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
                                     mi_row, mi_col, &args, best_rd);
 
         rate2 = rd_stats.rate;
@@ -9518,23 +10939,39 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         total_sse = rd_stats.sse;
         rate_y = rd_stats_y.rate;
         rate_uv = rd_stats_uv.rate;
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
-        if (bsize < BLOCK_8X8) distortion2_y = rd_stats_y.dist;
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
+        if (bsize < BLOCK_8X8) {
+          if (rd_stats_y.rate != INT_MAX) {
+            assert(rd_stats_y.sse < INT64_MAX);
+            assert(rd_stats_y.dist < INT64_MAX);
+          }
+          total_sse_y = rd_stats_y.sse;
+          distortion2_y = rd_stats_y.dist;
+        }
 #endif
       }
 
 // TODO(jingning): This needs some refactoring to improve code quality
 // and reduce redundant steps.
 #if CONFIG_EXT_INTER
+#if CONFIG_COMPOUND_SINGLEREF
+      if ((have_nearmv_in_inter_mode(mbmi->mode) &&
+           mbmi_ext->ref_mv_count[ref_frame_type] > 2) ||
+          ((mbmi->mode == NEWMV || mbmi->mode == SR_NEW_NEWMV ||
+            mbmi->mode == NEW_NEWMV) &&
+           mbmi_ext->ref_mv_count[ref_frame_type] > 1))
+#else   // !CONFIG_COMPOUND_SINGLEREF
       if ((have_nearmv_in_inter_mode(mbmi->mode) &&
            mbmi_ext->ref_mv_count[ref_frame_type] > 2) ||
           ((mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) &&
-           mbmi_ext->ref_mv_count[ref_frame_type] > 1)) {
-#else
+           mbmi_ext->ref_mv_count[ref_frame_type] > 1))
+#endif  // CONFIG_COMPOUND_SINGLEREF
+#else   // !CONFIG_EXT_INTER
       if ((mbmi->mode == NEARMV &&
            mbmi_ext->ref_mv_count[ref_frame_type] > 2) ||
-          (mbmi->mode == NEWMV && mbmi_ext->ref_mv_count[ref_frame_type] > 1)) {
-#endif
+          (mbmi->mode == NEWMV && mbmi_ext->ref_mv_count[ref_frame_type] > 1))
+#endif  // CONFIG_EXT_INTER
+      {
         int_mv backup_mv = frame_mv[NEARMV][ref_frame];
         MB_MODE_INFO backup_mbmi = *mbmi;
         int backup_skip = x->skip;
@@ -9560,18 +10997,16 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         rate2 += (rate2 < INT_MAX ? cpi->drl_mode_cost0[drl_ctx][0] : 0);
 
         if (this_rd < INT64_MAX) {
-          if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
-              RDCOST(x->rdmult, x->rddiv, 0, total_sse))
-            tmp_ref_rd =
-                RDCOST(x->rdmult, x->rddiv,
-                       rate2 + av1_cost_bit(av1_get_skip_prob(cm, xd), 0),
-                       distortion2);
+          if (RDCOST(x->rdmult, rate_y + rate_uv, distortion2) <
+              RDCOST(x->rdmult, 0, total_sse))
+            tmp_ref_rd = RDCOST(
+                x->rdmult, rate2 + av1_cost_bit(av1_get_skip_prob(cm, xd), 0),
+                distortion2);
           else
-            tmp_ref_rd =
-                RDCOST(x->rdmult, x->rddiv,
-                       rate2 + av1_cost_bit(av1_get_skip_prob(cm, xd), 1) -
-                           rate_y - rate_uv,
-                       total_sse);
+            tmp_ref_rd = RDCOST(
+                x->rdmult, rate2 + av1_cost_bit(av1_get_skip_prob(cm, xd), 1) -
+                               rate_y - rate_uv,
+                total_sse);
         }
 #if CONFIG_VAR_TX
         for (i = 0; i < MAX_MB_PLANE; ++i)
@@ -9587,6 +11022,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
           RD_STATS tmp_rd_stats, tmp_rd_stats_y, tmp_rd_stats_uv;
 
           av1_invalid_rd_stats(&tmp_rd_stats);
+
           x->skip = 0;
 
           mbmi->ref_mv_idx = 1 + ref_idx;
@@ -9627,6 +11063,34 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
                            xd->n8_h << MI_SIZE_LOG2, xd);
               mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0] = this_mv;
             }
+#if CONFIG_COMPOUND_SINGLEREF
+          } else if (is_inter_singleref_comp_mode(mbmi->mode)) {
+            int ref_mv_idx = mbmi->ref_mv_idx;
+            // Special case: SR_NEAR_NEWMV mode use
+            // 1 + mbmi->ref_mv_idx (like NEARMV) instead of
+            // mbmi->ref_mv_idx (like NEWMV)
+            if (mbmi->mode == SR_NEAR_NEWMV) ref_mv_idx = 1 + mbmi->ref_mv_idx;
+
+            // TODO(zoeliu): For the mode of SR_NEAREST_NEWMV, as it only runs
+            //               the "if", not the "else if",
+            //               mbmi_ext->ref_mvs[mbmi->ref_frame[0]] takes the
+            //               value for "NEWMV", instead of "NEARESTMV".
+            if (compound_ref0_mode(mbmi->mode) == NEWMV ||
+                compound_ref1_mode(mbmi->mode) == NEWMV) {
+              int_mv this_mv =
+                  mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
+              clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
+                           xd->n8_h << MI_SIZE_LOG2, xd);
+              mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv;
+            } else if (compound_ref0_mode(mbmi->mode) == NEARESTMV ||
+                       compound_ref1_mode(mbmi->mode) == NEARESTMV) {
+              int_mv this_mv =
+                  mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv;
+              clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
+                           xd->n8_h << MI_SIZE_LOG2, xd);
+              mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv;
+            }
+#endif  // CONFIG_COMPOUND_SINGLEREF
           } else {
 #endif  // CONFIG_EXT_INTER
             for (ref = 0; ref < 1 + comp_pred; ++ref) {
@@ -9657,16 +11121,28 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 
             frame_mv[NEARMV][ref_frame] = cur_mv;
             av1_init_rd_stats(&tmp_rd_stats);
-
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
+            // With the same reason as 'rd_stats_y' passed to above
+            // handle_inter_mode(), tmp_rd_stats_y.dist and
+            // tmp_rd_stats_y.sse are sometimes not initialized, esp. when
+            // tmp_rd_stats.skip = 1 and tmp_rd_stats.dist and .sse
+            // represent combined luma and chroma .dist and .sse,
+            // we should initialized tmp_rd_stats_y.
+            if (bsize < BLOCK_8X8) av1_init_rd_stats(&tmp_rd_stats_y);
+#endif
             // Point to variables that are not maintained between iterations
             args.single_newmv = dummy_single_newmv;
 #if CONFIG_EXT_INTER
             args.single_newmv_rate = dummy_single_newmv_rate;
             args.modelled_rd = NULL;
 #endif  // CONFIG_EXT_INTER
-            tmp_alt_rd = handle_inter_mode(
-                cpi, x, bsize, &tmp_rd_stats, &tmp_rd_stats_y, &tmp_rd_stats_uv,
-                &dummy_disable_skip, frame_mv, mi_row, mi_col, &args, best_rd);
+            tmp_alt_rd = handle_inter_mode(cpi, x, bsize, &tmp_rd_stats,
+                                           &tmp_rd_stats_y, &tmp_rd_stats_uv,
+                                           &dummy_disable_skip, frame_mv,
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+                                           frame_comp_mv,
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+                                           mi_row, mi_col, &args, best_rd);
             // Prevent pointers from escaping local scope
             args.single_newmv = NULL;
 #if CONFIG_EXT_INTER
@@ -9696,25 +11172,22 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 
           if (tmp_alt_rd < INT64_MAX) {
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-            tmp_alt_rd = RDCOST(x->rdmult, x->rddiv, tmp_rd_stats.rate,
-                                tmp_rd_stats.dist);
+            tmp_alt_rd =
+                RDCOST(x->rdmult, tmp_rd_stats.rate, tmp_rd_stats.dist);
 #else
-            if (RDCOST(x->rdmult, x->rddiv,
-                       tmp_rd_stats_y.rate + tmp_rd_stats_uv.rate,
+            if (RDCOST(x->rdmult, tmp_rd_stats_y.rate + tmp_rd_stats_uv.rate,
                        tmp_rd_stats.dist) <
-                RDCOST(x->rdmult, x->rddiv, 0, tmp_rd_stats.sse))
-              tmp_alt_rd =
-                  RDCOST(x->rdmult, x->rddiv,
-                         tmp_rd_stats.rate +
-                             av1_cost_bit(av1_get_skip_prob(cm, xd), 0),
-                         tmp_rd_stats.dist);
+                RDCOST(x->rdmult, 0, tmp_rd_stats.sse))
+              tmp_alt_rd = RDCOST(
+                  x->rdmult, tmp_rd_stats.rate +
+                                 av1_cost_bit(av1_get_skip_prob(cm, xd), 0),
+                  tmp_rd_stats.dist);
             else
-              tmp_alt_rd =
-                  RDCOST(x->rdmult, x->rddiv,
-                         tmp_rd_stats.rate +
-                             av1_cost_bit(av1_get_skip_prob(cm, xd), 1) -
-                             tmp_rd_stats_y.rate - tmp_rd_stats_uv.rate,
-                         tmp_rd_stats.sse);
+              tmp_alt_rd = RDCOST(
+                  x->rdmult, tmp_rd_stats.rate +
+                                 av1_cost_bit(av1_get_skip_prob(cm, xd), 1) -
+                                 tmp_rd_stats_y.rate - tmp_rd_stats_uv.rate,
+                  tmp_rd_stats.sse);
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
           }
 
@@ -9730,8 +11203,12 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
             tmp_ref_rd = tmp_alt_rd;
             backup_mbmi = *mbmi;
             backup_skip = x->skip;
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
             if (bsize < BLOCK_8X8) {
+              if (tmp_rd_stats_y.rate != INT_MAX) {
+                assert(tmp_rd_stats_y.sse < INT64_MAX);
+                assert(tmp_rd_stats_y.dist < INT64_MAX);
+              }
               total_sse_y = tmp_rd_stats_y.sse;
               distortion2_y = tmp_rd_stats_y.dist;
             }
@@ -9774,19 +11251,33 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     // Estimate the reference frame signaling cost and add it
     // to the rolling cost variable.
     if (comp_pred) {
+#if CONFIG_EXT_COMP_REFS
+      rate2 += ref_costs_comp[ref_frame][second_ref_frame];
+#else  // !CONFIG_EXT_COMP_REFS
       rate2 += ref_costs_comp[ref_frame];
 #if CONFIG_EXT_REFS
       rate2 += ref_costs_comp[second_ref_frame];
 #endif  // CONFIG_EXT_REFS
+#endif  // CONFIG_EXT_COMP_REFS
     } else {
       rate2 += ref_costs_single[ref_frame];
     }
 
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+    // Add the cost to signal single/comp mode in single ref.
+    if (!comp_pred && cm->reference_mode != COMPOUND_REFERENCE) {
+      aom_prob singleref_comp_mode_p = av1_get_inter_mode_prob(cm, xd);
+      rate2 += av1_cost_bit(singleref_comp_mode_p,
+                            is_inter_singleref_comp_mode(mbmi->mode));
+    }
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-    if (ref_frame == INTRA_FRAME) {
+    if (ref_frame == INTRA_FRAME)
 #else
-    if (!disable_skip) {
+    if (!disable_skip)
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+    {
       if (skippable) {
         // Back out the coefficient coding costs
         rate2 -= (rate_y + rate_uv);
@@ -9795,9 +11286,8 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         // Cost the skip mb case
         rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
       } else if (ref_frame != INTRA_FRAME && !xd->lossless[mbmi->segment_id]) {
-        if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv + rate_skip0,
-                   distortion2) <
-            RDCOST(x->rdmult, x->rddiv, rate_skip1, total_sse)) {
+        if (RDCOST(x->rdmult, rate_y + rate_uv + rate_skip0, distortion2) <
+            RDCOST(x->rdmult, rate_skip1, total_sse)) {
           // Add in the cost of the no skip flag.
           rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
         } else {
@@ -9809,8 +11299,11 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
           this_skip2 = 1;
           rate_y = 0;
           rate_uv = 0;
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
-          if (bsize < BLOCK_8X8) distortion2_y = total_sse_y;
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
+          if (bsize < BLOCK_8X8) {
+            assert(total_sse_y < INT64_MAX);
+            distortion2_y = total_sse_y;
+          }
 #endif
         }
       } else {
@@ -9819,11 +11312,11 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       }
 
       // Calculate the final RD estimate for this mode.
-      this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+      this_rd = RDCOST(x->rdmult, rate2, distortion2);
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
     } else {
       this_skip2 = mbmi->skip;
-      this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+      this_rd = RDCOST(x->rdmult, rate2, distortion2);
       if (this_skip2) {
         rate_y = 0;
         rate_uv = 0;
@@ -9831,6 +11324,12 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
     }
 
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
+    if ((bsize < BLOCK_8X8) && (rate2 != INT_MAX)) {
+      assert(distortion2_y < INT64_MAX);
+    }
+#endif
+
     if (ref_frame == INTRA_FRAME) {
       // Keep record of best intra rd
       if (this_rd < best_intra_rd) {
@@ -9875,12 +11374,18 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         *returnrate_nocoef -= av1_cost_bit(av1_get_intra_inter_prob(cm, xd),
                                            mbmi->ref_frame[0] != INTRA_FRAME);
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+#if CONFIG_WARPED_MOTION
+        set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+#endif
 #if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
         MODE_INFO *const mi = xd->mi[0];
         const MOTION_MODE motion_allowed = motion_mode_allowed(
-#if CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+#if CONFIG_GLOBAL_MOTION
             0, xd->global_motion,
-#endif  // CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+#endif  // CONFIG_GLOBAL_MOTION
+#if CONFIG_WARPED_MOTION
+            xd,
+#endif
             mi);
         if (motion_allowed == WARPED_CAUSAL)
           *returnrate_nocoef -= cpi->motion_mode_cost[bsize][mbmi->motion_mode];
@@ -9901,8 +11406,11 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         best_rate_y = rate_y + av1_cost_bit(av1_get_skip_prob(cm, xd),
                                             this_skip2 || skippable);
         best_rate_uv = rate_uv;
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
-        if (bsize < BLOCK_8X8) rd_cost->dist_y = distortion2_y;
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
+        if (bsize < BLOCK_8X8) {
+          assert(distortion2_y < INT64_MAX);
+          rd_cost->dist_y = distortion2_y;
+        }
 #endif
 #if CONFIG_VAR_TX
         for (i = 0; i < MAX_MB_PLANE; ++i)
@@ -9911,7 +11419,11 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 #endif  // CONFIG_VAR_TX
       }
     }
-
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
+    if ((bsize < BLOCK_8X8) && (rd_cost->rate != INT_MAX)) {
+      assert(rd_cost->dist_y < INT64_MAX);
+    }
+#endif
     /* keep record of best compound/single-only prediction */
     if (!disable_skip && ref_frame != INTRA_FRAME) {
       int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
@@ -9924,8 +11436,8 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         hybrid_rate = rate2 + compmode_cost;
       }
 
-      single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
-      hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
+      single_rd = RDCOST(x->rdmult, single_rate, distortion2);
+      hybrid_rd = RDCOST(x->rdmult, hybrid_rate, distortion2);
 
       if (!comp_pred) {
         if (single_rd < best_pred_rd[SINGLE_REFERENCE])
@@ -9963,6 +11475,15 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
     }
 
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+    // Single ref compound mode
+    if (!has_second_ref(mbmi) && is_inter_singleref_comp_mode(mbmi->mode)) {
+      xd->block_refs[1] = xd->block_refs[0];
+      for (i = 0; i < MAX_MB_PLANE; i++)
+        xd->plane[i].pre[1] = xd->plane[i].pre[0];
+    }
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+
     if (is_inter_mode(mbmi->mode)) {
       av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
 #if CONFIG_MOTION_VAR
@@ -9996,9 +11517,9 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
     }
 
-    if (RDCOST(x->rdmult, x->rddiv, rd_stats_y.rate + rd_stats_uv.rate,
+    if (RDCOST(x->rdmult, rd_stats_y.rate + rd_stats_uv.rate,
                (rd_stats_y.dist + rd_stats_uv.dist)) >
-        RDCOST(x->rdmult, x->rddiv, 0, (rd_stats_y.sse + rd_stats_uv.sse))) {
+        RDCOST(x->rdmult, 0, (rd_stats_y.sse + rd_stats_uv.sse))) {
       skip_blk = 1;
       rd_stats_y.rate = av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
       rd_stats_uv.rate = 0;
@@ -10009,8 +11530,8 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       rd_stats_y.rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
     }
 
-    if (RDCOST(x->rdmult, x->rddiv, best_rate_y + best_rate_uv, rd_cost->dist) >
-        RDCOST(x->rdmult, x->rddiv, rd_stats_y.rate + rd_stats_uv.rate,
+    if (RDCOST(x->rdmult, best_rate_y + best_rate_uv, rd_cost->dist) >
+        RDCOST(x->rdmult, rd_stats_y.rate + rd_stats_uv.rate,
                (rd_stats_y.dist + rd_stats_uv.dist))) {
 #if CONFIG_VAR_TX
       int idx, idy;
@@ -10031,15 +11552,24 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       rd_cost->rate +=
           (rd_stats_y.rate + rd_stats_uv.rate - best_rate_y - best_rate_uv);
       rd_cost->dist = rd_stats_y.dist + rd_stats_uv.dist;
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
-      if (bsize < BLOCK_8X8) rd_cost->dist_y = rd_stats_y.dist;
-#endif
-      rd_cost->rdcost =
-          RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
+      rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
       best_skip2 = skip_blk;
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
+      if (bsize < BLOCK_8X8) {
+        assert(rd_cost->rate != INT_MAX);
+        assert(rd_cost->dist_y < INT64_MAX);
+        rd_cost->dist_y = rd_stats_y.dist;
+      }
+#endif
     }
   }
 
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
+  if ((bsize < BLOCK_8X8) && (rd_cost->rate != INT_MAX)) {
+    assert(rd_cost->dist_y < INT64_MAX);
+  }
+#endif
+
 #if CONFIG_PALETTE
   // Only try palette mode when the best mode so far is an intra mode.
   if (try_palette && !is_inter_mode(best_mbmode.mode)) {
@@ -10058,7 +11588,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     MB_MODE_INFO best_mbmi_palette = best_mbmode;
 
     mbmi->mode = DC_PRED;
-    mbmi->uv_mode = DC_PRED;
+    mbmi->uv_mode = UV_DC_PRED;
     mbmi->ref_frame[0] = INTRA_FRAME;
     mbmi->ref_frame[1] = NONE_FRAME;
     rate_overhead_palette = rd_pick_palette_intra_sby(
@@ -10119,7 +11649,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 #endif  // CONFIG_SUPERTX
       rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
     }
-    this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+    this_rd = RDCOST(x->rdmult, rate2, distortion2);
     if (this_rd < best_rd) {
       best_mode_index = 3;
       mbmi->mv[0].as_int = 0;
@@ -10165,10 +11695,14 @@ PALETTE_EXIT:
   }
 #endif  // CONFIG_FILTER_INTRA
 
-  // The inter modes' rate costs are not calculated precisely in some cases.
-  // Therefore, sometimes, NEWMV is chosen instead of NEARESTMV, NEARMV, and
-  // ZEROMV. Here, checks are added for those cases, and the mode decisions
-  // are corrected.
+// The inter modes' rate costs are not calculated precisely in some cases.
+// Therefore, sometimes, NEWMV is chosen instead of NEARESTMV, NEARMV, and
+// ZEROMV. Here, checks are added for those cases, and the mode decisions
+// are corrected.
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+// NOTE: For SR_NEW_NEWMV, no need to check as the two mvs from the same ref
+//       are surely different from each other.
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
   if (best_mbmode.mode == NEWMV
 #if CONFIG_EXT_INTER
       || best_mbmode.mode == NEW_NEWMV
@@ -10248,8 +11782,9 @@ PALETTE_EXIT:
       }
 
       if (nearestmv[0].as_int == best_mbmode.mv[0].as_int &&
-          nearestmv[1].as_int == best_mbmode.mv[1].as_int) {
+          nearestmv[1].as_int == best_mbmode.mv[1].as_int)
 #if CONFIG_EXT_INTER
+      {
         best_mbmode.mode = NEAREST_NEARESTMV;
       } else {
         int ref_set = (mbmi_ext->ref_mv_count[rf_type] >= 2)
@@ -10274,6 +11809,7 @@ PALETTE_EXIT:
           best_mbmode.mode = ZERO_ZEROMV;
       }
 #else
+      {
         best_mbmode.mode = NEARESTMV;
       } else if (best_mbmode.mv[0].as_int == zeromv[0].as_int &&
                  best_mbmode.mv[1].as_int == zeromv[1].as_int) {
@@ -10287,11 +11823,18 @@ PALETTE_EXIT:
   // using a mode which can support ref_mv_idx
   if (best_mbmode.ref_mv_idx != 0 &&
 #if CONFIG_EXT_INTER
+#if CONFIG_COMPOUND_SINGLEREF
+      !(best_mbmode.mode == NEWMV || best_mbmode.mode == SR_NEW_NEWMV ||
+        best_mbmode.mode == NEW_NEWMV ||
+        have_nearmv_in_inter_mode(best_mbmode.mode)))
+#else   // !CONFIG_COMPOUND_SINGLEREF
       !(best_mbmode.mode == NEWMV || best_mbmode.mode == NEW_NEWMV ||
-        have_nearmv_in_inter_mode(best_mbmode.mode))) {
-#else
-      !(best_mbmode.mode == NEARMV || best_mbmode.mode == NEWMV)) {
-#endif
+        have_nearmv_in_inter_mode(best_mbmode.mode)))
+#endif  // CONFIG_COMPOUND_SINGLEREF
+#else   // !CONFIG_EXT_INTER
+      !(best_mbmode.mode == NEARMV || best_mbmode.mode == NEWMV))
+#endif  // CONFIG_EXT_INTER
+  {
     best_mbmode.ref_mv_idx = 0;
   }
 
@@ -10377,11 +11920,12 @@ PALETTE_EXIT:
       ) {
 #if CONFIG_WARPED_MOTION || CONFIG_MOTION_VAR
     // Correct the motion mode for ZEROMV
-    const MOTION_MODE last_motion_mode_allowed = motion_mode_allowed(
-#if SEPARATE_GLOBAL_MOTION
-        0, xd->global_motion,
-#endif  // SEPARATE_GLOBAL_MOTION
-        xd->mi[0]);
+    const MOTION_MODE last_motion_mode_allowed =
+        motion_mode_allowed(0, xd->global_motion,
+#if CONFIG_WARPED_MOTION
+                            xd,
+#endif
+                            xd->mi[0]);
     if (mbmi->motion_mode > last_motion_mode_allowed)
       mbmi->motion_mode = last_motion_mode_allowed;
 #endif  // CONFIG_WARPED_MOTION || CONFIG_MOTION_VAR
@@ -10445,7 +11989,11 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
   int i;
   int64_t best_pred_diff[REFERENCE_MODES];
   unsigned int ref_costs_single[TOTAL_REFS_PER_FRAME];
+#if CONFIG_EXT_COMP_REFS
+  unsigned int ref_costs_comp[TOTAL_REFS_PER_FRAME][TOTAL_REFS_PER_FRAME];
+#else
   unsigned int ref_costs_comp[TOTAL_REFS_PER_FRAME];
+#endif  // CONFIG_EXT_COMP_REFS
   aom_prob comp_mode_p;
   InterpFilter best_filter = SWITCHABLE;
   int64_t this_rd = INT64_MAX;
@@ -10476,7 +12024,7 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
 #endif  // CONFIG_FILTER_INTRA
   mbmi->mode = ZEROMV;
   mbmi->motion_mode = SIMPLE_TRANSLATION;
-  mbmi->uv_mode = DC_PRED;
+  mbmi->uv_mode = UV_DC_PRED;
   mbmi->ref_frame[0] = LAST_FRAME;
   mbmi->ref_frame[1] = NONE_FRAME;
 #if CONFIG_GLOBAL_MOTION
@@ -10501,7 +12049,17 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
 #if CONFIG_WARPED_MOTION
   if (is_motion_variation_allowed_bsize(bsize) && !has_second_ref(mbmi)) {
     int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+#if WARPED_MOTION_SORT_SAMPLES
+    int pts_mv[SAMPLES_ARRAY_SIZE];
+    mbmi->num_proj_ref[0] =
+        findSamples(cm, xd, mi_row, mi_col, pts, pts_inref, pts_mv);
+    // Rank the samples by motion vector difference
+    if (mbmi->num_proj_ref[0] > 1)
+      mbmi->num_proj_ref[0] = sortSamples(pts_mv, &mbmi->mv[0].as_mv, pts,
+                                          pts_inref, mbmi->num_proj_ref[0]);
+#else
     mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref);
+#endif  // WARPED_MOTION_SORT_SAMPLES
   }
 #endif
 
@@ -10548,12 +12106,12 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
   // Estimate the reference frame signaling cost and add it
   // to the rolling cost variable.
   rate2 += ref_costs_single[LAST_FRAME];
-  this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+  this_rd = RDCOST(x->rdmult, rate2, distortion2);
 
   rd_cost->rate = rate2;
   rd_cost->dist = distortion2;
   rd_cost->rdcost = this_rd;
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
   if (bsize < BLOCK_8X8) rd_cost->dist_y = distortion2;
 #endif
   if (this_rd >= best_rd_so_far) {
@@ -10646,7 +12204,8 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
 
   // handle above row
   if (xd->up_available) {
-    const int overlap = num_4x4_blocks_high_lookup[bsize] * 2;
+    const int overlap =
+        AOMMIN(block_size_high[bsize] >> 1, block_size_high[BLOCK_64X64] >> 1);
     const int miw = AOMMIN(xd->n8_w, cm->mi_cols - mi_col);
     const int mi_row_offset = -1;
     const uint8_t *const mask1d = av1_get_obmc_mask(overlap);
@@ -10666,7 +12225,9 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
             &xd->mi[mi_col_offset + 1 + mi_row_offset * xd->mi_stride]->mbmi;
 #endif
       const BLOCK_SIZE a_bsize = AOMMAX(above_mbmi->sb_type, BLOCK_8X8);
-      const int mi_step = AOMMIN(xd->n8_w, mi_size_wide[a_bsize]);
+      const int above_step =
+          AOMMIN(mi_size_wide[a_bsize], mi_size_wide[BLOCK_64X64]);
+      const int mi_step = AOMMIN(xd->n8_w, above_step);
       const int neighbor_bw = mi_step * MI_SIZE;
 
       if (is_neighbor_overlappable(above_mbmi)) {
@@ -10725,7 +12286,8 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
 
   // handle left column
   if (xd->left_available) {
-    const int overlap = num_4x4_blocks_wide_lookup[bsize] * 2;
+    const int overlap =
+        AOMMIN(block_size_wide[bsize] >> 1, block_size_wide[BLOCK_64X64] >> 1);
     const int mih = AOMMIN(xd->n8_h, cm->mi_rows - mi_row);
     const int mi_col_offset = -1;
     const uint8_t *const mask1d = av1_get_obmc_mask(overlap);
@@ -10746,7 +12308,9 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
             &xd->mi[mi_col_offset + (mi_row_offset + 1) * xd->mi_stride]->mbmi;
 #endif
       const BLOCK_SIZE l_bsize = AOMMAX(left_mbmi->sb_type, BLOCK_8X8);
-      const int mi_step = AOMMIN(xd->n8_h, mi_size_high[l_bsize]);
+      const int left_step =
+          AOMMIN(mi_size_high[l_bsize], mi_size_high[BLOCK_64X64]);
+      const int mi_step = AOMMIN(xd->n8_h, left_step);
       const int neighbor_bh = mi_step * MI_SIZE;
 
       if (is_neighbor_overlappable(left_mbmi)) {
@@ -10854,8 +12418,23 @@ void av1_check_ncobmc_rd(const struct AV1_COMP *cpi, struct macroblock *x,
   av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
 
   av1_subtract_plane(x, bsize, 0);
+#if CONFIG_VAR_TX
+  if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
+    select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+  } else {
+    int idx, idy;
+    super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+    for (idy = 0; idy < xd->n8_h; ++idy)
+      for (idx = 0; idx < xd->n8_w; ++idx)
+        mbmi->inter_tx_size[idy][idx] = mbmi->tx_size;
+    memset(x->blk_skip[0], rd_stats_y.skip,
+           sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
+  }
+  inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+#else
   super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
   super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+#endif
   assert(rd_stats_y.rate != INT_MAX && rd_stats_uv.rate != INT_MAX);
   if (rd_stats_y.skip && rd_stats_uv.skip) {
     rd_stats_y.rate = rate_skip1;
@@ -10863,10 +12442,10 @@ void av1_check_ncobmc_rd(const struct AV1_COMP *cpi, struct macroblock *x,
     rd_stats_y.dist = rd_stats_y.sse;
     rd_stats_uv.dist = rd_stats_uv.sse;
     skip_blk = 0;
-  } else if (RDCOST(x->rdmult, x->rddiv,
+  } else if (RDCOST(x->rdmult,
                     (rd_stats_y.rate + rd_stats_uv.rate + rate_skip0),
                     (rd_stats_y.dist + rd_stats_uv.dist)) >
-             RDCOST(x->rdmult, x->rddiv, rate_skip1,
+             RDCOST(x->rdmult, rate_skip1,
                     (rd_stats_y.sse + rd_stats_uv.sse))) {
     rd_stats_y.rate = rate_skip1;
     rd_stats_uv.rate = 0;
@@ -10879,18 +12458,33 @@ void av1_check_ncobmc_rd(const struct AV1_COMP *cpi, struct macroblock *x,
   }
   backup_skip = skip_blk;
   backup_mbmi = *mbmi;
-  rd_causal = RDCOST(x->rdmult, x->rddiv, (rd_stats_y.rate + rd_stats_uv.rate),
+  rd_causal = RDCOST(x->rdmult, (rd_stats_y.rate + rd_stats_uv.rate),
                      (rd_stats_y.dist + rd_stats_uv.dist));
-  rd_causal += RDCOST(x->rdmult, x->rddiv,
-                      av1_cost_bit(cm->fc->motion_mode_prob[bsize][0], 0), 0);
+  rd_causal +=
+      RDCOST(x->rdmult, av1_cost_bit(cm->fc->motion_mode_prob[bsize][0], 0), 0);
 
   // Check non-causal mode
   mbmi->motion_mode = OBMC_CAUSAL;
   av1_build_ncobmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
 
   av1_subtract_plane(x, bsize, 0);
+#if CONFIG_VAR_TX
+  if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
+    select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+  } else {
+    int idx, idy;
+    super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+    for (idy = 0; idy < xd->n8_h; ++idy)
+      for (idx = 0; idx < xd->n8_w; ++idx)
+        mbmi->inter_tx_size[idy][idx] = mbmi->tx_size;
+    memset(x->blk_skip[0], rd_stats_y.skip,
+           sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
+  }
+  inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+#else
   super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
   super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+#endif
   assert(rd_stats_y.rate != INT_MAX && rd_stats_uv.rate != INT_MAX);
   if (rd_stats_y.skip && rd_stats_uv.skip) {
     rd_stats_y.rate = rate_skip1;
@@ -10898,10 +12492,10 @@ void av1_check_ncobmc_rd(const struct AV1_COMP *cpi, struct macroblock *x,
     rd_stats_y.dist = rd_stats_y.sse;
     rd_stats_uv.dist = rd_stats_uv.sse;
     skip_blk = 0;
-  } else if (RDCOST(x->rdmult, x->rddiv,
+  } else if (RDCOST(x->rdmult,
                     (rd_stats_y.rate + rd_stats_uv.rate + rate_skip0),
                     (rd_stats_y.dist + rd_stats_uv.dist)) >
-             RDCOST(x->rdmult, x->rddiv, rate_skip1,
+             RDCOST(x->rdmult, rate_skip1,
                     (rd_stats_y.sse + rd_stats_uv.sse))) {
     rd_stats_y.rate = rate_skip1;
     rd_stats_uv.rate = 0;
@@ -10914,9 +12508,8 @@ void av1_check_ncobmc_rd(const struct AV1_COMP *cpi, struct macroblock *x,
   }
 
   if (rd_causal >
-      RDCOST(x->rdmult, x->rddiv,
-             rd_stats_y.rate + rd_stats_uv.rate +
-                 av1_cost_bit(cm->fc->motion_mode_prob[bsize][0], 1),
+      RDCOST(x->rdmult, rd_stats_y.rate + rd_stats_uv.rate +
+                            av1_cost_bit(cm->fc->motion_mode_prob[bsize][0], 1),
              (rd_stats_y.dist + rd_stats_uv.dist))) {
     x->skip = skip_blk;
   } else {