From 7369c7d7a5eed32963d8af37658286617919f91c Mon Sep 17 00:00:00 2001
From: trav90 <travawine@palemoon.org>
Date: Thu, 18 Oct 2018 06:04:57 -0500
Subject: Update aom to commit id f5bdeac22930ff4c6b219be49c843db35970b918

---
 third_party/aom/av1/encoder/aq_cyclicrefresh.c     |    5 +-
 third_party/aom/av1/encoder/av1_quantize.c         |   45 +-
 third_party/aom/av1/encoder/av1_quantize.h         |    2 -
 third_party/aom/av1/encoder/bgsprite.c             |  748 +++++
 third_party/aom/av1/encoder/bgsprite.h             |   30 +
 third_party/aom/av1/encoder/bitstream.c            | 2604 ++++++++-------
 third_party/aom/av1/encoder/bitstream.h            |    9 +-
 third_party/aom/av1/encoder/block.h                |   14 +-
 third_party/aom/av1/encoder/context_tree.c         |   26 +-
 third_party/aom/av1/encoder/context_tree.h         |    4 +-
 third_party/aom/av1/encoder/cost.c                 |   18 +
 third_party/aom/av1/encoder/cost.h                 |   10 +
 third_party/aom/av1/encoder/dct.c                  |  606 +++-
 third_party/aom/av1/encoder/encodeframe.c          | 1362 +++++---
 third_party/aom/av1/encoder/encodeframe.h          |    2 +-
 third_party/aom/av1/encoder/encodemb.c             |  978 ++----
 third_party/aom/av1/encoder/encodemb.h             |   15 +-
 third_party/aom/av1/encoder/encodemv.c             |  143 +-
 third_party/aom/av1/encoder/encodemv.h             |    5 +-
 third_party/aom/av1/encoder/encoder.c              | 1211 +++----
 third_party/aom/av1/encoder/encoder.h              |  119 +-
 third_party/aom/av1/encoder/encodetxb.c            |  345 +-
 third_party/aom/av1/encoder/encodetxb.h            |   17 +-
 third_party/aom/av1/encoder/ethread.c              |   14 +-
 third_party/aom/av1/encoder/firstpass.c            |  134 +-
 third_party/aom/av1/encoder/firstpass.h            |   23 +-
 third_party/aom/av1/encoder/global_motion.c        |   26 +-
 third_party/aom/av1/encoder/global_motion.h        |    3 +-
 third_party/aom/av1/encoder/hybrid_fwd_txfm.c      |  421 +--
 third_party/aom/av1/encoder/hybrid_fwd_txfm.h      |   17 +-
 third_party/aom/av1/encoder/mcomp.c                |  245 +-
 third_party/aom/av1/encoder/mcomp.h                |   17 +-
 third_party/aom/av1/encoder/palette.c              |   48 -
 third_party/aom/av1/encoder/palette.h              |    8 -
 third_party/aom/av1/encoder/pickcdef.c             |  102 +-
 third_party/aom/av1/encoder/picklpf.c              |  111 +-
 third_party/aom/av1/encoder/picklpf.h              |    5 +
 third_party/aom/av1/encoder/pickrst.c              |   75 +-
 third_party/aom/av1/encoder/ransac.c               |   18 +-
 third_party/aom/av1/encoder/ratectrl.c             |   69 +-
 third_party/aom/av1/encoder/ratectrl.h             |    5 +
 third_party/aom/av1/encoder/rd.c                   |  196 +-
 third_party/aom/av1/encoder/rd.h                   |  150 +-
 third_party/aom/av1/encoder/rdopt.c                | 3419 ++++++++++++++------
 third_party/aom/av1/encoder/rdopt.h                |   25 +-
 third_party/aom/av1/encoder/segmentation.c         |   14 +-
 third_party/aom/av1/encoder/speed_features.c       |   34 +-
 third_party/aom/av1/encoder/speed_features.h       |   31 +-
 third_party/aom/av1/encoder/temporal_filter.c      |   83 +-
 third_party/aom/av1/encoder/temporal_filter.h      |    6 +-
 third_party/aom/av1/encoder/tokenize.c             |  107 +-
 third_party/aom/av1/encoder/tokenize.h             |    3 +
 .../aom/av1/encoder/x86/av1_highbd_quantize_avx2.c |  143 +
 .../aom/av1/encoder/x86/av1_highbd_quantize_sse4.c |   11 +-
 .../aom/av1/encoder/x86/av1_quantize_avx2.c        |  289 ++
 third_party/aom/av1/encoder/x86/dct_intrin_sse2.c  |   60 +-
 .../aom/av1/encoder/x86/error_intrin_avx2.c        |   25 +-
 .../aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c     |   18 +-
 .../aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c     |   73 +-
 59 files changed, 9091 insertions(+), 5255 deletions(-)
 create mode 100644 third_party/aom/av1/encoder/bgsprite.c
 create mode 100644 third_party/aom/av1/encoder/bgsprite.h
 create mode 100644 third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c
 create mode 100644 third_party/aom/av1/encoder/x86/av1_quantize_avx2.c

(limited to 'third_party/aom/av1/encoder')

diff --git a/third_party/aom/av1/encoder/aq_cyclicrefresh.c b/third_party/aom/av1/encoder/aq_cyclicrefresh.c
index b2b410617..05aa28c9f 100644
--- a/third_party/aom/av1/encoder/aq_cyclicrefresh.c
+++ b/third_party/aom/av1/encoder/aq_cyclicrefresh.c
@@ -352,10 +352,7 @@ void av1_cyclic_refresh_check_golden_update(AV1_COMP *const cpi) {
   // For video conference clips, if the background has high motion in current
   // frame because of the camera movement, set this frame as the golden frame.
   // Use 70% and 5% as the thresholds for golden frame refreshing.
-  // Also, force this frame as a golden update frame if this frame will change
-  // the resolution (av1_resize_pending != 0).
-  if (av1_resize_pending(cpi) ||
-      (cnt1 * 10 > (70 * rows * cols) && cnt2 * 20 < cnt1)) {
+  if (cnt1 * 10 > (70 * rows * cols) && cnt2 * 20 < cnt1) {
     av1_cyclic_refresh_set_golden_update(cpi);
     rc->frames_till_gf_update_due = rc->baseline_gf_interval;
 
diff --git a/third_party/aom/av1/encoder/av1_quantize.c b/third_party/aom/av1/encoder/av1_quantize.c
index 63727df1f..dd53d4223 100644
--- a/third_party/aom/av1/encoder/av1_quantize.c
+++ b/third_party/aom/av1/encoder/av1_quantize.c
@@ -845,7 +845,6 @@ void av1_quantize_dc_nuq_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 }
 #endif  // CONFIG_NEW_QUANT
 
-#if CONFIG_HIGHBITDEPTH
 void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr,
                                    intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
                                    tran_low_t *qcoeff_ptr,
@@ -899,14 +898,29 @@ void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
 
   switch (qparam->log_scale) {
     case 0:
-      aom_highbd_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
-                            p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
-                            pd->dequant, eob_ptr, sc->scan, sc->iscan
+      if (LIKELY(n_coeffs >= 8)) {
+        aom_highbd_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin,
+                              p->round, p->quant, p->quant_shift, qcoeff_ptr,
+                              dqcoeff_ptr, pd->dequant, eob_ptr, sc->scan,
+                              sc->iscan
 #if CONFIG_AOM_QM
-                            ,
-                            qm_ptr, iqm_ptr
+                              ,
+                              qm_ptr, iqm_ptr
 #endif
-                            );
+                              );
+      } else {
+        // TODO(luoyi): Need SIMD (e.g. sse2) for smaller block size
+        // quantization
+        aom_highbd_quantize_b_c(coeff_ptr, n_coeffs, skip_block, p->zbin,
+                                p->round, p->quant, p->quant_shift, qcoeff_ptr,
+                                dqcoeff_ptr, pd->dequant, eob_ptr, sc->scan,
+                                sc->iscan
+#if CONFIG_AOM_QM
+                                ,
+                                qm_ptr, iqm_ptr
+#endif
+                                );
+      }
       break;
     case 1:
       aom_highbd_quantize_b_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin,
@@ -936,7 +950,6 @@ void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
 static INLINE void highbd_quantize_dc(
     const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
     const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr,
@@ -958,14 +971,13 @@ static INLINE void highbd_quantize_dc(
     const int coeff_sign = (coeff >> 31);
     const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
     const int64_t tmp = abs_coeff + round_ptr[0];
-    const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> (16 - log_scale));
+    const int abs_qcoeff = (int)((tmp * quant) >> (16 - log_scale));
     qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
     dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / (1 << log_scale);
     if (abs_qcoeff) eob = 0;
   }
   *eob_ptr = eob + 1;
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
 void av1_highbd_quantize_dc_facade(const tran_low_t *coeff_ptr,
                                    intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
@@ -1504,9 +1516,7 @@ void av1_highbd_quantize_dc_nuq_facade(
   }
 }
 #endif  // CONFIG_NEW_QUANT
-#endif  // CONFIG_HIGHBITDEPTH
 
-#if CONFIG_HIGHBITDEPTH
 void av1_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count,
                               int skip_block, const int16_t *zbin_ptr,
                               const int16_t *round_ptr,
@@ -1547,15 +1557,14 @@ void av1_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count,
 #endif
       const int coeff_sign = (coeff >> 31);
       const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      const int64_t tmp = abs_coeff + round_ptr[rc != 0];
+      const int64_t tmp = abs_coeff + (round_ptr[rc != 0] >> log_scale);
 #if CONFIG_AOM_QM
-      const uint32_t abs_qcoeff =
-          (uint32_t)((tmp * quant_ptr[rc != 0] * wt) >> (shift + AOM_QM_BITS));
+      const int abs_qcoeff =
+          (int)((tmp * quant_ptr[rc != 0] * wt) >> (shift + AOM_QM_BITS));
       qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
       dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / scale;
 #else
-      const uint32_t abs_qcoeff =
-          (uint32_t)((tmp * quant_ptr[rc != 0]) >> shift);
+      const int abs_qcoeff = (int)((tmp * quant_ptr[rc != 0]) >> shift);
       qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
       dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / scale;
 #endif
@@ -1565,8 +1574,6 @@ void av1_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count,
   *eob_ptr = eob + 1;
 }
 
-#endif  // CONFIG_HIGHBITDEPTH
-
 static void invert_quant(int16_t *quant, int16_t *shift, int d) {
   uint32_t t;
   int l, m;
diff --git a/third_party/aom/av1/encoder/av1_quantize.h b/third_party/aom/av1/encoder/av1_quantize.h
index 4bc9cccc2..e5fc8b528 100644
--- a/third_party/aom/av1/encoder/av1_quantize.h
+++ b/third_party/aom/av1/encoder/av1_quantize.h
@@ -146,7 +146,6 @@ void av1_quantize_dc_nuq_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                                 const QUANT_PARAM *qparam);
 #endif  // CONFIG_NEW_QUANT
 
-#if CONFIG_HIGHBITDEPTH
 void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr,
                                    intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
                                    tran_low_t *qcoeff_ptr,
@@ -190,7 +189,6 @@ void av1_highbd_quantize_dc_nuq_facade(
     tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc,
     const QUANT_PARAM *qparam);
 #endif  // CONFIG_NEW_QUANT
-#endif  // CONFIG_HIGHBITDEPTH
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/encoder/bgsprite.c b/third_party/aom/av1/encoder/bgsprite.c
new file mode 100644
index 000000000..64deade06
--- /dev/null
+++ b/third_party/aom/av1/encoder/bgsprite.c
@@ -0,0 +1,748 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#define _POSIX_C_SOURCE 200112L  // rand_r()
+#include <assert.h>
+#include <float.h>
+#include <limits.h>
+#include <math.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "av1/encoder/bgsprite.h"
+
+#include "aom_mem/aom_mem.h"
+#include "./aom_scale_rtcd.h"
+#include "av1/common/mv.h"
+#include "av1/common/warped_motion.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/global_motion.h"
+#include "av1/encoder/mathutils.h"
+#include "av1/encoder/temporal_filter.h"
+
+/* Blending Modes:
+ * 0 = Median
+ * 1 = Mean
+ */
+#define BGSPRITE_BLENDING_MODE 1
+
+/* Interpolation for panorama alignment sampling:
+ * 0 = Nearest neighbor
+ * 1 = Bilinear
+ */
+#define BGSPRITE_INTERPOLATION 0
+
+#define TRANSFORM_MAT_DIM 3
+
+typedef struct {
+#if CONFIG_HIGHBITDEPTH
+  uint16_t y;
+  uint16_t u;
+  uint16_t v;
+#else
+  uint8_t y;
+  uint8_t u;
+  uint8_t v;
+#endif  // CONFIG_HIGHBITDEPTH
+} YuvPixel;
+
+// Maps to convert from matrix form to param vector form.
+static const int params_to_matrix_map[] = { 2, 3, 0, 4, 5, 1, 6, 7 };
+static const int matrix_to_params_map[] = { 2, 5, 0, 1, 3, 4, 6, 7 };
+
+// Convert the parameter array to a 3x3 matrix form.
+static void params_to_matrix(const double *const params, double *target) {
+  for (int i = 0; i < MAX_PARAMDIM - 1; i++) {
+    assert(params_to_matrix_map[i] < MAX_PARAMDIM - 1);
+    target[i] = params[params_to_matrix_map[i]];
+  }
+  target[8] = 1;
+}
+
+// Convert a 3x3 matrix to a parameter array form.
+static void matrix_to_params(const double *const matrix, double *target) {
+  for (int i = 0; i < MAX_PARAMDIM - 1; i++) {
+    assert(matrix_to_params_map[i] < MAX_PARAMDIM - 1);
+    target[i] = matrix[matrix_to_params_map[i]];
+  }
+}
+
+// Do matrix multiplication on params.
+static void multiply_params(double *const m1, double *const m2,
+                            double *target) {
+  double m1_matrix[MAX_PARAMDIM];
+  double m2_matrix[MAX_PARAMDIM];
+  double result[MAX_PARAMDIM];
+
+  params_to_matrix(m1, m1_matrix);
+  params_to_matrix(m2, m2_matrix);
+  multiply_mat(m2_matrix, m1_matrix, result, TRANSFORM_MAT_DIM,
+               TRANSFORM_MAT_DIM, TRANSFORM_MAT_DIM);
+  matrix_to_params(result, target);
+}
+
+// Finds x and y limits of a single transformed image.
+// Width and height are the size of the input video.
+static void find_frame_limit(int width, int height,
+                             const double *const transform, int *x_min,
+                             int *x_max, int *y_min, int *y_max) {
+  double transform_matrix[MAX_PARAMDIM];
+  double xy_matrix[3] = { 0, 0, 1 };
+  double uv_matrix[3] = { 0 };
+// Macro used to update frame limits based on transformed coordinates.
+#define UPDATELIMITS(u, v, x_min, x_max, y_min, y_max) \
+  {                                                    \
+    if ((int)ceil(u) > *x_max) {                       \
+      *x_max = (int)ceil(u);                           \
+    }                                                  \
+    if ((int)floor(u) < *x_min) {                      \
+      *x_min = (int)floor(u);                          \
+    }                                                  \
+    if ((int)ceil(v) > *y_max) {                       \
+      *y_max = (int)ceil(v);                           \
+    }                                                  \
+    if ((int)floor(v) < *y_min) {                      \
+      *y_min = (int)floor(v);                          \
+    }                                                  \
+  }
+
+  params_to_matrix(transform, transform_matrix);
+  xy_matrix[0] = 0;
+  xy_matrix[1] = 0;
+  multiply_mat(transform_matrix, xy_matrix, uv_matrix, TRANSFORM_MAT_DIM,
+               TRANSFORM_MAT_DIM, 1);
+  *x_max = (int)ceil(uv_matrix[0]);
+  *x_min = (int)floor(uv_matrix[0]);
+  *y_max = (int)ceil(uv_matrix[1]);
+  *y_min = (int)floor(uv_matrix[1]);
+
+  xy_matrix[0] = width;
+  xy_matrix[1] = 0;
+  multiply_mat(transform_matrix, xy_matrix, uv_matrix, TRANSFORM_MAT_DIM,
+               TRANSFORM_MAT_DIM, 1);
+  UPDATELIMITS(uv_matrix[0], uv_matrix[1], x_min, x_max, y_min, y_max);
+
+  xy_matrix[0] = width;
+  xy_matrix[1] = height;
+  multiply_mat(transform_matrix, xy_matrix, uv_matrix, TRANSFORM_MAT_DIM,
+               TRANSFORM_MAT_DIM, 1);
+  UPDATELIMITS(uv_matrix[0], uv_matrix[1], x_min, x_max, y_min, y_max);
+
+  xy_matrix[0] = 0;
+  xy_matrix[1] = height;
+  multiply_mat(transform_matrix, xy_matrix, uv_matrix, TRANSFORM_MAT_DIM,
+               TRANSFORM_MAT_DIM, 1);
+  UPDATELIMITS(uv_matrix[0], uv_matrix[1], x_min, x_max, y_min, y_max);
+
+#undef UPDATELIMITS
+}
+
+// Finds x and y limits for arrays. Also finds the overall max and minimums
+static void find_limits(int width, int height, const double **const params,
+                        int num_frames, int *x_min, int *x_max, int *y_min,
+                        int *y_max, int *pano_x_min, int *pano_x_max,
+                        int *pano_y_min, int *pano_y_max) {
+  *pano_x_max = INT_MIN;
+  *pano_x_min = INT_MAX;
+  *pano_y_max = INT_MIN;
+  *pano_y_min = INT_MAX;
+  for (int i = 0; i < num_frames; ++i) {
+    find_frame_limit(width, height, (const double *const)params[i], &x_min[i],
+                     &x_max[i], &y_min[i], &y_max[i]);
+    if (x_max[i] > *pano_x_max) {
+      *pano_x_max = x_max[i];
+    }
+    if (x_min[i] < *pano_x_min) {
+      *pano_x_min = x_min[i];
+    }
+    if (y_max[i] > *pano_y_max) {
+      *pano_y_max = y_max[i];
+    }
+    if (y_min[i] < *pano_y_min) {
+      *pano_y_min = y_min[i];
+    }
+  }
+}
+
+// Inverts a 3x3 matrix that is in the parameter form.
+static void invert_params(const double *const params, double *target) {
+  double temp[MAX_PARAMDIM] = { 0 };
+  params_to_matrix(params, temp);
+
+  // Find determinant of matrix (expansion by minors).
+  const double det = temp[0] * ((temp[4] * temp[8]) - (temp[5] * temp[7])) -
+                     temp[1] * ((temp[3] * temp[8]) - (temp[5] * temp[6])) +
+                     temp[2] * ((temp[3] * temp[7]) - (temp[4] * temp[6]));
+  assert(det != 0);
+
+  // inverse is transpose of cofactor * 1/det.
+  double inverse[MAX_PARAMDIM] = { 0 };
+  inverse[0] = (temp[4] * temp[8] - temp[7] * temp[5]) / det;
+  inverse[1] = (temp[2] * temp[7] - temp[1] * temp[8]) / det;
+  inverse[2] = (temp[1] * temp[5] - temp[2] * temp[4]) / det;
+  inverse[3] = (temp[5] * temp[6] - temp[3] * temp[8]) / det;
+  inverse[4] = (temp[0] * temp[8] - temp[2] * temp[6]) / det;
+  inverse[5] = (temp[3] * temp[2] - temp[0] * temp[5]) / det;
+  inverse[6] = (temp[3] * temp[7] - temp[6] * temp[4]) / det;
+  inverse[7] = (temp[6] * temp[1] - temp[0] * temp[7]) / det;
+  inverse[8] = (temp[0] * temp[4] - temp[3] * temp[1]) / det;
+
+  matrix_to_params(inverse, target);
+}
+
+#if BGSPRITE_BLENDING_MODE == 0
+// swaps two YuvPixels.
+static void swap_yuv(YuvPixel *a, YuvPixel *b) {
+  const YuvPixel temp = *b;
+  *b = *a;
+  *a = temp;
+}
+
+// Partitions array to find pivot index in qselect.
+static int partition(YuvPixel arr[], int left, int right, int pivot_idx) {
+  YuvPixel pivot = arr[pivot_idx];
+
+  // Move pivot to the end.
+  swap_yuv(&arr[pivot_idx], &arr[right]);
+
+  int p_idx = left;
+  for (int i = left; i < right; ++i) {
+    if (arr[i].y <= pivot.y) {
+      swap_yuv(&arr[i], &arr[p_idx]);
+      p_idx++;
+    }
+  }
+
+  swap_yuv(&arr[p_idx], &arr[right]);
+
+  return p_idx;
+}
+
+// Returns the kth element in array, partially sorted in place (quickselect).
+static YuvPixel qselect(YuvPixel arr[], int left, int right, int k) {
+  if (left >= right) {
+    return arr[left];
+  }
+  unsigned int seed = (int)time(NULL);
+  int pivot_idx = left + rand_r(&seed) % (right - left + 1);
+  pivot_idx = partition(arr, left, right, pivot_idx);
+
+  if (k == pivot_idx) {
+    return arr[k];
+  } else if (k < pivot_idx) {
+    return qselect(arr, left, pivot_idx - 1, k);
+  } else {
+    return qselect(arr, pivot_idx + 1, right, k);
+  }
+}
+#endif  // BGSPRITE_BLENDING_MODE == 0
+
+// Stitches images together to create ARF and stores it in 'panorama'.
+static void stitch_images(YV12_BUFFER_CONFIG **const frames,
+                          const int num_frames, const int center_idx,
+                          const double **const params, const int *const x_min,
+                          const int *const x_max, const int *const y_min,
+                          const int *const y_max, int pano_x_min,
+                          int pano_x_max, int pano_y_min, int pano_y_max,
+                          YV12_BUFFER_CONFIG *panorama) {
+  const int width = pano_x_max - pano_x_min + 1;
+  const int height = pano_y_max - pano_y_min + 1;
+
+  // Create temp_pano[y][x][num_frames] stack of pixel values
+  YuvPixel ***temp_pano = aom_malloc(height * sizeof(*temp_pano));
+  for (int i = 0; i < height; ++i) {
+    temp_pano[i] = aom_malloc(width * sizeof(**temp_pano));
+    for (int j = 0; j < width; ++j) {
+      temp_pano[i][j] = aom_malloc(num_frames * sizeof(***temp_pano));
+    }
+  }
+  // Create count[y][x] to count how many values in stack for median filtering
+  int **count = aom_malloc(height * sizeof(*count));
+  for (int i = 0; i < height; ++i) {
+    count[i] = aom_calloc(width, sizeof(**count));  // counts initialized to 0
+  }
+
+  // Re-sample images onto panorama (pre-median filtering).
+  const int x_offset = -pano_x_min;
+  const int y_offset = -pano_y_min;
+  const int frame_width = frames[0]->y_width;
+  const int frame_height = frames[0]->y_height;
+  for (int i = 0; i < num_frames; ++i) {
+    // Find transforms from panorama coordinate system back to single image
+    // coordinate system for sampling.
+    int transformed_width = x_max[i] - x_min[i] + 1;
+    int transformed_height = y_max[i] - y_min[i] + 1;
+
+    double transform_matrix[MAX_PARAMDIM];
+    double transform_params[MAX_PARAMDIM - 1];
+    invert_params(params[i], transform_params);
+    params_to_matrix(transform_params, transform_matrix);
+
+#if CONFIG_HIGHBITDEPTH
+    const uint16_t *y_buffer16 = CONVERT_TO_SHORTPTR(frames[i]->y_buffer);
+    const uint16_t *u_buffer16 = CONVERT_TO_SHORTPTR(frames[i]->u_buffer);
+    const uint16_t *v_buffer16 = CONVERT_TO_SHORTPTR(frames[i]->v_buffer);
+#endif  // CONFIG_HIGHBITDEPTH
+
+    for (int y = 0; y < transformed_height; ++y) {
+      for (int x = 0; x < transformed_width; ++x) {
+        // Do transform.
+        double xy_matrix[3] = { x + x_min[i], y + y_min[i], 1 };
+        double uv_matrix[3] = { 0 };
+        multiply_mat(transform_matrix, xy_matrix, uv_matrix, TRANSFORM_MAT_DIM,
+                     TRANSFORM_MAT_DIM, 1);
+
+        // Coordinates used for nearest neighbor interpolation.
+        int image_x = (int)round(uv_matrix[0]);
+        int image_y = (int)round(uv_matrix[1]);
+
+        // Temporary values for bilinear interpolation
+        double interpolated_yvalue = 0.0;
+        double interpolated_uvalue = 0.0;
+        double interpolated_vvalue = 0.0;
+        double interpolated_fraction = 0.0;
+        int interpolation_count = 0;
+
+#if BGSPRITE_INTERPOLATION == 1
+        // Coordintes used for bilinear interpolation.
+        double x_base;
+        double y_base;
+        double x_decimal = modf(uv_matrix[0], &x_base);
+        double y_decimal = modf(uv_matrix[1], &y_base);
+
+        if ((x_decimal > 0.2 && x_decimal < 0.8) ||
+            (y_decimal > 0.2 && y_decimal < 0.8)) {
+          for (int u = 0; u < 2; ++u) {
+            for (int v = 0; v < 2; ++v) {
+              int interp_x = (int)x_base + u;
+              int interp_y = (int)y_base + v;
+              if (interp_x >= 0 && interp_x < frame_width && interp_y >= 0 &&
+                  interp_y < frame_height) {
+                interpolation_count++;
+
+                interpolated_fraction +=
+                    fabs(u - x_decimal) * fabs(v - y_decimal);
+                int ychannel_idx = interp_y * frames[i]->y_stride + interp_x;
+                int uvchannel_idx = (interp_y >> frames[i]->subsampling_y) *
+                                        frames[i]->uv_stride +
+                                    (interp_x >> frames[i]->subsampling_x);
+#if CONFIG_HIGHBITDEPTH
+                if (frames[i]->flags & YV12_FLAG_HIGHBITDEPTH) {
+                  interpolated_yvalue += (1 - fabs(u - x_decimal)) *
+                                         (1 - fabs(v - y_decimal)) *
+                                         y_buffer16[ychannel_idx];
+                  interpolated_uvalue += (1 - fabs(u - x_decimal)) *
+                                         (1 - fabs(v - y_decimal)) *
+                                         u_buffer16[uvchannel_idx];
+                  interpolated_vvalue += (1 - fabs(u - x_decimal)) *
+                                         (1 - fabs(v - y_decimal)) *
+                                         v_buffer16[uvchannel_idx];
+                } else {
+#endif  // CONFIG_HIGHBITDEPTH
+                  interpolated_yvalue += (1 - fabs(u - x_decimal)) *
+                                         (1 - fabs(v - y_decimal)) *
+                                         frames[i]->y_buffer[ychannel_idx];
+                  interpolated_uvalue += (1 - fabs(u - x_decimal)) *
+                                         (1 - fabs(v - y_decimal)) *
+                                         frames[i]->u_buffer[uvchannel_idx];
+                  interpolated_vvalue += (1 - fabs(u - x_decimal)) *
+                                         (1 - fabs(v - y_decimal)) *
+                                         frames[i]->v_buffer[uvchannel_idx];
+#if CONFIG_HIGHBITDEPTH
+                }
+#endif  // CONFIG_HIGHBITDEPTH
+              }
+            }
+          }
+        }
+#endif  // BGSPRITE_INTERPOLATION == 1
+
+        if (BGSPRITE_INTERPOLATION && interpolation_count > 2) {
+          if (interpolation_count != 4) {
+            interpolated_yvalue /= interpolated_fraction;
+            interpolated_uvalue /= interpolated_fraction;
+            interpolated_vvalue /= interpolated_fraction;
+          }
+          int pano_x = x + x_min[i] + x_offset;
+          int pano_y = y + y_min[i] + y_offset;
+
+#if CONFIG_HIGHBITDEPTH
+          if (frames[i]->flags & YV12_FLAG_HIGHBITDEPTH) {
+            temp_pano[pano_y][pano_x][count[pano_y][pano_x]].y =
+                (uint16_t)interpolated_yvalue;
+            temp_pano[pano_y][pano_x][count[pano_y][pano_x]].u =
+                (uint16_t)interpolated_uvalue;
+            temp_pano[pano_y][pano_x][count[pano_y][pano_x]].v =
+                (uint16_t)interpolated_vvalue;
+          } else {
+#endif  // CONFIG_HIGHBITDEPTH
+            temp_pano[pano_y][pano_x][count[pano_y][pano_x]].y =
+                (uint8_t)interpolated_yvalue;
+            temp_pano[pano_y][pano_x][count[pano_y][pano_x]].u =
+                (uint8_t)interpolated_uvalue;
+            temp_pano[pano_y][pano_x][count[pano_y][pano_x]].v =
+                (uint8_t)interpolated_vvalue;
+#if CONFIG_HIGHBITDEPTH
+          }
+#endif  // CONFIG_HIGHBITDEPTH
+          ++count[pano_y][pano_x];
+        } else if (image_x >= 0 && image_x < frame_width && image_y >= 0 &&
+                   image_y < frame_height) {
+          // Place in panorama stack.
+          int pano_x = x + x_min[i] + x_offset;
+          int pano_y = y + y_min[i] + y_offset;
+
+          int ychannel_idx = image_y * frames[i]->y_stride + image_x;
+          int uvchannel_idx =
+              (image_y >> frames[i]->subsampling_y) * frames[i]->uv_stride +
+              (image_x >> frames[i]->subsampling_x);
+#if CONFIG_HIGHBITDEPTH
+          if (frames[i]->flags & YV12_FLAG_HIGHBITDEPTH) {
+            temp_pano[pano_y][pano_x][count[pano_y][pano_x]].y =
+                y_buffer16[ychannel_idx];
+            temp_pano[pano_y][pano_x][count[pano_y][pano_x]].u =
+                u_buffer16[uvchannel_idx];
+            temp_pano[pano_y][pano_x][count[pano_y][pano_x]].v =
+                v_buffer16[uvchannel_idx];
+          } else {
+#endif  // CONFIG_HIGHBITDEPTH
+            temp_pano[pano_y][pano_x][count[pano_y][pano_x]].y =
+                frames[i]->y_buffer[ychannel_idx];
+            temp_pano[pano_y][pano_x][count[pano_y][pano_x]].u =
+                frames[i]->u_buffer[uvchannel_idx];
+            temp_pano[pano_y][pano_x][count[pano_y][pano_x]].v =
+                frames[i]->v_buffer[uvchannel_idx];
+#if CONFIG_HIGHBITDEPTH
+          }
+#endif  // CONFIG_HIGHBITDEPTH
+          ++count[pano_y][pano_x];
+        }
+      }
+    }
+  }
+
+#if BGSPRITE_BLENDING_MODE == 1
+  // Apply mean filtering and store result in temp_pano[y][x][0].
+  for (int y = 0; y < height; ++y) {
+    for (int x = 0; x < width; ++x) {
+      if (count[y][x] == 0) {
+        // Just make the pixel black.
+        // TODO(toddnguyen): Color the pixel with nearest neighbor
+      } else {
+        // Find
+        uint32_t y_sum = 0;
+        uint32_t u_sum = 0;
+        uint32_t v_sum = 0;
+        for (int i = 0; i < count[y][x]; ++i) {
+          y_sum += temp_pano[y][x][i].y;
+          u_sum += temp_pano[y][x][i].u;
+          v_sum += temp_pano[y][x][i].v;
+        }
+
+        const uint32_t unsigned_count = (uint32_t)count[y][x];
+
+#if CONFIG_HIGHBITDEPTH
+        if (panorama->flags & YV12_FLAG_HIGHBITDEPTH) {
+          temp_pano[y][x][0].y = (uint16_t)OD_DIVU(y_sum, unsigned_count);
+          temp_pano[y][x][0].u = (uint16_t)OD_DIVU(u_sum, unsigned_count);
+          temp_pano[y][x][0].v = (uint16_t)OD_DIVU(v_sum, unsigned_count);
+        } else {
+#endif  // CONFIG_HIGHBITDEPTH
+          temp_pano[y][x][0].y = (uint8_t)OD_DIVU(y_sum, unsigned_count);
+          temp_pano[y][x][0].u = (uint8_t)OD_DIVU(u_sum, unsigned_count);
+          temp_pano[y][x][0].v = (uint8_t)OD_DIVU(v_sum, unsigned_count);
+#if CONFIG_HIGHBITDEPTH
+        }
+#endif  // CONFIG_HIGHBITDEPTH
+      }
+    }
+  }
+#else
+  // Apply median filtering using quickselect.
+  for (int y = 0; y < height; ++y) {
+    for (int x = 0; x < width; ++x) {
+      if (count[y][x] == 0) {
+        // Just make the pixel black.
+        // TODO(toddnguyen): Color the pixel with nearest neighbor
+      } else {
+        // Find
+        const int median_idx = (int)floor(count[y][x] / 2);
+        YuvPixel median =
+            qselect(temp_pano[y][x], 0, count[y][x] - 1, median_idx);
+
+        // Make the median value the 0th index for UV subsampling later
+        temp_pano[y][x][0] = median;
+        assert(median.y == temp_pano[y][x][0].y &&
+               median.u == temp_pano[y][x][0].u &&
+               median.v == temp_pano[y][x][0].v);
+      }
+    }
+  }
+#endif  // BGSPRITE_BLENDING_MODE == 1
+
+  // NOTE(toddnguyen): Right now the ARF in the cpi struct is fixed size at
+  // the same size as the frames. For now, we crop the generated panorama.
+  // assert(panorama->y_width < width && panorama->y_height < height);
+  const int crop_x_offset = x_min[center_idx] + x_offset;
+  const int crop_y_offset = y_min[center_idx] + y_offset;
+
+#if CONFIG_HIGHBITDEPTH
+  if (panorama->flags & YV12_FLAG_HIGHBITDEPTH) {
+    // Use median Y value.
+    uint16_t *pano_y_buffer16 = CONVERT_TO_SHORTPTR(panorama->y_buffer);
+    for (int y = 0; y < panorama->y_height; ++y) {
+      for (int x = 0; x < panorama->y_width; ++x) {
+        const int ychannel_idx = y * panorama->y_stride + x;
+        if (count[y + crop_y_offset][x + crop_x_offset] > 0) {
+          pano_y_buffer16[ychannel_idx] =
+              temp_pano[y + crop_y_offset][x + crop_x_offset][0].y;
+        } else {
+          pano_y_buffer16[ychannel_idx] = 0;
+        }
+      }
+    }
+
+    // UV subsampling with median UV values
+    uint16_t *pano_u_buffer16 = CONVERT_TO_SHORTPTR(panorama->u_buffer);
+    uint16_t *pano_v_buffer16 = CONVERT_TO_SHORTPTR(panorama->v_buffer);
+
+    for (int y = 0; y < panorama->uv_height; ++y) {
+      for (int x = 0; x < panorama->uv_width; ++x) {
+        uint32_t avg_count = 0;
+        uint32_t u_sum = 0;
+        uint32_t v_sum = 0;
+
+        // Look at surrounding pixels for subsampling
+        for (int s_x = 0; s_x < panorama->subsampling_x + 1; ++s_x) {
+          for (int s_y = 0; s_y < panorama->subsampling_y + 1; ++s_y) {
+            int y_sample = crop_y_offset + (y << panorama->subsampling_y) + s_y;
+            int x_sample = crop_x_offset + (x << panorama->subsampling_x) + s_x;
+            if (y_sample > 0 && y_sample < height && x_sample > 0 &&
+                x_sample < width && count[y_sample][x_sample] > 0) {
+              u_sum += temp_pano[y_sample][x_sample][0].u;
+              v_sum += temp_pano[y_sample][x_sample][0].v;
+              avg_count++;
+            }
+          }
+        }
+
+        const int uvchannel_idx = y * panorama->uv_stride + x;
+        if (avg_count != 0) {
+          pano_u_buffer16[uvchannel_idx] = (uint16_t)OD_DIVU(u_sum, avg_count);
+          pano_v_buffer16[uvchannel_idx] = (uint16_t)OD_DIVU(v_sum, avg_count);
+        } else {
+          pano_u_buffer16[uvchannel_idx] = 0;
+          pano_v_buffer16[uvchannel_idx] = 0;
+        }
+      }
+    }
+  } else {
+#endif  // CONFIG_HIGHBITDEPTH
+    // Use median Y value.
+    for (int y = 0; y < panorama->y_height; ++y) {
+      for (int x = 0; x < panorama->y_width; ++x) {
+        const int ychannel_idx = y * panorama->y_stride + x;
+        if (count[y + crop_y_offset][x + crop_x_offset] > 0) {
+          panorama->y_buffer[ychannel_idx] =
+              temp_pano[y + crop_y_offset][x + crop_x_offset][0].y;
+        } else {
+          panorama->y_buffer[ychannel_idx] = 0;
+        }
+      }
+    }
+
+    // UV subsampling with median UV values
+    for (int y = 0; y < panorama->uv_height; ++y) {
+      for (int x = 0; x < panorama->uv_width; ++x) {
+        uint16_t avg_count = 0;
+        uint16_t u_sum = 0;
+        uint16_t v_sum = 0;
+
+        // Look at surrounding pixels for subsampling
+        for (int s_x = 0; s_x < panorama->subsampling_x + 1; ++s_x) {
+          for (int s_y = 0; s_y < panorama->subsampling_y + 1; ++s_y) {
+            int y_sample = crop_y_offset + (y << panorama->subsampling_y) + s_y;
+            int x_sample = crop_x_offset + (x << panorama->subsampling_x) + s_x;
+            if (y_sample > 0 && y_sample < height && x_sample > 0 &&
+                x_sample < width && count[y_sample][x_sample] > 0) {
+              u_sum += temp_pano[y_sample][x_sample][0].u;
+              v_sum += temp_pano[y_sample][x_sample][0].v;
+              avg_count++;
+            }
+          }
+        }
+
+        const int uvchannel_idx = y * panorama->uv_stride + x;
+        if (avg_count != 0) {
+          panorama->u_buffer[uvchannel_idx] =
+              (uint8_t)OD_DIVU(u_sum, avg_count);
+          panorama->v_buffer[uvchannel_idx] =
+              (uint8_t)OD_DIVU(v_sum, avg_count);
+        } else {
+          panorama->u_buffer[uvchannel_idx] = 0;
+          panorama->v_buffer[uvchannel_idx] = 0;
+        }
+      }
+    }
+#if CONFIG_HIGHBITDEPTH
+  }
+#endif  // CONFIG_HIGHBITDEPTH
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; ++j) {
+      aom_free(temp_pano[i][j]);
+    }
+    aom_free(temp_pano[i]);
+    aom_free(count[i]);
+  }
+  aom_free(count);
+  aom_free(temp_pano);
+}
+
+int av1_background_sprite(AV1_COMP *cpi, int distance) {
+  YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = { NULL };
+  static const double identity_params[MAX_PARAMDIM - 1] = {
+    0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0
+  };
+
+  const int frames_after_arf =
+      av1_lookahead_depth(cpi->lookahead) - distance - 1;
+  int frames_fwd = (cpi->oxcf.arnr_max_frames - 1) >> 1;
+  int frames_bwd;
+
+  // Define the forward and backwards filter limits for this arnr group.
+  if (frames_fwd > frames_after_arf) frames_fwd = frames_after_arf;
+  if (frames_fwd > distance) frames_fwd = distance;
+  frames_bwd = frames_fwd;
+
+#if CONFIG_EXT_REFS
+  const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+  if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW) {
+    cpi->alt_ref_buffer = av1_lookahead_peek(cpi->lookahead, distance)->img;
+    cpi->is_arf_filter_off[gf_group->arf_update_idx[gf_group->index]] = 1;
+    frames_fwd = 0;
+    frames_bwd = 0;
+  } else {
+    cpi->is_arf_filter_off[gf_group->arf_update_idx[gf_group->index]] = 0;
+  }
+#endif  // CONFIG_EXT_REFS
+
+  const int start_frame = distance + frames_fwd;
+  const int frames_to_stitch = frames_bwd + 1 + frames_fwd;
+
+  // Get frames to be included in background sprite.
+  for (int frame = 0; frame < frames_to_stitch; ++frame) {
+    const int which_buffer = start_frame - frame;
+    struct lookahead_entry *buf =
+        av1_lookahead_peek(cpi->lookahead, which_buffer);
+    frames[frames_to_stitch - 1 - frame] = &buf->img;
+  }
+
+  YV12_BUFFER_CONFIG temp_bg;
+  memset(&temp_bg, 0, sizeof(temp_bg));
+  aom_alloc_frame_buffer(&temp_bg, frames[0]->y_width, frames[0]->y_height,
+                         frames[0]->subsampling_x, frames[0]->subsampling_y,
+#if CONFIG_HIGHBITDEPTH
+                         frames[0]->flags & YV12_FLAG_HIGHBITDEPTH,
+#endif
+                         frames[0]->border, 0);
+  aom_yv12_copy_frame(frames[0], &temp_bg);
+  temp_bg.bit_depth = frames[0]->bit_depth;
+
+  // Allocate empty arrays for parameters between frames.
+  double **params = aom_malloc(frames_to_stitch * sizeof(*params));
+  for (int i = 0; i < frames_to_stitch; ++i) {
+    params[i] = aom_malloc(sizeof(identity_params));
+    memcpy(params[i], identity_params, sizeof(identity_params));
+  }
+
+  // Use global motion to find affine transformations between frames.
+  // params[i] will have the transform from frame[i] to frame[i-1].
+  // params[0] will have the identity matrix because it has no previous frame.
+  TransformationType model = AFFINE;
+  int inliers_by_motion[RANSAC_NUM_MOTIONS];
+  for (int frame = 0; frame < frames_to_stitch - 1; ++frame) {
+    const int global_motion_ret = compute_global_motion_feature_based(
+        model, frames[frame + 1], frames[frame],
+#if CONFIG_HIGHBITDEPTH
+        cpi->common.bit_depth,
+#endif  // CONFIG_HIGHBITDEPTH
+        inliers_by_motion, params[frame + 1], RANSAC_NUM_MOTIONS);
+
+    // Quit if global motion had an error.
+    if (global_motion_ret == 0) {
+      for (int i = 0; i < frames_to_stitch; ++i) {
+        aom_free(params[i]);
+      }
+      aom_free(params);
+      return 1;
+    }
+  }
+
+  // Compound the transformation parameters.
+  for (int i = 1; i < frames_to_stitch; ++i) {
+    multiply_params(params[i - 1], params[i], params[i]);
+  }
+
+  // Compute frame limits for final stitched images.
+  int pano_x_max = INT_MIN;
+  int pano_x_min = INT_MAX;
+  int pano_y_max = INT_MIN;
+  int pano_y_min = INT_MAX;
+  int *x_max = aom_malloc(frames_to_stitch * sizeof(*x_max));
+  int *x_min = aom_malloc(frames_to_stitch * sizeof(*x_min));
+  int *y_max = aom_malloc(frames_to_stitch * sizeof(*y_max));
+  int *y_min = aom_malloc(frames_to_stitch * sizeof(*y_min));
+
+  find_limits(cpi->initial_width, cpi->initial_height,
+              (const double **const)params, frames_to_stitch, x_min, x_max,
+              y_min, y_max, &pano_x_min, &pano_x_max, &pano_y_min, &pano_y_max);
+
+  // Center panorama on the ARF.
+  const int center_idx = frames_bwd;
+  assert(center_idx >= 0 && center_idx < frames_to_stitch);
+
+  // Recompute transformations to adjust to center image.
+  // Invert center image's transform.
+  double inverse[MAX_PARAMDIM - 1] = { 0 };
+  invert_params(params[center_idx], inverse);
+
+  // Multiply the inverse to all transformation parameters.
+  for (int i = 0; i < frames_to_stitch; ++i) {
+    multiply_params(inverse, params[i], params[i]);
+  }
+
+  // Recompute frame limits for new adjusted center.
+  find_limits(cpi->initial_width, cpi->initial_height,
+              (const double **const)params, frames_to_stitch, x_min, x_max,
+              y_min, y_max, &pano_x_min, &pano_x_max, &pano_y_min, &pano_y_max);
+
+  // Stitch Images.
+  stitch_images(frames, frames_to_stitch, center_idx,
+                (const double **const)params, x_min, x_max, y_min, y_max,
+                pano_x_min, pano_x_max, pano_y_min, pano_y_max, &temp_bg);
+
+  // Apply temporal filter.
+  av1_temporal_filter(cpi, &temp_bg, distance);
+
+  // Free memory.
+  aom_free_frame_buffer(&temp_bg);
+  for (int i = 0; i < frames_to_stitch; ++i) {
+    aom_free(params[i]);
+  }
+  aom_free(params);
+  aom_free(x_max);
+  aom_free(x_min);
+  aom_free(y_max);
+  aom_free(y_min);
+
+  return 0;
+}
diff --git a/third_party/aom/av1/encoder/bgsprite.h b/third_party/aom/av1/encoder/bgsprite.h
new file mode 100644
index 000000000..711b00e40
--- /dev/null
+++ b/third_party/aom/av1/encoder/bgsprite.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_BGSPRITE_H_
+#define AV1_ENCODER_BGSPRITE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/encoder.h"
+
+// Creates alternate reference frame staring from source image + frames up to
+// 'distance' past source frame.
+// Returns 0 on success and 1 on failure.
+int av1_background_sprite(AV1_COMP *cpi, int distance);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_BGSPRITE_H_
diff --git a/third_party/aom/av1/encoder/bitstream.c b/third_party/aom/av1/encoder/bitstream.c
index f8378b14d..2e0abc186 100644
--- a/third_party/aom/av1/encoder/bitstream.c
+++ b/third_party/aom/av1/encoder/bitstream.c
@@ -26,7 +26,6 @@
 
 #if CONFIG_CDEF
 #include "av1/common/cdef.h"
-#include "av1/common/clpf.h"
 #endif  // CONFIG_CDEF
 #include "av1/common/entropy.h"
 #include "av1/common/entropymode.h"
@@ -61,21 +60,12 @@
 #include "av1/encoder/pvq_encoder.h"
 #endif
 
-static struct av1_token intra_mode_encodings[INTRA_MODES];
-static struct av1_token switchable_interp_encodings[SWITCHABLE_FILTERS];
-static struct av1_token partition_encodings[PARTITION_TYPES];
-#if CONFIG_EXT_INTER
-static const struct av1_token
-    inter_compound_mode_encodings[INTER_COMPOUND_MODES] = {
-      { 2, 2 },  { 12, 4 }, { 52, 6 }, { 53, 6 },
-      { 54, 6 }, { 55, 6 }, { 0, 1 },  { 7, 3 }
-    };
-#endif  // CONFIG_EXT_INTER
-#if CONFIG_PALETTE
-static struct av1_token palette_size_encodings[PALETTE_SIZES];
-static struct av1_token palette_color_index_encodings[PALETTE_SIZES]
-                                                     [PALETTE_COLORS];
-#endif  // CONFIG_PALETTE
+#define ENC_MISMATCH_DEBUG 0
+
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+static struct av1_token
+    inter_singleref_comp_mode_encodings[INTER_SINGLEREF_COMP_MODES];
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
 
 #if CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA || CONFIG_PALETTE
 static INLINE void write_uniform(aom_writer *w, int n, int v) {
@@ -97,9 +87,6 @@ static struct av1_token ext_tx_intra_encodings[EXT_TX_SETS_INTRA][TX_TYPES];
 #else
 static struct av1_token ext_tx_encodings[TX_TYPES];
 #endif  // CONFIG_EXT_TX
-#if CONFIG_GLOBAL_MOTION
-static struct av1_token global_motion_types_encodings[GLOBAL_TRANS_TYPES];
-#endif  // CONFIG_GLOBAL_MOTION
 #if CONFIG_EXT_INTRA
 #if CONFIG_INTRA_INTERP
 static struct av1_token intra_filter_encodings[INTRA_FILTERS];
@@ -114,7 +101,9 @@ static struct av1_token compound_type_encodings[COMPOUND_TYPES];
 #endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
 #endif  // CONFIG_EXT_INTER
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-static struct av1_token motion_mode_encodings[MOTION_MODES];
+#if CONFIG_NCOBMC_ADAPT_WEIGHT
+static struct av1_token ncobmc_mode_encodings[MAX_NCOBMC_MODES];
+#endif
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 #if CONFIG_LOOP_RESTORATION
 static struct av1_token switchable_restore_encodings[RESTORE_SWITCHABLE_TYPES];
@@ -129,9 +118,9 @@ static int remux_tiles(const AV1_COMMON *const cm, uint8_t *dst,
                        int *const tile_col_size_bytes);
 
 void av1_encode_token_init(void) {
-#if CONFIG_EXT_TX || CONFIG_PALETTE
+#if CONFIG_EXT_TX
   int s;
-#endif  // CONFIG_EXT_TX || CONFIG_PALETTE
+#endif  // CONFIG_EXT_TX
 #if CONFIG_EXT_TX
   for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
     av1_tokens_from_tree(ext_tx_inter_encodings[s], av1_ext_tx_inter_tree[s]);
@@ -142,17 +131,6 @@ void av1_encode_token_init(void) {
 #else
   av1_tokens_from_tree(ext_tx_encodings, av1_ext_tx_tree);
 #endif  // CONFIG_EXT_TX
-  av1_tokens_from_tree(intra_mode_encodings, av1_intra_mode_tree);
-  av1_tokens_from_tree(switchable_interp_encodings, av1_switchable_interp_tree);
-  av1_tokens_from_tree(partition_encodings, av1_partition_tree);
-
-#if CONFIG_PALETTE
-  av1_tokens_from_tree(palette_size_encodings, av1_palette_size_tree);
-  for (s = 0; s < PALETTE_SIZES; ++s) {
-    av1_tokens_from_tree(palette_color_index_encodings[s],
-                         av1_palette_color_index_tree[s]);
-  }
-#endif  // CONFIG_PALETTE
 
 #if CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
   av1_tokens_from_tree(intra_filter_encodings, av1_intra_filter_tree);
@@ -161,17 +139,19 @@ void av1_encode_token_init(void) {
 #if CONFIG_INTERINTRA
   av1_tokens_from_tree(interintra_mode_encodings, av1_interintra_mode_tree);
 #endif  // CONFIG_INTERINTRA
+#if CONFIG_COMPOUND_SINGLEREF
+  av1_tokens_from_tree(inter_singleref_comp_mode_encodings,
+                       av1_inter_singleref_comp_mode_tree);
+#endif  // CONFIG_COMPOUND_SINGLEREF
 #if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
   av1_tokens_from_tree(compound_type_encodings, av1_compound_type_tree);
 #endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
 #endif  // CONFIG_EXT_INTER
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-  av1_tokens_from_tree(motion_mode_encodings, av1_motion_mode_tree);
+#if CONFIG_NCOBMC_ADAPT_WEIGHT
+  av1_tokens_from_tree(ncobmc_mode_encodings, av1_ncobmc_mode_tree);
+#endif
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-#if CONFIG_GLOBAL_MOTION
-  av1_tokens_from_tree(global_motion_types_encodings,
-                       av1_global_motion_types_tree);
-#endif  // CONFIG_GLOBAL_MOTION
 #if CONFIG_LOOP_RESTORATION
   av1_tokens_from_tree(switchable_restore_encodings,
                        av1_switchable_restore_tree);
@@ -195,10 +175,6 @@ void av1_encode_token_init(void) {
 #else
   av1_indices_from_tree(av1_ext_tx_ind, av1_ext_tx_inv, av1_ext_tx_tree);
 #endif
-  av1_indices_from_tree(av1_intra_mode_ind, av1_intra_mode_inv,
-                        av1_intra_mode_tree);
-  av1_indices_from_tree(av1_inter_mode_ind, av1_inter_mode_inv,
-                        av1_inter_mode_tree);
 }
 
 static void write_intra_mode_kf(const AV1_COMMON *cm, FRAME_CONTEXT *frame_ctx,
@@ -214,65 +190,72 @@ static void write_intra_mode_kf(const AV1_COMMON *cm, FRAME_CONTEXT *frame_ctx,
   (void)cm;
 }
 
-#if CONFIG_EXT_INTER && CONFIG_INTERINTRA
-static void write_interintra_mode(aom_writer *w, INTERINTRA_MODE mode,
-                                  const aom_prob *probs) {
-  av1_write_token(w, av1_interintra_mode_tree, probs,
-                  &interintra_mode_encodings[mode]);
-}
-#endif  // CONFIG_EXT_INTER && CONFIG_INTERINTRA
-
 static void write_inter_mode(aom_writer *w, PREDICTION_MODE mode,
                              FRAME_CONTEXT *ec_ctx, const int16_t mode_ctx) {
   const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK;
-  const aom_prob newmv_prob = ec_ctx->newmv_prob[newmv_ctx];
 
-  aom_write(w, mode != NEWMV, newmv_prob);
+#if CONFIG_NEW_MULTISYMBOL
+  aom_write_symbol(w, mode != NEWMV, ec_ctx->newmv_cdf[newmv_ctx], 2);
+#else
+  aom_write(w, mode != NEWMV, ec_ctx->newmv_prob[newmv_ctx]);
+#endif
 
   if (mode != NEWMV) {
-    const int16_t zeromv_ctx = (mode_ctx >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK;
-    const aom_prob zeromv_prob = ec_ctx->zeromv_prob[zeromv_ctx];
-
     if (mode_ctx & (1 << ALL_ZERO_FLAG_OFFSET)) {
       assert(mode == ZEROMV);
       return;
     }
 
-    aom_write(w, mode != ZEROMV, zeromv_prob);
+    const int16_t zeromv_ctx = (mode_ctx >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK;
+#if CONFIG_NEW_MULTISYMBOL
+    aom_write_symbol(w, mode != ZEROMV, ec_ctx->zeromv_cdf[zeromv_ctx], 2);
+#else
+    aom_write(w, mode != ZEROMV, ec_ctx->zeromv_prob[zeromv_ctx]);
+#endif
 
     if (mode != ZEROMV) {
       int16_t refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
-      aom_prob refmv_prob;
 
       if (mode_ctx & (1 << SKIP_NEARESTMV_OFFSET)) refmv_ctx = 6;
       if (mode_ctx & (1 << SKIP_NEARMV_OFFSET)) refmv_ctx = 7;
       if (mode_ctx & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET)) refmv_ctx = 8;
-
-      refmv_prob = ec_ctx->refmv_prob[refmv_ctx];
-      aom_write(w, mode != NEARESTMV, refmv_prob);
+#if CONFIG_NEW_MULTISYMBOL
+      aom_write_symbol(w, mode != NEARESTMV, ec_ctx->refmv_cdf[refmv_ctx], 2);
+#else
+      aom_write(w, mode != NEARESTMV, ec_ctx->refmv_prob[refmv_ctx]);
+#endif
     }
   }
 }
 
-static void write_drl_idx(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi,
+static void write_drl_idx(FRAME_CONTEXT *ec_ctx, const MB_MODE_INFO *mbmi,
                           const MB_MODE_INFO_EXT *mbmi_ext, aom_writer *w) {
   uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
 
   assert(mbmi->ref_mv_idx < 3);
 
 #if CONFIG_EXT_INTER
+#if CONFIG_COMPOUND_SINGLEREF
+  if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV ||
+      mbmi->mode == SR_NEW_NEWMV) {
+#else   // !CONFIG_COMPOUND_SINGLEREF
   if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) {
-#else
+#endif  // CONFIG_COMPOUND_SINGLEREF
+#else   // !CONFIG_EXT_INTER
   if (mbmi->mode == NEWMV) {
-#endif
+#endif  // CONFIG_EXT_INTER
     int idx;
     for (idx = 0; idx < 2; ++idx) {
       if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
         uint8_t drl_ctx =
             av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
-        aom_prob drl_prob = cm->fc->drl_prob[drl_ctx];
 
-        aom_write(w, mbmi->ref_mv_idx != idx, drl_prob);
+#if CONFIG_NEW_MULTISYMBOL
+        aom_write_symbol(w, mbmi->ref_mv_idx != idx, ec_ctx->drl_cdf[drl_ctx],
+                         2);
+#else
+        aom_write(w, mbmi->ref_mv_idx != idx, ec_ctx->drl_prob[drl_ctx]);
+#endif
         if (mbmi->ref_mv_idx == idx) return;
       }
     }
@@ -286,9 +269,12 @@ static void write_drl_idx(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi,
       if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
         uint8_t drl_ctx =
             av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
-        aom_prob drl_prob = cm->fc->drl_prob[drl_ctx];
-
-        aom_write(w, mbmi->ref_mv_idx != (idx - 1), drl_prob);
+#if CONFIG_NEW_MULTISYMBOL
+        aom_write_symbol(w, mbmi->ref_mv_idx != (idx - 1),
+                         ec_ctx->drl_cdf[drl_ctx], 2);
+#else
+        aom_write(w, mbmi->ref_mv_idx != (idx - 1), ec_ctx->drl_prob[drl_ctx]);
+#endif
         if (mbmi->ref_mv_idx == (idx - 1)) return;
       }
     }
@@ -297,16 +283,28 @@ static void write_drl_idx(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi,
 }
 
 #if CONFIG_EXT_INTER
-static void write_inter_compound_mode(AV1_COMMON *cm, aom_writer *w,
-                                      PREDICTION_MODE mode,
+static void write_inter_compound_mode(AV1_COMMON *cm, MACROBLOCKD *xd,
+                                      aom_writer *w, PREDICTION_MODE mode,
                                       const int16_t mode_ctx) {
-  const aom_prob *const inter_compound_probs =
-      cm->fc->inter_compound_mode_probs[mode_ctx];
-
   assert(is_inter_compound_mode(mode));
-  av1_write_token(w, av1_inter_compound_mode_tree, inter_compound_probs,
-                  &inter_compound_mode_encodings[INTER_COMPOUND_OFFSET(mode)]);
+  (void)cm;
+  aom_write_symbol(w, INTER_COMPOUND_OFFSET(mode),
+                   xd->tile_ctx->inter_compound_mode_cdf[mode_ctx],
+                   INTER_COMPOUND_MODES);
+}
+
+#if CONFIG_COMPOUND_SINGLEREF
+static void write_inter_singleref_comp_mode(MACROBLOCKD *xd, aom_writer *w,
+                                            PREDICTION_MODE mode,
+                                            const int16_t mode_ctx) {
+  assert(is_inter_singleref_comp_mode(mode));
+  aom_cdf_prob *const inter_singleref_comp_cdf =
+      xd->tile_ctx->inter_singleref_comp_mode_cdf[mode_ctx];
+
+  aom_write_symbol(w, INTER_SINGLEREF_COMP_OFFSET(mode),
+                   inter_singleref_comp_cdf, INTER_SINGLEREF_COMP_MODES);
 }
+#endif  // CONFIG_COMPOUND_SINGLEREF
 #endif  // CONFIG_EXT_INTER
 
 static void encode_unsigned_max(struct aom_write_bit_buffer *wb, int data,
@@ -314,11 +312,10 @@ static void encode_unsigned_max(struct aom_write_bit_buffer *wb, int data,
   aom_wb_write_literal(wb, data, get_unsigned_bits(max));
 }
 
-#if !CONFIG_EC_ADAPT || \
-    (CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION || CONFIG_EXT_INTER)
+#if CONFIG_NCOBMC_ADAPT_WEIGHT
 static void prob_diff_update(const aom_tree_index *tree,
                              aom_prob probs[/*n - 1*/],
-                             const unsigned int counts[/*n - 1*/], int n,
+                             const unsigned int counts[/* n */], int n,
                              int probwt, aom_writer *w) {
   int i;
   unsigned int branch_ct[32][2];
@@ -332,31 +329,15 @@ static void prob_diff_update(const aom_tree_index *tree,
 }
 #endif
 
-#if CONFIG_EXT_INTER || !CONFIG_EC_ADAPT
-static int prob_diff_update_savings(const aom_tree_index *tree,
-                                    aom_prob probs[/*n - 1*/],
-                                    const unsigned int counts[/*n - 1*/], int n,
-                                    int probwt) {
-  int i;
-  unsigned int branch_ct[32][2];
-  int savings = 0;
-
-  // Assuming max number of probabilities <= 32
-  assert(n <= 32);
-  av1_tree_probs_from_distribution(tree, branch_ct, counts);
-  for (i = 0; i < n - 1; ++i) {
-    savings +=
-        av1_cond_prob_diff_update_savings(&probs[i], branch_ct[i], probwt);
-  }
-  return savings;
-}
-#endif  // CONFIG_EXT_INTER || !CONFIG_EC_ADAPT
-
 #if CONFIG_VAR_TX
-static void write_tx_size_vartx(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+static void write_tx_size_vartx(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                 const MB_MODE_INFO *mbmi, TX_SIZE tx_size,
                                 int depth, int blk_row, int blk_col,
                                 aom_writer *w) {
+#if CONFIG_NEW_MULTISYMBOL
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  (void)cm;
+#endif
   const int tx_row = blk_row >> 1;
   const int tx_col = blk_col >> 1;
   const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0);
@@ -374,16 +355,31 @@ static void write_tx_size_vartx(const AV1_COMMON *cm, const MACROBLOCKD *xd,
     return;
   }
 
+#if CONFIG_RECT_TX_EXT
+  if (tx_size == mbmi->inter_tx_size[tx_row][tx_col] ||
+      mbmi->tx_size == quarter_txsize_lookup[mbmi->sb_type]) {
+#else
   if (tx_size == mbmi->inter_tx_size[tx_row][tx_col]) {
+#endif
+#if CONFIG_NEW_MULTISYMBOL
+    aom_write_symbol(w, 0, ec_ctx->txfm_partition_cdf[ctx], 2);
+#else
     aom_write(w, 0, cm->fc->txfm_partition_prob[ctx]);
+#endif
+
     txfm_partition_update(xd->above_txfm_context + blk_col,
                           xd->left_txfm_context + blk_row, tx_size, tx_size);
+    // TODO(yuec): set correct txfm partition update for qttx
   } else {
     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
     const int bsl = tx_size_wide_unit[sub_txs];
     int i;
 
+#if CONFIG_NEW_MULTISYMBOL
+    aom_write_symbol(w, 1, ec_ctx->txfm_partition_cdf[ctx], 2);
+#else
     aom_write(w, 1, cm->fc->txfm_partition_prob[ctx]);
+#endif
 
     if (tx_size == TX_8X8) {
       txfm_partition_update(xd->above_txfm_context + blk_col,
@@ -401,6 +397,7 @@ static void write_tx_size_vartx(const AV1_COMMON *cm, const MACROBLOCKD *xd,
   }
 }
 
+#if !CONFIG_NEW_MULTISYMBOL
 static void update_txfm_partition_probs(AV1_COMMON *cm, aom_writer *w,
                                         FRAME_COUNTS *counts, int probwt) {
   int k;
@@ -408,18 +405,15 @@ static void update_txfm_partition_probs(AV1_COMMON *cm, aom_writer *w,
     av1_cond_prob_diff_update(w, &cm->fc->txfm_partition_prob[k],
                               counts->txfm_partition[k], probwt);
 }
+#endif  // CONFIG_NEW_MULTISYMBOL
 #endif
 
 static void write_selected_tx_size(const AV1_COMMON *cm, const MACROBLOCKD *xd,
                                    aom_writer *w) {
   const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   const BLOCK_SIZE bsize = mbmi->sb_type;
-#if CONFIG_EC_ADAPT
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
   (void)cm;
-#else
-  FRAME_CONTEXT *ec_ctx = cm->fc;
-#endif
 // For sub8x8 blocks the tx_size symbol does not need to be sent
 #if CONFIG_CB4X4 && (CONFIG_VAR_TX || CONFIG_EXT_TX) && CONFIG_RECT_TX
   if (bsize > BLOCK_4X4) {
@@ -439,22 +433,19 @@ static void write_selected_tx_size(const AV1_COMMON *cm, const MACROBLOCKD *xd,
 
     aom_write_symbol(w, depth, ec_ctx->tx_size_cdf[tx_size_cat][tx_size_ctx],
                      tx_size_cat + 2);
-#if CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
+#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
     if (is_quarter_tx_allowed(xd, mbmi, is_inter) && tx_size != coded_tx_size)
       aom_write(w, tx_size == quarter_txsize_lookup[bsize],
                 cm->fc->quarter_tx_size_prob);
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
+#endif
   }
 }
 
+#if !CONFIG_NEW_MULTISYMBOL
 static void update_inter_mode_probs(AV1_COMMON *cm, aom_writer *w,
                                     FRAME_COUNTS *counts) {
   int i;
-#if CONFIG_TILE_GROUPS
   const int probwt = cm->num_tg;
-#else
-  const int probwt = 1;
-#endif
   for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i)
     av1_cond_prob_diff_update(w, &cm->fc->newmv_prob[i], counts->newmv_mode[i],
                               probwt);
@@ -468,31 +459,7 @@ static void update_inter_mode_probs(AV1_COMMON *cm, aom_writer *w,
     av1_cond_prob_diff_update(w, &cm->fc->drl_prob[i], counts->drl_mode[i],
                               probwt);
 }
-
-#if CONFIG_EXT_INTER
-static void update_inter_compound_mode_probs(AV1_COMMON *cm, int probwt,
-                                             aom_writer *w) {
-  const int savings_thresh = av1_cost_one(GROUP_DIFF_UPDATE_PROB) -
-                             av1_cost_zero(GROUP_DIFF_UPDATE_PROB);
-  int i;
-  int savings = 0;
-  int do_update = 0;
-  for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
-    savings += prob_diff_update_savings(
-        av1_inter_compound_mode_tree, cm->fc->inter_compound_mode_probs[i],
-        cm->counts.inter_compound_mode[i], INTER_COMPOUND_MODES, probwt);
-  }
-  do_update = savings > savings_thresh;
-  aom_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
-  if (do_update) {
-    for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
-      prob_diff_update(
-          av1_inter_compound_mode_tree, cm->fc->inter_compound_mode_probs[i],
-          cm->counts.inter_compound_mode[i], INTER_COMPOUND_MODES, probwt, w);
-    }
-  }
-}
-#endif  // CONFIG_EXT_INTER
+#endif
 
 static int write_skip(const AV1_COMMON *cm, const MACROBLOCKD *xd,
                       int segment_id, const MODE_INFO *mi, aom_writer *w) {
@@ -500,35 +467,100 @@ static int write_skip(const AV1_COMMON *cm, const MACROBLOCKD *xd,
     return 1;
   } else {
     const int skip = mi->mbmi.skip;
+#if CONFIG_NEW_MULTISYMBOL
+    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+    const int ctx = av1_get_skip_context(xd);
+    aom_write_symbol(w, skip, ec_ctx->skip_cdfs[ctx], 2);
+#else
     aom_write(w, skip, av1_get_skip_prob(cm, xd));
+#endif
     return skip;
   }
 }
 
+static void write_is_inter(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                           int segment_id, aom_writer *w, const int is_inter) {
+  if (!segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+#if CONFIG_NEW_MULTISYMBOL
+    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+    const int ctx = av1_get_intra_inter_context(xd);
+    aom_write_symbol(w, is_inter, ec_ctx->intra_inter_cdf[ctx], 2);
+#else
+    aom_write(w, is_inter, av1_get_intra_inter_prob(cm, xd));
+#endif
+  }
+}
+
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-static void write_motion_mode(const AV1_COMMON *cm, const MODE_INFO *mi,
-                              aom_writer *w) {
+static void write_motion_mode(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                              const MODE_INFO *mi, aom_writer *w) {
   const MB_MODE_INFO *mbmi = &mi->mbmi;
+
+#if CONFIG_NCOBMC_ADAPT_WEIGHT
+  MOTION_MODE last_motion_mode_allowed =
+      motion_mode_allowed_wrapper(0,
+#if CONFIG_GLOBAL_MOTION
+                                  0, cm->global_motion,
+#endif  // CONFIG_GLOBAL_MOTION
+                                  mi);
+#else
   MOTION_MODE last_motion_mode_allowed = motion_mode_allowed(
-#if CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+#if CONFIG_GLOBAL_MOTION
       0, cm->global_motion,
-#endif  // CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+#endif  // CONFIG_GLOBAL_MOTION
+#if CONFIG_WARPED_MOTION
+      xd,
+#endif
       mi);
-
+#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
   if (last_motion_mode_allowed == SIMPLE_TRANSLATION) return;
 #if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
   if (last_motion_mode_allowed == OBMC_CAUSAL) {
+#if CONFIG_NEW_MULTISYMBOL
+    aom_write_symbol(w, mbmi->motion_mode == OBMC_CAUSAL,
+                     xd->tile_ctx->obmc_cdf[mbmi->sb_type], 2);
+#else
     aom_write(w, mbmi->motion_mode == OBMC_CAUSAL,
               cm->fc->obmc_prob[mbmi->sb_type]);
+#endif
   } else {
 #endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-    av1_write_token(w, av1_motion_mode_tree,
-                    cm->fc->motion_mode_prob[mbmi->sb_type],
-                    &motion_mode_encodings[mbmi->motion_mode]);
+    aom_write_symbol(w, mbmi->motion_mode,
+                     xd->tile_ctx->motion_mode_cdf[mbmi->sb_type],
+                     MOTION_MODES);
 #if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
   }
 #endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
 }
+
+#if CONFIG_NCOBMC_ADAPT_WEIGHT
+static void write_ncobmc_mode(MACROBLOCKD *xd, const MODE_INFO *mi,
+                              aom_writer *w) {
+  const MB_MODE_INFO *mbmi = &mi->mbmi;
+  ADAPT_OVERLAP_BLOCK ao_block = adapt_overlap_block_lookup[mbmi->sb_type];
+  if (mbmi->motion_mode != NCOBMC_ADAPT_WEIGHT) return;
+
+#ifndef TRAINING_WEIGHTS
+  aom_write_symbol(w, mbmi->ncobmc_mode[0],
+                   xd->tile_ctx->ncobmc_mode_cdf[ao_block], MAX_NCOBMC_MODES);
+  if (mi_size_wide[mbmi->sb_type] != mi_size_high[mbmi->sb_type]) {
+    aom_write_symbol(w, mbmi->ncobmc_mode[1],
+                     xd->tile_ctx->ncobmc_mode_cdf[ao_block], MAX_NCOBMC_MODES);
+  }
+#else
+  int block;
+  for (block = 0; block < 4; ++block)
+    aom_write_symbol(w, mbmi->ncobmc_mode[0][block],
+                     xd->tile_ctx->ncobmc_mode_cdf[ao_block], MAX_NCOBMC_MODES);
+  if (mi_size_wide[mbmi->sb_type] != mi_size_high[mbmi->sb_type]) {
+    for (block = 0; block < 4; ++block)
+      aom_write_symbol(w, mbmi->ncobmc_mode[1][block],
+                       xd->tile_ctx->ncobmc_mode_cdf[ao_block],
+                       MAX_NCOBMC_MODES);
+  }
+#endif
+}
+#endif
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 
 #if CONFIG_DELTA_Q
@@ -538,13 +570,8 @@ static void write_delta_qindex(const AV1_COMMON *cm, const MACROBLOCKD *xd,
   int abs = sign ? -delta_qindex : delta_qindex;
   int rem_bits, thr;
   int smallval = abs < DELTA_Q_SMALL ? 1 : 0;
-#if CONFIG_EC_ADAPT
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
   (void)cm;
-#else
-  FRAME_CONTEXT *ec_ctx = cm->fc;
-  (void)xd;
-#endif
 
   aom_write_symbol(w, AOMMIN(abs, DELTA_Q_SMALL), ec_ctx->delta_q_cdf,
                    DELTA_Q_PROBS + 1);
@@ -560,25 +587,6 @@ static void write_delta_qindex(const AV1_COMMON *cm, const MACROBLOCKD *xd,
   }
 }
 
-#if !CONFIG_EC_ADAPT
-static void update_delta_q_probs(AV1_COMMON *cm, aom_writer *w,
-                                 FRAME_COUNTS *counts) {
-  int k;
-#if CONFIG_TILE_GROUPS
-  const int probwt = cm->num_tg;
-#else
-  const int probwt = 1;
-#endif
-#if CONFIG_EXT_DELTA_Q
-  if (!cm->delta_q_present_flag) return;
-#endif  // CONFIG_EXT_DELTA_Q
-  for (k = 0; k < DELTA_Q_PROBS; ++k) {
-    av1_cond_prob_diff_update(w, &cm->fc->delta_q_prob[k], counts->delta_q[k],
-                              probwt);
-  }
-}
-#endif  // CONFIG_EC_ADAPT
-
 #if CONFIG_EXT_DELTA_Q
 static void write_delta_lflevel(const AV1_COMMON *cm, const MACROBLOCKD *xd,
                                 int delta_lflevel, aom_writer *w) {
@@ -586,13 +594,8 @@ static void write_delta_lflevel(const AV1_COMMON *cm, const MACROBLOCKD *xd,
   int abs = sign ? -delta_lflevel : delta_lflevel;
   int rem_bits, thr;
   int smallval = abs < DELTA_LF_SMALL ? 1 : 0;
-#if CONFIG_EC_ADAPT
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
   (void)cm;
-#else
-  FRAME_CONTEXT *ec_ctx = cm->fc;
-  (void)xd;
-#endif
 
   aom_write_symbol(w, AOMMIN(abs, DELTA_LF_SMALL), ec_ctx->delta_lf_cdf,
                    DELTA_LF_PROBS + 1);
@@ -607,178 +610,32 @@ static void write_delta_lflevel(const AV1_COMMON *cm, const MACROBLOCKD *xd,
     aom_write_bit(w, sign);
   }
 }
-
-#if !CONFIG_EC_ADAPT
-static void update_delta_lf_probs(AV1_COMMON *cm, aom_writer *w,
-                                  FRAME_COUNTS *counts) {
-  int k;
-#if CONFIG_TILE_GROUPS
-  const int probwt = cm->num_tg;
-#else
-  const int probwt = 1;
-#endif
-  if (!cm->delta_lf_present_flag) return;
-  for (k = 0; k < DELTA_LF_PROBS; ++k) {
-    av1_cond_prob_diff_update(w, &cm->fc->delta_lf_prob[k], counts->delta_lf[k],
-                              probwt);
-  }
-}
-#endif  // CONFIG_EC_ADAPT
 #endif  // CONFIG_EXT_DELTA_Q
 #endif  // CONFIG_DELTA_Q
 
+#if !CONFIG_NEW_MULTISYMBOL
 static void update_skip_probs(AV1_COMMON *cm, aom_writer *w,
                               FRAME_COUNTS *counts) {
   int k;
-#if CONFIG_TILE_GROUPS
   const int probwt = cm->num_tg;
-#else
-  const int probwt = 1;
-#endif
   for (k = 0; k < SKIP_CONTEXTS; ++k) {
     av1_cond_prob_diff_update(w, &cm->fc->skip_probs[k], counts->skip[k],
                               probwt);
   }
 }
-
-#if !CONFIG_EC_ADAPT
-static void update_switchable_interp_probs(AV1_COMMON *cm, aom_writer *w,
-                                           FRAME_COUNTS *counts) {
-  int j;
-  for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) {
-#if CONFIG_TILE_GROUPS
-    const int probwt = cm->num_tg;
-#else
-    const int probwt = 1;
-#endif
-    prob_diff_update(
-        av1_switchable_interp_tree, cm->fc->switchable_interp_prob[j],
-        counts->switchable_interp[j], SWITCHABLE_FILTERS, probwt, w);
-  }
-}
-#endif
-
-#if !CONFIG_EC_ADAPT
-#if CONFIG_EXT_TX
-static void update_ext_tx_probs(AV1_COMMON *cm, aom_writer *w) {
-  const int savings_thresh = av1_cost_one(GROUP_DIFF_UPDATE_PROB) -
-                             av1_cost_zero(GROUP_DIFF_UPDATE_PROB);
-  int i, j;
-  int s;
-#if CONFIG_TILE_GROUPS
-  const int probwt = cm->num_tg;
-#else
-  const int probwt = 1;
-#endif
-  for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
-    int savings = 0;
-    int do_update = 0;
-    for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
-      if (!use_inter_ext_tx_for_txsize[s][i]) continue;
-      savings += prob_diff_update_savings(
-          av1_ext_tx_inter_tree[s], cm->fc->inter_ext_tx_prob[s][i],
-          cm->counts.inter_ext_tx[s][i],
-          num_ext_tx_set[ext_tx_set_type_inter[s]], probwt);
-    }
-    do_update = savings > savings_thresh;
-    aom_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
-    if (do_update) {
-      for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
-        if (!use_inter_ext_tx_for_txsize[s][i]) continue;
-        prob_diff_update(av1_ext_tx_inter_tree[s],
-                         cm->fc->inter_ext_tx_prob[s][i],
-                         cm->counts.inter_ext_tx[s][i],
-                         num_ext_tx_set[ext_tx_set_type_inter[s]], probwt, w);
-      }
-    }
-  }
-
-  for (s = 1; s < EXT_TX_SETS_INTRA; ++s) {
-    int savings = 0;
-    int do_update = 0;
-    for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
-      if (!use_intra_ext_tx_for_txsize[s][i]) continue;
-      for (j = 0; j < INTRA_MODES; ++j)
-        savings += prob_diff_update_savings(
-            av1_ext_tx_intra_tree[s], cm->fc->intra_ext_tx_prob[s][i][j],
-            cm->counts.intra_ext_tx[s][i][j],
-            num_ext_tx_set[ext_tx_set_type_intra[s]], probwt);
-    }
-    do_update = savings > savings_thresh;
-    aom_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
-    if (do_update) {
-      for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
-        if (!use_intra_ext_tx_for_txsize[s][i]) continue;
-        for (j = 0; j < INTRA_MODES; ++j)
-          prob_diff_update(av1_ext_tx_intra_tree[s],
-                           cm->fc->intra_ext_tx_prob[s][i][j],
-                           cm->counts.intra_ext_tx[s][i][j],
-                           num_ext_tx_set[ext_tx_set_type_intra[s]], probwt, w);
-      }
-    }
-  }
-}
-
-#else
-static void update_ext_tx_probs(AV1_COMMON *cm, aom_writer *w) {
-  const int savings_thresh = av1_cost_one(GROUP_DIFF_UPDATE_PROB) -
-                             av1_cost_zero(GROUP_DIFF_UPDATE_PROB);
-  int i, j;
-
-  int savings = 0;
-  int do_update = 0;
-#if CONFIG_TILE_GROUPS
-  const int probwt = cm->num_tg;
-#else
-  const int probwt = 1;
 #endif
-  for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
-    for (j = 0; j < TX_TYPES; ++j)
-      savings += prob_diff_update_savings(
-          av1_ext_tx_tree, cm->fc->intra_ext_tx_prob[i][j],
-          cm->counts.intra_ext_tx[i][j], TX_TYPES, probwt);
-  }
-  do_update = savings > savings_thresh;
-  aom_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
-  if (do_update) {
-    for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
-      for (j = 0; j < TX_TYPES; ++j) {
-        prob_diff_update(av1_ext_tx_tree, cm->fc->intra_ext_tx_prob[i][j],
-                         cm->counts.intra_ext_tx[i][j], TX_TYPES, probwt, w);
-      }
-    }
-  }
 
-  savings = 0;
-  for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
-    savings +=
-        prob_diff_update_savings(av1_ext_tx_tree, cm->fc->inter_ext_tx_prob[i],
-                                 cm->counts.inter_ext_tx[i], TX_TYPES, probwt);
-  }
-  do_update = savings > savings_thresh;
-  aom_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
-  if (do_update) {
-    for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
-      prob_diff_update(av1_ext_tx_tree, cm->fc->inter_ext_tx_prob[i],
-                       cm->counts.inter_ext_tx[i], TX_TYPES, probwt, w);
-    }
-  }
-}
-#endif  // CONFIG_EXT_TX
-#endif  // !CONFIG_EC_ADAPT
 #if CONFIG_PALETTE
 static void pack_palette_tokens(aom_writer *w, const TOKENEXTRA **tp, int n,
                                 int num) {
-  int i;
   const TOKENEXTRA *p = *tp;
-
-  for (i = 0; i < num; ++i) {
-    av1_write_token(
-        w, av1_palette_color_index_tree[n - PALETTE_MIN_SIZE], p->context_tree,
-        &palette_color_index_encodings[n - PALETTE_MIN_SIZE][p->token]);
+  write_uniform(w, n, p->token);  // The first color index.
+  ++p;
+  --num;
+  for (int i = 0; i < num; ++i) {
+    aom_write_symbol(w, p->token, p->palette_cdf, n);
     ++p;
   }
-
   *tp = p;
 }
 #endif  // CONFIG_PALETTE
@@ -930,8 +787,16 @@ static void pack_pvq_tokens(aom_writer *w, MACROBLOCK *const x,
   int max_blocks_wide;
   int max_blocks_high;
   int step = (1 << tx_size);
+
+#if CONFIG_CHROMA_SUB8X8
   const BLOCK_SIZE plane_bsize =
-      get_plane_block_size(AOMMAX(bsize, BLOCK_8X8), pd);
+      AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
+#elif CONFIG_CB4X4
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+#else
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(AOMMAX(BLOCK_8X8, bsize), pd);
+#endif
 
   adapt = x->daala_enc.state.adapt;
 
@@ -1030,7 +895,8 @@ static void pack_txb_tokens(aom_writer *w,
     uint16_t eob = x->mbmi_ext->eobs[plane][block];
     TXB_CTX txb_ctx = { x->mbmi_ext->txb_skip_ctx[plane][block],
                         x->mbmi_ext->dc_sign_ctx[plane][block] };
-    av1_write_coeffs_txb(cm, xd, w, block, plane, tcoeff, eob, &txb_ctx);
+    av1_write_coeffs_txb(cm, xd, w, blk_row, blk_col, block, plane, tx_size,
+                         tcoeff, eob, &txb_ctx);
 #else
     pack_pvq_tokens(w, x, xd, plane, bsize, tx_size);
 #endif
@@ -1103,15 +969,30 @@ static void pack_txb_tokens(aom_writer *w, const TOKENEXTRA **tp,
     token_stats->cost += tmp_token_stats.cost;
 #endif
   } else {
+#if CONFIG_RECT_TX_EXT
+    int is_qttx = plane_tx_size == quarter_txsize_lookup[plane_bsize];
+    const TX_SIZE sub_txs = is_qttx ? plane_tx_size : sub_tx_size_map[tx_size];
+#else
     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+#endif
     const int bsl = tx_size_wide_unit[sub_txs];
     int i;
 
     assert(bsl > 0);
 
     for (i = 0; i < 4; ++i) {
+#if CONFIG_RECT_TX_EXT
+      int is_wide_tx = tx_size_wide_unit[sub_txs] > tx_size_high_unit[sub_txs];
+      const int offsetr =
+          is_qttx ? (is_wide_tx ? i * tx_size_high_unit[sub_txs] : 0)
+                  : blk_row + (i >> 1) * bsl;
+      const int offsetc =
+          is_qttx ? (is_wide_tx ? 0 : i * tx_size_wide_unit[sub_txs])
+                  : blk_col + (i & 0x01) * bsl;
+#else
       const int offsetr = blk_row + (i >> 1) * bsl;
       const int offsetc = blk_col + (i & 0x01) * bsl;
+#endif
       const int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
 
       if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
@@ -1136,6 +1017,14 @@ static void write_segment_id(aom_writer *w, const struct segmentation *seg,
   }
 }
 
+#if CONFIG_NEW_MULTISYMBOL
+#define WRITE_REF_BIT(bname, pname) \
+  aom_write_symbol(w, bname, av1_get_pred_cdf_##pname(cm, xd), 2)
+#else
+#define WRITE_REF_BIT(bname, pname) \
+  aom_write(w, bname, av1_get_pred_prob_##pname(cm, xd))
+#endif
+
 // This function encodes the reference frame
 static void write_ref_frames(const AV1_COMMON *cm, const MACROBLOCKD *xd,
                              aom_writer *w) {
@@ -1153,66 +1042,183 @@ static void write_ref_frames(const AV1_COMMON *cm, const MACROBLOCKD *xd,
     // does the feature use compound prediction or not
     // (if not specified at the frame/segment level)
     if (cm->reference_mode == REFERENCE_MODE_SELECT) {
-#if SUB8X8_COMP_REF
-      aom_write(w, is_compound, av1_get_reference_mode_prob(cm, xd));
-#else
+#if !SUB8X8_COMP_REF
       if (mbmi->sb_type != BLOCK_4X4)
-        aom_write(w, is_compound, av1_get_reference_mode_prob(cm, xd));
+#endif
+#if CONFIG_NEW_MULTISYMBOL
+        aom_write_symbol(w, is_compound, av1_get_reference_mode_cdf(cm, xd), 2);
+#else
+      aom_write(w, is_compound, av1_get_reference_mode_prob(cm, xd));
 #endif
     } else {
       assert((!is_compound) == (cm->reference_mode == SINGLE_REFERENCE));
     }
 
     if (is_compound) {
+#if CONFIG_EXT_COMP_REFS
+      const COMP_REFERENCE_TYPE comp_ref_type = has_uni_comp_refs(mbmi)
+                                                    ? UNIDIR_COMP_REFERENCE
+                                                    : BIDIR_COMP_REFERENCE;
+#if USE_UNI_COMP_REFS
+#if CONFIG_VAR_REFS
+      if ((L_OR_L2(cm) || L3_OR_G(cm)) && BWD_OR_ALT(cm))
+        if (L_AND_L2(cm) || L_AND_L3(cm) || L_AND_G(cm) || BWD_AND_ALT(cm))
+#endif  // CONFIG_VAR_REFS
+          aom_write(w, comp_ref_type, av1_get_comp_reference_type_prob(cm, xd));
+#if CONFIG_VAR_REFS
+        else
+          assert(comp_ref_type == BIDIR_COMP_REFERENCE);
+      else
+        assert(comp_ref_type == UNIDIR_COMP_REFERENCE);
+#endif  // CONFIG_VAR_REFS
+#else   // !USE_UNI_COMP_REFS
+      // NOTE: uni-directional comp refs disabled
+      assert(comp_ref_type == BIDIR_COMP_REFERENCE);
+#endif  // USE_UNI_COMP_REFS
+
+      if (comp_ref_type == UNIDIR_COMP_REFERENCE) {
+        const int bit = mbmi->ref_frame[0] == BWDREF_FRAME;
+#if CONFIG_VAR_REFS
+        if ((L_AND_L2(cm) || L_AND_L3(cm) || L_AND_G(cm)) && BWD_AND_ALT(cm))
+#endif  // CONFIG_VAR_REFS
+          aom_write(w, bit, av1_get_pred_prob_uni_comp_ref_p(cm, xd));
+
+        if (!bit) {
+          assert(mbmi->ref_frame[0] == LAST_FRAME);
+#if CONFIG_VAR_REFS
+          if (L_AND_L2(cm) && (L_AND_L3(cm) || L_AND_G(cm))) {
+#endif  // CONFIG_VAR_REFS
+            const int bit1 = mbmi->ref_frame[1] == LAST3_FRAME ||
+                             mbmi->ref_frame[1] == GOLDEN_FRAME;
+            aom_write(w, bit1, av1_get_pred_prob_uni_comp_ref_p1(cm, xd));
+
+            if (bit1) {
+#if CONFIG_VAR_REFS
+              if (L_AND_L3(cm) && L_AND_G(cm)) {
+#endif  // CONFIG_VAR_REFS
+                const int bit2 = mbmi->ref_frame[1] == GOLDEN_FRAME;
+                aom_write(w, bit2, av1_get_pred_prob_uni_comp_ref_p2(cm, xd));
+#if CONFIG_VAR_REFS
+              }
+#endif  // CONFIG_VAR_REFS
+            }
+#if CONFIG_VAR_REFS
+          }
+#endif  // CONFIG_VAR_REFS
+        } else {
+          assert(mbmi->ref_frame[1] == ALTREF_FRAME);
+        }
+
+        return;
+      }
+
+      assert(comp_ref_type == BIDIR_COMP_REFERENCE);
+#endif  // CONFIG_EXT_COMP_REFS
+
 #if CONFIG_EXT_REFS
       const int bit = (mbmi->ref_frame[0] == GOLDEN_FRAME ||
                        mbmi->ref_frame[0] == LAST3_FRAME);
-      const int bit_bwd = mbmi->ref_frame[1] == ALTREF_FRAME;
-#else  // CONFIG_EXT_REFS
-      const int bit = mbmi->ref_frame[0] == GOLDEN_FRAME;
-#endif  // CONFIG_EXT_REFS
+#if CONFIG_VAR_REFS
+      // Test need to explicitly code (L,L2) vs (L3,G) branch node in tree
+      if (L_OR_L2(cm) && L3_OR_G(cm))
+#endif  // CONFIG_VAR_REFS
+        WRITE_REF_BIT(bit, comp_ref_p);
 
-      aom_write(w, bit, av1_get_pred_prob_comp_ref_p(cm, xd));
-
-#if CONFIG_EXT_REFS
       if (!bit) {
-        const int bit1 = mbmi->ref_frame[0] == LAST_FRAME;
-        aom_write(w, bit1, av1_get_pred_prob_comp_ref_p1(cm, xd));
+#if CONFIG_VAR_REFS
+        // Test need to explicitly code (L) vs (L2) branch node in tree
+        if (L_AND_L2(cm)) {
+#endif  // CONFIG_VAR_REFS
+          const int bit1 = mbmi->ref_frame[0] == LAST_FRAME;
+          WRITE_REF_BIT(bit1, comp_ref_p1);
+#if CONFIG_VAR_REFS
+        }
+#endif  // CONFIG_VAR_REFS
       } else {
-        const int bit2 = mbmi->ref_frame[0] == GOLDEN_FRAME;
-        aom_write(w, bit2, av1_get_pred_prob_comp_ref_p2(cm, xd));
+#if CONFIG_VAR_REFS
+        // Test need to explicitly code (L3) vs (G) branch node in tree
+        if (L3_AND_G(cm)) {
+#endif  // CONFIG_VAR_REFS
+          const int bit2 = mbmi->ref_frame[0] == GOLDEN_FRAME;
+          WRITE_REF_BIT(bit2, comp_ref_p2);
+#if CONFIG_VAR_REFS
+        }
+#endif  // CONFIG_VAR_REFS
+      }
+
+#if CONFIG_VAR_REFS
+      // Test need to explicitly code (BWD) vs (ALT) branch node in tree
+      if (BWD_AND_ALT(cm)) {
+#endif  // CONFIG_VAR_REFS
+        const int bit_bwd = mbmi->ref_frame[1] == ALTREF_FRAME;
+        WRITE_REF_BIT(bit_bwd, comp_bwdref_p);
+#if CONFIG_VAR_REFS
       }
-      aom_write(w, bit_bwd, av1_get_pred_prob_comp_bwdref_p(cm, xd));
+#endif  // CONFIG_VAR_REFS
+
+#else   // !CONFIG_EXT_REFS
+      const int bit = mbmi->ref_frame[0] == GOLDEN_FRAME;
+      WRITE_REF_BIT(bit, comp_ref_p);
 #endif  // CONFIG_EXT_REFS
     } else {
 #if CONFIG_EXT_REFS
       const int bit0 = (mbmi->ref_frame[0] == ALTREF_FRAME ||
                         mbmi->ref_frame[0] == BWDREF_FRAME);
-      aom_write(w, bit0, av1_get_pred_prob_single_ref_p1(cm, xd));
+#if CONFIG_VAR_REFS
+      // Test need to explicitly code (L,L2,L3,G) vs (BWD,ALT) branch node in
+      // tree
+      if ((L_OR_L2(cm) || L3_OR_G(cm)) && BWD_OR_ALT(cm))
+#endif  // CONFIG_VAR_REFS
+        WRITE_REF_BIT(bit0, single_ref_p1);
 
       if (bit0) {
-        const int bit1 = mbmi->ref_frame[0] == ALTREF_FRAME;
-        aom_write(w, bit1, av1_get_pred_prob_single_ref_p2(cm, xd));
+#if CONFIG_VAR_REFS
+        // Test need to explicitly code (BWD) vs (ALT) branch node in tree
+        if (BWD_AND_ALT(cm)) {
+#endif  // CONFIG_VAR_REFS
+          const int bit1 = mbmi->ref_frame[0] == ALTREF_FRAME;
+          WRITE_REF_BIT(bit1, single_ref_p2);
+#if CONFIG_VAR_REFS
+        }
+#endif  // CONFIG_VAR_REFS
       } else {
         const int bit2 = (mbmi->ref_frame[0] == LAST3_FRAME ||
                           mbmi->ref_frame[0] == GOLDEN_FRAME);
-        aom_write(w, bit2, av1_get_pred_prob_single_ref_p3(cm, xd));
+#if CONFIG_VAR_REFS
+        // Test need to explicitly code (L,L2) vs (L3,G) branch node in tree
+        if (L_OR_L2(cm) && L3_OR_G(cm))
+#endif  // CONFIG_VAR_REFS
+          WRITE_REF_BIT(bit2, single_ref_p3);
 
         if (!bit2) {
-          const int bit3 = mbmi->ref_frame[0] != LAST_FRAME;
-          aom_write(w, bit3, av1_get_pred_prob_single_ref_p4(cm, xd));
+#if CONFIG_VAR_REFS
+          // Test need to explicitly code (L) vs (L2) branch node in tree
+          if (L_AND_L2(cm)) {
+#endif  // CONFIG_VAR_REFS
+            const int bit3 = mbmi->ref_frame[0] != LAST_FRAME;
+            WRITE_REF_BIT(bit3, single_ref_p4);
+#if CONFIG_VAR_REFS
+          }
+#endif  // CONFIG_VAR_REFS
         } else {
-          const int bit4 = mbmi->ref_frame[0] != LAST3_FRAME;
-          aom_write(w, bit4, av1_get_pred_prob_single_ref_p5(cm, xd));
+#if CONFIG_VAR_REFS
+          // Test need to explicitly code (L3) vs (G) branch node in tree
+          if (L3_AND_G(cm)) {
+#endif  // CONFIG_VAR_REFS
+            const int bit4 = mbmi->ref_frame[0] != LAST3_FRAME;
+            WRITE_REF_BIT(bit4, single_ref_p5);
+#if CONFIG_VAR_REFS
+          }
+#endif  // CONFIG_VAR_REFS
         }
       }
-#else   // CONFIG_EXT_REFS
+#else   // !CONFIG_EXT_REFS
       const int bit0 = mbmi->ref_frame[0] != LAST_FRAME;
-      aom_write(w, bit0, av1_get_pred_prob_single_ref_p1(cm, xd));
+      WRITE_REF_BIT(bit0, single_ref_p1);
 
       if (bit0) {
         const int bit1 = mbmi->ref_frame[0] != GOLDEN_FRAME;
-        aom_write(w, bit1, av1_get_pred_prob_single_ref_p2(cm, xd));
+        WRITE_REF_BIT(bit1, single_ref_p2);
       }
 #endif  // CONFIG_EXT_REFS
     }
@@ -1250,7 +1256,7 @@ static void write_filter_intra_mode_info(const AV1_COMMON *const cm,
   (void)mi_col;
 #endif  // CONFIG_CB4X4
 
-  if (mbmi->uv_mode == DC_PRED
+  if (mbmi->uv_mode == UV_DC_PRED
 #if CONFIG_PALETTE
       && mbmi->palette_mode_info.palette_size[1] == 0
 #endif  // CONFIG_PALETTE
@@ -1277,7 +1283,7 @@ static void write_intra_angle_info(const MACROBLOCKD *xd,
 #endif  // CONFIG_INTRA_INTERP
 
   (void)ec_ctx;
-  if (bsize < BLOCK_8X8) return;
+  if (!av1_use_angle_delta(bsize)) return;
 
   if (av1_is_directional_mode(mbmi->mode, bsize)) {
     write_uniform(w, 2 * MAX_ANGLE_DELTA + 1,
@@ -1292,7 +1298,7 @@ static void write_intra_angle_info(const MACROBLOCKD *xd,
 #endif  // CONFIG_INTRA_INTERP
   }
 
-  if (av1_is_directional_mode(mbmi->uv_mode, bsize)) {
+  if (av1_is_directional_mode(get_uv_mode(mbmi->uv_mode), bsize)) {
     write_uniform(w, 2 * MAX_ANGLE_DELTA + 1,
                   MAX_ANGLE_DELTA + mbmi->angle_delta[1]);
   }
@@ -1303,11 +1309,7 @@ static void write_mb_interp_filter(AV1_COMP *cpi, const MACROBLOCKD *xd,
                                    aom_writer *w) {
   AV1_COMMON *const cm = &cpi->common;
   const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-#if CONFIG_EC_ADAPT
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-#else
-  FRAME_CONTEXT *ec_ctx = cm->fc;
-#endif
 
   if (!av1_is_interp_needed(xd)) {
 #if CONFIG_DUAL_FILTER
@@ -1485,19 +1487,21 @@ static void write_palette_mode_info(const AV1_COMMON *cm, const MACROBLOCKD *xd,
   if (mbmi->mode == DC_PRED) {
     const int n = pmi->palette_size[0];
     int palette_y_mode_ctx = 0;
-    if (above_mi)
+    if (above_mi) {
       palette_y_mode_ctx +=
           (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
-    if (left_mi)
+    }
+    if (left_mi) {
       palette_y_mode_ctx +=
           (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
+    }
     aom_write(
         w, n > 0,
         av1_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_y_mode_ctx]);
     if (n > 0) {
-      av1_write_token(w, av1_palette_size_tree,
-                      av1_default_palette_y_size_prob[bsize - BLOCK_8X8],
-                      &palette_size_encodings[n - PALETTE_MIN_SIZE]);
+      aom_write_symbol(w, n - PALETTE_MIN_SIZE,
+                       xd->tile_ctx->palette_y_size_cdf[bsize - BLOCK_8X8],
+                       PALETTE_SIZES);
 #if CONFIG_PALETTE_DELTA_ENCODING
       write_palette_colors_y(xd, pmi, cm->bit_depth, w);
 #else
@@ -1506,18 +1510,17 @@ static void write_palette_mode_info(const AV1_COMMON *cm, const MACROBLOCKD *xd,
         aom_write_literal(w, pmi->palette_colors[i], cm->bit_depth);
       }
 #endif  // CONFIG_PALETTE_DELTA_ENCODING
-      write_uniform(w, n, pmi->palette_first_color_idx[0]);
     }
   }
 
-  if (mbmi->uv_mode == DC_PRED) {
+  if (mbmi->uv_mode == UV_DC_PRED) {
     const int n = pmi->palette_size[1];
     const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0);
     aom_write(w, n > 0, av1_default_palette_uv_mode_prob[palette_uv_mode_ctx]);
     if (n > 0) {
-      av1_write_token(w, av1_palette_size_tree,
-                      av1_default_palette_uv_size_prob[bsize - BLOCK_8X8],
-                      &palette_size_encodings[n - PALETTE_MIN_SIZE]);
+      aom_write_symbol(w, n - PALETTE_MIN_SIZE,
+                       xd->tile_ctx->palette_uv_size_cdf[bsize - BLOCK_8X8],
+                       PALETTE_SIZES);
 #if CONFIG_PALETTE_DELTA_ENCODING
       write_palette_colors_uv(xd, pmi, cm->bit_depth, w);
 #else
@@ -1532,7 +1535,6 @@ static void write_palette_mode_info(const AV1_COMMON *cm, const MACROBLOCKD *xd,
                           cm->bit_depth);
       }
 #endif  // CONFIG_PALETTE_DELTA_ENCODING
-      write_uniform(w, n, pmi->palette_first_color_idx[1]);
     }
   }
 }
@@ -1543,21 +1545,20 @@ void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
                        const int supertx_enabled,
 #endif
 #if CONFIG_TXK_SEL
-                       int block, int plane,
+                       int blk_row, int blk_col, int block, int plane,
+                       TX_SIZE tx_size,
 #endif
                        aom_writer *w) {
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   const int is_inter = is_inter_block(mbmi);
+#if !CONFIG_TXK_SEL
 #if CONFIG_VAR_TX
   const TX_SIZE tx_size = is_inter ? mbmi->min_tx_size : mbmi->tx_size;
 #else
   const TX_SIZE tx_size = mbmi->tx_size;
 #endif  // CONFIG_VAR_TX
-#if CONFIG_EC_ADAPT
+#endif  // !CONFIG_TXK_SEL
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-#else
-  FRAME_CONTEXT *ec_ctx = cm->fc;
-#endif
 
 #if !CONFIG_TXK_SEL
   TX_TYPE tx_type = mbmi->tx_type;
@@ -1565,7 +1566,8 @@ void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
   // Only y plane's tx_type is transmitted
   if (plane > 0) return;
   PLANE_TYPE plane_type = get_plane_type(plane);
-  TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+  TX_TYPE tx_type =
+      av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
 #endif
 
   if (!FIXED_TX_TYPE) {
@@ -1583,21 +1585,20 @@ void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
         !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
       const int eset =
           get_ext_tx_set(tx_size, bsize, is_inter, cm->reduced_tx_set_used);
+      // eset == 0 should correspond to a set with only DCT_DCT and there
+      // is no need to send the tx_type
+      assert(eset > 0);
       if (is_inter) {
         assert(ext_tx_used_inter[eset][tx_type]);
-        if (eset > 0) {
-          aom_write_symbol(w, av1_ext_tx_inter_ind[eset][tx_type],
-                           ec_ctx->inter_ext_tx_cdf[eset][square_tx_size],
-                           ext_tx_cnt_inter[eset]);
-        }
+        aom_write_symbol(w, av1_ext_tx_inter_ind[eset][tx_type],
+                         ec_ctx->inter_ext_tx_cdf[eset][square_tx_size],
+                         ext_tx_cnt_inter[eset]);
       } else if (ALLOW_INTRA_EXT_TX) {
         assert(ext_tx_used_intra[eset][tx_type]);
-        if (eset > 0) {
-          aom_write_symbol(
-              w, av1_ext_tx_intra_ind[eset][tx_type],
-              ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][mbmi->mode],
-              ext_tx_cnt_intra[eset]);
-        }
+        aom_write_symbol(
+            w, av1_ext_tx_intra_ind[eset][tx_type],
+            ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][mbmi->mode],
+            ext_tx_cnt_intra[eset]);
       }
     }
 #else
@@ -1632,36 +1633,30 @@ static void write_intra_mode(FRAME_CONTEXT *frame_ctx, BLOCK_SIZE bsize,
 }
 
 static void write_intra_uv_mode(FRAME_CONTEXT *frame_ctx,
-                                PREDICTION_MODE uv_mode, PREDICTION_MODE y_mode,
-                                aom_writer *w) {
-  aom_write_symbol(w, av1_intra_mode_ind[uv_mode],
-                   frame_ctx->uv_mode_cdf[y_mode], INTRA_MODES);
+                                UV_PREDICTION_MODE uv_mode,
+                                PREDICTION_MODE y_mode, aom_writer *w) {
+  aom_write_symbol(w, av1_intra_mode_ind[get_uv_mode(uv_mode)],
+                   frame_ctx->uv_mode_cdf[y_mode], UV_INTRA_MODES);
 }
 
 #if CONFIG_CFL
-static void write_cfl_alphas(FRAME_CONTEXT *const frame_ctx, int skip, int ind,
+static void write_cfl_alphas(FRAME_CONTEXT *const frame_ctx, int ind,
                              const CFL_SIGN_TYPE signs[CFL_SIGNS],
                              aom_writer *w) {
-  if (skip) {
-    assert(ind == 0);
+  // Check for uninitialized signs
+  if (cfl_alpha_codes[ind][CFL_PRED_U] == 0)
     assert(signs[CFL_PRED_U] == CFL_SIGN_POS);
+  if (cfl_alpha_codes[ind][CFL_PRED_V] == 0)
     assert(signs[CFL_PRED_V] == CFL_SIGN_POS);
-  } else {
-    // Check for uninitialized signs
-    if (cfl_alpha_codes[ind][CFL_PRED_U] == 0)
-      assert(signs[CFL_PRED_U] == CFL_SIGN_POS);
-    if (cfl_alpha_codes[ind][CFL_PRED_V] == 0)
-      assert(signs[CFL_PRED_V] == CFL_SIGN_POS);
-
-    // Write a symbol representing a combination of alpha Cb and alpha Cr.
-    aom_write_symbol(w, ind, frame_ctx->cfl_alpha_cdf, CFL_ALPHABET_SIZE);
-
-    // Signs are only signaled for nonzero codes.
-    if (cfl_alpha_codes[ind][CFL_PRED_U] != 0)
-      aom_write_bit(w, signs[CFL_PRED_U]);
-    if (cfl_alpha_codes[ind][CFL_PRED_V] != 0)
-      aom_write_bit(w, signs[CFL_PRED_V]);
-  }
+
+  // Write a symbol representing a combination of alpha Cb and alpha Cr.
+  aom_write_symbol(w, ind, frame_ctx->cfl_alpha_cdf, CFL_ALPHABET_SIZE);
+
+  // Signs are only signaled for nonzero codes.
+  if (cfl_alpha_codes[ind][CFL_PRED_U] != 0)
+    aom_write_bit(w, signs[CFL_PRED_U]);
+  if (cfl_alpha_codes[ind][CFL_PRED_V] != 0)
+    aom_write_bit(w, signs[CFL_PRED_V]);
 }
 #endif
 
@@ -1672,22 +1667,13 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
 #endif
                                 aom_writer *w) {
   AV1_COMMON *const cm = &cpi->common;
-#if CONFIG_DELTA_Q || CONFIG_EC_ADAPT
   MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-#else
-  const MACROBLOCK *x = &cpi->td.mb;
-  const MACROBLOCKD *xd = &x->e_mbd;
-#endif
-#if CONFIG_EC_ADAPT
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-#else
-  FRAME_CONTEXT *ec_ctx = cm->fc;
-#endif
   const MODE_INFO *mi = xd->mi[0];
 
   const struct segmentation *const seg = &cm->seg;
-  struct segmentation_probs *const segp = &cm->fc->seg;
+  struct segmentation_probs *const segp = &ec_ctx->seg;
   const MB_MODE_INFO *const mbmi = &mi->mbmi;
   const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   const PREDICTION_MODE mode = mbmi->mode;
@@ -1708,8 +1694,13 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
   if (seg->update_map) {
     if (seg->temporal_update) {
       const int pred_flag = mbmi->seg_id_predicted;
+#if CONFIG_NEW_MULTISYMBOL
+      aom_cdf_prob *pred_cdf = av1_get_pred_cdf_seg_id(segp, xd);
+      aom_write_symbol(w, pred_flag, pred_cdf, 2);
+#else
       aom_prob pred_prob = av1_get_pred_prob_seg_id(segp, xd);
       aom_write(w, pred_flag, pred_prob);
+#endif
       if (!pred_flag) write_segment_id(w, seg, segp, segment_id);
     } else {
       write_segment_id(w, seg, segp, segment_id);
@@ -1750,8 +1741,7 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
 #if CONFIG_SUPERTX
   if (!supertx_enabled)
 #endif  // CONFIG_SUPERTX
-    if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
-      aom_write(w, is_inter, av1_get_intra_inter_prob(cm, xd));
+    write_is_inter(cm, xd, mbmi->segment_id, w, is_inter);
 
   if (cm->tx_mode == TX_MODE_SELECT &&
 #if CONFIG_CB4X4 && (CONFIG_VAR_TX || CONFIG_RECT_TX)
@@ -1779,6 +1769,15 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
         for (idx = 0; idx < width; idx += bw)
           write_tx_size_vartx(cm, xd, mbmi, max_tx_size, height != width, idy,
                               idx, w);
+#if CONFIG_RECT_TX_EXT
+      if (is_quarter_tx_allowed(xd, mbmi, is_inter_block(mbmi)) &&
+          quarter_txsize_lookup[bsize] != max_tx_size &&
+          (mbmi->tx_size == quarter_txsize_lookup[bsize] ||
+           mbmi->tx_size == max_tx_size)) {
+        aom_write(w, mbmi->tx_size != max_tx_size,
+                  cm->fc->quarter_tx_size_prob);
+      }
+#endif
     } else {
       set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h, skip, xd);
       write_selected_tx_size(cm, xd, w);
@@ -1813,9 +1812,8 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
 #endif  // CONFIG_CB4X4
 
 #if CONFIG_CFL
-      if (mbmi->uv_mode == DC_PRED) {
-        write_cfl_alphas(ec_ctx, mbmi->skip, mbmi->cfl_alpha_idx,
-                         mbmi->cfl_alpha_signs, w);
+      if (mbmi->uv_mode == UV_DC_PRED) {
+        write_cfl_alphas(ec_ctx, mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, w);
       }
 #endif
 
@@ -1838,11 +1836,25 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
     int16_t mode_ctx;
     write_ref_frames(cm, xd, w);
 
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+    if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
+      // NOTE: Handle single ref comp mode
+      if (!is_compound)
+        aom_write(w, is_inter_singleref_comp_mode(mode),
+                  av1_get_inter_mode_prob(cm, xd));
+    }
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+
 #if CONFIG_EXT_INTER
+#if CONFIG_COMPOUND_SINGLEREF
+    if (is_compound || is_inter_singleref_comp_mode(mode))
+#else   // !CONFIG_COMPOUND_SINGLEREF
     if (is_compound)
+#endif  // CONFIG_COMPOUND_SINGLEREF
       mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]];
     else
 #endif  // CONFIG_EXT_INTER
+
       mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
                                            mbmi->ref_frame, bsize, -1);
 
@@ -1851,18 +1863,25 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
       if (bsize >= BLOCK_8X8 || unify_bsize) {
 #if CONFIG_EXT_INTER
         if (is_inter_compound_mode(mode))
-          write_inter_compound_mode(cm, w, mode, mode_ctx);
+          write_inter_compound_mode(cm, xd, w, mode, mode_ctx);
+#if CONFIG_COMPOUND_SINGLEREF
+        else if (is_inter_singleref_comp_mode(mode))
+          write_inter_singleref_comp_mode(xd, w, mode, mode_ctx);
+#endif  // CONFIG_COMPOUND_SINGLEREF
         else if (is_inter_singleref_mode(mode))
 #endif  // CONFIG_EXT_INTER
           write_inter_mode(w, mode, ec_ctx, mode_ctx);
 
 #if CONFIG_EXT_INTER
         if (mode == NEWMV || mode == NEW_NEWMV ||
+#if CONFIG_COMPOUND_SINGLEREF
+            mbmi->mode == SR_NEW_NEWMV ||
+#endif  // CONFIG_COMPOUND_SINGLEREF
             have_nearmv_in_inter_mode(mode))
-#else
+#else   // !CONFIG_EXT_INTER
         if (mode == NEARMV || mode == NEWMV)
-#endif
-          write_drl_idx(cm, mbmi, mbmi_ext, w);
+#endif  // CONFIG_EXT_INTER
+          write_drl_idx(ec_ctx, mbmi, mbmi_ext, w);
         else
           assert(mbmi->ref_mv_idx == 0);
       }
@@ -1873,6 +1892,10 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
 #endif  // !CONFIG_DUAL_FILTER && !CONFIG_WARPED_MOTION
 
     if (bsize < BLOCK_8X8 && !unify_bsize) {
+#if CONFIG_COMPOUND_SINGLEREF
+      /// NOTE: Single ref comp mode does not support sub8x8.
+      assert(is_compound || !is_inter_singleref_comp_mode(mbmi->mode));
+#endif  // CONFIG_COMPOUND_SINGLEREF
       const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
       const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
       int idx, idy;
@@ -1887,7 +1910,7 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
                                                  mbmi->ref_frame, bsize, j);
 #if CONFIG_EXT_INTER
           if (is_inter_compound_mode(b_mode))
-            write_inter_compound_mode(cm, w, b_mode, mode_ctx);
+            write_inter_compound_mode(cm, xd, w, b_mode, mode_ctx);
           else if (is_inter_singleref_mode(b_mode))
 #endif  // CONFIG_EXT_INTER
             write_inter_mode(w, b_mode, ec_ctx, mode_ctx);
@@ -1969,6 +1992,22 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
         av1_encode_mv(cpi, w, &mbmi->mv[0].as_mv,
                       &mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0].as_mv, nmvc,
                       allow_hp);
+#if CONFIG_COMPOUND_SINGLEREF
+      } else if (  //  mode == SR_NEAREST_NEWMV ||
+          mode == SR_NEAR_NEWMV || mode == SR_ZERO_NEWMV ||
+          mode == SR_NEW_NEWMV) {
+        int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+        int nmv_ctx =
+            av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
+                        mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
+        nmv_context *nmvc = &ec_ctx->nmvc[nmv_ctx];
+        int_mv ref_mv = mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0];
+        if (mode == SR_NEW_NEWMV)
+          av1_encode_mv(cpi, w, &mbmi->mv[0].as_mv, &ref_mv.as_mv, nmvc,
+                        allow_hp);
+        av1_encode_mv(cpi, w, &mbmi->mv[1].as_mv, &ref_mv.as_mv, nmvc,
+                      allow_hp);
+#endif  // CONFIG_COMPOUND_SINGLEREF
 #endif  // CONFIG_EXT_INTER
       }
     }
@@ -1981,13 +2020,23 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
         cpi->common.allow_interintra_compound && is_interintra_allowed(mbmi)) {
       const int interintra = mbmi->ref_frame[1] == INTRA_FRAME;
       const int bsize_group = size_group_lookup[bsize];
+#if CONFIG_NEW_MULTISYMBOL
+      aom_write_symbol(w, interintra, ec_ctx->interintra_cdf[bsize_group], 2);
+#else
       aom_write(w, interintra, cm->fc->interintra_prob[bsize_group]);
+#endif
       if (interintra) {
-        write_interintra_mode(w, mbmi->interintra_mode,
-                              cm->fc->interintra_mode_prob[bsize_group]);
+        aom_write_symbol(w, mbmi->interintra_mode,
+                         ec_ctx->interintra_mode_cdf[bsize_group],
+                         INTERINTRA_MODES);
         if (is_interintra_wedge_used(bsize)) {
+#if CONFIG_NEW_MULTISYMBOL
+          aom_write_symbol(w, mbmi->use_wedge_interintra,
+                           ec_ctx->wedge_interintra_cdf[bsize], 2);
+#else
           aom_write(w, mbmi->use_wedge_interintra,
                     cm->fc->wedge_interintra_prob[bsize]);
+#endif
           if (mbmi->use_wedge_interintra) {
             aom_write_literal(w, mbmi->interintra_wedge_index,
                               get_wedge_bits_lookup(bsize));
@@ -2005,21 +2054,28 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
 #if CONFIG_EXT_INTER
       if (mbmi->ref_frame[1] != INTRA_FRAME)
 #endif  // CONFIG_EXT_INTER
-        write_motion_mode(cm, mi, w);
+        write_motion_mode(cm, xd, mi, w);
+#if CONFIG_NCOBMC_ADAPT_WEIGHT
+    write_ncobmc_mode(xd, mi, w);
+#endif
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 
 #if CONFIG_EXT_INTER
-    if (cpi->common.reference_mode != SINGLE_REFERENCE &&
-        is_inter_compound_mode(mbmi->mode)
+    if (
+#if CONFIG_COMPOUND_SINGLEREF
+        is_inter_anyref_comp_mode(mbmi->mode) &&
+#else   // !CONFIG_COMPOUND_SINGLEREF
+        cpi->common.reference_mode != SINGLE_REFERENCE &&
+        is_inter_compound_mode(mbmi->mode) &&
+#endif  // CONFIG_COMPOUND_SINGLEREF
 #if CONFIG_MOTION_VAR
-        && mbmi->motion_mode == SIMPLE_TRANSLATION
+        mbmi->motion_mode == SIMPLE_TRANSLATION &&
 #endif  // CONFIG_MOTION_VAR
-        && is_any_masked_compound_used(bsize)) {
+        is_any_masked_compound_used(bsize)) {
 #if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
       if (cm->allow_masked_compound) {
-        av1_write_token(
-            w, av1_compound_type_tree, cm->fc->compound_type_prob[bsize],
-            &compound_type_encodings[mbmi->interinter_compound_type]);
+        aom_write_symbol(w, mbmi->interinter_compound_type,
+                         ec_ctx->compound_type_cdf[bsize], COMPOUND_TYPES);
 #if CONFIG_WEDGE
         if (mbmi->interinter_compound_type == COMPOUND_WEDGE) {
           aom_write_literal(w, mbmi->wedge_index, get_wedge_bits_lookup(bsize));
@@ -2061,8 +2117,9 @@ static void write_mb_modes_kf(AV1_COMMON *cm,
 #endif  // CONFIG_INTRABC
                               const int mi_row, const int mi_col,
                               aom_writer *w) {
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
   const struct segmentation *const seg = &cm->seg;
-  struct segmentation_probs *const segp = &cm->fc->seg;
+  struct segmentation_probs *const segp = &ec_ctx->seg;
   const MODE_INFO *const mi = xd->mi[0];
   const MODE_INFO *const above_mi = xd->above_mi;
   const MODE_INFO *const left_mi = xd->left_mi;
@@ -2076,12 +2133,6 @@ static void write_mb_modes_kf(AV1_COMMON *cm,
   (void)mi_row;
   (void)mi_col;
 
-#if CONFIG_EC_ADAPT
-  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-#else
-  FRAME_CONTEXT *ec_ctx = cm->fc;
-#endif
-
   if (seg->update_map) write_segment_id(w, seg, segp, mbmi->segment_id);
 
 #if CONFIG_DELTA_Q
@@ -2110,18 +2161,17 @@ static void write_mb_modes_kf(AV1_COMMON *cm,
   write_skip(cm, xd, mbmi->segment_id, mi, w);
 #endif
 
-  if (cm->tx_mode == TX_MODE_SELECT &&
+  int enable_tx_size = cm->tx_mode == TX_MODE_SELECT &&
 #if CONFIG_CB4X4 && (CONFIG_VAR_TX || CONFIG_RECT_TX)
 #if CONFIG_RECT_TX
-      bsize > BLOCK_4X4 &&
+                       bsize > BLOCK_4X4 &&
 #else
-      bsize >= BLOCK_8X8 &&
+                       bsize >= BLOCK_8X8 &&
 #endif  // CONFIG_RECT_TX
 #else
-      bsize >= BLOCK_8X8 &&
+                       bsize >= BLOCK_8X8 &&
 #endif
-      !xd->lossless[mbmi->segment_id])
-    write_selected_tx_size(cm, xd, w);
+                       !xd->lossless[mbmi->segment_id];
 
 #if CONFIG_INTRABC
   if (bsize >= BLOCK_8X8 && cm->allow_screen_content_tools) {
@@ -2129,7 +2179,8 @@ static void write_mb_modes_kf(AV1_COMMON *cm,
     aom_write(w, use_intrabc, ec_ctx->intrabc_prob);
     if (use_intrabc) {
       assert(mbmi->mode == DC_PRED);
-      assert(mbmi->uv_mode == DC_PRED);
+      assert(mbmi->uv_mode == UV_DC_PRED);
+      if (enable_tx_size && !mbmi->skip) write_selected_tx_size(cm, xd, w);
       int_mv dv_ref = mbmi_ext->ref_mvs[INTRA_FRAME][0];
       av1_encode_dv(w, &mbmi->mv[0].as_mv, &dv_ref.as_mv, &ec_ctx->ndvc);
 #if CONFIG_EXT_TX && !CONFIG_TXK_SEL
@@ -2143,6 +2194,7 @@ static void write_mb_modes_kf(AV1_COMMON *cm,
     }
   }
 #endif  // CONFIG_INTRABC
+  if (enable_tx_size) write_selected_tx_size(cm, xd, w);
 
   if (bsize >= BLOCK_8X8 || unify_bsize) {
     write_intra_mode_kf(cm, ec_ctx, mi, above_mi, left_mi, 0, mbmi->mode, w);
@@ -2169,9 +2221,8 @@ static void write_mb_modes_kf(AV1_COMMON *cm,
 #endif  // CONFIG_CB4X4
 
 #if CONFIG_CFL
-    if (mbmi->uv_mode == DC_PRED) {
-      write_cfl_alphas(ec_ctx, mbmi->skip, mbmi->cfl_alpha_idx,
-                       mbmi->cfl_alpha_signs, w);
+    if (mbmi->uv_mode == UV_DC_PRED) {
+      write_cfl_alphas(ec_ctx, mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, w);
     }
 #endif
 
@@ -2252,6 +2303,89 @@ static int rd_token_stats_mismatch(RD_STATS *rd_stats, TOKEN_STATS *token_stats,
 }
 #endif
 
+#if ENC_MISMATCH_DEBUG
+static void enc_dump_logs(AV1_COMP *cpi, int mi_row, int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  MODE_INFO *m;
+  xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
+  m = xd->mi[0];
+  if (is_inter_block(&m->mbmi)) {
+#define FRAME_TO_CHECK 1
+    if (cm->current_video_frame == FRAME_TO_CHECK /* && cm->show_frame == 1*/) {
+      const MB_MODE_INFO *const mbmi = &m->mbmi;
+      const BLOCK_SIZE bsize = mbmi->sb_type;
+
+      int_mv mv[2];
+      int is_comp_ref = has_second_ref(&m->mbmi);
+      int ref;
+
+      for (ref = 0; ref < 1 + is_comp_ref; ++ref)
+        mv[ref].as_mv = m->mbmi.mv[ref].as_mv;
+
+      if (!is_comp_ref) {
+#if CONFIG_COMPOUND_SINGLEREF
+        if (is_inter_singleref_comp_mode(m->mbmi.mode))
+          mv[1].as_mv = m->mbmi.mv[1].as_mv;
+        else
+#endif  // CONFIG_COMPOUND_SINGLEREF
+          mv[1].as_int = 0;
+      }
+      int interp_ctx[2] = { -1 };
+      int interp_filter[2] = { cm->interp_filter };
+      if (cm->interp_filter == SWITCHABLE) {
+        int dir;
+        for (dir = 0; dir < 2; ++dir) {
+          if (has_subpel_mv_component(xd->mi[0], xd, dir) ||
+              (mbmi->ref_frame[1] > INTRA_FRAME &&
+               has_subpel_mv_component(xd->mi[0], xd, dir + 2))) {
+            interp_ctx[dir] = av1_get_pred_context_switchable_interp(xd, dir);
+            interp_filter[dir] = mbmi->interp_filter[dir];
+          } else {
+            interp_filter[dir] = EIGHTTAP_REGULAR;
+          }
+        }
+      }
+
+      MACROBLOCK *const x = &cpi->td.mb;
+      const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+      const int16_t mode_ctx = av1_mode_context_analyzer(
+          mbmi_ext->mode_context, mbmi->ref_frame, bsize, -1);
+      const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK;
+      int16_t zeromv_ctx = -1;
+      int16_t refmv_ctx = -1;
+      if (mbmi->mode != NEWMV) {
+        zeromv_ctx = (mode_ctx >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK;
+        if (mode_ctx & (1 << ALL_ZERO_FLAG_OFFSET)) {
+          assert(mbmi->mode == ZEROMV);
+        }
+        if (mbmi->mode != ZEROMV) {
+          refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
+          if (mode_ctx & (1 << SKIP_NEARESTMV_OFFSET)) refmv_ctx = 6;
+          if (mode_ctx & (1 << SKIP_NEARMV_OFFSET)) refmv_ctx = 7;
+          if (mode_ctx & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET)) refmv_ctx = 8;
+        }
+      }
+
+      int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+      printf(
+          "=== ENCODER ===: "
+          "Frame=%d, (mi_row,mi_col)=(%d,%d), mode=%d, bsize=%d, "
+          "show_frame=%d, mv[0]=(%d,%d), mv[1]=(%d,%d), ref[0]=%d, "
+          "ref[1]=%d, motion_mode=%d, inter_mode_ctx=%d, mode_ctx=%d, "
+          "interp_ctx=(%d,%d), interp_filter=(%d,%d), newmv_ctx=%d, "
+          "zeromv_ctx=%d, refmv_ctx=%d\n",
+          cm->current_video_frame, mi_row, mi_col, mbmi->mode, bsize,
+          cm->show_frame, mv[0].as_mv.row, mv[0].as_mv.col, mv[1].as_mv.row,
+          mv[1].as_mv.col, mbmi->ref_frame[0], mbmi->ref_frame[1],
+          mbmi->motion_mode, mbmi_ext->mode_context[ref_frame_type], mode_ctx,
+          interp_ctx[0], interp_ctx[1], interp_filter[0], interp_filter[1],
+          newmv_ctx, zeromv_ctx, refmv_ctx);
+    }
+  }
+}
+#endif  // ENC_MISMATCH_DEBUG
+
 static void write_mbmi_b(AV1_COMP *cpi, const TileInfo *const tile,
                          aom_writer *w,
 #if CONFIG_SUPERTX
@@ -2265,7 +2399,8 @@ static void write_mbmi_b(AV1_COMP *cpi, const TileInfo *const tile,
   xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
   m = xd->mi[0];
 
-  assert(m->mbmi.sb_type <= cm->sb_size);
+  assert(m->mbmi.sb_type <= cm->sb_size ||
+         (m->mbmi.sb_type >= BLOCK_4X16 && m->mbmi.sb_type <= BLOCK_32X8));
 
   bh = mi_size_high[m->mbmi.sb_type];
   bw = mi_size_wide[m->mbmi.sb_type];
@@ -2291,36 +2426,22 @@ static void write_mbmi_b(AV1_COMP *cpi, const TileInfo *const tile,
     xd->left_txfm_context = xd->left_txfm_context_buffer +
                             ((mi_row & MAX_MIB_MASK) << TX_UNIT_HIGH_LOG2);
 #endif
-#if CONFIG_DUAL_FILTER
+#if CONFIG_DUAL_FILTER || CONFIG_WARPED_MOTION
     // has_subpel_mv_component needs the ref frame buffers set up to look
     // up if they are scaled. has_subpel_mv_component is in turn needed by
     // write_switchable_interp_filter, which is called by pack_inter_mode_mvs.
     set_ref_ptrs(cm, xd, m->mbmi.ref_frame[0], m->mbmi.ref_frame[1]);
-#endif  // CONFIG_DUAL_FILTER
-#if 0
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+    if (!has_second_ref(&m->mbmi) && is_inter_singleref_comp_mode(m->mbmi.mode))
+      xd->block_refs[1] = xd->block_refs[0];
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_DUAL_FILTER || CONFIG_WARPED_MOTION
+
+#if ENC_MISMATCH_DEBUG
     // NOTE(zoeliu): For debug
-    if (cm->current_video_frame == FRAME_TO_CHECK && cm->show_frame == 1) {
-      const PREDICTION_MODE mode = m->mbmi.mode;
-      const int segment_id = m->mbmi.segment_id;
-      const BLOCK_SIZE bsize = m->mbmi.sb_type;
-
-      // For sub8x8, simply dump out the first sub8x8 block info
-      const PREDICTION_MODE b_mode =
-          (bsize < BLOCK_8X8) ? m->bmi[0].as_mode : -1;
-      const int mv_x = (bsize < BLOCK_8X8) ?
-          m->bmi[0].as_mv[0].as_mv.row : m->mbmi.mv[0].as_mv.row;
-      const int mv_y = (bsize < BLOCK_8X8) ?
-          m->bmi[0].as_mv[0].as_mv.col : m->mbmi.mv[0].as_mv.col;
-
-      printf("Before pack_inter_mode_mvs(): "
-             "Frame=%d, (mi_row,mi_col)=(%d,%d), "
-             "mode=%d, segment_id=%d, bsize=%d, b_mode=%d, "
-             "mv[0]=(%d, %d), ref[0]=%d, ref[1]=%d\n",
-             cm->current_video_frame, mi_row, mi_col,
-             mode, segment_id, bsize, b_mode, mv_x, mv_y,
-             m->mbmi.ref_frame[0], m->mbmi.ref_frame[1]);
-    }
-#endif  // 0
+    enc_dump_logs(cpi, mi_row, mi_col);
+#endif  // ENC_MISMATCH_DEBUG
+
     pack_inter_mode_mvs(cpi, mi_row, mi_col,
 #if CONFIG_SUPERTX
                         supertx_enabled,
@@ -2335,7 +2456,8 @@ static void write_tokens_b(AV1_COMP *cpi, const TileInfo *const tile,
                            int mi_col) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
-  MODE_INFO *const m = xd->mi[0];
+  const int mi_offset = mi_row * cm->mi_stride + mi_col;
+  MODE_INFO *const m = *(cm->mi_grid_visible + mi_offset);
   MB_MODE_INFO *const mbmi = &m->mbmi;
   int plane;
   int bh, bw;
@@ -2344,9 +2466,10 @@ static void write_tokens_b(AV1_COMP *cpi, const TileInfo *const tile,
   (void)tok;
   (void)tok_end;
 #endif
-  xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
+  xd->mi = cm->mi_grid_visible + mi_offset;
 
-  assert(mbmi->sb_type <= cm->sb_size);
+  assert(mbmi->sb_type <= cm->sb_size ||
+         (mbmi->sb_type >= BLOCK_4X16 && mbmi->sb_type <= BLOCK_32X8));
 
   bh = mi_size_high[mbmi->sb_type];
   bw = mi_size_wide[mbmi->sb_type];
@@ -2371,7 +2494,7 @@ static void write_tokens_b(AV1_COMP *cpi, const TileInfo *const tile,
       av1_get_block_dimensions(mbmi->sb_type, plane, xd, NULL, NULL, &rows,
                                &cols);
       assert(*tok < tok_end);
-      pack_palette_tokens(w, tok, palette_size_plane, rows * cols - 1);
+      pack_palette_tokens(w, tok, palette_size_plane, rows * cols);
       assert(*tok < tok_end + mbmi->skip);
     }
   }
@@ -2382,7 +2505,7 @@ static void write_tokens_b(AV1_COMP *cpi, const TileInfo *const tile,
     const struct macroblockd_plane *const pd_y = &xd->plane[0];
     const struct macroblockd_plane *const pd_c = &xd->plane[1];
     const TX_SIZE tx_log2_y = mbmi->tx_size;
-    const TX_SIZE tx_log2_c = get_uv_tx_size(mbmi, pd_c);
+    const TX_SIZE tx_log2_c = av1_get_uv_tx_size(mbmi, pd_c);
     const int tx_sz_y = (1 << tx_log2_y);
     const int tx_sz_c = (1 << tx_log2_c);
 
@@ -2469,13 +2592,11 @@ static void write_tokens_b(AV1_COMP *cpi, const TileInfo *const tile,
 #if CONFIG_VAR_TX
       const struct macroblockd_plane *const pd = &xd->plane[plane];
       BLOCK_SIZE bsize = mbmi->sb_type;
-#if CONFIG_CB4X4
-#if CONFIG_CHROMA_2X2
-      const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-#else
+#if CONFIG_CHROMA_SUB8X8
       const BLOCK_SIZE plane_bsize =
           AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
-#endif
+#elif CONFIG_CB4X4
+      const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
 #else
       const BLOCK_SIZE plane_bsize =
           get_plane_block_size(AOMMAX(bsize, BLOCK_8X8), pd);
@@ -2489,6 +2610,15 @@ static void write_tokens_b(AV1_COMP *cpi, const TileInfo *const tile,
       TOKEN_STATS token_stats;
       init_token_stats(&token_stats);
 
+      const BLOCK_SIZE max_unit_bsize = get_plane_block_size(BLOCK_64X64, pd);
+      int mu_blocks_wide =
+          block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
+      int mu_blocks_high =
+          block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
+
+      mu_blocks_wide = AOMMIN(num_4x4_w, mu_blocks_wide);
+      mu_blocks_high = AOMMIN(num_4x4_h, mu_blocks_high);
+
       if (is_inter_block(mbmi)) {
         const TX_SIZE max_tx_size = get_vartx_max_txsize(mbmi, plane_bsize);
         int block = 0;
@@ -2496,19 +2626,27 @@ static void write_tokens_b(AV1_COMP *cpi, const TileInfo *const tile,
             tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
         const int bkw = tx_size_wide_unit[max_tx_size];
         const int bkh = tx_size_high_unit[max_tx_size];
-        for (row = 0; row < num_4x4_h; row += bkh) {
-          for (col = 0; col < num_4x4_w; col += bkw) {
-            pack_txb_tokens(w,
+        for (row = 0; row < num_4x4_h; row += mu_blocks_high) {
+          const int unit_height = AOMMIN(mu_blocks_high + row, num_4x4_h);
+          for (col = 0; col < num_4x4_w; col += mu_blocks_wide) {
+            int blk_row, blk_col;
+            const int unit_width = AOMMIN(mu_blocks_wide + col, num_4x4_w);
+            for (blk_row = row; blk_row < unit_height; blk_row += bkh) {
+              for (blk_col = col; blk_col < unit_width; blk_col += bkw) {
+                pack_txb_tokens(w,
 #if CONFIG_LV_MAP
-                            cm,
+                                cm,
 #endif
-                            tok, tok_end,
+                                tok, tok_end,
 #if CONFIG_PVQ || CONFIG_LV_MAP
-                            x,
+                                x,
 #endif
-                            xd, mbmi, plane, plane_bsize, cm->bit_depth, block,
-                            row, col, max_tx_size, &token_stats);
-            block += step;
+                                xd, mbmi, plane, plane_bsize, cm->bit_depth,
+                                block, blk_row, blk_col, max_tx_size,
+                                &token_stats);
+                block += step;
+              }
+            }
           }
         }
 #if CONFIG_RD_DEBUG
@@ -2522,22 +2660,32 @@ static void write_tokens_b(AV1_COMP *cpi, const TileInfo *const tile,
 #if CONFIG_LV_MAP
         av1_write_coeffs_mb(cm, x, w, plane);
 #else
-        TX_SIZE tx = get_tx_size(plane, xd);
+        const TX_SIZE tx = av1_get_tx_size(plane, xd);
         const int bkw = tx_size_wide_unit[tx];
         const int bkh = tx_size_high_unit[tx];
-        for (row = 0; row < num_4x4_h; row += bkh) {
-          for (col = 0; col < num_4x4_w; col += bkw) {
+        int blk_row, blk_col;
+
+        for (row = 0; row < num_4x4_h; row += mu_blocks_high) {
+          for (col = 0; col < num_4x4_w; col += mu_blocks_wide) {
+            const int unit_height = AOMMIN(mu_blocks_high + row, num_4x4_h);
+            const int unit_width = AOMMIN(mu_blocks_wide + col, num_4x4_w);
+
+            for (blk_row = row; blk_row < unit_height; blk_row += bkh) {
+              for (blk_col = col; blk_col < unit_width; blk_col += bkw) {
 #if !CONFIG_PVQ
-            pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx, &token_stats);
+                pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx,
+                               &token_stats);
 #else
-            pack_pvq_tokens(w, x, xd, plane, bsize, tx);
+                pack_pvq_tokens(w, x, xd, plane, bsize, tx);
 #endif
+              }
+            }
           }
         }
 #endif  // CONFIG_LV_MAP
       }
 #else
-      TX_SIZE tx = get_tx_size(plane, xd);
+      const TX_SIZE tx = av1_get_tx_size(plane, xd);
       TOKEN_STATS token_stats;
 #if !CONFIG_PVQ
       init_token_stats(&token_stats);
@@ -2570,7 +2718,7 @@ static void write_tokens_b(AV1_COMP *cpi, const TileInfo *const tile,
 #endif  // CONFIG_COEF_INTERLEAVE
 }
 
-#if CONFIG_MOTION_VAR && CONFIG_NCOBMC
+#if CONFIG_MOTION_VAR && (CONFIG_NCOBMC || CONFIG_NCOBMC_ADAPT_WEIGHT)
 static void write_tokens_sb(AV1_COMP *cpi, const TileInfo *const tile,
                             aom_writer *w, const TOKENEXTRA **tok,
                             const TOKENEXTRA *const tok_end, int mi_row,
@@ -2656,7 +2804,7 @@ static void write_modes_b(AV1_COMP *cpi, const TileInfo *const tile,
                supertx_enabled,
 #endif
                mi_row, mi_col);
-#if CONFIG_MOTION_VAR && CONFIG_NCOBMC
+#if CONFIG_MOTION_VAR && (CONFIG_NCOBMC || CONFIG_NCOBMC_ADAPT_WEIGHT)
   (void)tok;
   (void)tok_end;
 #else
@@ -2688,12 +2836,8 @@ static void write_partition(const AV1_COMMON *const cm,
   const aom_prob *const probs = cm->fc->partition_prob[ctx];
 #endif
 
-#if CONFIG_EC_ADAPT
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
   (void)cm;
-#else
-  FRAME_CONTEXT *ec_ctx = cm->fc;
-#endif
 
   if (!is_partition_point) return;
 
@@ -2738,6 +2882,10 @@ static void write_modes_sb(AV1_COMP *const cpi, const TileInfo *const tile,
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   const int hbs = mi_size_wide[bsize] / 2;
+#if CONFIG_EXT_PARTITION_TYPES
+  const int quarter_step = mi_size_wide[bsize] / 4;
+  int i;
+#endif
   const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
   const BLOCK_SIZE subsize = get_subsize(bsize, partition);
 #if CONFIG_CB4X4
@@ -2843,6 +2991,24 @@ static void write_modes_sb(AV1_COMP *const cpi, const TileInfo *const tile,
         write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
                               mi_row + hbs, mi_col + hbs);
         break;
+      case PARTITION_HORZ_4:
+        for (i = 0; i < 4; ++i) {
+          int this_mi_row = mi_row + i * quarter_step;
+          if (i > 0 && this_mi_row >= cm->mi_rows) break;
+
+          write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                                this_mi_row, mi_col);
+        }
+        break;
+      case PARTITION_VERT_4:
+        for (i = 0; i < 4; ++i) {
+          int this_mi_col = mi_col + i * quarter_step;
+          if (i > 0 && this_mi_col >= cm->mi_cols) break;
+
+          write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                                mi_row, this_mi_col);
+        }
+        break;
 #endif  // CONFIG_EXT_PARTITION_TYPES
       default: assert(0);
     }
@@ -2865,17 +3031,15 @@ static void write_modes_sb(AV1_COMP *const cpi, const TileInfo *const tile,
     assert(mbmi->segment_id_supertx < MAX_SEGMENTS);
 
     skip = write_skip(cm, xd, mbmi->segment_id_supertx, xd->mi[0], w);
+
+    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
 #if CONFIG_EXT_TX
     if (get_ext_tx_types(supertx_size, bsize, 1, cm->reduced_tx_set_used) > 1 &&
         !skip) {
       const int eset =
           get_ext_tx_set(supertx_size, bsize, 1, cm->reduced_tx_set_used);
       if (eset > 0) {
-#if CONFIG_EC_ADAPT
-        FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-#else
-        FRAME_CONTEXT *ec_ctx = cm->fc;
-#endif
         aom_write_symbol(w, av1_ext_tx_inter_ind[eset][mbmi->tx_type],
                          ec_ctx->inter_ext_tx_cdf[eset][supertx_size],
                          ext_tx_cnt_inter[eset]);
@@ -2883,9 +3047,8 @@ static void write_modes_sb(AV1_COMP *const cpi, const TileInfo *const tile,
     }
 #else
     if (supertx_size < TX_32X32 && !skip) {
-      av1_write_token(w, av1_ext_tx_tree,
-                      cm->fc->inter_ext_tx_prob[supertx_size],
-                      &ext_tx_encodings[mbmi->tx_type]);
+      aom_write_symbol(w, mbmi->tx_type, ec_ctx->inter_ext_tx_cdf[supertx_size],
+                       TX_TYPES);
     }
 #endif  // CONFIG_EXT_TX
 
@@ -2900,7 +3063,7 @@ static void write_modes_sb(AV1_COMP *const cpi, const TileInfo *const tile,
         const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
 
         int row, col;
-        TX_SIZE tx = get_tx_size(plane, xd);
+        const TX_SIZE tx = av1_get_tx_size(plane, xd);
         BLOCK_SIZE txb_size = txsize_to_bsize[tx];
 
         const int stepr = tx_size_high_unit[txb_size];
@@ -2934,11 +3097,24 @@ static void write_modes_sb(AV1_COMP *const cpi, const TileInfo *const tile,
 #endif  // CONFIG_EXT_PARTITION_TYPES
 
 #if CONFIG_CDEF
-  if (bsize == cm->sb_size && !sb_all_skip(cm, mi_row, mi_col) &&
-      cm->cdef_bits != 0) {
-    aom_write_literal(w, cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]
-                             ->mbmi.cdef_strength,
-                      cm->cdef_bits);
+  if (bsize == cm->sb_size && cm->cdef_bits != 0 && !cm->all_lossless) {
+    int width_step = mi_size_wide[BLOCK_64X64];
+    int height_step = mi_size_high[BLOCK_64X64];
+    int width, height;
+    for (height = 0; (height < mi_size_high[cm->sb_size]) &&
+                     (mi_row + height < cm->mi_rows);
+         height += height_step) {
+      for (width = 0; (width < mi_size_wide[cm->sb_size]) &&
+                      (mi_col + width < cm->mi_cols);
+           width += width_step) {
+        if (!sb_all_skip(cm, mi_row + height, mi_col + width))
+          aom_write_literal(
+              w, cm->mi_grid_visible[(mi_row + height) * cm->mi_stride +
+                                     (mi_col + width)]
+                     ->mbmi.cdef_strength,
+              cm->cdef_bits);
+      }
+    }
   }
 #endif
 }
@@ -2955,12 +3131,8 @@ static void write_modes(AV1_COMP *const cpi, const TileInfo *const tile,
   int mi_row, mi_col;
 
 #if CONFIG_DEPENDENT_HORZTILES
-#if CONFIG_TILE_GROUPS
   if (!cm->dependent_horz_tiles || mi_row_start == 0 ||
       tile->tg_horz_boundary) {
-#else
-  if (!cm->dependent_horz_tiles || mi_row_start == 0) {
-#endif
     av1_zero_above_context(cm, mi_col_start, mi_col_end);
   }
 #else
@@ -2986,7 +3158,7 @@ static void write_modes(AV1_COMP *const cpi, const TileInfo *const tile,
     for (mi_col = mi_col_start; mi_col < mi_col_end; mi_col += cm->mib_size) {
       write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, 0, mi_row, mi_col,
                              cm->sb_size);
-#if CONFIG_MOTION_VAR && CONFIG_NCOBMC
+#if CONFIG_MOTION_VAR && (CONFIG_NCOBMC || CONFIG_NCOBMC_ADAPT_WEIGHT)
       write_tokens_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, cm->sb_size);
 #endif
     }
@@ -3000,214 +3172,6 @@ static void write_modes(AV1_COMP *const cpi, const TileInfo *const tile,
 #endif
 }
 
-#if !CONFIG_LV_MAP
-#if !CONFIG_PVQ && !CONFIG_EC_ADAPT
-static void build_tree_distribution(AV1_COMP *cpi, TX_SIZE tx_size,
-                                    av1_coeff_stats *coef_branch_ct,
-                                    av1_coeff_probs_model *coef_probs) {
-  av1_coeff_count *coef_counts = cpi->td.rd_counts.coef_counts[tx_size];
-  unsigned int(*eob_branch_ct)[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] =
-      cpi->common.counts.eob_branch[tx_size];
-  int i, j, k, l, m;
-#if CONFIG_RECT_TX
-  assert(!is_rect_tx(tx_size));
-#endif  // CONFIG_RECT_TX
-
-  for (i = 0; i < PLANE_TYPES; ++i) {
-    for (j = 0; j < REF_TYPES; ++j) {
-      for (k = 0; k < COEF_BANDS; ++k) {
-        for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
-          av1_tree_probs_from_distribution(av1_coef_tree,
-                                           coef_branch_ct[i][j][k][l],
-                                           coef_counts[i][j][k][l]);
-          coef_branch_ct[i][j][k][l][0][1] =
-              eob_branch_ct[i][j][k][l] - coef_branch_ct[i][j][k][l][0][0];
-          for (m = 0; m < UNCONSTRAINED_NODES; ++m)
-            coef_probs[i][j][k][l][m] =
-                get_binary_prob(coef_branch_ct[i][j][k][l][m][0],
-                                coef_branch_ct[i][j][k][l][m][1]);
-        }
-      }
-    }
-  }
-}
-
-#if !CONFIG_EC_ADAPT
-static void update_coef_probs_common(aom_writer *const bc, AV1_COMP *cpi,
-                                     TX_SIZE tx_size,
-                                     av1_coeff_stats *frame_branch_ct,
-                                     av1_coeff_probs_model *new_coef_probs) {
-  av1_coeff_probs_model *old_coef_probs = cpi->common.fc->coef_probs[tx_size];
-  const aom_prob upd = DIFF_UPDATE_PROB;
-#if CONFIG_EC_ADAPT
-  const int entropy_nodes_update = UNCONSTRAINED_NODES - 1;
-#else
-  const int entropy_nodes_update = UNCONSTRAINED_NODES;
-#endif
-  int i, j, k, l, t;
-  int stepsize = cpi->sf.coeff_prob_appx_step;
-#if CONFIG_TILE_GROUPS
-  const int probwt = cpi->common.num_tg;
-#else
-  const int probwt = 1;
-#endif
-#if CONFIG_RECT_TX
-  assert(!is_rect_tx(tx_size));
-#endif  // CONFIG_RECT_TX
-
-  switch (cpi->sf.use_fast_coef_updates) {
-    case TWO_LOOP: {
-      /* dry run to see if there is any update at all needed */
-      int savings = 0;
-      int update[2] = { 0, 0 };
-      for (i = 0; i < PLANE_TYPES; ++i) {
-        for (j = 0; j < REF_TYPES; ++j) {
-          for (k = 0; k < COEF_BANDS; ++k) {
-            for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
-              for (t = 0; t < entropy_nodes_update; ++t) {
-                aom_prob newp = new_coef_probs[i][j][k][l][t];
-                const aom_prob oldp = old_coef_probs[i][j][k][l][t];
-                int s;
-                int u = 0;
-                if (t == PIVOT_NODE)
-                  s = av1_prob_diff_update_savings_search_model(
-                      frame_branch_ct[i][j][k][l][0], oldp, &newp, upd,
-                      stepsize, probwt);
-                else
-                  s = av1_prob_diff_update_savings_search(
-                      frame_branch_ct[i][j][k][l][t], oldp, &newp, upd, probwt);
-
-                if (s > 0 && newp != oldp) u = 1;
-                if (u)
-                  savings += s - (int)(av1_cost_zero(upd));
-                else
-                  savings -= (int)(av1_cost_zero(upd));
-                update[u]++;
-              }
-            }
-          }
-        }
-      }
-
-      /* Is coef updated at all */
-      if (update[1] == 0 || savings < 0) {
-        aom_write_bit(bc, 0);
-        return;
-      }
-      aom_write_bit(bc, 1);
-      for (i = 0; i < PLANE_TYPES; ++i) {
-        for (j = 0; j < REF_TYPES; ++j) {
-          for (k = 0; k < COEF_BANDS; ++k) {
-            for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
-              // calc probs and branch cts for this frame only
-              for (t = 0; t < entropy_nodes_update; ++t) {
-                aom_prob newp = new_coef_probs[i][j][k][l][t];
-                aom_prob *oldp = old_coef_probs[i][j][k][l] + t;
-                int s;
-                int u = 0;
-                if (t == PIVOT_NODE)
-                  s = av1_prob_diff_update_savings_search_model(
-                      frame_branch_ct[i][j][k][l][0], *oldp, &newp, upd,
-                      stepsize, probwt);
-                else
-                  s = av1_prob_diff_update_savings_search(
-                      frame_branch_ct[i][j][k][l][t], *oldp, &newp, upd,
-                      probwt);
-                if (s > 0 && newp != *oldp) u = 1;
-                aom_write(bc, u, upd);
-                if (u) {
-                  /* send/use new probability */
-                  av1_write_prob_diff_update(bc, newp, *oldp);
-                  *oldp = newp;
-                }
-              }
-            }
-          }
-        }
-      }
-      return;
-    }
-
-    case ONE_LOOP_REDUCED: {
-      int updates = 0;
-      int noupdates_before_first = 0;
-      for (i = 0; i < PLANE_TYPES; ++i) {
-        for (j = 0; j < REF_TYPES; ++j) {
-          for (k = 0; k < COEF_BANDS; ++k) {
-            for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
-              // calc probs and branch cts for this frame only
-              for (t = 0; t < entropy_nodes_update; ++t) {
-                aom_prob newp = new_coef_probs[i][j][k][l][t];
-                aom_prob *oldp = old_coef_probs[i][j][k][l] + t;
-                int s;
-                int u = 0;
-                if (t == PIVOT_NODE) {
-                  s = av1_prob_diff_update_savings_search_model(
-                      frame_branch_ct[i][j][k][l][0], *oldp, &newp, upd,
-                      stepsize, probwt);
-                } else {
-                  s = av1_prob_diff_update_savings_search(
-                      frame_branch_ct[i][j][k][l][t], *oldp, &newp, upd,
-                      probwt);
-                }
-
-                if (s > 0 && newp != *oldp) u = 1;
-                updates += u;
-                if (u == 0 && updates == 0) {
-                  noupdates_before_first++;
-                  continue;
-                }
-                if (u == 1 && updates == 1) {
-                  int v;
-                  // first update
-                  aom_write_bit(bc, 1);
-                  for (v = 0; v < noupdates_before_first; ++v)
-                    aom_write(bc, 0, upd);
-                }
-                aom_write(bc, u, upd);
-                if (u) {
-                  /* send/use new probability */
-                  av1_write_prob_diff_update(bc, newp, *oldp);
-                  *oldp = newp;
-                }
-              }
-            }
-          }
-        }
-      }
-      if (updates == 0) {
-        aom_write_bit(bc, 0);  // no updates
-      }
-      return;
-    }
-    default: assert(0);
-  }
-}
-#endif
-
-#if !CONFIG_EC_ADAPT
-static void update_coef_probs(AV1_COMP *cpi, aom_writer *w) {
-  const TX_MODE tx_mode = cpi->common.tx_mode;
-  const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
-  TX_SIZE tx_size;
-
-  for (tx_size = 0; tx_size <= max_tx_size; ++tx_size) {
-    av1_coeff_stats frame_branch_ct[PLANE_TYPES];
-    av1_coeff_probs_model frame_coef_probs[PLANE_TYPES];
-    if (cpi->td.counts->tx_size_totals[tx_size] <= 20 || CONFIG_RD_DEBUG ||
-        (tx_size >= TX_16X16 && cpi->sf.tx_size_search_method == USE_TX_8X8)) {
-      aom_write_bit(w, 0);
-    } else {
-      build_tree_distribution(cpi, tx_size, frame_branch_ct, frame_coef_probs);
-      update_coef_probs_common(w, cpi, tx_size, frame_branch_ct,
-                               frame_coef_probs);
-    }
-  }
-}
-#endif  // !CONFIG_EC_ADAPT
-#endif  // !CONFIG_EC_ADAPT
-#endif  // !CONFIG_LV_MAP
-
 #if CONFIG_LOOP_RESTORATION
 static void encode_restoration_mode(AV1_COMMON *cm,
                                     struct aom_write_bit_buffer *wb) {
@@ -3257,6 +3221,23 @@ static void encode_restoration_mode(AV1_COMMON *cm,
           wb, rsi->restoration_tilesize != (RESTORATION_TILESIZE_MAX >> 1));
     }
   }
+  int s = AOMMIN(cm->subsampling_x, cm->subsampling_y);
+  if (s && (cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
+            cm->rst_info[2].frame_restoration_type != RESTORE_NONE)) {
+    aom_wb_write_bit(wb, cm->rst_info[1].restoration_tilesize !=
+                             cm->rst_info[0].restoration_tilesize);
+    assert(cm->rst_info[1].restoration_tilesize ==
+               cm->rst_info[0].restoration_tilesize ||
+           cm->rst_info[1].restoration_tilesize ==
+               (cm->rst_info[0].restoration_tilesize >> s));
+    assert(cm->rst_info[2].restoration_tilesize ==
+           cm->rst_info[1].restoration_tilesize);
+  } else if (!s) {
+    assert(cm->rst_info[1].restoration_tilesize ==
+           cm->rst_info[0].restoration_tilesize);
+    assert(cm->rst_info[2].restoration_tilesize ==
+           cm->rst_info[1].restoration_tilesize);
+  }
 }
 
 static void write_wiener_filter(WienerInfo *wiener_info,
@@ -3311,16 +3292,23 @@ static void write_sgrproj_filter(SgrprojInfo *sgrproj_info,
 
 static void encode_restoration(AV1_COMMON *cm, aom_writer *wb) {
   int i, p;
-  const int ntiles = av1_get_rest_ntiles(cm->width, cm->height,
-                                         cm->rst_info[0].restoration_tilesize,
-                                         NULL, NULL, NULL, NULL);
+#if CONFIG_FRAME_SUPERRES
+  const int width = cm->superres_upscaled_width;
+  const int height = cm->superres_upscaled_height;
+#else
+  const int width = cm->width;
+  const int height = cm->height;
+#endif  // CONFIG_FRAME_SUPERRES
+  const int ntiles =
+      av1_get_rest_ntiles(width, height, cm->rst_info[0].restoration_tilesize,
+                          NULL, NULL, NULL, NULL);
   WienerInfo ref_wiener_info;
   SgrprojInfo ref_sgrproj_info;
   set_default_wiener(&ref_wiener_info);
   set_default_sgrproj(&ref_sgrproj_info);
   const int ntiles_uv = av1_get_rest_ntiles(
-      ROUND_POWER_OF_TWO(cm->width, cm->subsampling_x),
-      ROUND_POWER_OF_TWO(cm->height, cm->subsampling_y),
+      ROUND_POWER_OF_TWO(width, cm->subsampling_x),
+      ROUND_POWER_OF_TWO(height, cm->subsampling_y),
       cm->rst_info[1].restoration_tilesize, NULL, NULL, NULL, NULL);
   RestorationInfo *rsi = &cm->rst_info[0];
   if (rsi->frame_restoration_type != RESTORE_NONE) {
@@ -3389,6 +3377,12 @@ static void encode_loopfilter(AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
 
   // Encode the loop filter level and type
   aom_wb_write_literal(wb, lf->filter_level, 6);
+#if CONFIG_UV_LVL
+  if (lf->filter_level > 0) {
+    aom_wb_write_literal(wb, lf->filter_level_u, 6);
+    aom_wb_write_literal(wb, lf->filter_level_v, 6);
+  }
+#endif
   aom_wb_write_literal(wb, lf->sharpness_level, 3);
 
   // Write out loop filter deltas applied at the MB level based on mode or
@@ -3509,51 +3503,17 @@ static void encode_segmentation(AV1_COMMON *cm, MACROBLOCKD *xd,
   }
 }
 
-#if !CONFIG_EC_ADAPT
-static void update_seg_probs(AV1_COMP *cpi, aom_writer *w) {
-  AV1_COMMON *cm = &cpi->common;
-#if CONFIG_TILE_GROUPS
-  const int probwt = cm->num_tg;
-#else
-  const int probwt = 1;
-#endif
-
-  if (!cm->seg.enabled || !cm->seg.update_map) return;
-
-  if (cm->seg.temporal_update) {
-    int i;
-
-    for (i = 0; i < PREDICTION_PROBS; i++)
-      av1_cond_prob_diff_update(w, &cm->fc->seg.pred_probs[i],
-                                cm->counts.seg.pred[i], probwt);
-
-    prob_diff_update(av1_segment_tree, cm->fc->seg.tree_probs,
-                     cm->counts.seg.tree_mispred, MAX_SEGMENTS, probwt, w);
-  } else {
-    prob_diff_update(av1_segment_tree, cm->fc->seg.tree_probs,
-                     cm->counts.seg.tree_total, MAX_SEGMENTS, probwt, w);
-  }
-}
-#endif
-
-static void write_tx_mode(AV1_COMMON *cm, MACROBLOCKD *xd, TX_MODE *mode,
+static void write_tx_mode(AV1_COMMON *cm, TX_MODE *mode,
                           struct aom_write_bit_buffer *wb) {
-  int i, all_lossless = 1;
-
-  if (cm->seg.enabled) {
-    for (i = 0; i < MAX_SEGMENTS; ++i) {
-      if (!xd->lossless[i]) {
-        all_lossless = 0;
-        break;
-      }
-    }
-  } else {
-    all_lossless = xd->lossless[0];
-  }
-  if (all_lossless) {
+  if (cm->all_lossless) {
     *mode = ONLY_4X4;
     return;
   }
+#if CONFIG_VAR_TX_NO_TX_MODE
+  (void)wb;
+  *mode = TX_MODE_SELECT;
+  return;
+#else
 #if CONFIG_TX64X64
   aom_wb_write_bit(wb, *mode == TX_MODE_SELECT);
   if (*mode != TX_MODE_SELECT) {
@@ -3564,26 +3524,9 @@ static void write_tx_mode(AV1_COMMON *cm, MACROBLOCKD *xd, TX_MODE *mode,
   aom_wb_write_bit(wb, *mode == TX_MODE_SELECT);
   if (*mode != TX_MODE_SELECT) aom_wb_write_literal(wb, *mode, 2);
 #endif  // CONFIG_TX64X64
+#endif  // CONFIG_VAR_TX_NO_TX_MODE
 }
 
-#if !CONFIG_EC_ADAPT
-static void update_txfm_probs(AV1_COMMON *cm, aom_writer *w,
-                              FRAME_COUNTS *counts) {
-#if CONFIG_TILE_GROUPS
-  const int probwt = cm->num_tg;
-#else
-  const int probwt = 1;
-#endif
-  if (cm->tx_mode == TX_MODE_SELECT) {
-    int i, j;
-    for (i = 0; i < MAX_TX_DEPTH; ++i)
-      for (j = 0; j < TX_SIZE_CONTEXTS; ++j)
-        prob_diff_update(av1_tx_size_tree[i], cm->fc->tx_size_probs[i][j],
-                         counts->tx_size[i][j], i + 2, probwt, w);
-  }
-}
-#endif
-
 static void write_frame_interp_filter(InterpFilter filter,
                                       struct aom_write_bit_buffer *wb) {
   aom_wb_write_bit(wb, filter == SWITCHABLE);
@@ -3624,52 +3567,52 @@ static void fix_interp_filter(AV1_COMMON *cm, FRAME_COUNTS *counts) {
 static void write_tile_info(const AV1_COMMON *const cm,
                             struct aom_write_bit_buffer *wb) {
 #if CONFIG_EXT_TILE
-  const int tile_width =
-      ALIGN_POWER_OF_TWO(cm->tile_width, cm->mib_size_log2) >>
-      cm->mib_size_log2;
-  const int tile_height =
-      ALIGN_POWER_OF_TWO(cm->tile_height, cm->mib_size_log2) >>
-      cm->mib_size_log2;
+  if (cm->large_scale_tile) {
+    const int tile_width =
+        ALIGN_POWER_OF_TWO(cm->tile_width, cm->mib_size_log2) >>
+        cm->mib_size_log2;
+    const int tile_height =
+        ALIGN_POWER_OF_TWO(cm->tile_height, cm->mib_size_log2) >>
+        cm->mib_size_log2;
 
-  assert(tile_width > 0);
-  assert(tile_height > 0);
-
-  aom_wb_write_literal(wb, cm->tile_encoding_mode, 1);
+    assert(tile_width > 0);
+    assert(tile_height > 0);
 
 // Write the tile sizes
 #if CONFIG_EXT_PARTITION
-  if (cm->sb_size == BLOCK_128X128) {
-    assert(tile_width <= 32);
-    assert(tile_height <= 32);
-    aom_wb_write_literal(wb, tile_width - 1, 5);
-    aom_wb_write_literal(wb, tile_height - 1, 5);
-  } else
+    if (cm->sb_size == BLOCK_128X128) {
+      assert(tile_width <= 32);
+      assert(tile_height <= 32);
+      aom_wb_write_literal(wb, tile_width - 1, 5);
+      aom_wb_write_literal(wb, tile_height - 1, 5);
+    } else {
 #endif  // CONFIG_EXT_PARTITION
-  {
-    assert(tile_width <= 64);
-    assert(tile_height <= 64);
-    aom_wb_write_literal(wb, tile_width - 1, 6);
-    aom_wb_write_literal(wb, tile_height - 1, 6);
-  }
-#if CONFIG_DEPENDENT_HORZTILES
-  if (tile_height > 1) aom_wb_write_bit(wb, cm->dependent_horz_tiles);
-#endif
-#else
-  int min_log2_tile_cols, max_log2_tile_cols, ones;
-  av1_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
+      assert(tile_width <= 64);
+      assert(tile_height <= 64);
+      aom_wb_write_literal(wb, tile_width - 1, 6);
+      aom_wb_write_literal(wb, tile_height - 1, 6);
+#if CONFIG_EXT_PARTITION
+    }
+#endif  // CONFIG_EXT_PARTITION
+  } else {
+#endif  // CONFIG_EXT_TILE
+    int min_log2_tile_cols, max_log2_tile_cols, ones;
+    av1_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
 
-  // columns
-  ones = cm->log2_tile_cols - min_log2_tile_cols;
-  while (ones--) aom_wb_write_bit(wb, 1);
+    // columns
+    ones = cm->log2_tile_cols - min_log2_tile_cols;
+    while (ones--) aom_wb_write_bit(wb, 1);
 
-  if (cm->log2_tile_cols < max_log2_tile_cols) aom_wb_write_bit(wb, 0);
+    if (cm->log2_tile_cols < max_log2_tile_cols) aom_wb_write_bit(wb, 0);
 
-  // rows
-  aom_wb_write_bit(wb, cm->log2_tile_rows != 0);
-  if (cm->log2_tile_rows != 0) aom_wb_write_bit(wb, cm->log2_tile_rows != 1);
+    // rows
+    aom_wb_write_bit(wb, cm->log2_tile_rows != 0);
+    if (cm->log2_tile_rows != 0) aom_wb_write_bit(wb, cm->log2_tile_rows != 1);
 #if CONFIG_DEPENDENT_HORZTILES
-  if (cm->log2_tile_rows != 0) aom_wb_write_bit(wb, cm->dependent_horz_tiles);
+    if (cm->log2_tile_rows != 0) aom_wb_write_bit(wb, cm->dependent_horz_tiles);
 #endif
+#if CONFIG_EXT_TILE
+  }
 #endif  // CONFIG_EXT_TILE
 
 #if CONFIG_LOOPFILTERING_ACROSS_TILES
@@ -3782,16 +3725,9 @@ static INLINE int find_identical_tile(
 }
 #endif  // CONFIG_EXT_TILE
 
-#if CONFIG_TILE_GROUPS
-static uint32_t write_tiles(AV1_COMP *const cpi,
-                            struct aom_write_bit_buffer *wb,
-                            unsigned int *max_tile_size,
-                            unsigned int *max_tile_col_size) {
-#else
 static uint32_t write_tiles(AV1_COMP *const cpi, uint8_t *const dst,
                             unsigned int *max_tile_size,
                             unsigned int *max_tile_col_size) {
-#endif
   const AV1_COMMON *const cm = &cpi->common;
 #if CONFIG_ANS
   struct BufAnsCoder *buf_ans = &cpi->buf_ans;
@@ -3805,19 +3741,24 @@ static uint32_t write_tiles(AV1_COMP *const cpi, uint8_t *const dst,
   const int tile_cols = cm->tile_cols;
   const int tile_rows = cm->tile_rows;
   unsigned int tile_size = 0;
-#if CONFIG_TILE_GROUPS
+  const int have_tiles = tile_cols * tile_rows > 1;
+  struct aom_write_bit_buffer wb = { dst, 0 };
   const int n_log2_tiles = cm->log2_tile_rows + cm->log2_tile_cols;
-  const int have_tiles = n_log2_tiles > 0;
   uint32_t comp_hdr_size;
   // Fixed size tile groups for the moment
   const int num_tg_hdrs = cm->num_tg;
-  const int tg_size = (tile_rows * tile_cols + num_tg_hdrs - 1) / num_tg_hdrs;
+  const int tg_size =
+#if CONFIG_EXT_TILE
+      (cm->large_scale_tile)
+          ? 1
+          :
+#endif  // CONFIG_EXT_TILE
+          (tile_rows * tile_cols + num_tg_hdrs - 1) / num_tg_hdrs;
   int tile_count = 0;
   int tg_count = 1;
   int tile_size_bytes = 4;
   int tile_col_size_bytes;
   uint32_t uncompressed_hdr_size = 0;
-  uint8_t *dst = NULL;
   struct aom_write_bit_buffer comp_hdr_len_wb;
   struct aom_write_bit_buffer tg_params_wb;
   struct aom_write_bit_buffer tile_size_bytes_wb;
@@ -3825,10 +3766,6 @@ static uint32_t write_tiles(AV1_COMP *const cpi, uint8_t *const dst,
   int mtu_size = cpi->oxcf.mtu;
   int curr_tg_data_size = 0;
   int hdr_size;
-#endif
-#if CONFIG_EXT_TILE
-  const int have_tiles = tile_cols * tile_rows > 1;
-#endif  // CONFIG_EXT_TILE
 
   *max_tile_size = 0;
   *max_tile_col_size = 0;
@@ -3837,282 +3774,274 @@ static uint32_t write_tiles(AV1_COMP *const cpi, uint8_t *const dst,
 // later compact the data if smaller headers are adequate.
 
 #if CONFIG_EXT_TILE
-  for (tile_col = 0; tile_col < tile_cols; tile_col++) {
-    TileInfo tile_info;
-    const int is_last_col = (tile_col == tile_cols - 1);
-    const uint32_t col_offset = total_size;
-
-    av1_tile_set_col(&tile_info, cm, tile_col);
-
-    // The last column does not have a column header
-    if (!is_last_col) total_size += 4;
-
-    for (tile_row = 0; tile_row < tile_rows; tile_row++) {
-      TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
-      const TOKENEXTRA *tok = tok_buffers[tile_row][tile_col];
-      const TOKENEXTRA *tok_end = tok + cpi->tok_count[tile_row][tile_col];
-      const int data_offset = have_tiles ? 4 : 0;
-#if CONFIG_EC_ADAPT
-      const int tile_idx = tile_row * tile_cols + tile_col;
-      TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
-#endif
-      av1_tile_set_row(&tile_info, cm, tile_row);
+  if (cm->large_scale_tile) {
+    for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+      TileInfo tile_info;
+      const int is_last_col = (tile_col == tile_cols - 1);
+      const uint32_t col_offset = total_size;
 
-      buf->data = dst + total_size;
+      av1_tile_set_col(&tile_info, cm, tile_col);
 
-      // Is CONFIG_EXT_TILE = 1, every tile in the row has a header,
-      // even for the last one, unless no tiling is used at all.
-      total_size += data_offset;
-#if CONFIG_EC_ADAPT
-      // Initialise tile context from the frame context
-      this_tile->tctx = *cm->fc;
-      cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
-#endif
+      // The last column does not have a column header
+      if (!is_last_col) total_size += 4;
+
+      for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+        TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
+        const TOKENEXTRA *tok = tok_buffers[tile_row][tile_col];
+        const TOKENEXTRA *tok_end = tok + cpi->tok_count[tile_row][tile_col];
+        const int data_offset = have_tiles ? 4 : 0;
+        const int tile_idx = tile_row * tile_cols + tile_col;
+        TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
+        av1_tile_set_row(&tile_info, cm, tile_row);
+
+        buf->data = dst + total_size;
+
+        // Is CONFIG_EXT_TILE = 1, every tile in the row has a header,
+        // even for the last one, unless no tiling is used at all.
+        total_size += data_offset;
+        // Initialise tile context from the frame context
+        this_tile->tctx = *cm->fc;
+        cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
 #if CONFIG_PVQ
-      cpi->td.mb.pvq_q = &this_tile->pvq_q;
-      cpi->td.mb.daala_enc.state.adapt = &this_tile->tctx.pvq_context;
+        cpi->td.mb.pvq_q = &this_tile->pvq_q;
+        cpi->td.mb.daala_enc.state.adapt = &this_tile->tctx.pvq_context;
 #endif  // CONFIG_PVQ
 #if !CONFIG_ANS
-      aom_start_encode(&mode_bc, buf->data + data_offset);
-      write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end);
-      assert(tok == tok_end);
-      aom_stop_encode(&mode_bc);
-      tile_size = mode_bc.pos;
+        aom_start_encode(&mode_bc, buf->data + data_offset);
+        write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end);
+        assert(tok == tok_end);
+        aom_stop_encode(&mode_bc);
+        tile_size = mode_bc.pos;
 #else
-      buf_ans_write_init(buf_ans, buf->data + data_offset);
-      write_modes(cpi, &tile_info, buf_ans, &tok, tok_end);
-      assert(tok == tok_end);
-      aom_buf_ans_flush(buf_ans);
-      tile_size = buf_ans_write_end(buf_ans);
+        buf_ans_write_init(buf_ans, buf->data + data_offset);
+        write_modes(cpi, &tile_info, buf_ans, &tok, tok_end);
+        assert(tok == tok_end);
+        aom_buf_ans_flush(buf_ans);
+        tile_size = buf_ans_write_end(buf_ans);
 #endif  // !CONFIG_ANS
 #if CONFIG_PVQ
-      cpi->td.mb.pvq_q = NULL;
+        cpi->td.mb.pvq_q = NULL;
 #endif
-      buf->size = tile_size;
-
-      // Record the maximum tile size we see, so we can compact headers later.
-      *max_tile_size = AOMMAX(*max_tile_size, tile_size);
-
-      if (have_tiles) {
-        // tile header: size of this tile, or copy offset
-        uint32_t tile_header = tile_size;
+        buf->size = tile_size;
 
-        // If the tile_encoding_mode is 1 (i.e. TILE_VR), check if this tile is
-        // a copy tile.
-        // Very low chances to have copy tiles on the key frames, so don't
-        // search on key frames to reduce unnecessary search.
-        if (cm->frame_type != KEY_FRAME && cm->tile_encoding_mode) {
-          const int idendical_tile_offset =
-              find_identical_tile(tile_row, tile_col, tile_buffers);
+        // Record the maximum tile size we see, so we can compact headers later.
+        *max_tile_size = AOMMAX(*max_tile_size, tile_size);
 
-          if (idendical_tile_offset > 0) {
-            tile_size = 0;
-            tile_header = idendical_tile_offset | 0x80;
-            tile_header <<= 24;
+        if (have_tiles) {
+          // tile header: size of this tile, or copy offset
+          uint32_t tile_header = tile_size;
+          const int tile_copy_mode =
+              ((AOMMAX(cm->tile_width, cm->tile_height) << MI_SIZE_LOG2) <= 256)
+                  ? 1
+                  : 0;
+
+          // If tile_copy_mode = 1, check if this tile is a copy tile.
+          // Very low chances to have copy tiles on the key frames, so don't
+          // search on key frames to reduce unnecessary search.
+          if (cm->frame_type != KEY_FRAME && tile_copy_mode) {
+            const int idendical_tile_offset =
+                find_identical_tile(tile_row, tile_col, tile_buffers);
+
+            if (idendical_tile_offset > 0) {
+              tile_size = 0;
+              tile_header = idendical_tile_offset | 0x80;
+              tile_header <<= 24;
+            }
           }
+
+          mem_put_le32(buf->data, tile_header);
         }
 
-        mem_put_le32(buf->data, tile_header);
+        total_size += tile_size;
       }
 
-      total_size += tile_size;
-    }
-
-    if (!is_last_col) {
-      uint32_t col_size = total_size - col_offset - 4;
-      mem_put_le32(dst + col_offset, col_size);
+      if (!is_last_col) {
+        uint32_t col_size = total_size - col_offset - 4;
+        mem_put_le32(dst + col_offset, col_size);
 
-      // If it is not final packing, record the maximum tile column size we see,
-      // otherwise, check if the tile size is out of the range.
-      *max_tile_col_size = AOMMAX(*max_tile_col_size, col_size);
+        // If it is not final packing, record the maximum tile column size we
+        // see, otherwise, check if the tile size is out of the range.
+        *max_tile_col_size = AOMMAX(*max_tile_col_size, col_size);
+      }
     }
-  }
-#else
-#if CONFIG_TILE_GROUPS
-  write_uncompressed_header(cpi, wb);
+  } else {
+#endif  // CONFIG_EXT_TILE
+    write_uncompressed_header(cpi, &wb);
 
 #if CONFIG_EXT_REFS
-  if (cm->show_existing_frame) {
-    total_size = aom_wb_bytes_written(wb);
-    return (uint32_t)total_size;
-  }
+    if (cm->show_existing_frame) {
+      total_size = aom_wb_bytes_written(&wb);
+      return (uint32_t)total_size;
+    }
 #endif  // CONFIG_EXT_REFS
 
-  // Write the tile length code
-  tile_size_bytes_wb = *wb;
-  aom_wb_write_literal(wb, 3, 2);
+    // Write the tile length code
+    tile_size_bytes_wb = wb;
+    aom_wb_write_literal(&wb, 3, 2);
 
-  /* Write a placeholder for the number of tiles in each tile group */
-  tg_params_wb = *wb;
-  saved_offset = wb->bit_offset;
-  if (have_tiles) {
-    aom_wb_overwrite_literal(wb, 3, n_log2_tiles);
-    aom_wb_overwrite_literal(wb, (1 << n_log2_tiles) - 1, n_log2_tiles);
-  }
-
-  /* Write a placeholder for the compressed header length */
-  comp_hdr_len_wb = *wb;
-  aom_wb_write_literal(wb, 0, 16);
-
-  uncompressed_hdr_size = aom_wb_bytes_written(wb);
-  dst = wb->bit_buffer;
-  comp_hdr_size = write_compressed_header(cpi, dst + uncompressed_hdr_size);
-  aom_wb_overwrite_literal(&comp_hdr_len_wb, (int)(comp_hdr_size), 16);
-  hdr_size = uncompressed_hdr_size + comp_hdr_size;
-  total_size += hdr_size;
-#endif
+    /* Write a placeholder for the number of tiles in each tile group */
+    tg_params_wb = wb;
+    saved_offset = wb.bit_offset;
+    if (have_tiles) {
+      aom_wb_overwrite_literal(&wb, 3, n_log2_tiles);
+      aom_wb_overwrite_literal(&wb, (1 << n_log2_tiles) - 1, n_log2_tiles);
+    }
 
-  for (tile_row = 0; tile_row < tile_rows; tile_row++) {
-    TileInfo tile_info;
-    const int is_last_row = (tile_row == tile_rows - 1);
-    av1_tile_set_row(&tile_info, cm, tile_row);
+    /* Write a placeholder for the compressed header length */
+    comp_hdr_len_wb = wb;
+    aom_wb_write_literal(&wb, 0, 16);
 
-    for (tile_col = 0; tile_col < tile_cols; tile_col++) {
-      const int tile_idx = tile_row * tile_cols + tile_col;
-      TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
-#if CONFIG_PVQ || CONFIG_EC_ADAPT
-      TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
-#endif
-      const TOKENEXTRA *tok = tok_buffers[tile_row][tile_col];
-      const TOKENEXTRA *tok_end = tok + cpi->tok_count[tile_row][tile_col];
-      const int is_last_col = (tile_col == tile_cols - 1);
-      const int is_last_tile = is_last_col && is_last_row;
-#if !CONFIG_TILE_GROUPS
-      (void)tile_idx;
-#else
+    uncompressed_hdr_size = aom_wb_bytes_written(&wb);
+    comp_hdr_size = write_compressed_header(cpi, dst + uncompressed_hdr_size);
+    aom_wb_overwrite_literal(&comp_hdr_len_wb, (int)(comp_hdr_size), 16);
+    hdr_size = uncompressed_hdr_size + comp_hdr_size;
+    total_size += hdr_size;
 
-      if ((!mtu_size && tile_count > tg_size) ||
-          (mtu_size && tile_count && curr_tg_data_size >= mtu_size)) {
-        // New tile group
-        tg_count++;
-        // We've exceeded the packet size
-        if (tile_count > 1) {
-          /* The last tile exceeded the packet size. The tile group size
-             should therefore be tile_count-1.
-             Move the last tile and insert headers before it
-           */
-          uint32_t old_total_size = total_size - tile_size - 4;
-          memmove(dst + old_total_size + hdr_size, dst + old_total_size,
-                  (tile_size + 4) * sizeof(uint8_t));
-          // Copy uncompressed header
-          memmove(dst + old_total_size, dst,
-                  uncompressed_hdr_size * sizeof(uint8_t));
-          // Write the number of tiles in the group into the last uncompressed
-          // header before the one we've just inserted
-          aom_wb_overwrite_literal(&tg_params_wb, tile_idx - tile_count,
-                                   n_log2_tiles);
-          aom_wb_overwrite_literal(&tg_params_wb, tile_count - 2, n_log2_tiles);
-          // Update the pointer to the last TG params
-          tg_params_wb.bit_offset = saved_offset + 8 * old_total_size;
-          // Copy compressed header
-          memmove(dst + old_total_size + uncompressed_hdr_size,
-                  dst + uncompressed_hdr_size, comp_hdr_size * sizeof(uint8_t));
-          total_size += hdr_size;
-          tile_count = 1;
-          curr_tg_data_size = hdr_size + tile_size + 4;
+    for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+      TileInfo tile_info;
+      const int is_last_row = (tile_row == tile_rows - 1);
+      av1_tile_set_row(&tile_info, cm, tile_row);
 
-        } else {
-          // We exceeded the packet size in just one tile
-          // Copy uncompressed header
-          memmove(dst + total_size, dst,
-                  uncompressed_hdr_size * sizeof(uint8_t));
-          // Write the number of tiles in the group into the last uncompressed
-          // header
-          aom_wb_overwrite_literal(&tg_params_wb, tile_idx - tile_count,
-                                   n_log2_tiles);
-          aom_wb_overwrite_literal(&tg_params_wb, tile_count - 1, n_log2_tiles);
-          tg_params_wb.bit_offset = saved_offset + 8 * total_size;
-          // Copy compressed header
-          memmove(dst + total_size + uncompressed_hdr_size,
-                  dst + uncompressed_hdr_size, comp_hdr_size * sizeof(uint8_t));
-          total_size += hdr_size;
-          tile_count = 0;
-          curr_tg_data_size = hdr_size;
+      for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+        const int tile_idx = tile_row * tile_cols + tile_col;
+        TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
+        TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
+        const TOKENEXTRA *tok = tok_buffers[tile_row][tile_col];
+        const TOKENEXTRA *tok_end = tok + cpi->tok_count[tile_row][tile_col];
+        const int is_last_col = (tile_col == tile_cols - 1);
+        const int is_last_tile = is_last_col && is_last_row;
+
+        if ((!mtu_size && tile_count > tg_size) ||
+            (mtu_size && tile_count && curr_tg_data_size >= mtu_size)) {
+          // New tile group
+          tg_count++;
+          // We've exceeded the packet size
+          if (tile_count > 1) {
+            /* The last tile exceeded the packet size. The tile group size
+               should therefore be tile_count-1.
+               Move the last tile and insert headers before it
+             */
+            uint32_t old_total_size = total_size - tile_size - 4;
+            memmove(dst + old_total_size + hdr_size, dst + old_total_size,
+                    (tile_size + 4) * sizeof(uint8_t));
+            // Copy uncompressed header
+            memmove(dst + old_total_size, dst,
+                    uncompressed_hdr_size * sizeof(uint8_t));
+            // Write the number of tiles in the group into the last uncompressed
+            // header before the one we've just inserted
+            aom_wb_overwrite_literal(&tg_params_wb, tile_idx - tile_count,
+                                     n_log2_tiles);
+            aom_wb_overwrite_literal(&tg_params_wb, tile_count - 2,
+                                     n_log2_tiles);
+            // Update the pointer to the last TG params
+            tg_params_wb.bit_offset = saved_offset + 8 * old_total_size;
+            // Copy compressed header
+            memmove(dst + old_total_size + uncompressed_hdr_size,
+                    dst + uncompressed_hdr_size,
+                    comp_hdr_size * sizeof(uint8_t));
+            total_size += hdr_size;
+            tile_count = 1;
+            curr_tg_data_size = hdr_size + tile_size + 4;
+          } else {
+            // We exceeded the packet size in just one tile
+            // Copy uncompressed header
+            memmove(dst + total_size, dst,
+                    uncompressed_hdr_size * sizeof(uint8_t));
+            // Write the number of tiles in the group into the last uncompressed
+            // header
+            aom_wb_overwrite_literal(&tg_params_wb, tile_idx - tile_count,
+                                     n_log2_tiles);
+            aom_wb_overwrite_literal(&tg_params_wb, tile_count - 1,
+                                     n_log2_tiles);
+            tg_params_wb.bit_offset = saved_offset + 8 * total_size;
+            // Copy compressed header
+            memmove(dst + total_size + uncompressed_hdr_size,
+                    dst + uncompressed_hdr_size,
+                    comp_hdr_size * sizeof(uint8_t));
+            total_size += hdr_size;
+            tile_count = 0;
+            curr_tg_data_size = hdr_size;
+          }
         }
-      }
-      tile_count++;
-#endif
-      av1_tile_set_col(&tile_info, cm, tile_col);
+        tile_count++;
+        av1_tile_set_col(&tile_info, cm, tile_col);
 
-#if CONFIG_DEPENDENT_HORZTILES && CONFIG_TILE_GROUPS
-      av1_tile_set_tg_boundary(&tile_info, cm, tile_row, tile_col);
+#if CONFIG_DEPENDENT_HORZTILES
+        av1_tile_set_tg_boundary(&tile_info, cm, tile_row, tile_col);
 #endif
-      buf->data = dst + total_size;
+        buf->data = dst + total_size;
 
-      // The last tile does not have a header.
-      if (!is_last_tile) total_size += 4;
+        // The last tile does not have a header.
+        if (!is_last_tile) total_size += 4;
 
-#if CONFIG_EC_ADAPT
-      // Initialise tile context from the frame context
-      this_tile->tctx = *cm->fc;
-      cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
-#endif
+        // Initialise tile context from the frame context
+        this_tile->tctx = *cm->fc;
+        cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
 #if CONFIG_PVQ
-      cpi->td.mb.pvq_q = &this_tile->pvq_q;
-      cpi->td.mb.daala_enc.state.adapt = &this_tile->tctx.pvq_context;
+        cpi->td.mb.pvq_q = &this_tile->pvq_q;
+        cpi->td.mb.daala_enc.state.adapt = &this_tile->tctx.pvq_context;
 #endif  // CONFIG_PVQ
 #if CONFIG_ANS
-      buf_ans_write_init(buf_ans, dst + total_size);
-      write_modes(cpi, &tile_info, buf_ans, &tok, tok_end);
-      assert(tok == tok_end);
-      aom_buf_ans_flush(buf_ans);
-      tile_size = buf_ans_write_end(buf_ans);
+        buf_ans_write_init(buf_ans, dst + total_size);
+        write_modes(cpi, &tile_info, buf_ans, &tok, tok_end);
+        assert(tok == tok_end);
+        aom_buf_ans_flush(buf_ans);
+        tile_size = buf_ans_write_end(buf_ans);
 #else
       aom_start_encode(&mode_bc, dst + total_size);
       write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end);
 #if !CONFIG_LV_MAP
+#if !CONFIG_PVQ
       assert(tok == tok_end);
+#endif  // !CONFIG_PVQ
 #endif  // !CONFIG_LV_MAP
       aom_stop_encode(&mode_bc);
       tile_size = mode_bc.pos;
 #endif  // CONFIG_ANS
 #if CONFIG_PVQ
-      cpi->td.mb.pvq_q = NULL;
+        cpi->td.mb.pvq_q = NULL;
 #endif
 
-      assert(tile_size > 0);
+        assert(tile_size > 0);
 
-#if CONFIG_TILE_GROUPS
-      curr_tg_data_size += tile_size + 4;
-#endif
-      buf->size = tile_size;
+        curr_tg_data_size += tile_size + 4;
+        buf->size = tile_size;
 
-      if (!is_last_tile) {
-        *max_tile_size = AOMMAX(*max_tile_size, tile_size);
-        // size of this tile
-        mem_put_le32(buf->data, tile_size);
-      }
+        if (!is_last_tile) {
+          *max_tile_size = AOMMAX(*max_tile_size, tile_size);
+          // size of this tile
+          mem_put_le32(buf->data, tile_size);
+        }
 
-      total_size += tile_size;
+        total_size += tile_size;
+      }
+    }
+    // Write the final tile group size
+    if (n_log2_tiles) {
+      aom_wb_overwrite_literal(&tg_params_wb, (1 << n_log2_tiles) - tile_count,
+                               n_log2_tiles);
+      aom_wb_overwrite_literal(&tg_params_wb, tile_count - 1, n_log2_tiles);
+    }
+    // Remux if possible. TODO (Thomas Davies): do this for more than one tile
+    // group
+    if (have_tiles && tg_count == 1) {
+      int data_size = total_size - (uncompressed_hdr_size + comp_hdr_size);
+      data_size = remux_tiles(cm, dst + uncompressed_hdr_size + comp_hdr_size,
+                              data_size, *max_tile_size, *max_tile_col_size,
+                              &tile_size_bytes, &tile_col_size_bytes);
+      total_size = data_size + uncompressed_hdr_size + comp_hdr_size;
+      aom_wb_overwrite_literal(&tile_size_bytes_wb, tile_size_bytes - 1, 2);
     }
-  }
-#if CONFIG_TILE_GROUPS
-  // Write the final tile group size
-  if (n_log2_tiles) {
-    aom_wb_overwrite_literal(&tg_params_wb, (1 << n_log2_tiles) - tile_count,
-                             n_log2_tiles);
-    aom_wb_overwrite_literal(&tg_params_wb, tile_count - 1, n_log2_tiles);
-  }
-  // Remux if possible. TODO (Thomas Davies): do this for more than one tile
-  // group
-  if (have_tiles && tg_count == 1) {
-    int data_size = total_size - (uncompressed_hdr_size + comp_hdr_size);
-    data_size = remux_tiles(cm, dst + uncompressed_hdr_size + comp_hdr_size,
-                            data_size, *max_tile_size, *max_tile_col_size,
-                            &tile_size_bytes, &tile_col_size_bytes);
-    total_size = data_size + uncompressed_hdr_size + comp_hdr_size;
-    aom_wb_overwrite_literal(&tile_size_bytes_wb, tile_size_bytes - 1, 2);
-  }
 
-#endif
+#if CONFIG_EXT_TILE
+  }
 #endif  // CONFIG_EXT_TILE
   return (uint32_t)total_size;
 }
 
 static void write_render_size(const AV1_COMMON *cm,
                               struct aom_write_bit_buffer *wb) {
-  const int scaling_active =
-      cm->width != cm->render_width || cm->height != cm->render_height;
+  const int scaling_active = !av1_resize_unscaled(cm);
   aom_wb_write_bit(wb, scaling_active);
   if (scaling_active) {
     aom_wb_write_literal(wb, cm->render_width - 1, 16);
@@ -4124,11 +4053,10 @@ static void write_render_size(const AV1_COMMON *cm,
 static void write_superres_scale(const AV1_COMMON *const cm,
                                  struct aom_write_bit_buffer *wb) {
   // First bit is whether to to scale or not
-  if (cm->superres_scale_numerator == SUPERRES_SCALE_DENOMINATOR) {
+  if (cm->superres_scale_numerator == SCALE_DENOMINATOR) {
     aom_wb_write_bit(wb, 0);  // no scaling
   } else {
     aom_wb_write_bit(wb, 1);  // scaling, write scale factor
-    // TODO(afergs): write factor to the compressed header instead
     aom_wb_write_literal(
         wb, cm->superres_scale_numerator - SUPERRES_SCALE_NUMERATOR_MIN,
         SUPERRES_SCALE_BITS);
@@ -4138,13 +4066,15 @@ static void write_superres_scale(const AV1_COMMON *const cm,
 
 static void write_frame_size(const AV1_COMMON *cm,
                              struct aom_write_bit_buffer *wb) {
-  aom_wb_write_literal(wb, cm->width - 1, 16);
-  aom_wb_write_literal(wb, cm->height - 1, 16);
-
-  write_render_size(cm, wb);
 #if CONFIG_FRAME_SUPERRES
+  aom_wb_write_literal(wb, cm->superres_upscaled_width - 1, 16);
+  aom_wb_write_literal(wb, cm->superres_upscaled_height - 1, 16);
   write_superres_scale(cm, wb);
+#else
+  aom_wb_write_literal(wb, cm->width - 1, 16);
+  aom_wb_write_literal(wb, cm->height - 1, 16);
 #endif  // CONFIG_FRAME_SUPERRES
+  write_render_size(cm, wb);
 }
 
 static void write_frame_size_with_refs(AV1_COMP *cpi,
@@ -4157,20 +4087,26 @@ static void write_frame_size_with_refs(AV1_COMP *cpi,
     YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, ref_frame);
 
     if (cfg != NULL) {
+#if CONFIG_FRAME_SUPERRES
+      found = cm->superres_upscaled_width == cfg->y_crop_width &&
+              cm->superres_upscaled_height == cfg->y_crop_height;
+#else
       found =
           cm->width == cfg->y_crop_width && cm->height == cfg->y_crop_height;
+#endif
       found &= cm->render_width == cfg->render_width &&
                cm->render_height == cfg->render_height;
     }
     aom_wb_write_bit(wb, found);
     if (found) {
+#if CONFIG_FRAME_SUPERRES
+      write_superres_scale(cm, wb);
+#endif  // CONFIG_FRAME_SUPERRES
       break;
     }
   }
 
-  if (!found) {
-    write_frame_size(cm, wb);
-  }
+  if (!found) write_frame_size(cm, wb);
 }
 
 static void write_sync_code(struct aom_write_bit_buffer *wb) {
@@ -4196,7 +4132,12 @@ static void write_bitdepth_colorspace_sampling(
     assert(cm->bit_depth > AOM_BITS_8);
     aom_wb_write_bit(wb, cm->bit_depth == AOM_BITS_10 ? 0 : 1);
   }
+#if CONFIG_COLORSPACE_HEADERS
+  aom_wb_write_literal(wb, cm->color_space, 5);
+  aom_wb_write_literal(wb, cm->transfer_function, 5);
+#else
   aom_wb_write_literal(wb, cm->color_space, 3);
+#endif
   if (cm->color_space != AOM_CS_SRGB) {
     // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
     aom_wb_write_bit(wb, cm->color_range);
@@ -4208,6 +4149,11 @@ static void write_bitdepth_colorspace_sampling(
     } else {
       assert(cm->subsampling_x == 1 && cm->subsampling_y == 1);
     }
+#if CONFIG_COLORSPACE_HEADERS
+    if (cm->subsampling_x == 1 && cm->subsampling_y == 1) {
+      aom_wb_write_literal(wb, cm->chroma_sample_position, 2);
+    }
+#endif
   } else {
     assert(cm->profile == PROFILE_1 || cm->profile == PROFILE_3);
     aom_wb_write_bit(wb, 0);  // unused
@@ -4215,9 +4161,17 @@ static void write_bitdepth_colorspace_sampling(
 }
 
 #if CONFIG_REFERENCE_BUFFER
-void write_sequence_header(SequenceHeader *seq_params) {
+void write_sequence_header(
+#if CONFIG_EXT_TILE
+    AV1_COMMON *const cm,
+#endif  // CONFIG_EXT_TILE
+    SequenceHeader *seq_params) {
   /* Placeholder for actually writing to the bitstream */
-  seq_params->frame_id_numbers_present_flag = FRAME_ID_NUMBERS_PRESENT_FLAG;
+  seq_params->frame_id_numbers_present_flag =
+#if CONFIG_EXT_TILE
+      cm->large_scale_tile ? 0 :
+#endif  // CONFIG_EXT_TILE
+                           FRAME_ID_NUMBERS_PRESENT_FLAG;
   seq_params->frame_id_length_minus7 = FRAME_ID_LENGTH_MINUS7;
   seq_params->delta_frame_id_length_minus2 = DELTA_FRAME_ID_LENGTH_MINUS2;
 }
@@ -4236,7 +4190,11 @@ static void write_compound_tools(const AV1_COMMON *cm,
   }
 #endif  // CONFIG_INTERINTRA
 #if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
+#if CONFIG_COMPOUND_SINGLEREF
+  if (!frame_is_intra_only(cm)) {
+#else   // !CONFIG_COMPOUND_SINGLEREF
   if (!frame_is_intra_only(cm) && cm->reference_mode != SINGLE_REFERENCE) {
+#endif  // CONFIG_COMPOUND_SINGLEREF
     aom_wb_write_bit(wb, cm->allow_masked_compound);
   } else {
     assert(cm->allow_masked_compound == 0);
@@ -4252,13 +4210,21 @@ static void write_uncompressed_header(AV1_COMP *cpi,
 
 #if CONFIG_REFERENCE_BUFFER
   /* TODO: Move outside frame loop or inside key-frame branch */
-  write_sequence_header(&cpi->seq_params);
+  write_sequence_header(
+#if CONFIG_EXT_TILE
+      cm,
+#endif  // CONFIG_EXT_TILE
+      &cpi->seq_params);
 #endif
 
   aom_wb_write_literal(wb, AOM_FRAME_MARKER, 2);
 
   write_profile(cm->profile, wb);
 
+#if CONFIG_EXT_TILE
+  aom_wb_write_literal(wb, cm->large_scale_tile, 1);
+#endif  // CONFIG_EXT_TILE
+
 #if CONFIG_EXT_REFS
   // NOTE: By default all coded frames to be used as a reference
   cm->is_reference_frame = 1;
@@ -4309,11 +4275,6 @@ static void write_uncompressed_header(AV1_COMP *cpi,
   }
 #endif
 
-#if CONFIG_FRAME_SUPERRES
-  // TODO(afergs): Remove - this is just to stop superres from breaking
-  cm->superres_scale_numerator = SUPERRES_SCALE_DENOMINATOR;
-#endif  // CONFIG_FRAME_SUPERRES
-
   if (cm->frame_type == KEY_FRAME) {
     write_sync_code(wb);
     write_bitdepth_colorspace_sampling(cm, wb);
@@ -4447,12 +4408,6 @@ static void write_uncompressed_header(AV1_COMP *cpi,
 #endif  // CONFIG_EXT_PARTITION
 
   encode_loopfilter(cm, wb);
-#if CONFIG_CDEF
-  encode_cdef(cm, wb);
-#endif
-#if CONFIG_LOOP_RESTORATION
-  encode_restoration_mode(cm, wb);
-#endif  // CONFIG_LOOP_RESTORATION
   encode_quantization(cm, wb);
   encode_segmentation(cm, xd, wb);
 #if CONFIG_DELTA_Q
@@ -4485,8 +4440,15 @@ static void write_uncompressed_header(AV1_COMP *cpi,
     }
   }
 #endif
-
-  write_tx_mode(cm, xd, &cm->tx_mode, wb);
+#if CONFIG_CDEF
+  if (!cm->all_lossless) {
+    encode_cdef(cm, wb);
+  }
+#endif
+#if CONFIG_LOOP_RESTORATION
+  encode_restoration_mode(cm, wb);
+#endif  // CONFIG_LOOP_RESTORATION
+  write_tx_mode(cm, &cm->tx_mode, wb);
 
   if (cpi->allow_comp_inter_inter) {
     const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT;
@@ -4513,13 +4475,13 @@ static void write_uncompressed_header(AV1_COMP *cpi,
 #if CONFIG_GLOBAL_MOTION
 static void write_global_motion_params(WarpedMotionParams *params,
                                        WarpedMotionParams *ref_params,
-                                       aom_prob *probs, aom_writer *w,
-                                       int allow_hp) {
+                                       aom_writer *w, int allow_hp) {
   TransformationType type = params->wmtype;
   int trans_bits;
   int trans_prec_diff;
-  av1_write_token(w, av1_global_motion_types_tree, probs,
-                  &global_motion_types_encodings[type]);
+  aom_write_bit(w, type != IDENTITY);
+  if (type != IDENTITY) aom_write_literal(w, type - 1, GLOBAL_TYPE_BITS);
+
   switch (type) {
     case HOMOGRAPHY:
     case HORTRAPEZOID:
@@ -4584,10 +4546,18 @@ static void write_global_motion_params(WarpedMotionParams *params,
 static void write_global_motion(AV1_COMP *cpi, aom_writer *w) {
   AV1_COMMON *const cm = &cpi->common;
   int frame;
+  YV12_BUFFER_CONFIG *ref_buf;
   for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
-    write_global_motion_params(
-        &cm->global_motion[frame], &cm->prev_frame->global_motion[frame],
-        cm->fc->global_motion_types_prob, w, cm->allow_high_precision_mv);
+    ref_buf = get_ref_frame_buffer(cpi, frame);
+    if (cpi->source->y_crop_width == ref_buf->y_crop_width &&
+        cpi->source->y_crop_height == ref_buf->y_crop_height) {
+      write_global_motion_params(&cm->global_motion[frame],
+                                 &cm->prev_frame->global_motion[frame], w,
+                                 cm->allow_high_precision_mv);
+    } else {
+      assert(cm->global_motion[frame].wmtype == IDENTITY &&
+             "Invalid warp type for frames of different resolutions");
+    }
     /*
     printf("Frame %d/%d: Enc Ref %d (used %d): %d %d %d %d\n",
            cm->current_video_frame, cm->show_frame, frame,
@@ -4605,15 +4575,17 @@ static uint32_t write_compressed_header(AV1_COMP *cpi, uint8_t *data) {
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
 #endif  // CONFIG_SUPERTX
   FRAME_CONTEXT *const fc = cm->fc;
-  FRAME_COUNTS *counts = cpi->td.counts;
   aom_writer *header_bc;
-  int i, j;
+  int i;
+#if !CONFIG_NEW_MULTISYMBOL
+  FRAME_COUNTS *counts = cpi->td.counts;
+  int j;
+#endif
 
-#if CONFIG_TILE_GROUPS
   const int probwt = cm->num_tg;
-#else
-  const int probwt = 1;
-#endif
+  (void)probwt;
+  (void)i;
+  (void)fc;
 
 #if CONFIG_ANS
   int header_size;
@@ -4628,96 +4600,26 @@ static uint32_t write_compressed_header(AV1_COMP *cpi, uint8_t *data) {
 #if CONFIG_LOOP_RESTORATION
   encode_restoration(cm, header_bc);
 #endif  // CONFIG_LOOP_RESTORATION
-#if !CONFIG_EC_ADAPT
-  update_txfm_probs(cm, header_bc, counts);
-#endif
-#if CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
+#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
   if (cm->tx_mode == TX_MODE_SELECT)
     av1_cond_prob_diff_update(header_bc, &cm->fc->quarter_tx_size_prob,
                               cm->counts.quarter_tx_size, probwt);
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
+#endif
 #if CONFIG_LV_MAP
   av1_write_txb_probs(cpi, header_bc);
-#else
-#if !CONFIG_PVQ
-#if !CONFIG_EC_ADAPT
-  update_coef_probs(cpi, header_bc);
-#endif  // !CONFIG_EC_ADAPT
-#endif  // CONFIG_PVQ
 #endif  // CONFIG_LV_MAP
 
-#if CONFIG_VAR_TX
+#if CONFIG_VAR_TX && !CONFIG_NEW_MULTISYMBOL
   update_txfm_partition_probs(cm, header_bc, counts, probwt);
 #endif
 
+#if !CONFIG_NEW_MULTISYMBOL
   update_skip_probs(cm, header_bc, counts);
-#if !CONFIG_EC_ADAPT && CONFIG_DELTA_Q
-  update_delta_q_probs(cm, header_bc, counts);
-#if CONFIG_EXT_DELTA_Q
-  update_delta_lf_probs(cm, header_bc, counts);
-#endif
-#endif
-#if !CONFIG_EC_ADAPT
-  update_seg_probs(cpi, header_bc);
-
-  for (i = 0; i < INTRA_MODES; ++i) {
-    prob_diff_update(av1_intra_mode_tree, fc->uv_mode_prob[i],
-                     counts->uv_mode[i], INTRA_MODES, probwt, header_bc);
-  }
-
-#if CONFIG_EXT_PARTITION_TYPES
-  for (i = 0; i < PARTITION_PLOFFSET; ++i)
-    prob_diff_update(av1_partition_tree, fc->partition_prob[i],
-                     counts->partition[i], PARTITION_TYPES, probwt, header_bc);
-  for (; i < PARTITION_CONTEXTS_PRIMARY; ++i)
-    prob_diff_update(av1_ext_partition_tree, fc->partition_prob[i],
-                     counts->partition[i], EXT_PARTITION_TYPES, probwt,
-                     header_bc);
-#else
-  for (i = 0; i < PARTITION_CONTEXTS_PRIMARY; ++i)
-    prob_diff_update(av1_partition_tree, fc->partition_prob[i],
-                     counts->partition[i], PARTITION_TYPES, probwt, header_bc);
-#endif  // CONFIG_EXT_PARTITION_TYPES
-#if CONFIG_UNPOISON_PARTITION_CTX
-  for (; i < PARTITION_CONTEXTS_PRIMARY + PARTITION_BLOCK_SIZES; ++i) {
-    unsigned int ct[2] = { counts->partition[i][PARTITION_VERT],
-                           counts->partition[i][PARTITION_SPLIT] };
-    assert(counts->partition[i][PARTITION_NONE] == 0);
-    assert(counts->partition[i][PARTITION_HORZ] == 0);
-    assert(fc->partition_prob[i][PARTITION_NONE] == 0);
-    assert(fc->partition_prob[i][PARTITION_HORZ] == 0);
-    av1_cond_prob_diff_update(header_bc, &fc->partition_prob[i][PARTITION_VERT],
-                              ct, probwt);
-  }
-  for (; i < PARTITION_CONTEXTS_PRIMARY + 2 * PARTITION_BLOCK_SIZES; ++i) {
-    unsigned int ct[2] = { counts->partition[i][PARTITION_HORZ],
-                           counts->partition[i][PARTITION_SPLIT] };
-    assert(counts->partition[i][PARTITION_NONE] == 0);
-    assert(counts->partition[i][PARTITION_VERT] == 0);
-    assert(fc->partition_prob[i][PARTITION_NONE] == 0);
-    assert(fc->partition_prob[i][PARTITION_VERT] == 0);
-    av1_cond_prob_diff_update(header_bc, &fc->partition_prob[i][PARTITION_HORZ],
-                              ct, probwt);
-  }
 #endif
-#if CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
-  for (i = 0; i < INTRA_FILTERS + 1; ++i)
-    prob_diff_update(av1_intra_filter_tree, fc->intra_filter_probs[i],
-                     counts->intra_filter[i], INTRA_FILTERS, probwt, header_bc);
-#endif  // CONFIG_EXT_INTRA && CONFIG_INTRA_INTERP
-#endif  // !CONFIG_EC_ADAPT
 
   if (frame_is_intra_only(cm)) {
-    av1_copy(cm->kf_y_prob, av1_kf_y_mode_prob);
     av1_copy(cm->fc->kf_y_cdf, av1_kf_y_mode_cdf);
 
-#if !CONFIG_EC_ADAPT
-    for (i = 0; i < INTRA_MODES; ++i)
-      for (j = 0; j < INTRA_MODES; ++j)
-        prob_diff_update(av1_intra_mode_tree, cm->kf_y_prob[i][j],
-                         counts->kf_y_mode[i][j], INTRA_MODES, probwt,
-                         header_bc);
-#endif  // CONFIG_EC_ADAPT
 #if CONFIG_INTRABC
     if (cm->allow_screen_content_tools) {
       av1_cond_prob_diff_update(header_bc, &fc->intrabc_prob,
@@ -4725,56 +4627,54 @@ static uint32_t write_compressed_header(AV1_COMP *cpi, uint8_t *data) {
     }
 #endif
   } else {
+#if !CONFIG_NEW_MULTISYMBOL
     update_inter_mode_probs(cm, header_bc, counts);
+#endif
 #if CONFIG_EXT_INTER
-    update_inter_compound_mode_probs(cm, probwt, header_bc);
 #if CONFIG_INTERINTRA
     if (cm->reference_mode != COMPOUND_REFERENCE &&
         cm->allow_interintra_compound) {
+#if !CONFIG_NEW_MULTISYMBOL
       for (i = 0; i < BLOCK_SIZE_GROUPS; i++) {
         if (is_interintra_allowed_bsize_group(i)) {
           av1_cond_prob_diff_update(header_bc, &fc->interintra_prob[i],
                                     cm->counts.interintra[i], probwt);
         }
       }
-      for (i = 0; i < BLOCK_SIZE_GROUPS; i++) {
-        prob_diff_update(
-            av1_interintra_mode_tree, cm->fc->interintra_mode_prob[i],
-            counts->interintra_mode[i], INTERINTRA_MODES, probwt, header_bc);
-      }
-#if CONFIG_WEDGE
-      for (i = 0; i < BLOCK_SIZES; i++) {
+#endif
+#if CONFIG_WEDGE && !CONFIG_NEW_MULTISYMBOL
+#if CONFIG_EXT_PARTITION_TYPES
+      int block_sizes_to_update = BLOCK_SIZES_ALL;
+#else
+      int block_sizes_to_update = BLOCK_SIZES;
+#endif
+      for (i = 0; i < block_sizes_to_update; i++) {
         if (is_interintra_allowed_bsize(i) && is_interintra_wedge_used(i))
           av1_cond_prob_diff_update(header_bc, &fc->wedge_interintra_prob[i],
                                     cm->counts.wedge_interintra[i], probwt);
       }
-#endif  // CONFIG_WEDGE
+#endif  // CONFIG_WEDGE && CONFIG_NEW_MULTISYMBOL
     }
 #endif  // CONFIG_INTERINTRA
-#if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
-    if (cm->reference_mode != SINGLE_REFERENCE && cm->allow_masked_compound) {
-      for (i = 0; i < BLOCK_SIZES; i++)
-        prob_diff_update(av1_compound_type_tree, fc->compound_type_prob[i],
-                         cm->counts.compound_interinter[i], COMPOUND_TYPES,
-                         probwt, header_bc);
-    }
-#endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
 #endif  // CONFIG_EXT_INTER
 
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-    for (i = BLOCK_8X8; i < BLOCK_SIZES; ++i)
-      prob_diff_update(av1_motion_mode_tree, fc->motion_mode_prob[i],
-                       counts->motion_mode[i], MOTION_MODES, probwt, header_bc);
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-#if !CONFIG_EC_ADAPT
-    if (cm->interp_filter == SWITCHABLE)
-      update_switchable_interp_probs(cm, header_bc, counts);
+#if CONFIG_NCOBMC_ADAPT_WEIGHT
+    for (i = ADAPT_OVERLAP_BLOCK_8X8; i < ADAPT_OVERLAP_BLOCKS; ++i) {
+      prob_diff_update(av1_ncobmc_mode_tree, fc->ncobmc_mode_prob[i],
+                       counts->ncobmc_mode[i], MAX_NCOBMC_MODES, probwt,
+                       header_bc);
+    }
 #endif
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 
+#if !CONFIG_NEW_MULTISYMBOL
     for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
       av1_cond_prob_diff_update(header_bc, &fc->intra_inter_prob[i],
                                 counts->intra_inter[i], probwt);
+#endif
 
+#if !CONFIG_NEW_MULTISYMBOL
     if (cpi->allow_comp_inter_inter) {
       const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT;
       if (use_hybrid_pred)
@@ -4791,7 +4691,19 @@ static uint32_t write_compressed_header(AV1_COMP *cpi, uint8_t *data) {
         }
       }
     }
+
     if (cm->reference_mode != SINGLE_REFERENCE) {
+#if CONFIG_EXT_COMP_REFS
+      for (i = 0; i < COMP_REF_TYPE_CONTEXTS; i++)
+        av1_cond_prob_diff_update(header_bc, &fc->comp_ref_type_prob[i],
+                                  counts->comp_ref_type[i], probwt);
+
+      for (i = 0; i < UNI_COMP_REF_CONTEXTS; i++)
+        for (j = 0; j < (UNIDIR_COMP_REFS - 1); j++)
+          av1_cond_prob_diff_update(header_bc, &fc->uni_comp_ref_prob[i][j],
+                                    counts->uni_comp_ref[i][j], probwt);
+#endif  // CONFIG_EXT_COMP_REFS
+
       for (i = 0; i < REF_CONTEXTS; i++) {
 #if CONFIG_EXT_REFS
         for (j = 0; j < (FWD_REFS - 1); j++) {
@@ -4810,17 +4722,16 @@ static uint32_t write_compressed_header(AV1_COMP *cpi, uint8_t *data) {
 #endif  // CONFIG_EXT_REFS
       }
     }
+#endif  // CONFIG_NEW_MULTISYMBOL
 
-#if !CONFIG_EC_ADAPT
-    for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) {
-      prob_diff_update(av1_intra_mode_tree, cm->fc->y_mode_prob[i],
-                       counts->y_mode[i], INTRA_MODES, probwt, header_bc);
-    }
-#endif
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+    for (i = 0; i < COMP_INTER_MODE_CONTEXTS; i++)
+      av1_cond_prob_diff_update(header_bc, &fc->comp_inter_mode_prob[i],
+                                counts->comp_inter_mode[i], probwt);
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
 
+#if !CONFIG_NEW_MULTISYMBOL
     av1_write_nmv_probs(cm, cm->allow_high_precision_mv, header_bc, counts->mv);
-#if !CONFIG_EC_ADAPT
-    update_ext_tx_probs(cm, header_bc);
 #endif
 #if CONFIG_SUPERTX
     if (!xd->lossless[0]) update_supertx_probs(cm, probwt, header_bc);
@@ -4829,12 +4740,6 @@ static uint32_t write_compressed_header(AV1_COMP *cpi, uint8_t *data) {
     write_global_motion(cpi, header_bc);
 #endif  // CONFIG_GLOBAL_MOTION
   }
-#if !CONFIG_EC_ADAPT
-  av1_coef_head_cdfs(fc);
-  av1_coef_pareto_cdfs(fc);
-  for (i = 0; i < NMV_CONTEXTS; ++i) av1_set_mv_cdfs(&fc->nmvc[i]);
-  av1_set_mode_cdfs(cm);
-#endif  // !CONFIG_EC_ADAPT
 #if CONFIG_ANS
   aom_buf_ans_flush(header_bc);
   header_size = buf_ans_write_end(header_bc);
@@ -4881,16 +4786,23 @@ static int remux_tiles(const AV1_COMMON *const cm, uint8_t *dst,
                        const uint32_t max_tile_col_size,
                        int *const tile_size_bytes,
                        int *const tile_col_size_bytes) {
-// Choose the tile size bytes (tsb) and tile column size bytes (tcsb)
+  // Choose the tile size bytes (tsb) and tile column size bytes (tcsb)
+  int tsb;
+  int tcsb;
+
 #if CONFIG_EXT_TILE
-  // The top bit in the tile size field indicates tile copy mode, so we
-  // have 1 less bit to code the tile size
-  const int tsb = choose_size_bytes(max_tile_size, 1);
-  const int tcsb = choose_size_bytes(max_tile_col_size, 0);
-#else
-  const int tsb = choose_size_bytes(max_tile_size, 0);
-  const int tcsb = 4;  // This is ignored
-  (void)max_tile_col_size;
+  if (cm->large_scale_tile) {
+    // The top bit in the tile size field indicates tile copy mode, so we
+    // have 1 less bit to code the tile size
+    tsb = choose_size_bytes(max_tile_size, 1);
+    tcsb = choose_size_bytes(max_tile_col_size, 0);
+  } else {
+#endif  // CONFIG_EXT_TILE
+    tsb = choose_size_bytes(max_tile_size, 0);
+    tcsb = 4;  // This is ignored
+    (void)max_tile_col_size;
+#if CONFIG_EXT_TILE
+  }
 #endif  // CONFIG_EXT_TILE
 
   assert(tsb > 0);
@@ -4906,64 +4818,68 @@ static int remux_tiles(const AV1_COMMON *const cm, uint8_t *dst,
     uint32_t rpos = 0;
 
 #if CONFIG_EXT_TILE
-    int tile_row;
-    int tile_col;
-
-    for (tile_col = 0; tile_col < cm->tile_cols; tile_col++) {
-      // All but the last column has a column header
-      if (tile_col < cm->tile_cols - 1) {
-        uint32_t tile_col_size = mem_get_le32(dst + rpos);
-        rpos += 4;
+    if (cm->large_scale_tile) {
+      int tile_row;
+      int tile_col;
+
+      for (tile_col = 0; tile_col < cm->tile_cols; tile_col++) {
+        // All but the last column has a column header
+        if (tile_col < cm->tile_cols - 1) {
+          uint32_t tile_col_size = mem_get_le32(dst + rpos);
+          rpos += 4;
+
+          // Adjust the tile column size by the number of bytes removed
+          // from the tile size fields.
+          tile_col_size -= (4 - tsb) * cm->tile_rows;
+
+          mem_put_varsize(dst + wpos, tcsb, tile_col_size);
+          wpos += tcsb;
+        }
 
-        // Adjust the tile column size by the number of bytes removed
-        // from the tile size fields.
-        tile_col_size -= (4 - tsb) * cm->tile_rows;
+        for (tile_row = 0; tile_row < cm->tile_rows; tile_row++) {
+          // All, including the last row has a header
+          uint32_t tile_header = mem_get_le32(dst + rpos);
+          rpos += 4;
+
+          // If this is a copy tile, we need to shift the MSB to the
+          // top bit of the new width, and there is no data to copy.
+          if (tile_header >> 31 != 0) {
+            if (tsb < 4) tile_header >>= 32 - 8 * tsb;
+            mem_put_varsize(dst + wpos, tsb, tile_header);
+            wpos += tsb;
+          } else {
+            mem_put_varsize(dst + wpos, tsb, tile_header);
+            wpos += tsb;
 
-        mem_put_varsize(dst + wpos, tcsb, tile_col_size);
-        wpos += tcsb;
+            memmove(dst + wpos, dst + rpos, tile_header);
+            rpos += tile_header;
+            wpos += tile_header;
+          }
+        }
       }
+    } else {
+#endif  // CONFIG_EXT_TILE
+      const int n_tiles = cm->tile_cols * cm->tile_rows;
+      int n;
 
-      for (tile_row = 0; tile_row < cm->tile_rows; tile_row++) {
-        // All, including the last row has a header
-        uint32_t tile_header = mem_get_le32(dst + rpos);
-        rpos += 4;
+      for (n = 0; n < n_tiles; n++) {
+        int tile_size;
 
-        // If this is a copy tile, we need to shift the MSB to the
-        // top bit of the new width, and there is no data to copy.
-        if (tile_header >> 31 != 0) {
-          if (tsb < 4) tile_header >>= 32 - 8 * tsb;
-          mem_put_varsize(dst + wpos, tsb, tile_header);
-          wpos += tsb;
+        if (n == n_tiles - 1) {
+          tile_size = data_size - rpos;
         } else {
-          mem_put_varsize(dst + wpos, tsb, tile_header);
+          tile_size = mem_get_le32(dst + rpos);
+          rpos += 4;
+          mem_put_varsize(dst + wpos, tsb, tile_size);
           wpos += tsb;
-
-          memmove(dst + wpos, dst + rpos, tile_header);
-          rpos += tile_header;
-          wpos += tile_header;
         }
-      }
-    }
-#else
-    const int n_tiles = cm->tile_cols * cm->tile_rows;
-    int n;
 
-    for (n = 0; n < n_tiles; n++) {
-      int tile_size;
+        memmove(dst + wpos, dst + rpos, tile_size);
 
-      if (n == n_tiles - 1) {
-        tile_size = data_size - rpos;
-      } else {
-        tile_size = mem_get_le32(dst + rpos);
-        rpos += 4;
-        mem_put_varsize(dst + wpos, tsb, tile_size);
-        wpos += tsb;
+        rpos += tile_size;
+        wpos += tile_size;
       }
-
-      memmove(dst + wpos, dst + rpos, tile_size);
-
-      rpos += tile_size;
-      wpos += tile_size;
+#if CONFIG_EXT_TILE
     }
 #endif  // CONFIG_EXT_TILE
 
@@ -4976,14 +4892,17 @@ static int remux_tiles(const AV1_COMMON *const cm, uint8_t *dst,
 
 void av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) {
   uint8_t *data = dst;
-#if !CONFIG_TILE_GROUPS
-  uint32_t compressed_header_size;
+  uint32_t data_size;
+#if CONFIG_EXT_TILE
+  AV1_COMMON *const cm = &cpi->common;
+  uint32_t compressed_header_size = 0;
   uint32_t uncompressed_header_size;
   struct aom_write_bit_buffer saved_wb;
-#endif
-  uint32_t data_size;
   struct aom_write_bit_buffer wb = { data, 0 };
-
+  const int have_tiles = cm->tile_cols * cm->tile_rows > 1;
+  int tile_size_bytes;
+  int tile_col_size_bytes;
+#endif  // CONFIG_EXT_TILE
   unsigned int max_tile_size;
   unsigned int max_tile_col_size;
 
@@ -4991,76 +4910,77 @@ void av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) {
   bitstream_queue_reset_write();
 #endif
 
-#if !CONFIG_TILE_GROUPS
-  int tile_size_bytes;
-  int tile_col_size_bytes;
-  AV1_COMMON *const cm = &cpi->common;
-  const int have_tiles = cm->tile_cols * cm->tile_rows > 1;
-
-  // Write the uncompressed header
-  write_uncompressed_header(cpi, &wb);
+#if CONFIG_EXT_TILE
+  if (cm->large_scale_tile) {
+    // Write the uncompressed header
+    write_uncompressed_header(cpi, &wb);
 
 #if CONFIG_EXT_REFS
-  if (cm->show_existing_frame) {
-    *size = aom_wb_bytes_written(&wb);
-    return;
-  }
+    if (cm->show_existing_frame) {
+      *size = aom_wb_bytes_written(&wb);
+      return;
+    }
 #endif  // CONFIG_EXT_REFS
 
-  // We do not know these in advance. Output placeholder bit.
-  saved_wb = wb;
-  // Write tile size magnitudes
-  if (have_tiles) {
-// Note that the last item in the uncompressed header is the data
-// describing tile configuration.
-#if CONFIG_EXT_TILE
-    // Number of bytes in tile column size - 1
-    aom_wb_write_literal(&wb, 0, 2);
-#endif  // CONFIG_EXT_TILE
-    // Number of bytes in tile size - 1
-    aom_wb_write_literal(&wb, 0, 2);
-  }
-  // Size of compressed header
-  aom_wb_write_literal(&wb, 0, 16);
+    // We do not know these in advance. Output placeholder bit.
+    saved_wb = wb;
+    // Write tile size magnitudes
+    if (have_tiles) {
+      // Note that the last item in the uncompressed header is the data
+      // describing tile configuration.
+      // Number of bytes in tile column size - 1
+      aom_wb_write_literal(&wb, 0, 2);
+
+      // Number of bytes in tile size - 1
+      aom_wb_write_literal(&wb, 0, 2);
+    }
+    // Size of compressed header
+    aom_wb_write_literal(&wb, 0, 16);
 
-  uncompressed_header_size = (uint32_t)aom_wb_bytes_written(&wb);
-  data += uncompressed_header_size;
+    uncompressed_header_size = (uint32_t)aom_wb_bytes_written(&wb);
+    data += uncompressed_header_size;
 
-  aom_clear_system_state();
+    aom_clear_system_state();
 
-  // Write the compressed header
-  compressed_header_size = write_compressed_header(cpi, data);
-  data += compressed_header_size;
+    // Write the compressed header
+    compressed_header_size = write_compressed_header(cpi, data);
+    data += compressed_header_size;
 
-  // Write the encoded tile data
-  data_size = write_tiles(cpi, data, &max_tile_size, &max_tile_col_size);
-#else
-  data_size = write_tiles(cpi, &wb, &max_tile_size, &max_tile_col_size);
-#endif
-#if !CONFIG_TILE_GROUPS
-  if (have_tiles) {
-    data_size =
-        remux_tiles(cm, data, data_size, max_tile_size, max_tile_col_size,
-                    &tile_size_bytes, &tile_col_size_bytes);
+    // Write the encoded tile data
+    data_size = write_tiles(cpi, data, &max_tile_size, &max_tile_col_size);
+  } else {
+#endif  // CONFIG_EXT_TILE
+    data_size = write_tiles(cpi, data, &max_tile_size, &max_tile_col_size);
+#if CONFIG_EXT_TILE
   }
+#endif  // CONFIG_EXT_TILE
+#if CONFIG_EXT_TILE
+  if (cm->large_scale_tile) {
+    if (have_tiles) {
+      data_size =
+          remux_tiles(cm, data, data_size, max_tile_size, max_tile_col_size,
+                      &tile_size_bytes, &tile_col_size_bytes);
+    }
 
-  data += data_size;
+    data += data_size;
 
-  // Now fill in the gaps in the uncompressed header.
-  if (have_tiles) {
-#if CONFIG_EXT_TILE
-    assert(tile_col_size_bytes >= 1 && tile_col_size_bytes <= 4);
-    aom_wb_write_literal(&saved_wb, tile_col_size_bytes - 1, 2);
+    // Now fill in the gaps in the uncompressed header.
+    if (have_tiles) {
+      assert(tile_col_size_bytes >= 1 && tile_col_size_bytes <= 4);
+      aom_wb_write_literal(&saved_wb, tile_col_size_bytes - 1, 2);
+
+      assert(tile_size_bytes >= 1 && tile_size_bytes <= 4);
+      aom_wb_write_literal(&saved_wb, tile_size_bytes - 1, 2);
+    }
+    // TODO(jbb): Figure out what to do if compressed_header_size > 16 bits.
+    assert(compressed_header_size <= 0xffff);
+    aom_wb_write_literal(&saved_wb, compressed_header_size, 16);
+  } else {
 #endif  // CONFIG_EXT_TILE
-    assert(tile_size_bytes >= 1 && tile_size_bytes <= 4);
-    aom_wb_write_literal(&saved_wb, tile_size_bytes - 1, 2);
+    data += data_size;
+#if CONFIG_EXT_TILE
   }
-  // TODO(jbb): Figure out what to do if compressed_header_size > 16 bits.
-  assert(compressed_header_size <= 0xffff);
-  aom_wb_write_literal(&saved_wb, compressed_header_size, 16);
-#else
-  data += data_size;
-#endif
+#endif  // CONFIG_EXT_TILE
 #if CONFIG_ANS && ANS_REVERSE
   // Avoid aliasing the superframe index
   *data++ = 0;
diff --git a/third_party/aom/av1/encoder/bitstream.h b/third_party/aom/av1/encoder/bitstream.h
index c75d80891..29c930356 100644
--- a/third_party/aom/av1/encoder/bitstream.h
+++ b/third_party/aom/av1/encoder/bitstream.h
@@ -19,7 +19,11 @@ extern "C" {
 #include "av1/encoder/encoder.h"
 
 #if CONFIG_REFERENCE_BUFFER
-void write_sequence_header(SequenceHeader *seq_params);
+void write_sequence_header(
+#if CONFIG_EXT_TILE
+    AV1_COMMON *const cm,
+#endif  // CONFIG_EXT_TILE
+    SequenceHeader *seq_params);
 #endif
 
 void av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dest, size_t *size);
@@ -42,7 +46,8 @@ void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
                        const int supertx_enabled,
 #endif
 #if CONFIG_TXK_SEL
-                       int block, int plane,
+                       int blk_row, int blk_col, int block, int plane,
+                       TX_SIZE tx_size,
 #endif
                        aom_writer *w);
 
diff --git a/third_party/aom/av1/encoder/block.h b/third_party/aom/av1/encoder/block.h
index e16479e64..7b6eb0b0e 100644
--- a/third_party/aom/av1/encoder/block.h
+++ b/third_party/aom/av1/encoder/block.h
@@ -116,7 +116,6 @@ struct macroblock {
   // The equivalend SAD error of one (whole) bit at the current quantizer
   // for sub-8x8 blocks.
   int sadperbit4;
-  int rddiv;
   int rdmult;
   int mb_energy;
   int *m_search_count_ptr;
@@ -206,16 +205,15 @@ struct macroblock {
   int pvq_speed;
   int pvq_coded;  // Indicates whether pvq_info needs be stored to tokenize
 #endif
-#if CONFIG_DAALA_DIST
-  // Keep rate of each 4x4 block in the current macroblock during RDO
-  // This is needed when using the 8x8 Daala distortion metric during RDO,
-  // because it evaluates distortion in a different order than the underlying
-  // 4x4 blocks are coded.
-  int rate_4x4[MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
+#if CONFIG_DIST_8X8
 #if CONFIG_CB4X4
+#if CONFIG_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint16_t, decoded_8x8[8 * 8]);
+#else
   DECLARE_ALIGNED(16, uint8_t, decoded_8x8[8 * 8]);
+#endif
 #endif  // CONFIG_CB4X4
-#endif  // CONFIG_DAALA_DIST
+#endif  // CONFIG_DIST_8X8
 #if CONFIG_CFL
   // Whether luma needs to be stored during RDO.
   int cfl_store_y;
diff --git a/third_party/aom/av1/encoder/context_tree.c b/third_party/aom/av1/encoder/context_tree.c
index 4c7d6ff00..b1c01b28e 100644
--- a/third_party/aom/av1/encoder/context_tree.c
+++ b/third_party/aom/av1/encoder/context_tree.c
@@ -65,12 +65,10 @@ static void alloc_mode_context(AV1_COMMON *cm, int num_4x4_blk,
   }
 
 #if CONFIG_PALETTE
-  if (cm->allow_screen_content_tools) {
-    for (i = 0; i < 2; ++i) {
-      CHECK_MEM_ERROR(
-          cm, ctx->color_index_map[i],
-          aom_memalign(32, num_pix * sizeof(*ctx->color_index_map[i])));
-    }
+  for (i = 0; i < 2; ++i) {
+    CHECK_MEM_ERROR(
+        cm, ctx->color_index_map[i],
+        aom_memalign(32, num_pix * sizeof(*ctx->color_index_map[i])));
   }
 #endif  // CONFIG_PALETTE
 }
@@ -141,7 +139,13 @@ static void alloc_tree_contexts(AV1_COMMON *cm, PC_TREE *tree,
                      &tree->verticalb[1]);
   alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_VERT_B,
                      &tree->verticalb[2]);
-#ifdef CONFIG_SUPERTX
+  for (int i = 0; i < 4; ++i) {
+    alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_HORZ_4,
+                       &tree->horizontal4[i]);
+    alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_HORZ_4,
+                       &tree->vertical4[i]);
+  }
+#if CONFIG_SUPERTX
   alloc_mode_context(cm, num_4x4_blk, PARTITION_HORZ,
                      &tree->horizontal_supertx);
   alloc_mode_context(cm, num_4x4_blk, PARTITION_VERT, &tree->vertical_supertx);
@@ -159,7 +163,7 @@ static void alloc_tree_contexts(AV1_COMMON *cm, PC_TREE *tree,
   alloc_mode_context(cm, num_4x4_blk, &tree->none);
   alloc_mode_context(cm, num_4x4_blk / 2, &tree->horizontal[0]);
   alloc_mode_context(cm, num_4x4_blk / 2, &tree->vertical[0]);
-#ifdef CONFIG_SUPERTX
+#if CONFIG_SUPERTX
   alloc_mode_context(cm, num_4x4_blk, &tree->horizontal_supertx);
   alloc_mode_context(cm, num_4x4_blk, &tree->vertical_supertx);
   alloc_mode_context(cm, num_4x4_blk, &tree->split_supertx);
@@ -184,13 +188,17 @@ static void free_tree_contexts(PC_TREE *tree) {
     free_mode_context(&tree->verticala[i]);
     free_mode_context(&tree->verticalb[i]);
   }
+  for (i = 0; i < 4; ++i) {
+    free_mode_context(&tree->horizontal4[i]);
+    free_mode_context(&tree->vertical4[i]);
+  }
 #endif  // CONFIG_EXT_PARTITION_TYPES
   free_mode_context(&tree->none);
   free_mode_context(&tree->horizontal[0]);
   free_mode_context(&tree->horizontal[1]);
   free_mode_context(&tree->vertical[0]);
   free_mode_context(&tree->vertical[1]);
-#ifdef CONFIG_SUPERTX
+#if CONFIG_SUPERTX
   free_mode_context(&tree->horizontal_supertx);
   free_mode_context(&tree->vertical_supertx);
   free_mode_context(&tree->split_supertx);
diff --git a/third_party/aom/av1/encoder/context_tree.h b/third_party/aom/av1/encoder/context_tree.h
index 4f9d5e374..bcfcc274a 100644
--- a/third_party/aom/av1/encoder/context_tree.h
+++ b/third_party/aom/av1/encoder/context_tree.h
@@ -81,12 +81,14 @@ typedef struct PC_TREE {
   PICK_MODE_CONTEXT horizontalb[3];
   PICK_MODE_CONTEXT verticala[3];
   PICK_MODE_CONTEXT verticalb[3];
+  PICK_MODE_CONTEXT horizontal4[4];
+  PICK_MODE_CONTEXT vertical4[4];
 #endif
   union {
     struct PC_TREE *split[4];
     PICK_MODE_CONTEXT *leaf_split[4];
   };
-#ifdef CONFIG_SUPERTX
+#if CONFIG_SUPERTX
   PICK_MODE_CONTEXT horizontal_supertx;
   PICK_MODE_CONTEXT vertical_supertx;
   PICK_MODE_CONTEXT split_supertx;
diff --git a/third_party/aom/av1/encoder/cost.c b/third_party/aom/av1/encoder/cost.c
index e3151a597..e33df53e4 100644
--- a/third_party/aom/av1/encoder/cost.c
+++ b/third_party/aom/av1/encoder/cost.c
@@ -65,3 +65,21 @@ void av1_cost_tokens_skip(int *costs, const aom_prob *probs, aom_tree tree) {
   costs[-tree[0]] = av1_cost_bit(probs[0], 0);
   cost(costs, tree, probs, 2, 0);
 }
+
+void av1_cost_tokens_from_cdf(int *costs, const aom_cdf_prob *cdf,
+                              const int *inv_map) {
+  int i;
+  aom_cdf_prob prev_cdf = 0;
+  for (i = 0;; ++i) {
+    const aom_cdf_prob p15 = AOM_ICDF(cdf[i]) - prev_cdf;
+    prev_cdf = AOM_ICDF(cdf[i]);
+
+    if (inv_map)
+      costs[inv_map[i]] = av1_cost_symbol(p15);
+    else
+      costs[i] = av1_cost_symbol(p15);
+
+    // Stop once we reach the end of the CDF
+    if (cdf[i] == AOM_ICDF(CDF_PROB_TOP)) break;
+  }
+}
diff --git a/third_party/aom/av1/encoder/cost.h b/third_party/aom/av1/encoder/cost.h
index d8fb357e6..e60632005 100644
--- a/third_party/aom/av1/encoder/cost.h
+++ b/third_party/aom/av1/encoder/cost.h
@@ -34,6 +34,14 @@ extern const uint16_t av1_prob_cost[256];
 // for each bit.
 #define av1_cost_literal(n) ((n) * (1 << AV1_PROB_COST_SHIFT))
 
+// Calculate the cost of a symbol with probability p15 / 2^15
+static INLINE int av1_cost_symbol(aom_cdf_prob p15) {
+  assert(0 < p15 && p15 < CDF_PROB_TOP);
+  const int shift = CDF_PROB_BITS - 1 - get_msb(p15);
+  return av1_cost_zero(get_prob(p15 << shift, CDF_PROB_TOP)) +
+         av1_cost_literal(shift);
+}
+
 static INLINE unsigned int cost_branch256(const unsigned int ct[2],
                                           aom_prob p) {
   return ct[0] * av1_cost_zero(p) + ct[1] * av1_cost_one(p);
@@ -55,6 +63,8 @@ static INLINE int treed_cost(aom_tree tree, const aom_prob *probs, int bits,
 
 void av1_cost_tokens(int *costs, const aom_prob *probs, aom_tree tree);
 void av1_cost_tokens_skip(int *costs, const aom_prob *probs, aom_tree tree);
+void av1_cost_tokens_from_cdf(int *costs, const aom_cdf_prob *cdf,
+                              const int *inv_map);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/encoder/dct.c b/third_party/aom/av1/encoder/dct.c
index f6b64f0f7..850b84ca9 100644
--- a/third_party/aom/av1/encoder/dct.c
+++ b/third_party/aom/av1/encoder/dct.c
@@ -21,6 +21,9 @@
 #include "av1/common/av1_fwd_txfm1d.h"
 #include "av1/common/av1_fwd_txfm1d_cfg.h"
 #include "av1/common/idct.h"
+#if CONFIG_DAALA_DCT4 || CONFIG_DAALA_DCT8
+#include "av1/common/daala_tx.h"
+#endif
 
 static INLINE void range_check(const tran_low_t *input, const int size,
                                const int bit) {
@@ -39,6 +42,18 @@ static INLINE void range_check(const tran_low_t *input, const int size,
 #endif
 }
 
+#if CONFIG_DAALA_DCT4
+static void fdct4(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  od_coeff x[4];
+  od_coeff y[4];
+  for (i = 0; i < 4; i++) x[i] = (od_coeff)input[i];
+  od_bin_fdct4(y, x, 1);
+  for (i = 0; i < 4; i++) output[i] = (tran_low_t)y[i];
+}
+
+#else
+
 static void fdct4(const tran_low_t *input, tran_low_t *output) {
   tran_high_t temp;
   tran_low_t step[4];
@@ -74,6 +89,19 @@ static void fdct4(const tran_low_t *input, tran_low_t *output) {
 
   range_check(output, 4, 16);
 }
+#endif
+
+#if CONFIG_DAALA_DCT8
+static void fdct8(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  od_coeff x[8];
+  od_coeff y[8];
+  for (i = 0; i < 8; i++) x[i] = (od_coeff)input[i];
+  od_bin_fdct8(y, x, 1);
+  for (i = 0; i < 8; i++) output[i] = (tran_low_t)y[i];
+}
+
+#else
 
 static void fdct8(const tran_low_t *input, tran_low_t *output) {
   tran_high_t temp;
@@ -152,6 +180,7 @@ static void fdct8(const tran_low_t *input, tran_low_t *output) {
 
   range_check(output, 8, 16);
 }
+#endif
 
 static void fdct16(const tran_low_t *input, tran_low_t *output) {
   tran_high_t temp;
@@ -767,6 +796,18 @@ static void fadst4(const tran_low_t *input, tran_low_t *output) {
   output[3] = (tran_low_t)fdct_round_shift(s3);
 }
 
+#if CONFIG_DAALA_DCT8
+static void fadst8(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  od_coeff x[8];
+  od_coeff y[8];
+  for (i = 0; i < 8; i++) x[i] = (od_coeff)input[i];
+  od_bin_fdst8(y, x, 1);
+  for (i = 0; i < 8; i++) output[i] = (tran_low_t)y[i];
+}
+
+#else
+
 static void fadst8(const tran_low_t *input, tran_low_t *output) {
   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
 
@@ -837,6 +878,7 @@ static void fadst8(const tran_low_t *input, tran_low_t *output) {
   output[6] = (tran_low_t)x5;
   output[7] = (tran_low_t)-x1;
 }
+#endif
 
 static void fadst16(const tran_low_t *input, tran_low_t *output) {
   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
@@ -1021,6 +1063,83 @@ static void fhalfright32(const tran_low_t *input, tran_low_t *output) {
   // Note overall scaling factor is 4 times orthogonal
 }
 
+#if CONFIG_MRC_TX
+static void get_masked_residual32(const int16_t **input, int *input_stride,
+                                  const uint8_t *pred, int pred_stride,
+                                  int16_t *masked_input) {
+  int mrc_mask[32 * 32];
+  get_mrc_mask(pred, pred_stride, mrc_mask, 32, 32, 32);
+  int32_t sum = 0;
+  int16_t avg;
+  // Get the masked average of the prediction
+  for (int i = 0; i < 32; ++i) {
+    for (int j = 0; j < 32; ++j) {
+      sum += mrc_mask[i * 32 + j] * (*input)[i * (*input_stride) + j];
+    }
+  }
+  avg = ROUND_POWER_OF_TWO_SIGNED(sum, 10);
+  // Replace all of the unmasked pixels in the prediction with the average
+  // of the masked pixels
+  for (int i = 0; i < 32; ++i) {
+    for (int j = 0; j < 32; ++j)
+      masked_input[i * 32 + j] =
+          (mrc_mask[i * 32 + j]) ? (*input)[i * (*input_stride) + j] : avg;
+  }
+  *input = masked_input;
+  *input_stride = 32;
+}
+#endif  // CONFIG_MRC_TX
+
+#if CONFIG_LGT
+static void flgt4(const tran_low_t *input, tran_low_t *output,
+                  const tran_high_t *lgtmtx) {
+  if (!(input[0] | input[1] | input[2] | input[3])) {
+    output[0] = output[1] = output[2] = output[3] = 0;
+    return;
+  }
+
+  // evaluate s[j] = sum of all lgtmtx[j][i]*input[i] over i=1,...,4
+  tran_high_t s[4] = { 0 };
+  for (int i = 0; i < 4; ++i)
+    for (int j = 0; j < 4; ++j) s[j] += lgtmtx[j * 4 + i] * input[i];
+
+  for (int i = 0; i < 4; ++i) output[i] = (tran_low_t)fdct_round_shift(s[i]);
+}
+
+static void flgt8(const tran_low_t *input, tran_low_t *output,
+                  const tran_high_t *lgtmtx) {
+  // evaluate s[j] = sum of all lgtmtx[j][i]*input[i] over i=1,...,8
+  tran_high_t s[8] = { 0 };
+  for (int i = 0; i < 8; ++i)
+    for (int j = 0; j < 8; ++j) s[j] += lgtmtx[j * 8 + i] * input[i];
+
+  for (int i = 0; i < 8; ++i) output[i] = (tran_low_t)fdct_round_shift(s[i]);
+}
+
+// The get_fwd_lgt functions return 1 if LGT is chosen to apply, and 0 otherwise
+int get_fwd_lgt4(transform_1d tx_orig, TxfmParam *txfm_param,
+                 const tran_high_t *lgtmtx[], int ntx) {
+  // inter/intra split
+  if (tx_orig == &fadst4) {
+    for (int i = 0; i < ntx; ++i)
+      lgtmtx[i] = txfm_param->is_inter ? &lgt4_170[0][0] : &lgt4_140[0][0];
+    return 1;
+  }
+  return 0;
+}
+
+int get_fwd_lgt8(transform_1d tx_orig, TxfmParam *txfm_param,
+                 const tran_high_t *lgtmtx[], int ntx) {
+  // inter/intra split
+  if (tx_orig == &fadst8) {
+    for (int i = 0; i < ntx; ++i)
+      lgtmtx[i] = txfm_param->is_inter ? &lgt8_170[0][0] : &lgt8_150[0][0];
+    return 1;
+  }
+  return 0;
+}
+#endif  // CONFIG_LGT
+
 #if CONFIG_EXT_TX
 // TODO(sarahparker) these functions will be removed once the highbitdepth
 // codepath works properly for rectangular transforms. They have almost
@@ -1028,13 +1147,24 @@ static void fhalfright32(const tran_low_t *input, tran_low_t *output) {
 // being used for square transforms.
 static void fidtx4(const tran_low_t *input, tran_low_t *output) {
   int i;
-  for (i = 0; i < 4; ++i)
+  for (i = 0; i < 4; ++i) {
+#if CONFIG_DAALA_DCT4
+    output[i] = input[i];
+#else
     output[i] = (tran_low_t)fdct_round_shift(input[i] * Sqrt2);
+#endif
+  }
 }
 
 static void fidtx8(const tran_low_t *input, tran_low_t *output) {
   int i;
-  for (i = 0; i < 8; ++i) output[i] = input[i] * 2;
+  for (i = 0; i < 8; ++i) {
+#if CONFIG_DAALA_DCT8
+    output[i] = input[i];
+#else
+    output[i] = input[i] * 2;
+#endif
+  }
 }
 
 static void fidtx16(const tran_low_t *input, tran_low_t *output) {
@@ -1110,6 +1240,9 @@ static void copy_fliplrud(const int16_t *src, int src_stride, int l, int w,
 static void maybe_flip_input(const int16_t **src, int *src_stride, int l, int w,
                              int16_t *buff, int tx_type) {
   switch (tx_type) {
+#if CONFIG_MRC_TX
+    case MRC_DCT:
+#endif  // CONFIG_MRC_TX
     case DCT_DCT:
     case ADST_DCT:
     case DCT_ADST:
@@ -1144,10 +1277,21 @@ static void maybe_flip_input(const int16_t **src, int *src_stride, int l, int w,
 #endif  // CONFIG_EXT_TX
 
 void av1_fht4x4_c(const int16_t *input, tran_low_t *output, int stride,
-                  int tx_type) {
+                  TxfmParam *txfm_param) {
+  int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif
+#if CONFIG_DCT_ONLY
+  assert(tx_type == DCT_DCT);
+#endif
+#if !CONFIG_DAALA_DCT4
   if (tx_type == DCT_DCT) {
     aom_fdct4x4_c(input, output, stride);
-  } else {
+    return;
+  }
+#endif
+  {
     static const transform_2d FHT[] = {
       { fdct4, fdct4 },    // DCT_DCT
       { fadst4, fdct4 },   // ADST_DCT
@@ -1166,7 +1310,7 @@ void av1_fht4x4_c(const int16_t *input, tran_low_t *output, int stride,
       { fidtx4, fadst4 },  // H_ADST
       { fadst4, fidtx4 },  // V_FLIPADST
       { fidtx4, fadst4 },  // H_FLIPADST
-#endif                     // CONFIG_EXT_TX
+#endif
     };
     const transform_2d ht = FHT[tx_type];
     tran_low_t out[4 * 4];
@@ -1178,25 +1322,60 @@ void av1_fht4x4_c(const int16_t *input, tran_low_t *output, int stride,
     maybe_flip_input(&input, &stride, 4, 4, flipped_input, tx_type);
 #endif
 
+#if CONFIG_LGT
+    // Choose LGT adaptive to the prediction. We may apply different LGTs for
+    // different rows/columns, indicated by the pointers to 2D arrays
+    const tran_high_t *lgtmtx_col[4];
+    const tran_high_t *lgtmtx_row[4];
+    int use_lgt_col = get_fwd_lgt4(ht.cols, txfm_param, lgtmtx_col, 4);
+    int use_lgt_row = get_fwd_lgt4(ht.rows, txfm_param, lgtmtx_row, 4);
+#endif
+
     // Columns
     for (i = 0; i < 4; ++i) {
+      /* A C99-safe upshift by 4 for both Daala and VPx TX. */
       for (j = 0; j < 4; ++j) temp_in[j] = input[j * stride + i] * 16;
+#if !CONFIG_DAALA_DCT4
       if (i == 0 && temp_in[0]) temp_in[0] += 1;
-      ht.cols(temp_in, temp_out);
+#endif
+#if CONFIG_LGT
+      if (use_lgt_col)
+        flgt4(temp_in, temp_out, lgtmtx_col[i]);
+      else
+#endif
+        ht.cols(temp_in, temp_out);
       for (j = 0; j < 4; ++j) out[j * 4 + i] = temp_out[j];
     }
 
     // Rows
     for (i = 0; i < 4; ++i) {
       for (j = 0; j < 4; ++j) temp_in[j] = out[j + i * 4];
-      ht.rows(temp_in, temp_out);
+#if CONFIG_LGT
+      if (use_lgt_row)
+        flgt4(temp_in, temp_out, lgtmtx_row[i]);
+      else
+#endif
+        ht.rows(temp_in, temp_out);
+#if CONFIG_DAALA_DCT4
+      /* Daala TX has orthonormal scaling; shift down by only 1 to achieve
+         the usual VPx coefficient left-shift of 3. */
+      for (j = 0; j < 4; ++j) output[j + i * 4] = temp_out[j] >> 1;
+#else
       for (j = 0; j < 4; ++j) output[j + i * 4] = (temp_out[j] + 1) >> 2;
+#endif
     }
   }
 }
 
 void av1_fht4x8_c(const int16_t *input, tran_low_t *output, int stride,
-                  int tx_type) {
+                  TxfmParam *txfm_param) {
+  int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif  // CONFIG_MRC_TX
+#if CONFIG_DCT_ONLY
+  assert(tx_type == DCT_DCT);
+#endif
   static const transform_2d FHT[] = {
     { fdct8, fdct4 },    // DCT_DCT
     { fadst8, fdct4 },   // ADST_DCT
@@ -1228,19 +1407,36 @@ void av1_fht4x8_c(const int16_t *input, tran_low_t *output, int stride,
   maybe_flip_input(&input, &stride, n2, n, flipped_input, tx_type);
 #endif
 
+#if CONFIG_LGT
+  const tran_high_t *lgtmtx_col[4];
+  const tran_high_t *lgtmtx_row[8];
+  int use_lgt_col = get_fwd_lgt8(ht.cols, txfm_param, lgtmtx_col, 4);
+  int use_lgt_row = get_fwd_lgt4(ht.rows, txfm_param, lgtmtx_row, 8);
+#endif
+
   // Rows
   for (i = 0; i < n2; ++i) {
     for (j = 0; j < n; ++j)
       temp_in[j] =
           (tran_low_t)fdct_round_shift(input[i * stride + j] * 4 * Sqrt2);
-    ht.rows(temp_in, temp_out);
+#if CONFIG_LGT
+    if (use_lgt_row)
+      flgt4(temp_in, temp_out, lgtmtx_row[i]);
+    else
+#endif
+      ht.rows(temp_in, temp_out);
     for (j = 0; j < n; ++j) out[j * n2 + i] = temp_out[j];
   }
 
   // Columns
   for (i = 0; i < n; ++i) {
     for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
-    ht.cols(temp_in, temp_out);
+#if CONFIG_LGT
+    if (use_lgt_col)
+      flgt8(temp_in, temp_out, lgtmtx_col[i]);
+    else
+#endif
+      ht.cols(temp_in, temp_out);
     for (j = 0; j < n2; ++j)
       output[i + j * n] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
   }
@@ -1248,7 +1444,14 @@ void av1_fht4x8_c(const int16_t *input, tran_low_t *output, int stride,
 }
 
 void av1_fht8x4_c(const int16_t *input, tran_low_t *output, int stride,
-                  int tx_type) {
+                  TxfmParam *txfm_param) {
+  int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif  // CONFIG_MRC_TX
+#if CONFIG_DCT_ONLY
+  assert(tx_type == DCT_DCT);
+#endif
   static const transform_2d FHT[] = {
     { fdct4, fdct8 },    // DCT_DCT
     { fadst4, fdct8 },   // ADST_DCT
@@ -1280,19 +1483,36 @@ void av1_fht8x4_c(const int16_t *input, tran_low_t *output, int stride,
   maybe_flip_input(&input, &stride, n, n2, flipped_input, tx_type);
 #endif
 
+#if CONFIG_LGT
+  const tran_high_t *lgtmtx_col[8];
+  const tran_high_t *lgtmtx_row[4];
+  int use_lgt_col = get_fwd_lgt4(ht.cols, txfm_param, lgtmtx_col, 8);
+  int use_lgt_row = get_fwd_lgt8(ht.rows, txfm_param, lgtmtx_row, 4);
+#endif
+
   // Columns
   for (i = 0; i < n2; ++i) {
     for (j = 0; j < n; ++j)
       temp_in[j] =
           (tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2);
-    ht.cols(temp_in, temp_out);
+#if CONFIG_LGT
+    if (use_lgt_col)
+      flgt4(temp_in, temp_out, lgtmtx_col[i]);
+    else
+#endif
+      ht.cols(temp_in, temp_out);
     for (j = 0; j < n; ++j) out[j * n2 + i] = temp_out[j];
   }
 
   // Rows
   for (i = 0; i < n; ++i) {
     for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
-    ht.rows(temp_in, temp_out);
+#if CONFIG_LGT
+    if (use_lgt_row)
+      flgt8(temp_in, temp_out, lgtmtx_row[i]);
+    else
+#endif
+      ht.rows(temp_in, temp_out);
     for (j = 0; j < n2; ++j)
       output[j + i * n2] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
   }
@@ -1300,7 +1520,14 @@ void av1_fht8x4_c(const int16_t *input, tran_low_t *output, int stride,
 }
 
 void av1_fht4x16_c(const int16_t *input, tran_low_t *output, int stride,
-                   int tx_type) {
+                   TxfmParam *txfm_param) {
+  int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif  // CONFIG_MRC_TX
+#if CONFIG_DCT_ONLY
+  assert(tx_type == DCT_DCT);
+#endif
   static const transform_2d FHT[] = {
     { fdct16, fdct4 },    // DCT_DCT
     { fadst16, fdct4 },   // ADST_DCT
@@ -1332,10 +1559,20 @@ void av1_fht4x16_c(const int16_t *input, tran_low_t *output, int stride,
   maybe_flip_input(&input, &stride, n4, n, flipped_input, tx_type);
 #endif
 
+#if CONFIG_LGT
+  const tran_high_t *lgtmtx_row[16];
+  int use_lgt_row = get_fwd_lgt4(ht.rows, txfm_param, lgtmtx_row, 16);
+#endif
+
   // Rows
   for (i = 0; i < n4; ++i) {
     for (j = 0; j < n; ++j) temp_in[j] = input[i * stride + j] * 4;
-    ht.rows(temp_in, temp_out);
+#if CONFIG_LGT
+    if (use_lgt_row)
+      flgt4(temp_in, temp_out, lgtmtx_row[i]);
+    else
+#endif
+      ht.rows(temp_in, temp_out);
     for (j = 0; j < n; ++j) out[j * n4 + i] = temp_out[j];
   }
 
@@ -1350,7 +1587,14 @@ void av1_fht4x16_c(const int16_t *input, tran_low_t *output, int stride,
 }
 
 void av1_fht16x4_c(const int16_t *input, tran_low_t *output, int stride,
-                   int tx_type) {
+                   TxfmParam *txfm_param) {
+  int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif  // CONFIG_MRC_TX
+#if CONFIG_DCT_ONLY
+  assert(tx_type == DCT_DCT);
+#endif
   static const transform_2d FHT[] = {
     { fdct4, fdct16 },    // DCT_DCT
     { fadst4, fdct16 },   // ADST_DCT
@@ -1382,10 +1626,20 @@ void av1_fht16x4_c(const int16_t *input, tran_low_t *output, int stride,
   maybe_flip_input(&input, &stride, n, n4, flipped_input, tx_type);
 #endif
 
+#if CONFIG_LGT
+  const tran_high_t *lgtmtx_col[16];
+  int use_lgt_col = get_fwd_lgt4(ht.cols, txfm_param, lgtmtx_col, 16);
+#endif
+
   // Columns
   for (i = 0; i < n4; ++i) {
     for (j = 0; j < n; ++j) temp_in[j] = input[j * stride + i] * 4;
-    ht.cols(temp_in, temp_out);
+#if CONFIG_LGT
+    if (use_lgt_col)
+      flgt4(temp_in, temp_out, lgtmtx_col[i]);
+    else
+#endif
+      ht.cols(temp_in, temp_out);
     for (j = 0; j < n; ++j) out[j * n4 + i] = temp_out[j];
   }
 
@@ -1400,7 +1654,14 @@ void av1_fht16x4_c(const int16_t *input, tran_low_t *output, int stride,
 }
 
 void av1_fht8x16_c(const int16_t *input, tran_low_t *output, int stride,
-                   int tx_type) {
+                   TxfmParam *txfm_param) {
+  int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif  // CONFIG_MRC_TX
+#if CONFIG_DCT_ONLY
+  assert(tx_type == DCT_DCT);
+#endif
   static const transform_2d FHT[] = {
     { fdct16, fdct8 },    // DCT_DCT
     { fadst16, fdct8 },   // ADST_DCT
@@ -1432,12 +1693,22 @@ void av1_fht8x16_c(const int16_t *input, tran_low_t *output, int stride,
   maybe_flip_input(&input, &stride, n2, n, flipped_input, tx_type);
 #endif
 
+#if CONFIG_LGT
+  const tran_high_t *lgtmtx_row[16];
+  int use_lgt_row = get_fwd_lgt8(ht.rows, txfm_param, lgtmtx_row, 16);
+#endif
+
   // Rows
   for (i = 0; i < n2; ++i) {
     for (j = 0; j < n; ++j)
       temp_in[j] =
           (tran_low_t)fdct_round_shift(input[i * stride + j] * 4 * Sqrt2);
-    ht.rows(temp_in, temp_out);
+#if CONFIG_LGT
+    if (use_lgt_row)
+      flgt8(temp_in, temp_out, lgtmtx_row[i]);
+    else
+#endif
+      ht.rows(temp_in, temp_out);
     for (j = 0; j < n; ++j)
       out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
   }
@@ -1452,7 +1723,14 @@ void av1_fht8x16_c(const int16_t *input, tran_low_t *output, int stride,
 }
 
 void av1_fht16x8_c(const int16_t *input, tran_low_t *output, int stride,
-                   int tx_type) {
+                   TxfmParam *txfm_param) {
+  int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif  // CONFIG_MRC_TX
+#if CONFIG_DCT_ONLY
+  assert(tx_type == DCT_DCT);
+#endif
   static const transform_2d FHT[] = {
     { fdct8, fdct16 },    // DCT_DCT
     { fadst8, fdct16 },   // ADST_DCT
@@ -1484,12 +1762,22 @@ void av1_fht16x8_c(const int16_t *input, tran_low_t *output, int stride,
   maybe_flip_input(&input, &stride, n, n2, flipped_input, tx_type);
 #endif
 
+#if CONFIG_LGT
+  const tran_high_t *lgtmtx_col[16];
+  int use_lgt_col = get_fwd_lgt8(ht.cols, txfm_param, lgtmtx_col, 16);
+#endif
+
   // Columns
   for (i = 0; i < n2; ++i) {
     for (j = 0; j < n; ++j)
       temp_in[j] =
           (tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2);
-    ht.cols(temp_in, temp_out);
+#if CONFIG_LGT
+    if (use_lgt_col)
+      flgt8(temp_in, temp_out, lgtmtx_col[i]);
+    else
+#endif
+      ht.cols(temp_in, temp_out);
     for (j = 0; j < n; ++j)
       out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
   }
@@ -1504,7 +1792,14 @@ void av1_fht16x8_c(const int16_t *input, tran_low_t *output, int stride,
 }
 
 void av1_fht8x32_c(const int16_t *input, tran_low_t *output, int stride,
-                   int tx_type) {
+                   TxfmParam *txfm_param) {
+  int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif  // CONFIG_MRC_TX
+#if CONFIG_DCT_ONLY
+  assert(tx_type == DCT_DCT);
+#endif
   static const transform_2d FHT[] = {
     { fdct32, fdct8 },         // DCT_DCT
     { fhalfright32, fdct8 },   // ADST_DCT
@@ -1536,10 +1831,20 @@ void av1_fht8x32_c(const int16_t *input, tran_low_t *output, int stride,
   maybe_flip_input(&input, &stride, n4, n, flipped_input, tx_type);
 #endif
 
+#if CONFIG_LGT
+  const tran_high_t *lgtmtx_row[32];
+  int use_lgt_row = get_fwd_lgt8(ht.rows, txfm_param, lgtmtx_row, 32);
+#endif
+
   // Rows
   for (i = 0; i < n4; ++i) {
     for (j = 0; j < n; ++j) temp_in[j] = input[i * stride + j] * 4;
-    ht.rows(temp_in, temp_out);
+#if CONFIG_LGT
+    if (use_lgt_row)
+      flgt8(temp_in, temp_out, lgtmtx_row[i]);
+    else
+#endif
+      ht.rows(temp_in, temp_out);
     for (j = 0; j < n; ++j) out[j * n4 + i] = temp_out[j];
   }
 
@@ -1554,7 +1859,14 @@ void av1_fht8x32_c(const int16_t *input, tran_low_t *output, int stride,
 }
 
 void av1_fht32x8_c(const int16_t *input, tran_low_t *output, int stride,
-                   int tx_type) {
+                   TxfmParam *txfm_param) {
+  int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif  // CONFIG_MRC_TX
+#if CONFIG_DCT_ONLY
+  assert(tx_type == DCT_DCT);
+#endif
   static const transform_2d FHT[] = {
     { fdct8, fdct32 },         // DCT_DCT
     { fadst8, fdct32 },        // ADST_DCT
@@ -1586,10 +1898,20 @@ void av1_fht32x8_c(const int16_t *input, tran_low_t *output, int stride,
   maybe_flip_input(&input, &stride, n, n4, flipped_input, tx_type);
 #endif
 
+#if CONFIG_LGT
+  const tran_high_t *lgtmtx_col[32];
+  int use_lgt_col = get_fwd_lgt8(ht.cols, txfm_param, lgtmtx_col, 32);
+#endif
+
   // Columns
   for (i = 0; i < n4; ++i) {
     for (j = 0; j < n; ++j) temp_in[j] = input[j * stride + i] * 4;
-    ht.cols(temp_in, temp_out);
+#if CONFIG_LGT
+    if (use_lgt_col)
+      flgt8(temp_in, temp_out, lgtmtx_col[i]);
+    else
+#endif
+      ht.cols(temp_in, temp_out);
     for (j = 0; j < n; ++j) out[j * n4 + i] = temp_out[j];
   }
 
@@ -1604,7 +1926,14 @@ void av1_fht32x8_c(const int16_t *input, tran_low_t *output, int stride,
 }
 
 void av1_fht16x32_c(const int16_t *input, tran_low_t *output, int stride,
-                    int tx_type) {
+                    TxfmParam *txfm_param) {
+  int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif  // CONFIG_MRC_TX
+#if CONFIG_DCT_ONLY
+  assert(tx_type == DCT_DCT);
+#endif
   static const transform_2d FHT[] = {
     { fdct32, fdct16 },         // DCT_DCT
     { fhalfright32, fdct16 },   // ADST_DCT
@@ -1656,7 +1985,14 @@ void av1_fht16x32_c(const int16_t *input, tran_low_t *output, int stride,
 }
 
 void av1_fht32x16_c(const int16_t *input, tran_low_t *output, int stride,
-                    int tx_type) {
+                    TxfmParam *txfm_param) {
+  int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif  // CONFIG_MRC_TX
+#if CONFIG_DCT_ONLY
+  assert(tx_type == DCT_DCT);
+#endif
   static const transform_2d FHT[] = {
     { fdct16, fdct32 },         // DCT_DCT
     { fadst16, fdct32 },        // ADST_DCT
@@ -1833,10 +2169,21 @@ void av1_fdct8x8_quant_c(const int16_t *input, int stride,
 }
 
 void av1_fht8x8_c(const int16_t *input, tran_low_t *output, int stride,
-                  int tx_type) {
+                  TxfmParam *txfm_param) {
+  int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif  // CONFIG_MRC_TX
+#if CONFIG_DCT_ONLY
+  assert(tx_type == DCT_DCT);
+#endif
+#if !CONFIG_DAALA_DCT8
   if (tx_type == DCT_DCT) {
     aom_fdct8x8_c(input, output, stride);
-  } else {
+    return;
+  }
+#endif
+  {
     static const transform_2d FHT[] = {
       { fdct8, fdct8 },    // DCT_DCT
       { fadst8, fdct8 },   // ADST_DCT
@@ -1855,7 +2202,7 @@ void av1_fht8x8_c(const int16_t *input, tran_low_t *output, int stride,
       { fidtx8, fadst8 },  // H_ADST
       { fadst8, fidtx8 },  // V_FLIPADST
       { fidtx8, fadst8 },  // H_FLIPADST
-#endif                     // CONFIG_EXT_TX
+#endif
     };
     const transform_2d ht = FHT[tx_type];
     tran_low_t out[64];
@@ -1867,19 +2214,45 @@ void av1_fht8x8_c(const int16_t *input, tran_low_t *output, int stride,
     maybe_flip_input(&input, &stride, 8, 8, flipped_input, tx_type);
 #endif
 
+#if CONFIG_LGT
+    const tran_high_t *lgtmtx_col[8];
+    const tran_high_t *lgtmtx_row[8];
+    int use_lgt_col = get_fwd_lgt8(ht.cols, txfm_param, lgtmtx_col, 8);
+    int use_lgt_row = get_fwd_lgt8(ht.rows, txfm_param, lgtmtx_row, 8);
+#endif
+
     // Columns
     for (i = 0; i < 8; ++i) {
+#if CONFIG_DAALA_DCT8
+      for (j = 0; j < 8; ++j) temp_in[j] = input[j * stride + i] * 16;
+#else
       for (j = 0; j < 8; ++j) temp_in[j] = input[j * stride + i] * 4;
-      ht.cols(temp_in, temp_out);
+#endif
+#if CONFIG_LGT
+      if (use_lgt_col)
+        flgt8(temp_in, temp_out, lgtmtx_col[i]);
+      else
+#endif
+        ht.cols(temp_in, temp_out);
       for (j = 0; j < 8; ++j) out[j * 8 + i] = temp_out[j];
     }
 
     // Rows
     for (i = 0; i < 8; ++i) {
       for (j = 0; j < 8; ++j) temp_in[j] = out[j + i * 8];
-      ht.rows(temp_in, temp_out);
+#if CONFIG_LGT
+      if (use_lgt_row)
+        flgt8(temp_in, temp_out, lgtmtx_row[i]);
+      else
+#endif
+        ht.rows(temp_in, temp_out);
+#if CONFIG_DAALA_DCT8
+      for (j = 0; j < 8; ++j)
+        output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
+#else
       for (j = 0; j < 8; ++j)
         output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
+#endif
     }
   }
 }
@@ -1941,7 +2314,14 @@ void av1_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
 }
 
 void av1_fht16x16_c(const int16_t *input, tran_low_t *output, int stride,
-                    int tx_type) {
+                    TxfmParam *txfm_param) {
+  int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif  // CONFIG_MRC_TX
+#if CONFIG_DCT_ONLY
+  assert(tx_type == DCT_DCT);
+#endif
   static const transform_2d FHT[] = {
     { fdct16, fdct16 },    // DCT_DCT
     { fadst16, fdct16 },   // ADST_DCT
@@ -1960,9 +2340,8 @@ void av1_fht16x16_c(const int16_t *input, tran_low_t *output, int stride,
     { fidtx16, fadst16 },  // H_ADST
     { fadst16, fidtx16 },  // V_FLIPADST
     { fidtx16, fadst16 },  // H_FLIPADST
-#endif                     // CONFIG_EXT_TX
+#endif
   };
-
   const transform_2d ht = FHT[tx_type];
   tran_low_t out[256];
   int i, j;
@@ -1989,80 +2368,17 @@ void av1_fht16x16_c(const int16_t *input, tran_low_t *output, int stride,
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
-void av1_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride,
-                         int tx_type) {
-  av1_fht4x4_c(input, output, stride, tx_type);
-}
-
-void av1_highbd_fht4x8_c(const int16_t *input, tran_low_t *output, int stride,
-                         int tx_type) {
-  av1_fht4x8_c(input, output, stride, tx_type);
-}
-
-void av1_highbd_fht8x4_c(const int16_t *input, tran_low_t *output, int stride,
-                         int tx_type) {
-  av1_fht8x4_c(input, output, stride, tx_type);
-}
-
-void av1_highbd_fht8x16_c(const int16_t *input, tran_low_t *output, int stride,
-                          int tx_type) {
-  av1_fht8x16_c(input, output, stride, tx_type);
-}
-
-void av1_highbd_fht16x8_c(const int16_t *input, tran_low_t *output, int stride,
-                          int tx_type) {
-  av1_fht16x8_c(input, output, stride, tx_type);
-}
-
-void av1_highbd_fht16x32_c(const int16_t *input, tran_low_t *output, int stride,
-                           int tx_type) {
-  av1_fht16x32_c(input, output, stride, tx_type);
-}
-
-void av1_highbd_fht32x16_c(const int16_t *input, tran_low_t *output, int stride,
-                           int tx_type) {
-  av1_fht32x16_c(input, output, stride, tx_type);
-}
-
-void av1_highbd_fht4x16_c(const int16_t *input, tran_low_t *output, int stride,
-                          int tx_type) {
-  av1_fht4x16_c(input, output, stride, tx_type);
-}
-
-void av1_highbd_fht16x4_c(const int16_t *input, tran_low_t *output, int stride,
-                          int tx_type) {
-  av1_fht16x4_c(input, output, stride, tx_type);
-}
-
-void av1_highbd_fht8x32_c(const int16_t *input, tran_low_t *output, int stride,
-                          int tx_type) {
-  av1_fht8x32_c(input, output, stride, tx_type);
-}
-
-void av1_highbd_fht32x8_c(const int16_t *input, tran_low_t *output, int stride,
-                          int tx_type) {
-  av1_fht32x8_c(input, output, stride, tx_type);
-}
-
-void av1_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride,
-                         int tx_type) {
-  av1_fht8x8_c(input, output, stride, tx_type);
-}
-
 void av1_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output,
                           int stride) {
   av1_fwht4x4_c(input, output, stride);
 }
 
-void av1_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride,
-                           int tx_type) {
-  av1_fht16x16_c(input, output, stride, tx_type);
-}
-#endif  // CONFIG_HIGHBITDEPTH
-
 void av1_fht32x32_c(const int16_t *input, tran_low_t *output, int stride,
-                    int tx_type) {
+                    TxfmParam *txfm_param) {
+  int tx_type = txfm_param->tx_type;
+#if CONFIG_DCT_ONLY
+  assert(tx_type == DCT_DCT);
+#endif
   static const transform_2d FHT[] = {
     { fdct32, fdct32 },  // DCT_DCT
 #if CONFIG_EXT_TX
@@ -2082,6 +2398,9 @@ void av1_fht32x32_c(const int16_t *input, tran_low_t *output, int stride,
     { fhalfright32, fidtx32 },       // V_FLIPADST
     { fidtx32, fhalfright32 },       // H_FLIPADST
 #endif
+#if CONFIG_MRC_TX
+    { fdct32, fdct32 },  // MRC_TX
+#endif                   // CONFIG_MRC_TX
   };
   const transform_2d ht = FHT[tx_type];
   tran_low_t out[1024];
@@ -2093,6 +2412,14 @@ void av1_fht32x32_c(const int16_t *input, tran_low_t *output, int stride,
   maybe_flip_input(&input, &stride, 32, 32, flipped_input, tx_type);
 #endif
 
+#if CONFIG_MRC_TX
+  if (tx_type == MRC_DCT) {
+    int16_t masked_input[32 * 32];
+    get_masked_residual32(&input, &stride, txfm_param->dst, txfm_param->stride,
+                          masked_input);
+  }
+#endif  // CONFIG_MRC_TX
+
   // Columns
   for (i = 0; i < 32; ++i) {
     for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
@@ -2150,7 +2477,14 @@ static void fdct64_row(const tran_low_t *input, tran_low_t *output) {
 }
 
 void av1_fht64x64_c(const int16_t *input, tran_low_t *output, int stride,
-                    int tx_type) {
+                    TxfmParam *txfm_param) {
+  int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif  // CONFIG_MRC_TX
+#if CONFIG_DCT_ONLY
+  assert(tx_type == DCT_DCT);
+#endif
   static const transform_2d FHT[] = {
     { fdct64_col, fdct64_row },  // DCT_DCT
 #if CONFIG_EXT_TX
@@ -2179,6 +2513,7 @@ void av1_fht64x64_c(const int16_t *input, tran_low_t *output, int stride,
   int16_t flipped_input[64 * 64];
   maybe_flip_input(&input, &stride, 64, 64, flipped_input, tx_type);
 #endif
+
   // Columns
   for (i = 0; i < 64; ++i) {
     for (j = 0; j < 64; ++j) temp_in[j] = input[j * stride + i];
@@ -2214,20 +2549,6 @@ void av1_fwd_idtx_c(const int16_t *src_diff, tran_low_t *coeff, int stride,
 }
 #endif  // CONFIG_EXT_TX
 
-#if CONFIG_HIGHBITDEPTH
-void av1_highbd_fht32x32_c(const int16_t *input, tran_low_t *output, int stride,
-                           int tx_type) {
-  av1_fht32x32_c(input, output, stride, tx_type);
-}
-
-#if CONFIG_TX64X64
-void av1_highbd_fht64x64_c(const int16_t *input, tran_low_t *output, int stride,
-                           int tx_type) {
-  av1_fht64x64_c(input, output, stride, tx_type);
-}
-#endif  // CONFIG_TX64X64
-#endif  // CONFIG_HIGHBITDEPTH
-
 #if CONFIG_DPCM_INTRA
 void av1_dpcm_ft4_c(const int16_t *input, int stride, TX_TYPE_1D tx_type,
                     tran_low_t *output) {
@@ -2271,5 +2592,54 @@ void av1_dpcm_ft32_c(const int16_t *input, int stride, TX_TYPE_1D tx_type,
   for (int i = 0; i < 32; ++i) temp_in[i] = input[i * stride];
   ft(temp_in, output);
 }
+
+#if CONFIG_HIGHBITDEPTH
+void av1_hbd_dpcm_ft4_c(const int16_t *input, int stride, TX_TYPE_1D tx_type,
+                        tran_low_t *output, int dir) {
+  (void)dir;
+  assert(tx_type < TX_TYPES_1D);
+  static const transform_1d FHT[] = { fdct4, fadst4, fadst4, fidtx4 };
+  const transform_1d ft = FHT[tx_type];
+  tran_low_t temp_in[4];
+  for (int i = 0; i < 4; ++i)
+    temp_in[i] = (tran_low_t)fdct_round_shift(input[i * stride] * 4 * Sqrt2);
+  ft(temp_in, output);
+}
+
+void av1_hbd_dpcm_ft8_c(const int16_t *input, int stride, TX_TYPE_1D tx_type,
+                        tran_low_t *output, int dir) {
+  (void)dir;
+  assert(tx_type < TX_TYPES_1D);
+  static const transform_1d FHT[] = { fdct8, fadst8, fadst8, fidtx8 };
+  const transform_1d ft = FHT[tx_type];
+  tran_low_t temp_in[8];
+  for (int i = 0; i < 8; ++i) temp_in[i] = input[i * stride] * 4;
+  ft(temp_in, output);
+}
+
+void av1_hbd_dpcm_ft16_c(const int16_t *input, int stride, TX_TYPE_1D tx_type,
+                         tran_low_t *output, int dir) {
+  (void)dir;
+  assert(tx_type < TX_TYPES_1D);
+  static const transform_1d FHT[] = { fdct16, fadst16, fadst16, fidtx16 };
+  const transform_1d ft = FHT[tx_type];
+  tran_low_t temp_in[16];
+  for (int i = 0; i < 16; ++i)
+    temp_in[i] = (tran_low_t)fdct_round_shift(input[i * stride] * 2 * Sqrt2);
+  ft(temp_in, output);
+}
+
+void av1_hbd_dpcm_ft32_c(const int16_t *input, int stride, TX_TYPE_1D tx_type,
+                         tran_low_t *output, int dir) {
+  (void)dir;
+  assert(tx_type < TX_TYPES_1D);
+  static const transform_1d FHT[] = { fdct32, fhalfright32, fhalfright32,
+                                      fidtx32 };
+  const transform_1d ft = FHT[tx_type];
+  tran_low_t temp_in[32];
+  for (int i = 0; i < 32; ++i) temp_in[i] = input[i * stride];
+  ft(temp_in, output);
+}
+#endif  // CONFIG_HIGHBITDEPTH
 #endif  // CONFIG_DPCM_INTRA
 #endif  // !AV1_DCT_GTEST
diff --git a/third_party/aom/av1/encoder/encodeframe.c b/third_party/aom/av1/encoder/encodeframe.c
index 36d09c02a..d13eb42fb 100644
--- a/third_party/aom/av1/encoder/encodeframe.c
+++ b/third_party/aom/av1/encoder/encodeframe.c
@@ -84,7 +84,7 @@ static void predict_superblock(const AV1_COMP *const cpi, ThreadData *td,
 #if CONFIG_EXT_INTER
                                int mi_row_ori, int mi_col_ori,
 #endif  // CONFIG_EXT_INTER
-                               int mi_row_pred, int mi_col_pred,
+                               int mi_row_pred, int mi_col_pred, int plane,
                                BLOCK_SIZE bsize_pred, int b_sub8x8, int block);
 static int check_supertx_sb(BLOCK_SIZE bsize, TX_SIZE supertx_size,
                             PC_TREE *pc_tree);
@@ -308,7 +308,6 @@ static void set_offsets_without_segment_id(const AV1_COMP *const cpi,
   av1_setup_src_planes(x, cpi->source, mi_row, mi_col);
 
   // R/D setup.
-  x->rddiv = cpi->rd.RDDIV;
   x->rdmult = cpi->rd.RDMULT;
 
   // required by av1_append_sub8x8_mvs_for_idx() and av1_find_best_ref_mvs()
@@ -326,6 +325,10 @@ static void set_offsets(const AV1_COMP *const cpi, const TileInfo *const tile,
   set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
 
   mbmi = &xd->mi[0]->mbmi;
+#if CONFIG_CFL
+  xd->cfl->mi_row = mi_row;
+  xd->cfl->mi_col = mi_col;
+#endif
 
   // Setup segment ID.
   if (seg->enabled) {
@@ -413,7 +416,6 @@ static void set_offsets_extend(const AV1_COMP *const cpi, ThreadData *td,
   xd->left_available = (mi_col_ori > tile->mi_col_start);
 
   // R/D setup.
-  x->rddiv = cpi->rd.RDDIV;
   x->rdmult = cpi->rd.RDMULT;
 }
 
@@ -539,6 +541,21 @@ static void set_ref_and_pred_mvs(MACROBLOCK *const x, int_mv *const mi_pred_mv,
       mbmi->pred_mv[1] = this_mv;
       mi_pred_mv[1] = this_mv;
     }
+#if CONFIG_COMPOUND_SINGLEREF
+  } else if (is_inter_singleref_comp_mode(mbmi->mode)) {
+    // Special case: SR_NEAR_NEWMV uses 1 + mbmi->ref_mv_idx
+    // (like NEARMV) instead
+    if (mbmi->mode == SR_NEAR_NEWMV) ref_mv_idx += 1;
+
+    if (compound_ref0_mode(mbmi->mode) == NEWMV ||
+        compound_ref1_mode(mbmi->mode) == NEWMV) {
+      int_mv this_mv = curr_ref_mv_stack[ref_mv_idx].this_mv;
+      clamp_mv_ref(&this_mv.as_mv, bw, bh, xd);
+      mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv;
+      mbmi->pred_mv[0] = this_mv;
+      mi_pred_mv[0] = this_mv;
+    }
+#endif  // CONFIG_COMPOUND_SINGLEREF
   } else {
 #endif  // CONFIG_EXT_INTER
     if (mbmi->mode == NEWMV) {
@@ -635,7 +652,6 @@ static void update_state(const AV1_COMP *const cpi, ThreadData *td,
 #if CONFIG_PALETTE
   for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
 #endif  // CONFIG_PALETTE
-
   // Restore the coding context of the MB to that that was in place
   // when the mode was picked for it
   for (y = 0; y < mi_height; y++)
@@ -814,7 +830,6 @@ static void update_state_supertx(const AV1_COMP *const cpi, ThreadData *td,
     }
     mi_addr->mbmi.segment_id_supertx = MAX_SEGMENTS;
   }
-
   // Restore the coding context of the MB to that that was in place
   // when the mode was picked for it
   for (y = 0; y < mi_height; y++)
@@ -1147,7 +1162,7 @@ static void update_supertx_param_sb(const AV1_COMP *const cpi, ThreadData *td,
 }
 #endif  // CONFIG_SUPERTX
 
-#if CONFIG_MOTION_VAR && CONFIG_NCOBMC
+#if CONFIG_MOTION_VAR && (CONFIG_NCOBMC || CONFIG_NCOBMC_ADAPT_WEIGHT)
 static void set_mode_info_b(const AV1_COMP *const cpi,
                             const TileInfo *const tile, ThreadData *td,
                             int mi_row, int mi_col, BLOCK_SIZE bsize,
@@ -1167,6 +1182,7 @@ static void set_mode_info_sb(const AV1_COMP *const cpi, ThreadData *td,
   BLOCK_SIZE subsize = get_subsize(bsize, partition);
 #if CONFIG_EXT_PARTITION_TYPES
   const BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+  const int quarter_step = mi_size_wide[bsize] / 4;
 #endif
 #if CONFIG_CB4X4
   const int unify_bsize = 1;
@@ -1245,6 +1261,24 @@ static void set_mode_info_sb(const AV1_COMP *const cpi, ThreadData *td,
       set_mode_info_b(cpi, tile, td, mi_row + hbs, mi_col + hbs, bsize2,
                       &pc_tree->verticalb[2]);
       break;
+    case PARTITION_HORZ_4:
+      for (int i = 0; i < 4; ++i) {
+        int this_mi_row = mi_row + i * quarter_step;
+        if (i > 0 && this_mi_row >= cm->mi_rows) break;
+
+        set_mode_info_b(cpi, tile, td, this_mi_row, mi_col, subsize,
+                        &pc_tree->horizontal4[i]);
+      }
+      break;
+    case PARTITION_VERT_4:
+      for (int i = 0; i < 4; ++i) {
+        int this_mi_col = mi_col + i * quarter_step;
+        if (i > 0 && this_mi_col >= cm->mi_cols) break;
+
+        set_mode_info_b(cpi, tile, td, mi_row, this_mi_col, subsize,
+                        &pc_tree->vertical4[i]);
+      }
+      break;
 #endif  // CONFIG_EXT_PARTITION_TYPES
     default: assert(0 && "Invalid partition type."); break;
   }
@@ -1281,10 +1315,10 @@ static int set_segment_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
   return av1_compute_rd_mult(cpi, segment_qindex + cm->y_dc_delta_q);
 }
 
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
-static void daala_dist_set_sub8x8_dst(MACROBLOCK *const x, uint8_t *dst8x8,
-                                      BLOCK_SIZE bsize, int bw, int bh,
-                                      int mi_row, int mi_col) {
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
+static void dist_8x8_set_sub8x8_dst(MACROBLOCK *const x, uint8_t *dst8x8,
+                                    BLOCK_SIZE bsize, int bw, int bh,
+                                    int mi_row, int mi_col) {
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblockd_plane *const pd = &xd->plane[0];
   const int dst_stride = pd->dst.stride;
@@ -1294,12 +1328,24 @@ static void daala_dist_set_sub8x8_dst(MACROBLOCK *const x, uint8_t *dst8x8,
 
   if (bsize < BLOCK_8X8) {
     int i, j;
-    uint8_t *dst_sub8x8 = &dst8x8[((mi_row & 1) * 8 + (mi_col & 1)) << 2];
+#if CONFIG_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      uint16_t *dst8x8_16 = (uint16_t *)dst8x8;
+      uint16_t *dst_sub8x8 = &dst8x8_16[((mi_row & 1) * 8 + (mi_col & 1)) << 2];
 
-    for (j = 0; j < bh; ++j)
-      for (i = 0; i < bw; ++i) {
-        dst_sub8x8[j * 8 + i] = dst[j * dst_stride + i];
-      }
+      for (j = 0; j < bh; ++j)
+        for (i = 0; i < bw; ++i)
+          dst_sub8x8[j * 8 + i] = CONVERT_TO_SHORTPTR(dst)[j * dst_stride + i];
+    } else {
+#endif
+      uint8_t *dst_sub8x8 = &dst8x8[((mi_row & 1) * 8 + (mi_col & 1)) << 2];
+
+      for (j = 0; j < bh; ++j)
+        for (i = 0; i < bw; ++i)
+          dst_sub8x8[j * 8 + i] = dst[j * dst_stride + i];
+#if CONFIG_HIGHBITDEPTH
+    }
+#endif
   }
 }
 #endif
@@ -1330,10 +1376,6 @@ static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data,
   x->pvq_speed = 1;
   x->pvq_coded = 0;
 #endif
-#if CONFIG_CFL
-  // Don't store luma during RDO (we will store the best mode later).
-  x->cfl_store_y = 0;
-#endif
 
   set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
   mbmi = &xd->mi[0]->mbmi;
@@ -1342,6 +1384,10 @@ static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data,
   mbmi->mi_row = mi_row;
   mbmi->mi_col = mi_col;
 #endif
+#if CONFIG_CFL
+  // Don't store luma during RDO. Only store luma when best luma is known
+  x->cfl_store_y = 0;
+#endif
 #if CONFIG_SUPERTX
   // We set tx_size here as skip blocks would otherwise not set it.
   // tx_size needs to be set at this point as supertx_enable in
@@ -1542,6 +1588,7 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
 #endif
   if (!frame_is_intra_only(cm)) {
     FRAME_COUNTS *const counts = td->counts;
+    RD_COUNTS *rdc = &td->rd_counts;
     const int inter_block = is_inter_block(mbmi);
     const int seg_ref_active =
         segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
@@ -1560,6 +1607,12 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
 #endif  // CONFIG_EXT_REFS
 
         if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+          if (has_second_ref(mbmi))
+            // This flag is also updated for 4x4 blocks
+            rdc->compound_ref_used_flag = 1;
+          else
+            // This flag is also updated for 4x4 blocks
+            rdc->single_ref_used_flag = 1;
 #if !SUB8X8_COMP_REF
           if (mbmi->sb_type != BLOCK_4X4)
             counts->comp_inter[av1_get_reference_mode_context(cm, xd)]
@@ -1571,24 +1624,53 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
         }
 
         if (has_second_ref(mbmi)) {
+#if CONFIG_EXT_COMP_REFS
+          const COMP_REFERENCE_TYPE comp_ref_type = has_uni_comp_refs(mbmi)
+                                                        ? UNIDIR_COMP_REFERENCE
+                                                        : BIDIR_COMP_REFERENCE;
+#if !USE_UNI_COMP_REFS
+          // TODO(zoeliu): Temporarily turn off uni-directional comp refs
+          assert(comp_ref_type == BIDIR_COMP_REFERENCE);
+#endif  // !USE_UNI_COMP_REFS
+          counts->comp_ref_type[av1_get_comp_reference_type_context(xd)]
+                               [comp_ref_type]++;
+
+          if (comp_ref_type == UNIDIR_COMP_REFERENCE) {
+            const int bit = (ref0 == BWDREF_FRAME);
+            counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p(xd)][0]
+                                [bit]++;
+            if (!bit) {
+              const int bit1 = (ref1 == LAST3_FRAME || ref1 == GOLDEN_FRAME);
+              counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p1(xd)][1]
+                                  [bit1]++;
+              if (bit1) {
+                counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p2(xd)]
+                                    [2][ref1 == GOLDEN_FRAME]++;
+              }
+            }
+          } else {
+#endif  // CONFIG_EXT_COMP_REFS
 #if CONFIG_EXT_REFS
-          const int bit = (ref0 == GOLDEN_FRAME || ref0 == LAST3_FRAME);
+            const int bit = (ref0 == GOLDEN_FRAME || ref0 == LAST3_FRAME);
 
-          counts->comp_ref[av1_get_pred_context_comp_ref_p(cm, xd)][0][bit]++;
-          if (!bit) {
-            counts->comp_ref[av1_get_pred_context_comp_ref_p1(cm, xd)][1]
-                            [ref0 == LAST_FRAME]++;
-          } else {
-            counts->comp_ref[av1_get_pred_context_comp_ref_p2(cm, xd)][2]
-                            [ref0 == GOLDEN_FRAME]++;
-          }
+            counts->comp_ref[av1_get_pred_context_comp_ref_p(cm, xd)][0][bit]++;
+            if (!bit) {
+              counts->comp_ref[av1_get_pred_context_comp_ref_p1(cm, xd)][1]
+                              [ref0 == LAST_FRAME]++;
+            } else {
+              counts->comp_ref[av1_get_pred_context_comp_ref_p2(cm, xd)][2]
+                              [ref0 == GOLDEN_FRAME]++;
+            }
 
-          counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p(cm, xd)][0]
-                             [ref1 == ALTREF_FRAME]++;
-#else
+            counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p(cm, xd)][0]
+                               [ref1 == ALTREF_FRAME]++;
+#else   // !CONFIG_EXT_REFS
           counts->comp_ref[av1_get_pred_context_comp_ref_p(cm, xd)][0]
                           [ref0 == GOLDEN_FRAME]++;
 #endif  // CONFIG_EXT_REFS
+#if CONFIG_EXT_COMP_REFS
+          }
+#endif  // CONFIG_EXT_COMP_REFS
         } else {
 #if CONFIG_EXT_REFS
           const int bit = (ref0 == ALTREF_FRAME || ref0 == BWDREF_FRAME);
@@ -1609,7 +1691,7 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
                                 [ref0 != LAST3_FRAME]++;
             }
           }
-#else
+#else   // !CONFIG_EXT_REFS
           counts->single_ref[av1_get_pred_context_single_ref_p1(xd)][0]
                             [ref0 != LAST_FRAME]++;
           if (ref0 != LAST_FRAME) {
@@ -1619,7 +1701,14 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
 #endif  // CONFIG_EXT_REFS
         }
 
-#if CONFIG_EXT_INTER && CONFIG_INTERINTRA
+#if CONFIG_EXT_INTER
+#if CONFIG_COMPOUND_SINGLEREF
+        if (!has_second_ref(mbmi))
+          counts->comp_inter_mode[av1_get_inter_mode_context(xd)]
+                                 [is_inter_singleref_comp_mode(mbmi->mode)]++;
+#endif  // CONFIG_COMPOUND_SINGLEREF
+
+#if CONFIG_INTERINTRA
         if (cm->reference_mode != COMPOUND_REFERENCE &&
 #if CONFIG_SUPERTX
             !supertx_enabled &&
@@ -1635,14 +1724,33 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
             counts->interintra[bsize_group][0]++;
           }
         }
-#endif  // CONFIG_EXT_INTER && CONFIG_INTERINTRA
+#endif  // CONFIG_INTERINTRA
+#endif  // CONFIG_EXT_INTER
 
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+#if CONFIG_WARPED_MOTION
+        set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+#endif
+#if CONFIG_NCOBMC_ADAPT_WEIGHT
+        const MOTION_MODE motion_allowed =
+            motion_mode_allowed_wrapper(0,
+#if CONFIG_GLOBAL_MOTION
+                                        0, xd->global_motion,
+#endif  // CONFIG_GLOBAL_MOTION
+#if CONFIG_WARPED_MOTION
+                                        xd,
+#endif
+                                        mi);
+#else
         const MOTION_MODE motion_allowed = motion_mode_allowed(
-#if CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+#if CONFIG_GLOBAL_MOTION
             0, xd->global_motion,
-#endif  // CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+#endif  // CONFIG_GLOBAL_MOTION
+#if CONFIG_WARPED_MOTION
+            xd,
+#endif
             mi);
+#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
 #if CONFIG_SUPERTX
         if (!supertx_enabled)
 #endif  // CONFIG_SUPERTX
@@ -1660,11 +1768,28 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
         if (motion_allowed > SIMPLE_TRANSLATION)
           counts->motion_mode[mbmi->sb_type][mbmi->motion_mode]++;
 #endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+
+#if CONFIG_NCOBMC_ADAPT_WEIGHT
+        if (mbmi->motion_mode == NCOBMC_ADAPT_WEIGHT) {
+          ADAPT_OVERLAP_BLOCK ao_block =
+              adapt_overlap_block_lookup[mbmi->sb_type];
+          ++counts->ncobmc_mode[ao_block][mbmi->ncobmc_mode[0]];
+          if (mi_size_wide[mbmi->sb_type] != mi_size_high[mbmi->sb_type]) {
+            ++counts->ncobmc_mode[ao_block][mbmi->ncobmc_mode[1]];
+          }
+        }
+#endif
+
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 
 #if CONFIG_EXT_INTER
-        if (cm->reference_mode != SINGLE_REFERENCE &&
+        if (
+#if CONFIG_COMPOUND_SINGLEREF
+            is_inter_anyref_comp_mode(mbmi->mode)
+#else   // !CONFIG_COMPOUND_SINGLEREF
+            cm->reference_mode != SINGLE_REFERENCE &&
             is_inter_compound_mode(mbmi->mode)
+#endif  // CONFIG_COMPOUND_SINGLEREF
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
             && mbmi->motion_mode == SIMPLE_TRANSLATION
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
@@ -1683,6 +1808,12 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
       if (has_second_ref(mbmi)) {
         mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]];
         ++counts->inter_compound_mode[mode_ctx][INTER_COMPOUND_OFFSET(mode)];
+#if CONFIG_COMPOUND_SINGLEREF
+      } else if (is_inter_singleref_comp_mode(mode)) {
+        mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]];
+        ++counts->inter_singleref_comp_mode[mode_ctx]
+                                           [INTER_SINGLEREF_COMP_OFFSET(mode)];
+#endif  // CONFIG_COMPOUND_SINGLEREF
       } else {
 #endif  // CONFIG_EXT_INTER
         mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
@@ -1693,10 +1824,15 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
 #endif  // CONFIG_EXT_INTER
 
 #if CONFIG_EXT_INTER
+#if CONFIG_COMPOUND_SINGLEREF
+      if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV ||
+          mbmi->mode == SR_NEW_NEWMV) {
+#else   // !CONFIG_COMPOUND_SINGLEREF
       if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) {
-#else
+#endif  // CONFIG_COMPOUND_SINGLEREF
+#else   // !CONFIG_EXT_INTER
       if (mbmi->mode == NEWMV) {
-#endif
+#endif  // CONFIG_EXT_INTER
         uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
         int idx;
 
@@ -1871,10 +2007,16 @@ static void encode_b(const AV1_COMP *const cpi, const TileInfo *const tile,
   update_state(cpi, td, ctx, mi_row, mi_col, bsize, dry_run);
 #if CONFIG_MOTION_VAR && CONFIG_NCOBMC
   mbmi = &xd->mi[0]->mbmi;
+#if CONFIG_WARPED_MOTION
+  set_ref_ptrs(&cpi->common, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+#endif
   const MOTION_MODE motion_allowed = motion_mode_allowed(
-#if CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+#if CONFIG_GLOBAL_MOTION
       0, xd->global_motion,
-#endif  // CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+#endif  // CONFIG_GLOBAL_MOTION
+#if CONFIG_WARPED_MOTION
+      xd,
+#endif
       xd->mi[0]);
   check_ncobmc = is_inter_block(mbmi) && motion_allowed >= OBMC_CAUSAL;
   if (!dry_run && check_ncobmc) {
@@ -1922,6 +2064,8 @@ static void encode_sb(const AV1_COMP *const cpi, ThreadData *td,
   const BLOCK_SIZE subsize = get_subsize(bsize, partition);
 #if CONFIG_EXT_PARTITION_TYPES
   const BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+  int quarter_step = mi_size_wide[bsize] / 4;
+  int i;
 #endif
 
 #if CONFIG_CB4X4
@@ -1933,6 +2077,11 @@ static void encode_sb(const AV1_COMP *const cpi, ThreadData *td,
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
 
+#if CONFIG_SPEED_REFS
+  // First scanning pass of an SB is dry run only.
+  if (cpi->sb_scanning_pass_idx == 0) assert(dry_run == DRY_RUN_NORMAL);
+#endif  // CONFIG_SPEED_REFS
+
   if (!dry_run && ctx >= 0) td->counts->partition[ctx][partition]++;
 
 #if CONFIG_SUPERTX
@@ -2115,6 +2264,24 @@ static void encode_sb(const AV1_COMP *const cpi, ThreadData *td,
       encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col + hbs, dry_run, bsize2,
                partition, &pc_tree->verticalb[2], rate);
       break;
+    case PARTITION_HORZ_4:
+      for (i = 0; i < 4; ++i) {
+        int this_mi_row = mi_row + i * quarter_step;
+        if (i > 0 && this_mi_row >= cm->mi_rows) break;
+
+        encode_b(cpi, tile, td, tp, this_mi_row, mi_col, dry_run, subsize,
+                 partition, &pc_tree->horizontal4[i], rate);
+      }
+      break;
+    case PARTITION_VERT_4:
+      for (i = 0; i < 4; ++i) {
+        int this_mi_col = mi_col + i * quarter_step;
+        if (i > 0 && this_mi_col >= cm->mi_cols) break;
+
+        encode_b(cpi, tile, td, tp, mi_row, this_mi_col, dry_run, subsize,
+                 partition, &pc_tree->vertical4[i], rate);
+      }
+      break;
 #endif  // CONFIG_EXT_PARTITION_TYPES
     default: assert(0 && "Invalid partition type."); break;
   }
@@ -2302,8 +2469,7 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
 
       if (none_rdc.rate < INT_MAX) {
         none_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
-        none_rdc.rdcost =
-            RDCOST(x->rdmult, x->rddiv, none_rdc.rate, none_rdc.dist);
+        none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist);
 #if CONFIG_SUPERTX
         none_rate_nocoef += cpi->partition_cost[pl][PARTITION_NONE];
 #endif
@@ -2473,7 +2639,9 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
     case PARTITION_VERT_A:
     case PARTITION_VERT_B:
     case PARTITION_HORZ_A:
-    case PARTITION_HORZ_B: assert(0 && "Cannot handle extended partiton types");
+    case PARTITION_HORZ_B:
+    case PARTITION_HORZ_4:
+    case PARTITION_VERT_4: assert(0 && "Cannot handle extended partiton types");
 #endif  //  CONFIG_EXT_PARTITION_TYPES
     default: assert(0); break;
   }
@@ -2481,7 +2649,7 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
   if (last_part_rdc.rate < INT_MAX) {
     last_part_rdc.rate += cpi->partition_cost[pl][partition];
     last_part_rdc.rdcost =
-        RDCOST(x->rdmult, x->rddiv, last_part_rdc.rate, last_part_rdc.dist);
+        RDCOST(x->rdmult, last_part_rdc.rate, last_part_rdc.dist);
 #if CONFIG_SUPERTX
     last_part_rate_nocoef += cpi->partition_cost[pl][partition];
 #endif
@@ -2565,8 +2733,7 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
     }
     if (chosen_rdc.rate < INT_MAX) {
       chosen_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT];
-      chosen_rdc.rdcost =
-          RDCOST(x->rdmult, x->rddiv, chosen_rdc.rate, chosen_rdc.dist);
+      chosen_rdc.rdcost = RDCOST(x->rdmult, chosen_rdc.rate, chosen_rdc.dist);
 #if CONFIG_SUPERTX
       chosen_rate_nocoef += cpi->partition_cost[pl][PARTITION_NONE];
 #endif
@@ -2624,8 +2791,8 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
 }
 
 /* clang-format off */
-static const BLOCK_SIZE min_partition_size[BLOCK_SIZES] = {
-#if CONFIG_CB4X4
+static const BLOCK_SIZE min_partition_size[BLOCK_SIZES_ALL] = {
+#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
   BLOCK_2X2,   BLOCK_2X2,   BLOCK_2X2,    //    2x2,    2x4,     4x2
 #endif
                             BLOCK_4X4,    //                     4x4
@@ -2634,12 +2801,14 @@ static const BLOCK_SIZE min_partition_size[BLOCK_SIZES] = {
   BLOCK_8X8,   BLOCK_8X8,   BLOCK_16X16,  //  16x32,  32x16,   32x32
   BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,  //  32x64,  64x32,   64x64
 #if CONFIG_EXT_PARTITION
-  BLOCK_16X16, BLOCK_16X16, BLOCK_16X16   // 64x128, 128x64, 128x128
+  BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,  // 64x128, 128x64, 128x128
 #endif  // CONFIG_EXT_PARTITION
+  BLOCK_4X4,   BLOCK_4X4,   BLOCK_8X8,      //   4x16,   16x4,    8x32
+  BLOCK_8X8                                 //   32x8
 };
 
-static const BLOCK_SIZE max_partition_size[BLOCK_SIZES] = {
-#if CONFIG_CB4X4
+static const BLOCK_SIZE max_partition_size[BLOCK_SIZES_ALL] = {
+#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
   BLOCK_4X4,     BLOCK_4X4,       BLOCK_4X4,    //    2x2,    2x4,     4x2
 #endif
                                   BLOCK_8X8,    //                     4x4
@@ -2648,13 +2817,15 @@ static const BLOCK_SIZE max_partition_size[BLOCK_SIZES] = {
   BLOCK_64X64,   BLOCK_64X64,   BLOCK_64X64,    //  16x32,  32x16,   32x32
   BLOCK_LARGEST, BLOCK_LARGEST, BLOCK_LARGEST,  //  32x64,  64x32,   64x64
 #if CONFIG_EXT_PARTITION
-  BLOCK_LARGEST, BLOCK_LARGEST, BLOCK_LARGEST   // 64x128, 128x64, 128x128
+  BLOCK_LARGEST, BLOCK_LARGEST, BLOCK_LARGEST,  // 64x128, 128x64, 128x128
 #endif  // CONFIG_EXT_PARTITION
+  BLOCK_16X16,   BLOCK_16X16,   BLOCK_32X32,    //   4x16,   16x4,    8x32
+  BLOCK_32X32                                   //   32x8
 };
 
 // Next square block size less or equal than current block size.
-static const BLOCK_SIZE next_square_size[BLOCK_SIZES] = {
-#if CONFIG_CB4X4
+static const BLOCK_SIZE next_square_size[BLOCK_SIZES_ALL] = {
+#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
   BLOCK_2X2,   BLOCK_2X2,     BLOCK_2X2,    //    2x2,    2x4,     4x2
 #endif
                               BLOCK_4X4,    //                     4x4
@@ -2663,8 +2834,10 @@ static const BLOCK_SIZE next_square_size[BLOCK_SIZES] = {
   BLOCK_16X16, BLOCK_16X16,   BLOCK_32X32,  //  16x32,  32x16,   32x32
   BLOCK_32X32, BLOCK_32X32,   BLOCK_64X64,  //  32x64,  64x32,   64x64
 #if CONFIG_EXT_PARTITION
-  BLOCK_64X64, BLOCK_64X64, BLOCK_128X128   // 64x128, 128x64, 128x128
+  BLOCK_64X64, BLOCK_64X64, BLOCK_128X128,  // 64x128, 128x64, 128x128
 #endif  // CONFIG_EXT_PARTITION
+  BLOCK_4X4,   BLOCK_4X4,   BLOCK_8X8,      //   4x16,   16x4,    8x32
+  BLOCK_8X8                                 //   32x8
 };
 /* clang-format on */
 
@@ -3055,8 +3228,7 @@ static void rd_test_partition3(
             cm->fc->supertx_prob[partition_supertx_context_lookup[partition]]
                                 [supertx_size],
             0);
-        sum_rdc.rdcost =
-            RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+        sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
 
         if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) {
           TX_TYPE best_tx = DCT_DCT;
@@ -3071,8 +3243,7 @@ static void rd_test_partition3(
               cm->fc->supertx_prob[partition_supertx_context_lookup[partition]]
                                   [supertx_size],
               1);
-          tmp_rdc.rdcost =
-              RDCOST(x->rdmult, x->rddiv, tmp_rdc.rate, tmp_rdc.dist);
+          tmp_rdc.rdcost = RDCOST(x->rdmult, tmp_rdc.rate, tmp_rdc.dist);
           if (tmp_rdc.rdcost < sum_rdc.rdcost) {
             sum_rdc = tmp_rdc;
             update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx,
@@ -3091,8 +3262,7 @@ static void rd_test_partition3(
 #endif
                                          bsize);
         sum_rdc.rate += cpi->partition_cost[pl][partition];
-        sum_rdc.rdcost =
-            RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+        sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
 #if CONFIG_SUPERTX
         sum_rate_nocoef += cpi->partition_cost[pl][partition];
 #endif
@@ -3161,7 +3331,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 #if CONFIG_SUPERTX
   int this_rate_nocoef, sum_rate_nocoef = 0, best_rate_nocoef = INT_MAX;
   int abort_flag;
-  const int supertx_allowed = !frame_is_intra_only(cm) &&
+  const int supertx_allowed = !frame_is_intra_only(cm) && bsize >= BLOCK_8X8 &&
                               bsize <= MAX_SUPERTX_BLOCK_SIZE &&
                               !xd->lossless[0];
 #endif  // CONFIG_SUPERTX
@@ -3341,6 +3511,17 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
   }
 #endif
 
+#if CONFIG_SPEED_REFS
+  if (cpi->sb_scanning_pass_idx == 0) {
+    // NOTE: For the 1st pass of scanning, check all the subblocks of equal size
+    //       only.
+    partition_none_allowed = (bsize == MIN_SPEED_REFS_BLKSIZE);
+    partition_horz_allowed = 0;
+    partition_vert_allowed = 0;
+    do_square_split = (bsize > MIN_SPEED_REFS_BLKSIZE);
+  }
+#endif  // CONFIG_SPEED_REFS
+
   // PARTITION_NONE
   if (partition_none_allowed) {
     rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
@@ -3354,8 +3535,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
     if (this_rdc.rate != INT_MAX) {
       if (bsize_at_least_8x8) {
         this_rdc.rate += partition_cost[PARTITION_NONE];
-        this_rdc.rdcost =
-            RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist);
+        this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
 #if CONFIG_SUPERTX
         this_rate_nocoef += partition_cost[PARTITION_NONE];
 #endif
@@ -3494,8 +3674,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
             cm->fc->supertx_prob[partition_supertx_context_lookup
                                      [PARTITION_SPLIT]][supertx_size],
             0);
-        sum_rdc.rdcost =
-            RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+        sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
 
         if (is_inter_mode(pc_tree->leaf_split[0]->mic.mbmi.mode)) {
           TX_TYPE best_tx = DCT_DCT;
@@ -3512,8 +3691,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
               cm->fc->supertx_prob[partition_supertx_context_lookup
                                        [PARTITION_SPLIT]][supertx_size],
               1);
-          tmp_rdc.rdcost =
-              RDCOST(x->rdmult, x->rddiv, tmp_rdc.rate, tmp_rdc.dist);
+          tmp_rdc.rdcost = RDCOST(x->rdmult, tmp_rdc.rate, tmp_rdc.dist);
           if (tmp_rdc.rdcost < sum_rdc.rdcost) {
             sum_rdc = tmp_rdc;
             update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx,
@@ -3551,6 +3729,11 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
             &this_rdc, best_rdc.rdcost - sum_rdc.rdcost, pc_tree->split[idx]);
 #endif  // CONFIG_SUPERTX
 
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
+        if (bsize == BLOCK_8X8 && this_rdc.rate != INT_MAX) {
+          assert(this_rdc.dist_y < INT64_MAX);
+        }
+#endif
         if (this_rdc.rate == INT_MAX) {
           sum_rdc.rdcost = INT64_MAX;
 #if CONFIG_SUPERTX
@@ -3564,28 +3747,40 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 #if CONFIG_SUPERTX
           sum_rate_nocoef += this_rate_nocoef;
 #endif  // CONFIG_SUPERTX
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
-          sum_rdc.dist_y += this_rdc.dist_y;
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
+          if (bsize == BLOCK_8X8) {
+            assert(this_rdc.dist_y < INT64_MAX);
+            sum_rdc.dist_y += this_rdc.dist_y;
+          }
 #endif
         }
       }
       reached_last_index = (idx == 4);
 
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
       if (reached_last_index && sum_rdc.rdcost != INT64_MAX &&
           bsize == BLOCK_8X8) {
-        int use_activity_masking = 0;
-        int64_t daala_dist;
+        int64_t dist_8x8;
         const int src_stride = x->plane[0].src.stride;
-        daala_dist = av1_daala_dist(x->plane[0].src.buf - 4 * src_stride - 4,
-                                    src_stride, x->decoded_8x8, 8, 8, 8, 1,
-                                    use_activity_masking, x->qindex)
-                     << 4;
-        sum_rdc.dist = sum_rdc.dist - sum_rdc.dist_y + daala_dist;
-        sum_rdc.rdcost =
-            RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+        uint8_t *decoded_8x8;
+
+#if CONFIG_HIGHBITDEPTH
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+          decoded_8x8 = CONVERT_TO_BYTEPTR(x->decoded_8x8);
+        else
+#endif
+          decoded_8x8 = (uint8_t *)x->decoded_8x8;
+
+        dist_8x8 =
+            av1_dist_8x8(cpi, xd, x->plane[0].src.buf - 4 * src_stride - 4,
+                         src_stride, decoded_8x8, 8, BLOCK_8X8, 8, 8, 8, 8,
+                         x->qindex)
+            << 4;
+        assert(sum_rdc.dist_y < INT64_MAX);
+        sum_rdc.dist = sum_rdc.dist - sum_rdc.dist_y + dist_8x8;
+        sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
       }
-#endif  // CONFIG_DAALA_DIST && CONFIG_CB4X4
+#endif  // CONFIG_DIST_8X8 && CONFIG_CB4X4
 
 #if CONFIG_SUPERTX
       if (supertx_allowed && sum_rdc.rdcost < INT64_MAX && reached_last_index) {
@@ -3598,8 +3793,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
             cm->fc->supertx_prob[partition_supertx_context_lookup
                                      [PARTITION_SPLIT]][supertx_size],
             0);
-        sum_rdc.rdcost =
-            RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+        sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
 
         if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) {
           TX_TYPE best_tx = DCT_DCT;
@@ -3616,8 +3810,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
               cm->fc->supertx_prob[partition_supertx_context_lookup
                                        [PARTITION_SPLIT]][supertx_size],
               1);
-          tmp_rdc.rdcost =
-              RDCOST(x->rdmult, x->rddiv, tmp_rdc.rate, tmp_rdc.dist);
+          tmp_rdc.rdcost = RDCOST(x->rdmult, tmp_rdc.rate, tmp_rdc.dist);
           if (tmp_rdc.rdcost < sum_rdc.rdcost) {
             sum_rdc = tmp_rdc;
             update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx,
@@ -3632,7 +3825,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 
     if (reached_last_index && sum_rdc.rdcost < best_rdc.rdcost) {
       sum_rdc.rate += partition_cost[PARTITION_SPLIT];
-      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
 #if CONFIG_SUPERTX
       sum_rate_nocoef += partition_cost[PARTITION_SPLIT];
 #endif  // CONFIG_SUPERTX
@@ -3725,14 +3918,14 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
                        best_rdc.rdcost - sum_rdc.rdcost);
 #endif  // CONFIG_SUPERTX
 
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
       if (this_rdc.rate != INT_MAX && bsize == BLOCK_8X8) {
         update_state(cpi, td, &pc_tree->horizontal[1], mi_row + mi_step, mi_col,
                      subsize, DRY_RUN_NORMAL);
         encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row + mi_step, mi_col,
                           subsize, NULL);
       }
-#endif  // CONFIG_DAALA_DIST && CONFIG_CB4X4
+#endif  // CONFIG_DIST_8X8 && CONFIG_CB4X4
 
       if (this_rdc.rate == INT_MAX) {
         sum_rdc.rdcost = INT64_MAX;
@@ -3746,24 +3939,31 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 #if CONFIG_SUPERTX
         sum_rate_nocoef += this_rate_nocoef;
 #endif  // CONFIG_SUPERTX
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
         sum_rdc.dist_y += this_rdc.dist_y;
 #endif
       }
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
       if (sum_rdc.rdcost != INT64_MAX && bsize == BLOCK_8X8) {
-        int use_activity_masking = 0;
-        int64_t daala_dist;
+        int64_t dist_8x8;
         const int src_stride = x->plane[0].src.stride;
-        daala_dist = av1_daala_dist(x->plane[0].src.buf - 4 * src_stride,
-                                    src_stride, x->decoded_8x8, 8, 8, 8, 1,
-                                    use_activity_masking, x->qindex)
-                     << 4;
-        sum_rdc.dist = sum_rdc.dist - sum_rdc.dist_y + daala_dist;
-        sum_rdc.rdcost =
-            RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+        uint8_t *decoded_8x8;
+
+#if CONFIG_HIGHBITDEPTH
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+          decoded_8x8 = CONVERT_TO_BYTEPTR(x->decoded_8x8);
+        else
+#endif
+          decoded_8x8 = (uint8_t *)x->decoded_8x8;
+
+        dist_8x8 = av1_dist_8x8(cpi, xd, x->plane[0].src.buf - 4 * src_stride,
+                                src_stride, decoded_8x8, 8, BLOCK_8X8, 8, 8, 8,
+                                8, x->qindex)
+                   << 4;
+        sum_rdc.dist = sum_rdc.dist - sum_rdc.dist_y + dist_8x8;
+        sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
       }
-#endif  // CONFIG_DAALA_DIST && CONFIG_CB4X4
+#endif  // CONFIG_DIST_8X8 && CONFIG_CB4X4
     }
 
 #if CONFIG_SUPERTX
@@ -3777,7 +3977,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
           cm->fc->supertx_prob[partition_supertx_context_lookup[PARTITION_HORZ]]
                               [supertx_size],
           0);
-      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
 
       if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) {
         TX_TYPE best_tx = DCT_DCT;
@@ -3795,8 +3995,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
                 ->supertx_prob[partition_supertx_context_lookup[PARTITION_HORZ]]
                               [supertx_size],
             1);
-        tmp_rdc.rdcost =
-            RDCOST(x->rdmult, x->rddiv, tmp_rdc.rate, tmp_rdc.dist);
+        tmp_rdc.rdcost = RDCOST(x->rdmult, tmp_rdc.rate, tmp_rdc.dist);
         if (tmp_rdc.rdcost < sum_rdc.rdcost) {
           sum_rdc = tmp_rdc;
           update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx,
@@ -3810,7 +4009,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 
     if (sum_rdc.rdcost < best_rdc.rdcost) {
       sum_rdc.rate += partition_cost[PARTITION_HORZ];
-      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
 #if CONFIG_SUPERTX
       sum_rate_nocoef += partition_cost[PARTITION_HORZ];
 #endif  // CONFIG_SUPERTX
@@ -3899,14 +4098,14 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
                        best_rdc.rdcost - sum_rdc.rdcost);
 #endif  // CONFIG_SUPERTX
 
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
       if (this_rdc.rate != INT_MAX && bsize == BLOCK_8X8) {
         update_state(cpi, td, &pc_tree->vertical[1], mi_row, mi_col + mi_step,
                      subsize, DRY_RUN_NORMAL);
         encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col + mi_step,
                           subsize, NULL);
       }
-#endif  // CONFIG_DAALA_DIST && CONFIG_CB4X4
+#endif  // CONFIG_DIST_8X8 && CONFIG_CB4X4
 
       if (this_rdc.rate == INT_MAX) {
         sum_rdc.rdcost = INT64_MAX;
@@ -3920,24 +4119,31 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 #if CONFIG_SUPERTX
         sum_rate_nocoef += this_rate_nocoef;
 #endif  // CONFIG_SUPERTX
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
         sum_rdc.dist_y += this_rdc.dist_y;
 #endif
       }
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
       if (sum_rdc.rdcost != INT64_MAX && bsize == BLOCK_8X8) {
-        int use_activity_masking = 0;
-        int64_t daala_dist;
+        int64_t dist_8x8;
         const int src_stride = x->plane[0].src.stride;
-        daala_dist =
-            av1_daala_dist(x->plane[0].src.buf - 4, src_stride, x->decoded_8x8,
-                           8, 8, 8, 1, use_activity_masking, x->qindex)
+        uint8_t *decoded_8x8;
+
+#if CONFIG_HIGHBITDEPTH
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+          decoded_8x8 = CONVERT_TO_BYTEPTR(x->decoded_8x8);
+        else
+#endif
+          decoded_8x8 = (uint8_t *)x->decoded_8x8;
+
+        dist_8x8 =
+            av1_dist_8x8(cpi, xd, x->plane[0].src.buf - 4, src_stride,
+                         decoded_8x8, 8, BLOCK_8X8, 8, 8, 8, 8, x->qindex)
             << 4;
-        sum_rdc.dist = sum_rdc.dist - sum_rdc.dist_y + daala_dist;
-        sum_rdc.rdcost =
-            RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+        sum_rdc.dist = sum_rdc.dist - sum_rdc.dist_y + dist_8x8;
+        sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
       }
-#endif  // CONFIG_DAALA_DIST && CONFIG_CB4X4
+#endif  // CONFIG_DIST_8X8 && CONFIG_CB4X4
     }
 #if CONFIG_SUPERTX
     if (supertx_allowed && sum_rdc.rdcost < INT64_MAX && !abort_flag) {
@@ -3950,7 +4156,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
           cm->fc->supertx_prob[partition_supertx_context_lookup[PARTITION_VERT]]
                               [supertx_size],
           0);
-      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
 
       if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) {
         TX_TYPE best_tx = DCT_DCT;
@@ -3968,8 +4174,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
                 ->supertx_prob[partition_supertx_context_lookup[PARTITION_VERT]]
                               [supertx_size],
             1);
-        tmp_rdc.rdcost =
-            RDCOST(x->rdmult, x->rddiv, tmp_rdc.rate, tmp_rdc.dist);
+        tmp_rdc.rdcost = RDCOST(x->rdmult, tmp_rdc.rate, tmp_rdc.dist);
         if (tmp_rdc.rdcost < sum_rdc.rdcost) {
           sum_rdc = tmp_rdc;
           update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx,
@@ -3983,7 +4188,7 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 
     if (sum_rdc.rdcost < best_rdc.rdcost) {
       sum_rdc.rate += partition_cost[PARTITION_VERT];
-      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
 #if CONFIG_SUPERTX
       sum_rate_nocoef += partition_cost[PARTITION_VERT];
 #endif  // CONFIG_SUPERTX
@@ -4060,14 +4265,139 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
                        bsize2, mi_row + mi_step, mi_col + mi_step, bsize2);
     restore_context(x, &x_ctx, mi_row, mi_col, bsize);
   }
+
+  // PARTITION_HORZ_4
+  // TODO(david.barker): For this and PARTITION_VERT_4,
+  // * Add support for BLOCK_16X16 once we support 2x8 and 8x2 blocks for the
+  //   chroma plane
+  // * Add support for supertx
+  if (bsize == BLOCK_32X32 && partition_horz_allowed && !force_horz_split &&
+      (do_rectangular_split || av1_active_h_edge(cpi, mi_row, mi_step))) {
+    int i;
+    const int quarter_step = mi_size_high[bsize] / 4;
+    PICK_MODE_CONTEXT *ctx_prev = ctx_none;
+
+    subsize = get_subsize(bsize, PARTITION_HORZ_4);
+    av1_zero(sum_rdc);
+
+    for (i = 0; i < 4; ++i) {
+      int this_mi_row = mi_row + i * quarter_step;
+
+      if (i > 0 && this_mi_row >= cm->mi_rows) break;
+
+      if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_prev);
+
+      ctx_prev = &pc_tree->horizontal4[i];
+
+      rd_pick_sb_modes(cpi, tile_data, x, this_mi_row, mi_col, &this_rdc,
+                       PARTITION_HORZ_4, subsize, ctx_prev,
+                       best_rdc.rdcost - sum_rdc.rdcost);
+
+      if (this_rdc.rate == INT_MAX) {
+        sum_rdc.rdcost = INT64_MAX;
+        break;
+      } else {
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost += this_rdc.rdcost;
+      }
+
+      if (sum_rdc.rdcost >= best_rdc.rdcost) break;
+
+      if (i < 3) {
+        update_state(cpi, td, ctx_prev, this_mi_row, mi_col, subsize, 1);
+        encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, this_mi_row, mi_col,
+                          subsize, NULL);
+      }
+    }
+
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
+      sum_rdc.rate += partition_cost[PARTITION_HORZ_4];
+      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+      if (sum_rdc.rdcost < best_rdc.rdcost) {
+        best_rdc = sum_rdc;
+        pc_tree->partitioning = PARTITION_HORZ_4;
+      }
+    }
+#if !CONFIG_PVQ
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+#else
+    restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
+#endif
+  }
+  // PARTITION_VERT_4
+  if (bsize == BLOCK_32X32 && partition_vert_allowed && !force_vert_split &&
+      (do_rectangular_split || av1_active_v_edge(cpi, mi_row, mi_step))) {
+    int i;
+    const int quarter_step = mi_size_wide[bsize] / 4;
+    PICK_MODE_CONTEXT *ctx_prev = ctx_none;
+
+    subsize = get_subsize(bsize, PARTITION_VERT_4);
+    av1_zero(sum_rdc);
+
+    for (i = 0; i < 4; ++i) {
+      int this_mi_col = mi_col + i * quarter_step;
+
+      if (i > 0 && this_mi_col >= cm->mi_cols) break;
+
+      if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_prev);
+
+      ctx_prev = &pc_tree->vertical4[i];
+
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, this_mi_col, &this_rdc,
+                       PARTITION_VERT_4, subsize, ctx_prev,
+                       best_rdc.rdcost - sum_rdc.rdcost);
+
+      if (this_rdc.rate == INT_MAX) {
+        sum_rdc.rdcost = INT64_MAX;
+      } else {
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost += this_rdc.rdcost;
+      }
+
+      if (sum_rdc.rdcost >= best_rdc.rdcost) break;
+
+      if (i < 3) {
+        update_state(cpi, td, ctx_prev, mi_row, this_mi_col, subsize, 1);
+        encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, this_mi_col,
+                          subsize, NULL);
+      }
+    }
+
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
+      sum_rdc.rate += partition_cost[PARTITION_VERT_4];
+      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+      if (sum_rdc.rdcost < best_rdc.rdcost) {
+        best_rdc = sum_rdc;
+        pc_tree->partitioning = PARTITION_VERT_4;
+      }
+    }
+#if !CONFIG_PVQ
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+#else
+    restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
+#endif
+  }
 #endif  // CONFIG_EXT_PARTITION_TYPES
 
+#if CONFIG_SPEED_REFS
+  // First scanning is done.
+  if (cpi->sb_scanning_pass_idx == 0 && bsize == cm->sb_size) return;
+#endif  // CONFIG_SPEED_REFS
+
   // TODO(jbb): This code added so that we avoid static analysis
   // warning related to the fact that best_rd isn't used after this
   // point.  This code should be refactored so that the duplicate
   // checks occur in some sub function and thus are used...
   (void)best_rd;
   *rd_cost = best_rdc;
+
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
+  if (bsize <= BLOCK_8X8 && rd_cost->rate != INT_MAX) {
+    assert(rd_cost->dist_y < INT64_MAX);
+  }
+#endif  // CONFIG_DIST_8X8 && CONFIG_CB4X4
 #if CONFIG_SUPERTX
   *rate_nocoef = best_rate_nocoef;
 #endif  // CONFIG_SUPERTX
@@ -4093,13 +4423,13 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
   x->cfl_store_y = 0;
 #endif
 
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
   if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
       bsize == BLOCK_4X4 && pc_tree->index == 3) {
     encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
               pc_tree, NULL);
   }
-#endif  // CONFIG_DAALA_DIST && CONFIG_CB4X4
+#endif  // CONFIG_DIST_8X8 && CONFIG_CB4X4
 
   if (bsize == cm->sb_size) {
 #if !CONFIG_PVQ && !CONFIG_LV_MAP
@@ -4112,6 +4442,22 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
   }
 }
 
+#if CONFIG_SPEED_REFS
+static void restore_mi(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                       int mi_row, int mi_col) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
+  int x_idx, y;
+  for (y = 0; y < mi_size_high[cm->sb_size]; y++)
+    for (x_idx = 0; x_idx < mi_size_wide[cm->sb_size]; x_idx++)
+      if (mi_col + x_idx < cm->mi_cols && mi_row + y < cm->mi_rows) {
+        memset(xd->mi + y * cm->mi_stride + x_idx, 0, sizeof(*xd->mi));
+        memset(x->mbmi_ext + y * cm->mi_cols + x_idx, 0, sizeof(*x->mbmi_ext));
+      }
+}
+#endif  // CONFIG_SPEED_REFS
+
 static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
                              TileDataEnc *tile_data, int mi_row,
                              TOKENEXTRA **tp) {
@@ -4157,8 +4503,6 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
     MODE_INFO **mi = cm->mi_grid_visible + idx_str;
     PC_TREE *const pc_root = td->pc_root[cm->mib_size_log2 - MIN_MIB_SIZE_LOG2];
 
-    av1_update_boundary_info(cm, tile_info, mi_row, mi_col);
-
     if (sf->adaptive_pred_interp_filter) {
       for (i = 0; i < leaf_nodes; ++i)
         td->leaf_tree[i].pred_interp_filter = SWITCHABLE;
@@ -4258,12 +4602,35 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
         rd_auto_partition_range(cpi, tile_info, xd, mi_row, mi_col,
                                 &x->min_partition_size, &x->max_partition_size);
       }
+#if CONFIG_SPEED_REFS
+      // NOTE: Two scanning passes for the current superblock - the first pass
+      //       is only targeted to collect stats.
+      int m_search_count_backup = *(x->m_search_count_ptr);
+      for (int sb_pass_idx = 0; sb_pass_idx < 2; ++sb_pass_idx) {
+        cpi->sb_scanning_pass_idx = sb_pass_idx;
+        if (frame_is_intra_only(cm) && sb_pass_idx == 0) continue;
+
+        rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, cm->sb_size,
+                          &dummy_rdc,
+#if CONFIG_SUPERTX
+                          &dummy_rate_nocoef,
+#endif  // CONFIG_SUPERTX
+                          INT64_MAX, pc_root);
+        if (sb_pass_idx == 0) {
+          av1_zero(x->pred_mv);
+          pc_root->index = 0;
+          restore_mi(cpi, x, mi_row, mi_col);
+          *(x->m_search_count_ptr) = m_search_count_backup;
+        }
+      }
+#else  // !CONFIG_SPEED_REFS
       rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, cm->sb_size,
                         &dummy_rdc,
 #if CONFIG_SUPERTX
                         &dummy_rate_nocoef,
 #endif  // CONFIG_SUPERTX
                         INT64_MAX, pc_root);
+#endif  // CONFIG_SPEED_REFS
     }
   }
 }
@@ -4329,20 +4696,11 @@ static MV_REFERENCE_FRAME get_frame_type(const AV1_COMP *cpi) {
     return LAST_FRAME;
 }
 
-static TX_MODE select_tx_mode(const AV1_COMP *cpi, MACROBLOCKD *const xd) {
-  int i, all_lossless = 1;
-
-  if (cpi->common.seg.enabled) {
-    for (i = 0; i < MAX_SEGMENTS; ++i) {
-      if (!xd->lossless[i]) {
-        all_lossless = 0;
-        break;
-      }
-    }
-  } else {
-    all_lossless = xd->lossless[0];
-  }
-  if (all_lossless) return ONLY_4X4;
+static TX_MODE select_tx_mode(const AV1_COMP *cpi) {
+  if (cpi->common.all_lossless) return ONLY_4X4;
+#if CONFIG_VAR_TX_NO_TX_MODE
+  return TX_MODE_SELECT;
+#else
   if (cpi->sf.tx_size_search_method == USE_LARGESTALL)
     return ALLOW_32X32 + CONFIG_TX64X64;
   else if (cpi->sf.tx_size_search_method == USE_FULL_RD ||
@@ -4350,6 +4708,7 @@ static TX_MODE select_tx_mode(const AV1_COMP *cpi, MACROBLOCKD *const xd) {
     return TX_MODE_SELECT;
   else
     return cpi->common.tx_mode;
+#endif  // CONFIG_VAR_TX_NO_TX_MODE
 }
 
 void av1_init_tile_data(AV1_COMP *cpi) {
@@ -4372,7 +4731,7 @@ void av1_init_tile_data(AV1_COMP *cpi) {
         TileDataEnc *const tile_data =
             &cpi->tile_data[tile_row * tile_cols + tile_col];
         int i, j;
-        for (i = 0; i < BLOCK_SIZES; ++i) {
+        for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
           for (j = 0; j < MAX_MODES; ++j) {
             tile_data->thresh_freq_fact[i][j] = 32;
             tile_data->mode_map[i][j] = j;
@@ -4415,12 +4774,8 @@ void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row,
   int mi_row;
 
 #if CONFIG_DEPENDENT_HORZTILES
-#if CONFIG_TILE_GROUPS
   if ((!cm->dependent_horz_tiles) || (tile_row == 0) ||
       tile_info->tg_horz_boundary) {
-#else
-  if ((!cm->dependent_horz_tiles) || (tile_row == 0)) {
-#endif
     av1_zero_above_context(cm, tile_info->mi_col_start, tile_info->mi_col_end);
   }
 #else
@@ -4504,22 +4859,21 @@ void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row,
 #endif
 #endif  // #if CONFIG_PVQ
 
-#if CONFIG_EC_ADAPT
   this_tile->tctx = *cm->fc;
   td->mb.e_mbd.tile_ctx = &this_tile->tctx;
-#endif  // #if CONFIG_EC_ADAPT
 
 #if CONFIG_CFL
   MACROBLOCKD *const xd = &td->mb.e_mbd;
   xd->cfl = &this_tile->cfl;
-  cfl_init(xd->cfl, cm, xd->plane[AOM_PLANE_U].subsampling_x,
-           xd->plane[AOM_PLANE_U].subsampling_y);
+  cfl_init(xd->cfl, cm);
 #endif
 
 #if CONFIG_PVQ
   td->mb.daala_enc.state.adapt = &this_tile->tctx.pvq_context;
 #endif  // CONFIG_PVQ
 
+  av1_setup_across_tile_boundary_info(cm, tile_info);
+
   for (mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
        mi_row += cm->mib_size) {
     encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
@@ -4656,6 +5010,36 @@ static int do_gm_search_logic(SPEED_FEATURES *const sf, int num_refs_using_gm,
 }
 #endif  // CONFIG_GLOBAL_MOTION
 
+#if CONFIG_PALETTE
+// Estimate if the source frame is screen content, based on the portion of
+// blocks that have no more than 4 (experimentally selected) luma colors.
+static int is_screen_content(const uint8_t *src,
+#if CONFIG_HIGHBITDEPTH
+                             int use_hbd, int bd,
+#endif  // CONFIG_HIGHBITDEPTH
+                             int stride, int width, int height) {
+  assert(src != NULL);
+  int counts = 0;
+  const int blk_w = 16;
+  const int blk_h = 16;
+  const int limit = 4;
+  for (int r = 0; r + blk_h <= height; r += blk_h) {
+    for (int c = 0; c + blk_w <= width; c += blk_w) {
+      const int n_colors =
+#if CONFIG_HIGHBITDEPTH
+          use_hbd ? av1_count_colors_highbd(src + r * stride + c, stride, blk_w,
+                                            blk_h, bd)
+                  :
+#endif  // CONFIG_HIGHBITDEPTH
+                  av1_count_colors(src + r * stride + c, stride, blk_w, blk_h);
+      if (n_colors > 1 && n_colors <= limit) counts++;
+    }
+  }
+  // The threshold is 10%.
+  return counts * blk_h * blk_w * 10 > width * height;
+}
+#endif  // CONFIG_PALETTE
+
 static void encode_frame_internal(AV1_COMP *cpi) {
   ThreadData *const td = &cpi->td;
   MACROBLOCK *const x = &td->mb;
@@ -4682,6 +5066,23 @@ static void encode_frame_internal(AV1_COMP *cpi) {
   av1_zero(rdc->coef_counts);
   av1_zero(rdc->comp_pred_diff);
 
+#if CONFIG_PALETTE || CONFIG_INTRABC
+  if (frame_is_intra_only(cm)) {
+#if CONFIG_PALETTE
+    cm->allow_screen_content_tools =
+        cpi->oxcf.content == AOM_CONTENT_SCREEN ||
+        is_screen_content(cpi->source->y_buffer,
+#if CONFIG_HIGHBITDEPTH
+                          cpi->source->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
+#endif  // CONFIG_HIGHBITDEPTH
+                          cpi->source->y_stride, cpi->source->y_width,
+                          cpi->source->y_height);
+#else
+    cm->allow_screen_content_tools = cpi->oxcf.content == AOM_CONTENT_SCREEN;
+#endif  // CONFIG_PALETTE
+  }
+#endif  // CONFIG_PALETTE || CONFIG_INTRABC
+
 #if CONFIG_GLOBAL_MOTION
   av1_zero(rdc->global_motion_used);
   av1_zero(cpi->gmparams_cost);
@@ -4709,6 +5110,8 @@ static void encode_frame_internal(AV1_COMP *cpi) {
         memcpy(&cm->global_motion[frame], &cm->global_motion[pframe],
                sizeof(WarpedMotionParams));
       } else if (ref_buf[frame] &&
+                 ref_buf[frame]->y_crop_width == cpi->source->y_crop_width &&
+                 ref_buf[frame]->y_crop_height == cpi->source->y_crop_height &&
                  do_gm_search_logic(&cpi->sf, num_refs_using_gm, frame)) {
         TransformationType model;
         const int64_t ref_frame_error = av1_frame_error(
@@ -4716,8 +5119,8 @@ static void encode_frame_internal(AV1_COMP *cpi) {
             xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
 #endif  // CONFIG_HIGHBITDEPTH
             ref_buf[frame]->y_buffer, ref_buf[frame]->y_stride,
-            cpi->source->y_buffer, 0, 0, cpi->source->y_width,
-            cpi->source->y_height, cpi->source->y_stride);
+            cpi->source->y_buffer, cpi->source->y_width, cpi->source->y_height,
+            cpi->source->y_stride);
 
         if (ref_frame_error == 0) continue;
 
@@ -4752,7 +5155,8 @@ static void encode_frame_internal(AV1_COMP *cpi) {
                   ref_buf[frame]->y_buffer, ref_buf[frame]->y_width,
                   ref_buf[frame]->y_height, ref_buf[frame]->y_stride,
                   cpi->source->y_buffer, cpi->source->y_width,
-                  cpi->source->y_height, cpi->source->y_stride, 3);
+                  cpi->source->y_height, cpi->source->y_stride, 5,
+                  best_warp_error);
               if (warp_error < best_warp_error) {
                 best_warp_error = warp_error;
                 // Save the wm_params modified by refine_integerized_param()
@@ -4812,10 +5216,10 @@ static void encode_frame_internal(AV1_COMP *cpi) {
                       cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0;
     xd->qindex[i] = qindex;
   }
-
+  cm->all_lossless = all_lossless(cm, xd);
   if (!cm->seg.enabled && xd->lossless[0]) x->optimize = 0;
 
-  cm->tx_mode = select_tx_mode(cpi, xd);
+  cm->tx_mode = select_tx_mode(cpi);
 
 #if CONFIG_DELTA_Q
   // Fix delta q resolution for the moment
@@ -4859,18 +5263,32 @@ static void encode_frame_internal(AV1_COMP *cpi) {
 
 #if CONFIG_TEMPMV_SIGNALING
   if (cm->prev_frame) {
-    cm->use_prev_frame_mvs &= !cm->error_resilient_mode &&
-                              cm->width == cm->prev_frame->buf.y_width &&
-                              cm->height == cm->prev_frame->buf.y_height &&
-                              !cm->intra_only && !cm->prev_frame->intra_only;
+    cm->use_prev_frame_mvs &=
+        !cm->error_resilient_mode &&
+#if CONFIG_FRAME_SUPERRES
+        cm->width == cm->last_width && cm->height == cm->last_height &&
+#else
+        cm->width == cm->prev_frame->buf.y_crop_width &&
+        cm->height == cm->prev_frame->buf.y_crop_height &&
+#endif  // CONFIG_FRAME_SUPERRES
+        !cm->intra_only && !cm->prev_frame->intra_only && cm->last_show_frame;
   } else {
     cm->use_prev_frame_mvs = 0;
   }
 #else
-  cm->use_prev_frame_mvs = !cm->error_resilient_mode && cm->prev_frame &&
-                           cm->width == cm->prev_frame->buf.y_crop_width &&
-                           cm->height == cm->prev_frame->buf.y_crop_height &&
-                           !cm->intra_only && cm->last_show_frame;
+  if (cm->prev_frame) {
+    cm->use_prev_frame_mvs = !cm->error_resilient_mode &&
+#if CONFIG_FRAME_SUPERRES
+                             cm->width == cm->last_width &&
+                             cm->height == cm->last_height &&
+#else
+                             cm->width == cm->prev_frame->buf.y_crop_width &&
+                             cm->height == cm->prev_frame->buf.y_crop_height &&
+#endif  // CONFIG_FRAME_SUPERRES
+                             !cm->intra_only && cm->last_show_frame;
+  } else {
+    cm->use_prev_frame_mvs = 0;
+  }
 #endif  // CONFIG_TEMPMV_SIGNALING
 
   // Special case: set prev_mi to NULL when the previous mode info
@@ -4894,6 +5312,8 @@ static void encode_frame_internal(AV1_COMP *cpi) {
     }
 #endif
 
+    av1_setup_frame_boundary_info(cm);
+
     // If allowed, encoding tiles in parallel with one thread handling one tile.
     // TODO(geza.lore): The multi-threaded encoder is not safe with more than
     // 1 tile rows, as it uses the single above_context et al arrays from
@@ -4921,7 +5341,11 @@ static void make_consistent_compound_tools(AV1_COMMON *cm) {
     cm->allow_interintra_compound = 0;
 #endif  // CONFIG_INTERINTRA
 #if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
+#if CONFIG_COMPOUND_SINGLEREF
+  if (frame_is_intra_only(cm))
+#else   // !CONFIG_COMPOUND_SINGLEREF
   if (frame_is_intra_only(cm) || cm->reference_mode == SINGLE_REFERENCE)
+#endif  // CONFIG_COMPOUND_SINGLEREF
     cm->allow_masked_compound = 0;
 #endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
 }
@@ -4942,14 +5366,14 @@ void av1_encode_frame(AV1_COMP *cpi) {
   // side behavior is where the ALT ref buffer has opposite sign bias to
   // the other two.
   if (!frame_is_intra_only(cm)) {
-#if !CONFIG_ONE_SIDED_COMPOUND
+#if !(CONFIG_ONE_SIDED_COMPOUND || CONFIG_EXT_COMP_REFS)
     if ((cm->ref_frame_sign_bias[ALTREF_FRAME] ==
          cm->ref_frame_sign_bias[GOLDEN_FRAME]) ||
         (cm->ref_frame_sign_bias[ALTREF_FRAME] ==
          cm->ref_frame_sign_bias[LAST_FRAME])) {
       cpi->allow_comp_inter_inter = 0;
     } else {
-#endif
+#endif  // !(CONFIG_ONE_SIDED_COMPOUND || CONFIG_EXT_COMP_REFS)
       cpi->allow_comp_inter_inter = 1;
 #if CONFIG_EXT_REFS
       cm->comp_fwd_ref[0] = LAST_FRAME;
@@ -4962,10 +5386,11 @@ void av1_encode_frame(AV1_COMP *cpi) {
     cm->comp_fixed_ref = ALTREF_FRAME;
     cm->comp_var_ref[0] = LAST_FRAME;
     cm->comp_var_ref[1] = GOLDEN_FRAME;
-#endif                          // CONFIG_EXT_REFS
-#if !CONFIG_ONE_SIDED_COMPOUND  // Normative in encoder
+#endif  // CONFIG_EXT_REFS
+#if !(CONFIG_ONE_SIDED_COMPOUND || \
+      CONFIG_EXT_COMP_REFS)  // Normative in encoder
     }
-#endif
+#endif  // !(CONFIG_ONE_SIDED_COMPOUND || CONFIG_EXT_COMP_REFS)
   } else {
     cpi->allow_comp_inter_inter = 0;
   }
@@ -4997,8 +5422,13 @@ void av1_encode_frame(AV1_COMP *cpi) {
       cm->reference_mode = SINGLE_REFERENCE;
     else
       cm->reference_mode = REFERENCE_MODE_SELECT;
+#else
+#if CONFIG_BGSPRITE
+    (void)is_alt_ref;
+    if (!cpi->allow_comp_inter_inter)
 #else
     if (is_alt_ref || !cpi->allow_comp_inter_inter)
+#endif  // CONFIG_BGSPRITE
       cm->reference_mode = SINGLE_REFERENCE;
     else if (mode_thrs[COMPOUND_REFERENCE] > mode_thrs[SINGLE_REFERENCE] &&
              mode_thrs[COMPOUND_REFERENCE] > mode_thrs[REFERENCE_MODE_SELECT] &&
@@ -5017,25 +5447,23 @@ void av1_encode_frame(AV1_COMP *cpi) {
 #if CONFIG_EXT_INTER
     make_consistent_compound_tools(cm);
 #endif  // CONFIG_EXT_INTER
+
+    rdc->single_ref_used_flag = 0;
+    rdc->compound_ref_used_flag = 0;
+
     encode_frame_internal(cpi);
 
     for (i = 0; i < REFERENCE_MODES; ++i)
       mode_thrs[i] = (mode_thrs[i] + rdc->comp_pred_diff[i] / cm->MBs) / 2;
 
     if (cm->reference_mode == REFERENCE_MODE_SELECT) {
-      int single_count_zero = 0;
-      int comp_count_zero = 0;
-
-      for (i = 0; i < COMP_INTER_CONTEXTS; i++) {
-        single_count_zero += counts->comp_inter[i][0];
-        comp_count_zero += counts->comp_inter[i][1];
-      }
-
-      if (comp_count_zero == 0) {
+      // Use a flag that includes 4x4 blocks
+      if (rdc->compound_ref_used_flag == 0) {
         cm->reference_mode = SINGLE_REFERENCE;
         av1_zero(counts->comp_inter);
 #if !CONFIG_REF_ADAPT
-      } else if (single_count_zero == 0) {
+        // Use a flag that includes 4x4 blocks
+      } else if (rdc->single_ref_used_flag == 0) {
         cm->reference_mode = COMPOUND_REFERENCE;
         av1_zero(counts->comp_inter);
 #endif  // !CONFIG_REF_ADAPT
@@ -5046,10 +5474,15 @@ void av1_encode_frame(AV1_COMP *cpi) {
 #endif  // CONFIG_EXT_INTER
 
 #if CONFIG_VAR_TX
+#if CONFIG_RECT_TX_EXT
+    if (cm->tx_mode == TX_MODE_SELECT && cpi->td.mb.txb_split_count == 0 &&
+        counts->quarter_tx_size[1] == 0)
+#else
     if (cm->tx_mode == TX_MODE_SELECT && cpi->td.mb.txb_split_count == 0)
+#endif
       cm->tx_mode = ALLOW_32X32 + CONFIG_TX64X64;
 #else
-#if CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
+#if CONFIG_RECT_TX_EXT && CONFIG_EXT_TX
     if (cm->tx_mode == TX_MODE_SELECT && counts->quarter_tx_size[1] == 0) {
 #else
     if (cm->tx_mode == TX_MODE_SELECT) {
@@ -5232,12 +5665,20 @@ static void sum_intra_stats(FRAME_COUNTS *counts, MACROBLOCKD *xd,
                             const MODE_INFO *left_mi, const int intraonly,
                             const int mi_row, const int mi_col) {
   const MB_MODE_INFO *const mbmi = &mi->mbmi;
+#if CONFIG_ENTROPY_STATS
   const PREDICTION_MODE y_mode = mbmi->mode;
-  const PREDICTION_MODE uv_mode = mbmi->uv_mode;
+  const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
+#else   // CONFIG_ENTROPY_STATS
+  (void)counts;
+  (void)above_mi;
+  (void)left_mi;
+  (void)intraonly;
+#endif  // CONFIG_ENTROPY_STATS
   const BLOCK_SIZE bsize = mbmi->sb_type;
   const int unify_bsize = CONFIG_CB4X4;
 
   if (bsize < BLOCK_8X8 && !unify_bsize) {
+#if CONFIG_ENTROPY_STATS
     int idx, idy;
     const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
     const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
@@ -5253,7 +5694,9 @@ static void sum_intra_stats(FRAME_COUNTS *counts, MACROBLOCKD *xd,
           ++counts->y_mode[0][bmode];
         }
       }
+#endif  // CONFIG_ENTROPY_STATS
   } else {
+#if CONFIG_ENTROPY_STATS
     if (intraonly) {
       const PREDICTION_MODE above = av1_above_block_mode(mi, above_mi, 0);
       const PREDICTION_MODE left = av1_left_block_mode(mi, left_mi, 0);
@@ -5261,6 +5704,7 @@ static void sum_intra_stats(FRAME_COUNTS *counts, MACROBLOCKD *xd,
     } else {
       ++counts->y_mode[size_group_lookup[bsize]][y_mode];
     }
+#endif  // CONFIG_ENTROPY_STATS
 #if CONFIG_FILTER_INTRA
     if (mbmi->mode == DC_PRED
 #if CONFIG_PALETTE
@@ -5271,7 +5715,7 @@ static void sum_intra_stats(FRAME_COUNTS *counts, MACROBLOCKD *xd,
           mbmi->filter_intra_mode_info.use_filter_intra_mode[0];
       ++counts->filter_intra[0][use_filter_intra_mode];
     }
-    if (mbmi->uv_mode == DC_PRED
+    if (mbmi->uv_mode == UV_DC_PRED
 #if CONFIG_CB4X4
         &&
         is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
@@ -5306,7 +5750,9 @@ static void sum_intra_stats(FRAME_COUNTS *counts, MACROBLOCKD *xd,
   (void)mi_col;
   (void)xd;
 #endif
+#if CONFIG_ENTROPY_STATS
   ++counts->uv_mode[y_mode][uv_mode];
+#endif  // CONFIG_ENTROPY_STATS
 }
 
 #if CONFIG_VAR_TX
@@ -5325,9 +5771,17 @@ static void update_txfm_count(MACROBLOCK *x, MACROBLOCKD *xd,
 
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
+#if CONFIG_RECT_TX_EXT
+  if (tx_size == plane_tx_size ||
+      mbmi->tx_size == quarter_txsize_lookup[mbmi->sb_type]) {
+#else
   if (tx_size == plane_tx_size) {
+#endif
     ++counts->txfm_partition[ctx][0];
-    mbmi->tx_size = tx_size;
+#if CONFIG_RECT_TX_EXT
+    if (tx_size == plane_tx_size)
+#endif
+      mbmi->tx_size = tx_size;
     txfm_partition_update(xd->above_txfm_context + blk_col,
                           xd->left_txfm_context + blk_row, tx_size, tx_size);
   } else {
@@ -5438,18 +5892,22 @@ static void tx_partition_set_contexts(const AV1_COMMON *const cm,
 
 void av1_update_tx_type_count(const AV1_COMMON *cm, MACROBLOCKD *xd,
 #if CONFIG_TXK_SEL
-                              int block, int plane,
+                              int blk_row, int blk_col, int block, int plane,
 #endif
                               BLOCK_SIZE bsize, TX_SIZE tx_size,
                               FRAME_COUNTS *counts) {
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   int is_inter = is_inter_block(mbmi);
+
 #if !CONFIG_TXK_SEL
   TX_TYPE tx_type = mbmi->tx_type;
 #else
+  (void)blk_row;
+  (void)blk_col;
   // Only y plane's tx_type is updated
   if (plane > 0) return;
-  TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block, tx_size);
+  TX_TYPE tx_type =
+      av1_get_tx_type(PLANE_TYPE_Y, xd, blk_row, blk_col, block, tx_size);
 #endif
 #if CONFIG_EXT_TX
   if (get_ext_tx_types(tx_size, bsize, is_inter, cm->reduced_tx_set_used) > 1 &&
@@ -5509,7 +5967,7 @@ static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
   x->pvq_coded = (dry_run == OUTPUT_ENABLED) ? 1 : 0;
 #endif
 #if CONFIG_CFL
-  x->cfl_store_y = (dry_run == OUTPUT_ENABLED) ? 1 : 0;
+  x->cfl_store_y = 1;
 #endif
 
   if (!is_inter) {
@@ -5526,13 +5984,8 @@ static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
 #if CONFIG_PALETTE
     if (bsize >= BLOCK_8X8 && !dry_run) {
       for (plane = 0; plane <= 1; ++plane) {
-        if (mbmi->palette_mode_info.palette_size[plane] > 0) {
-          mbmi->palette_mode_info.palette_first_color_idx[plane] =
-              xd->plane[plane].color_index_map[0];
-          // TODO(huisu): this increases the use of token buffer. Needs stretch
-          // test to verify.
+        if (mbmi->palette_mode_info.palette_size[plane] > 0)
           av1_tokenize_palette_sb(cpi, td, plane, t, dry_run, bsize, rate);
-        }
       }
     }
 #endif  // CONFIG_PALETTE
@@ -5559,9 +6012,21 @@ static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
       av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
                            &xd->block_refs[ref]->sf);
     }
-    av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, NULL, block_size);
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+    // Single ref compound mode
+    if (!is_compound && is_inter_singleref_comp_mode(mbmi->mode)) {
+      xd->block_refs[1] = xd->block_refs[0];
+      YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[0]);
+#if CONFIG_INTRABC
+      assert(IMPLIES(!is_intrabc_block(mbmi), cfg));
+#else
+      assert(cfg != NULL);
+#endif  // !CONFIG_INTRABC
+      av1_setup_pre_planes(xd, 1, cfg, mi_row, mi_col, &xd->block_refs[1]->sf);
+    }
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
 
-    av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, NULL, block_size);
+    av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, block_size);
 #if CONFIG_MOTION_VAR
     if (mbmi->motion_mode == OBMC_CAUSAL) {
 #if CONFIG_NCOBMC
@@ -5587,10 +6052,11 @@ static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
 #endif
   }
 
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
   if (bsize < BLOCK_8X8) {
-    daala_dist_set_sub8x8_dst(x, x->decoded_8x8, bsize, block_size_wide[bsize],
-                              block_size_high[bsize], mi_row, mi_col);
+    dist_8x8_set_sub8x8_dst(x, (uint8_t *)x->decoded_8x8, bsize,
+                            block_size_wide[bsize], block_size_high[bsize],
+                            mi_row, mi_col);
   }
 #endif
 
@@ -5629,13 +6095,16 @@ static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
 
       ++td->counts->tx_size[tx_size_cat][tx_size_ctx][depth];
 #endif
-#if CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
+
+#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
       if (is_quarter_tx_allowed(xd, mbmi, is_inter) &&
-          mbmi->tx_size != txsize_sqr_up_map[mbmi->tx_size]) {
-        ++td->counts->quarter_tx_size[mbmi->tx_size ==
-                                      quarter_txsize_lookup[mbmi->sb_type]];
+          quarter_txsize_lookup[bsize] != max_txsize_rect_lookup[bsize] &&
+          (mbmi->tx_size == quarter_txsize_lookup[bsize] ||
+           mbmi->tx_size == max_txsize_rect_lookup[bsize])) {
+        ++td->counts
+              ->quarter_tx_size[mbmi->tx_size == quarter_txsize_lookup[bsize]];
       }
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
+#endif
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
       assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed(xd, mbmi)));
 #endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
@@ -5673,8 +6142,8 @@ static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
     }
 
     ++td->counts->tx_size_totals[txsize_sqr_map[tx_size]];
-    ++td->counts
-          ->tx_size_totals[txsize_sqr_map[get_uv_tx_size(mbmi, &xd->plane[1])]];
+    ++td->counts->tx_size_totals[txsize_sqr_map[av1_get_uv_tx_size(
+        mbmi, &xd->plane[1])]];
 #if !CONFIG_TXK_SEL
     av1_update_tx_type_count(cm, xd, bsize, tx_size, td->counts);
 #endif
@@ -5837,7 +6306,7 @@ static void predict_superblock(const AV1_COMP *const cpi, ThreadData *td,
 #if CONFIG_EXT_INTER
                                int mi_row_ori, int mi_col_ori,
 #endif  // CONFIG_EXT_INTER
-                               int mi_row_pred, int mi_col_pred,
+                               int mi_row_pred, int mi_col_pred, int plane,
                                BLOCK_SIZE bsize_pred, int b_sub8x8, int block) {
   // Used in supertx
   // (mi_row_ori, mi_col_ori): location for mv
@@ -5859,28 +6328,39 @@ static void predict_superblock(const AV1_COMP *const cpi, ThreadData *td,
                          &xd->block_refs[ref]->sf);
   }
 
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  // Single ref compound mode
+  if (!is_compound && is_inter_singleref_comp_mode(mbmi->mode)) {
+    xd->block_refs[1] = xd->block_refs[0];
+    YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[0]);
+    av1_setup_pre_planes(xd, 1, cfg, mi_row_pred, mi_col_pred,
+                         &xd->block_refs[1]->sf);
+  }
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+
   if (!b_sub8x8)
-    av1_build_inter_predictors_sb_extend(cm, xd,
+    av1_build_inter_predictor_sb_extend(cm, xd,
 #if CONFIG_EXT_INTER
-                                         mi_row_ori, mi_col_ori,
+                                        mi_row_ori, mi_col_ori,
 #endif  // CONFIG_EXT_INTER
-                                         mi_row_pred, mi_col_pred, bsize_pred);
+                                        mi_row_pred, mi_col_pred, plane,
+                                        bsize_pred);
   else
-    av1_build_inter_predictors_sb_sub8x8_extend(cm, xd,
+    av1_build_inter_predictor_sb_sub8x8_extend(cm, xd,
 #if CONFIG_EXT_INTER
-                                                mi_row_ori, mi_col_ori,
+                                               mi_row_ori, mi_col_ori,
 #endif  // CONFIG_EXT_INTER
-                                                mi_row_pred, mi_col_pred,
-                                                bsize_pred, block);
+                                               mi_row_pred, mi_col_pred, plane,
+                                               bsize_pred, block);
 }
 
 static void predict_b_extend(const AV1_COMP *const cpi, ThreadData *td,
                              const TileInfo *const tile, int block,
                              int mi_row_ori, int mi_col_ori, int mi_row_pred,
                              int mi_col_pred, int mi_row_top, int mi_col_top,
-                             uint8_t *dst_buf[3], int dst_stride[3],
+                             int plane, uint8_t *dst_buf, int dst_stride,
                              BLOCK_SIZE bsize_top, BLOCK_SIZE bsize_pred,
-                             RUN_TYPE dry_run, int b_sub8x8, int bextend) {
+                             RUN_TYPE dry_run, int b_sub8x8) {
   // Used in supertx
   // (mi_row_ori, mi_col_ori): location for mv
   // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
@@ -5905,34 +6385,27 @@ static void predict_b_extend(const AV1_COMP *const cpi, ThreadData *td,
 
   set_offsets_extend(cpi, td, tile, mi_row_pred, mi_col_pred, mi_row_ori,
                      mi_col_ori, bsize_pred);
-  xd->plane[0].dst.stride = dst_stride[0];
-  xd->plane[1].dst.stride = dst_stride[1];
-  xd->plane[2].dst.stride = dst_stride[2];
-  xd->plane[0].dst.buf = dst_buf[0] +
-                         (r >> xd->plane[0].subsampling_y) * dst_stride[0] +
-                         (c >> xd->plane[0].subsampling_x);
-  xd->plane[1].dst.buf = dst_buf[1] +
-                         (r >> xd->plane[1].subsampling_y) * dst_stride[1] +
-                         (c >> xd->plane[1].subsampling_x);
-  xd->plane[2].dst.buf = dst_buf[2] +
-                         (r >> xd->plane[2].subsampling_y) * dst_stride[2] +
-                         (c >> xd->plane[2].subsampling_x);
+  xd->plane[plane].dst.stride = dst_stride;
+  xd->plane[plane].dst.buf =
+      dst_buf + (r >> xd->plane[plane].subsampling_y) * dst_stride +
+      (c >> xd->plane[plane].subsampling_x);
 
   predict_superblock(cpi, td,
 #if CONFIG_EXT_INTER
                      mi_row_ori, mi_col_ori,
 #endif  // CONFIG_EXT_INTER
-                     mi_row_pred, mi_col_pred, bsize_pred, b_sub8x8, block);
+                     mi_row_pred, mi_col_pred, plane, bsize_pred, b_sub8x8,
+                     block);
 
-  if (!dry_run && !bextend)
+  if (!dry_run && (plane == 0) && (block == 0 || !b_sub8x8))
     update_stats(&cpi->common, td, mi_row_pred, mi_col_pred, 1);
 }
 
 static void extend_dir(const AV1_COMP *const cpi, ThreadData *td,
                        const TileInfo *const tile, int block, BLOCK_SIZE bsize,
-                       BLOCK_SIZE top_bsize, int mi_row, int mi_col,
-                       int mi_row_top, int mi_col_top, RUN_TYPE dry_run,
-                       uint8_t *dst_buf[3], int dst_stride[3], int dir) {
+                       BLOCK_SIZE top_bsize, int mi_row_ori, int mi_col_ori,
+                       int mi_row, int mi_col, int mi_row_top, int mi_col_top,
+                       int plane, uint8_t *dst_buf, int dst_stride, int dir) {
   // dir: 0-lower, 1-upper, 2-left, 3-right
   //      4-lowerleft, 5-upperleft, 6-lowerright, 7-upperright
   MACROBLOCKD *xd = &td->mb.e_mbd;
@@ -5973,10 +6446,10 @@ static void extend_dir(const AV1_COMP *const cpi, ThreadData *td,
 
     for (j = 0; j < mi_height + ext_offset; j += high_unit)
       for (i = 0; i < mi_width + ext_offset; i += wide_unit)
-        predict_b_extend(cpi, td, tile, block, mi_row, mi_col, mi_row_pred + j,
-                         mi_col_pred + i, mi_row_top, mi_col_top, dst_buf,
-                         dst_stride, top_bsize, extend_bsize, dry_run, b_sub8x8,
-                         1);
+        predict_b_extend(cpi, td, tile, block, mi_row_ori, mi_col_ori,
+                         mi_row_pred + j, mi_col_pred + i, mi_row_top,
+                         mi_col_top, plane, dst_buf, dst_stride, top_bsize,
+                         extend_bsize, 1, b_sub8x8);
   } else if (dir == 2 || dir == 3) {  // left and right
     extend_bsize =
         (mi_height == mi_size_high[BLOCK_8X8] || bsize < BLOCK_8X8 || yss < xss)
@@ -5996,10 +6469,10 @@ static void extend_dir(const AV1_COMP *const cpi, ThreadData *td,
 
     for (j = 0; j < mi_height + ext_offset; j += high_unit)
       for (i = 0; i < mi_width + ext_offset; i += wide_unit)
-        predict_b_extend(cpi, td, tile, block, mi_row, mi_col, mi_row_pred + j,
-                         mi_col_pred + i, mi_row_top, mi_col_top, dst_buf,
-                         dst_stride, top_bsize, extend_bsize, dry_run, b_sub8x8,
-                         1);
+        predict_b_extend(cpi, td, tile, block, mi_row_ori, mi_col_ori,
+                         mi_row_pred + j, mi_col_pred + i, mi_row_top,
+                         mi_col_top, plane, dst_buf, dst_stride, top_bsize,
+                         extend_bsize, 1, b_sub8x8);
   } else {
     extend_bsize = BLOCK_8X8;
 #if CONFIG_CB4X4
@@ -6018,35 +6491,24 @@ static void extend_dir(const AV1_COMP *const cpi, ThreadData *td,
 
     for (j = 0; j < mi_height + ext_offset; j += high_unit)
       for (i = 0; i < mi_width + ext_offset; i += wide_unit)
-        predict_b_extend(cpi, td, tile, block, mi_row, mi_col, mi_row_pred + j,
-                         mi_col_pred + i, mi_row_top, mi_col_top, dst_buf,
-                         dst_stride, top_bsize, extend_bsize, dry_run, b_sub8x8,
-                         1);
+        predict_b_extend(cpi, td, tile, block, mi_row_ori, mi_col_ori,
+                         mi_row_pred + j, mi_col_pred + i, mi_row_top,
+                         mi_col_top, plane, dst_buf, dst_stride, top_bsize,
+                         extend_bsize, 1, b_sub8x8);
   }
 }
 
 static void extend_all(const AV1_COMP *const cpi, ThreadData *td,
                        const TileInfo *const tile, int block, BLOCK_SIZE bsize,
-                       BLOCK_SIZE top_bsize, int mi_row, int mi_col,
-                       int mi_row_top, int mi_col_top, RUN_TYPE dry_run,
-                       uint8_t *dst_buf[3], int dst_stride[3]) {
+                       BLOCK_SIZE top_bsize, int mi_row_ori, int mi_col_ori,
+                       int mi_row, int mi_col, int mi_row_top, int mi_col_top,
+                       int plane, uint8_t *dst_buf, int dst_stride) {
   assert(block >= 0 && block < 4);
-  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-             mi_col_top, dry_run, dst_buf, dst_stride, 0);
-  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-             mi_col_top, dry_run, dst_buf, dst_stride, 1);
-  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-             mi_col_top, dry_run, dst_buf, dst_stride, 2);
-  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-             mi_col_top, dry_run, dst_buf, dst_stride, 3);
-  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-             mi_col_top, dry_run, dst_buf, dst_stride, 4);
-  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-             mi_col_top, dry_run, dst_buf, dst_stride, 5);
-  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-             mi_col_top, dry_run, dst_buf, dst_stride, 6);
-  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-             mi_col_top, dry_run, dst_buf, dst_stride, 7);
+  for (int i = 0; i < 8; ++i) {
+    extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row_ori, mi_col_ori,
+               mi_row, mi_col, mi_row_top, mi_col_top, plane, dst_buf,
+               dst_stride, i);
+  }
 }
 
 // This function generates prediction for multiple blocks, between which
@@ -6140,29 +6602,36 @@ static void predict_sb_complex(const AV1_COMP *const cpi, ThreadData *td,
   switch (partition) {
     case PARTITION_NONE:
       assert(bsize < top_bsize);
-      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                       mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                       bsize, dry_run, 0, 0);
-      extend_all(cpi, td, tile, 0, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-                 mi_col_top, dry_run, dst_buf, dst_stride);
+      for (i = 0; i < MAX_MB_PLANE; ++i) {
+        predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, i, dst_buf[i], dst_stride[i],
+                         top_bsize, bsize, dry_run, 0);
+        extend_all(cpi, td, tile, 0, bsize, top_bsize, mi_row, mi_col, mi_row,
+                   mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
+                   dst_stride[i]);
+      }
       break;
     case PARTITION_HORZ:
       if (bsize == BLOCK_8X8 && !unify_bsize) {
-        // Fisrt half
-        predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                         mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                         BLOCK_8X8, dry_run, 1, 0);
-        if (bsize < top_bsize)
-          extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
-
-        // Second half
-        predict_b_extend(cpi, td, tile, 2, mi_row, mi_col, mi_row, mi_col,
-                         mi_row_top, mi_col_top, dst_buf1, dst_stride1,
-                         top_bsize, BLOCK_8X8, dry_run, 1, 1);
-        if (bsize < top_bsize)
-          extend_all(cpi, td, tile, 2, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
+        for (i = 0; i < MAX_MB_PLANE; ++i) {
+          // First half
+          predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                           mi_row_top, mi_col_top, i, dst_buf[i], dst_stride[i],
+                           top_bsize, BLOCK_8X8, dry_run, 1);
+          if (bsize < top_bsize)
+            extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                       mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
+                       dst_stride[i]);
+
+          // Second half
+          predict_b_extend(cpi, td, tile, 2, mi_row, mi_col, mi_row, mi_col,
+                           mi_row_top, mi_col_top, i, dst_buf1[i],
+                           dst_stride1[i], top_bsize, BLOCK_8X8, dry_run, 1);
+          if (bsize < top_bsize)
+            extend_all(cpi, td, tile, 2, subsize, top_bsize, mi_row, mi_col,
+                       mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf1[i],
+                       dst_stride1[i]);
+        }
 
         // Smooth
         xd->plane[0].dst.buf = dst_buf[0];
@@ -6172,60 +6641,89 @@ static void predict_sb_complex(const AV1_COMP *const cpi, ThreadData *td,
             mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ,
             0);
       } else {
-        // First half
-        predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                         mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                         subsize, dry_run, 0, 0);
-        if (bsize < top_bsize)
-          extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
-        else
-          extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 0);
-
-        if (mi_row + hbs < cm->mi_rows) {
-          // Second half
-          predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
-                           mi_col, mi_row_top, mi_col_top, dst_buf1,
-                           dst_stride1, top_bsize, subsize, dry_run, 0, 0);
-          if (bsize < top_bsize)
-            extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs,
-                       mi_col, mi_row_top, mi_col_top, dry_run, dst_buf1,
-                       dst_stride1);
-          else
-            extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs,
-                       mi_col, mi_row_top, mi_col_top, dry_run, dst_buf1,
-                       dst_stride1, 1);
-
-          // Smooth
-          for (i = 0; i < MAX_MB_PLANE; i++) {
+        for (i = 0; i < MAX_MB_PLANE; ++i) {
+#if CONFIG_CB4X4
+          const struct macroblockd_plane *pd = &xd->plane[i];
+          int handle_chroma_sub8x8 = need_handle_chroma_sub8x8(
+              subsize, pd->subsampling_x, pd->subsampling_y);
+
+          if (handle_chroma_sub8x8) {
+            int mode_offset_row = CONFIG_CHROMA_SUB8X8 ? hbs : 0;
+
+            predict_b_extend(cpi, td, tile, 0, mi_row + mode_offset_row, mi_col,
+                             mi_row, mi_col, mi_row_top, mi_col_top, i,
+                             dst_buf[i], dst_stride[i], top_bsize, bsize,
+                             dry_run, 0);
+            if (bsize < top_bsize)
+              extend_all(cpi, td, tile, 0, bsize, top_bsize,
+                         mi_row + mode_offset_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, i, dst_buf[i], dst_stride[i]);
+          } else {
+#endif
+            // First half
+            predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, i, dst_buf[i],
+                             dst_stride[i], top_bsize, subsize, dry_run, 0);
+            if (bsize < top_bsize)
+              extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
+                         dst_stride[i]);
+            else
+              extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
+                         dst_stride[i], 0);
             xd->plane[i].dst.buf = dst_buf[i];
             xd->plane[i].dst.stride = dst_stride[i];
-            av1_build_masked_inter_predictor_complex(
-                xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
-                mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
-                PARTITION_HORZ, i);
+
+            if (mi_row + hbs < cm->mi_rows) {
+              // Second half
+              predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col,
+                               mi_row + hbs, mi_col, mi_row_top, mi_col_top, i,
+                               dst_buf1[i], dst_stride1[i], top_bsize, subsize,
+                               dry_run, 0);
+              if (bsize < top_bsize)
+                extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs,
+                           mi_col, mi_row + hbs, mi_col, mi_row_top, mi_col_top,
+                           i, dst_buf1[i], dst_stride1[i]);
+              else
+                extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs,
+                           mi_col, mi_row + hbs, mi_col, mi_row_top, mi_col_top,
+                           i, dst_buf1[i], dst_stride1[i], 1);
+              // Smooth
+              xd->plane[i].dst.buf = dst_buf[i];
+              xd->plane[i].dst.stride = dst_stride[i];
+              av1_build_masked_inter_predictor_complex(
+                  xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
+                  mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+                  PARTITION_HORZ, i);
+            }
+#if CONFIG_CB4X4
           }
+#endif
         }
       }
       break;
     case PARTITION_VERT:
       if (bsize == BLOCK_8X8 && !unify_bsize) {
-        // First half
-        predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                         mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                         BLOCK_8X8, dry_run, 1, 0);
-        if (bsize < top_bsize)
-          extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
-
-        // Second half
-        predict_b_extend(cpi, td, tile, 1, mi_row, mi_col, mi_row, mi_col,
-                         mi_row_top, mi_col_top, dst_buf1, dst_stride1,
-                         top_bsize, BLOCK_8X8, dry_run, 1, 1);
-        if (bsize < top_bsize)
-          extend_all(cpi, td, tile, 1, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
+        for (i = 0; i < MAX_MB_PLANE; ++i) {
+          // First half
+          predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                           mi_row_top, mi_col_top, i, dst_buf[i], dst_stride[i],
+                           top_bsize, BLOCK_8X8, dry_run, 1);
+          if (bsize < top_bsize)
+            extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                       mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
+                       dst_stride[i]);
+
+          // Second half
+          predict_b_extend(cpi, td, tile, 1, mi_row, mi_col, mi_row, mi_col,
+                           mi_row_top, mi_col_top, i, dst_buf1[i],
+                           dst_stride1[i], top_bsize, BLOCK_8X8, dry_run, 1);
+          if (bsize < top_bsize)
+            extend_all(cpi, td, tile, 1, subsize, top_bsize, mi_row, mi_col,
+                       mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf1[i],
+                       dst_stride1[i]);
+        }
 
         // Smooth
         xd->plane[0].dst.buf = dst_buf[0];
@@ -6235,66 +6733,160 @@ static void predict_sb_complex(const AV1_COMP *const cpi, ThreadData *td,
             mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT,
             0);
       } else {
-        // bsize: not important, not useful
-        predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                         mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                         subsize, dry_run, 0, 0);
-        if (bsize < top_bsize)
-          extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
-        else
-          extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 3);
-
-        if (mi_col + hbs < cm->mi_cols) {
-          predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
-                           mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
-                           dst_stride1, top_bsize, subsize, dry_run, 0, 0);
-          if (bsize < top_bsize)
-            extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row,
-                       mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf1,
-                       dst_stride1);
-          else
-            extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row,
-                       mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf1,
-                       dst_stride1, 2);
-
-          for (i = 0; i < MAX_MB_PLANE; i++) {
+        for (i = 0; i < MAX_MB_PLANE; ++i) {
+#if CONFIG_CB4X4
+          const struct macroblockd_plane *pd = &xd->plane[i];
+          int handle_chroma_sub8x8 = need_handle_chroma_sub8x8(
+              subsize, pd->subsampling_x, pd->subsampling_y);
+
+          if (handle_chroma_sub8x8) {
+            int mode_offset_col = CONFIG_CHROMA_SUB8X8 ? hbs : 0;
+
+            predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + mode_offset_col,
+                             mi_row, mi_col, mi_row_top, mi_col_top, i,
+                             dst_buf[i], dst_stride[i], top_bsize, bsize,
+                             dry_run, 0);
+            if (bsize < top_bsize)
+              extend_all(cpi, td, tile, 0, bsize, top_bsize, mi_row,
+                         mi_col + mode_offset_col, mi_row, mi_col, mi_row_top,
+                         mi_col_top, i, dst_buf[i], dst_stride[i]);
+          } else {
+#endif
+            predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, i, dst_buf[i],
+                             dst_stride[i], top_bsize, subsize, dry_run, 0);
+            if (bsize < top_bsize)
+              extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
+                         dst_stride[i]);
+            else
+              extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
+                         dst_stride[i], 3);
             xd->plane[i].dst.buf = dst_buf[i];
             xd->plane[i].dst.stride = dst_stride[i];
-            av1_build_masked_inter_predictor_complex(
-                xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
-                mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
-                PARTITION_VERT, i);
+
+            if (mi_col + hbs < cm->mi_cols) {
+              predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
+                               mi_col + hbs, mi_row_top, mi_col_top, i,
+                               dst_buf1[i], dst_stride1[i], top_bsize, subsize,
+                               dry_run, 0);
+              if (bsize < top_bsize)
+                extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row,
+                           mi_col + hbs, mi_row, mi_col + hbs, mi_row_top,
+                           mi_col_top, i, dst_buf1[i], dst_stride1[i]);
+              else
+                extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row,
+                           mi_col + hbs, mi_row, mi_col + hbs, mi_row_top,
+                           mi_col_top, i, dst_buf1[i], dst_stride1[i], 2);
+
+              // smooth
+              xd->plane[i].dst.buf = dst_buf[i];
+              xd->plane[i].dst.stride = dst_stride[i];
+              av1_build_masked_inter_predictor_complex(
+                  xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
+                  mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+                  PARTITION_VERT, i);
+            }
+#if CONFIG_CB4X4
           }
+#endif
         }
       }
       break;
     case PARTITION_SPLIT:
       if (bsize == BLOCK_8X8 && !unify_bsize) {
-        predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
-                         mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                         BLOCK_8X8, dry_run, 1, 0);
-        predict_b_extend(cpi, td, tile, 1, mi_row, mi_col, mi_row, mi_col,
-                         mi_row_top, mi_col_top, dst_buf1, dst_stride1,
-                         top_bsize, BLOCK_8X8, dry_run, 1, 1);
-        predict_b_extend(cpi, td, tile, 2, mi_row, mi_col, mi_row, mi_col,
-                         mi_row_top, mi_col_top, dst_buf2, dst_stride2,
-                         top_bsize, BLOCK_8X8, dry_run, 1, 1);
-        predict_b_extend(cpi, td, tile, 3, mi_row, mi_col, mi_row, mi_col,
-                         mi_row_top, mi_col_top, dst_buf3, dst_stride3,
-                         top_bsize, BLOCK_8X8, dry_run, 1, 1);
-
-        if (bsize < top_bsize) {
-          extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
-          extend_all(cpi, td, tile, 1, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
-          extend_all(cpi, td, tile, 2, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2);
-          extend_all(cpi, td, tile, 3, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, dry_run, dst_buf3, dst_stride3);
+        for (i = 0; i < MAX_MB_PLANE; i++) {
+          predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                           mi_row_top, mi_col_top, i, dst_buf[i], dst_stride[i],
+                           top_bsize, BLOCK_8X8, dry_run, 1);
+          predict_b_extend(cpi, td, tile, 1, mi_row, mi_col, mi_row, mi_col,
+                           mi_row_top, mi_col_top, i, dst_buf1[i],
+                           dst_stride1[i], top_bsize, BLOCK_8X8, dry_run, 1);
+          predict_b_extend(cpi, td, tile, 2, mi_row, mi_col, mi_row, mi_col,
+                           mi_row_top, mi_col_top, i, dst_buf2[i],
+                           dst_stride2[i], top_bsize, BLOCK_8X8, dry_run, 1);
+          predict_b_extend(cpi, td, tile, 3, mi_row, mi_col, mi_row, mi_col,
+                           mi_row_top, mi_col_top, i, dst_buf3[i],
+                           dst_stride3[i], top_bsize, BLOCK_8X8, dry_run, 1);
+
+          if (bsize < top_bsize) {
+            extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                       mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
+                       dst_stride[i]);
+            extend_all(cpi, td, tile, 1, subsize, top_bsize, mi_row, mi_col,
+                       mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf1[i],
+                       dst_stride1[i]);
+            extend_all(cpi, td, tile, 2, subsize, top_bsize, mi_row, mi_col,
+                       mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf2[i],
+                       dst_stride2[i]);
+            extend_all(cpi, td, tile, 3, subsize, top_bsize, mi_row, mi_col,
+                       mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf3[i],
+                       dst_stride3[i]);
+          }
+        }
+#if CONFIG_CB4X4
+      } else if (bsize == BLOCK_8X8) {
+        for (i = 0; i < MAX_MB_PLANE; i++) {
+          const struct macroblockd_plane *pd = &xd->plane[i];
+          int handle_chroma_sub8x8 = need_handle_chroma_sub8x8(
+              subsize, pd->subsampling_x, pd->subsampling_y);
+
+          if (handle_chroma_sub8x8) {
+            int mode_offset_row =
+                CONFIG_CHROMA_SUB8X8 && mi_row + hbs < cm->mi_rows ? hbs : 0;
+            int mode_offset_col =
+                CONFIG_CHROMA_SUB8X8 && mi_col + hbs < cm->mi_cols ? hbs : 0;
+
+            predict_b_extend(cpi, td, tile, 0, mi_row + mode_offset_row,
+                             mi_col + mode_offset_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, i, dst_buf[i],
+                             dst_stride[i], top_bsize, BLOCK_8X8, dry_run, 0);
+            if (bsize < top_bsize)
+              extend_all(cpi, td, tile, 0, BLOCK_8X8, top_bsize,
+                         mi_row + mode_offset_row, mi_col + mode_offset_col,
+                         mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
+                         dst_stride[i]);
+          } else {
+            predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, i, dst_buf[i],
+                             dst_stride[i], top_bsize, subsize, dry_run, 0);
+            if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols)
+              predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
+                               mi_col + hbs, mi_row_top, mi_col_top, i,
+                               dst_buf1[i], dst_stride1[i], top_bsize, subsize,
+                               dry_run, 0);
+            if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols)
+              predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col,
+                               mi_row + hbs, mi_col, mi_row_top, mi_col_top, i,
+                               dst_buf2[i], dst_stride2[i], top_bsize, subsize,
+                               dry_run, 0);
+            if (mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols)
+              predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col + hbs,
+                               mi_row + hbs, mi_col + hbs, mi_row_top,
+                               mi_col_top, i, dst_buf3[i], dst_stride3[i],
+                               top_bsize, subsize, dry_run, 0);
+
+            if (bsize < top_bsize) {
+              extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row, mi_col, mi_row_top, mi_col_top, i, dst_buf[i],
+                         dst_stride[i]);
+              if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols)
+                extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row,
+                           mi_col + hbs, mi_row, mi_col + hbs, mi_row_top,
+                           mi_col_top, i, dst_buf1[i], dst_stride1[i]);
+              if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols)
+                extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs,
+                           mi_col, mi_row + hbs, mi_col, mi_row_top, mi_col_top,
+                           i, dst_buf2[i], dst_stride2[i]);
+              if (mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols)
+                extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs,
+                           mi_col + hbs, mi_row + hbs, mi_col + hbs, mi_row_top,
+                           mi_col_top, i, dst_buf3[i], dst_stride3[i]);
+            }
+          }
         }
+#endif
       } else {
         predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row_top,
                            mi_col_top, dry_run, subsize, top_bsize, dst_buf,
@@ -6314,10 +6906,16 @@ static void predict_sb_complex(const AV1_COMP *const cpi, ThreadData *td,
                              pc_tree->split[3]);
       }
       for (i = 0; i < MAX_MB_PLANE; i++) {
-#if !CONFIG_CB4X4
+#if CONFIG_CB4X4
+        const struct macroblockd_plane *pd = &xd->plane[i];
+        int handle_chroma_sub8x8 = need_handle_chroma_sub8x8(
+            subsize, pd->subsampling_x, pd->subsampling_y);
+        if (handle_chroma_sub8x8) continue;  // Skip <4x4 chroma smoothing
+#else
         if (bsize == BLOCK_8X8 && i != 0)
           continue;  // Skip <4x4 chroma smoothing
 #endif
+
         if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols) {
           av1_build_masked_inter_predictor_complex(
               xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
@@ -6334,9 +6932,6 @@ static void predict_sb_complex(const AV1_COMP *const cpi, ThreadData *td,
                 PARTITION_HORZ, i);
           }
         } else if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols) {
-          if (bsize == BLOCK_8X8 && i != 0)
-            continue;  // Skip <4x4 chroma smoothing
-
           av1_build_masked_inter_predictor_complex(
               xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i],
               mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
@@ -6660,8 +7255,7 @@ static void rd_supertx_sb(const AV1_COMP *const cpi, ThreadData *td,
       *tmp_rate = av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
       x->skip = 1;
     } else {
-      if (RDCOST(x->rdmult, x->rddiv, *tmp_rate, *tmp_dist) <
-          RDCOST(x->rdmult, x->rddiv, 0, sse)) {
+      if (RDCOST(x->rdmult, *tmp_rate, *tmp_dist) < RDCOST(x->rdmult, 0, sse)) {
         *tmp_rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
         x->skip = 0;
       } else {
@@ -6671,7 +7265,7 @@ static void rd_supertx_sb(const AV1_COMP *const cpi, ThreadData *td,
       }
     }
     *tmp_rate += base_rate;
-    rd_tx = RDCOST(x->rdmult, x->rddiv, *tmp_rate, *tmp_dist);
+    rd_tx = RDCOST(x->rdmult, *tmp_rate, *tmp_dist);
     if (rd_tx < bestrd_tx * 0.99 || tx_type == DCT_DCT) {
       *best_tx = tx_type;
       bestrd_tx = rd_tx;
diff --git a/third_party/aom/av1/encoder/encodeframe.h b/third_party/aom/av1/encoder/encodeframe.h
index 46a99e1cf..569ec9f72 100644
--- a/third_party/aom/av1/encoder/encodeframe.h
+++ b/third_party/aom/av1/encoder/encodeframe.h
@@ -37,7 +37,7 @@ void av1_encode_tile(struct AV1_COMP *cpi, struct ThreadData *td, int tile_row,
 
 void av1_update_tx_type_count(const struct AV1Common *cm, MACROBLOCKD *xd,
 #if CONFIG_TXK_SEL
-                              int block, int plane,
+                              int blk_row, int blk_col, int block, int plane,
 #endif
                               BLOCK_SIZE bsize, TX_SIZE tx_size,
                               FRAME_COUNTS *counts);
diff --git a/third_party/aom/av1/encoder/encodemb.c b/third_party/aom/av1/encoder/encodemb.c
index 7c9781533..e7f4d313d 100644
--- a/third_party/aom/av1/encoder/encodemb.c
+++ b/third_party/aom/av1/encoder/encodemb.c
@@ -112,19 +112,9 @@ void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
 
 // These numbers are empirically obtained.
 static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
-#if CONFIG_EC_ADAPT
   { 10, 7 }, { 8, 5 },
-#else
-  { 10, 6 }, { 8, 6 },
-#endif
 };
 
-#define UPDATE_RD_COST()                             \
-  {                                                  \
-    rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0); \
-    rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1); \
-  }
-
 static INLINE unsigned int get_token_bit_costs(
     unsigned int token_costs[2][COEFF_CONTEXTS][ENTROPY_TOKENS], int skip_eob,
     int ctx, int token) {
@@ -133,23 +123,14 @@ static INLINE unsigned int get_token_bit_costs(
 }
 
 #if !CONFIG_LV_MAP
-#define USE_GREEDY_OPTIMIZE_B 0
-
-#if USE_GREEDY_OPTIMIZE_B
-
-typedef struct av1_token_state_greedy {
-  int16_t token;
-  tran_low_t qc;
-  tran_low_t dqc;
-} av1_token_state_greedy;
 
 static int optimize_b_greedy(const AV1_COMMON *cm, MACROBLOCK *mb, int plane,
-                             int block, TX_SIZE tx_size, int ctx) {
+                             int blk_row, int blk_col, int block,
+                             TX_SIZE tx_size, int ctx) {
   MACROBLOCKD *const xd = &mb->e_mbd;
   struct macroblock_plane *const p = &mb->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int ref = is_inter_block(&xd->mi[0]->mbmi);
-  av1_token_state_greedy tokens[MAX_TX_SQUARE + 1][2];
   uint8_t token_cache[MAX_TX_SQUARE];
   const tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
   tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
@@ -158,23 +139,27 @@ static int optimize_b_greedy(const AV1_COMMON *cm, MACROBLOCK *mb, int plane,
   const PLANE_TYPE plane_type = pd->plane_type;
   const int16_t *const dequant_ptr = pd->dequant;
   const uint8_t *const band_translate = get_band_translate(tx_size);
-  TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+  TX_TYPE tx_type =
+      av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
   const SCAN_ORDER *const scan_order =
-      get_scan(cm, tx_size, tx_type, is_inter_block(&xd->mi[0]->mbmi));
+      get_scan(cm, tx_size, tx_type, &xd->mi[0]->mbmi);
   const int16_t *const scan = scan_order->scan;
   const int16_t *const nb = scan_order->neighbors;
   int dqv;
   const int shift = av1_get_tx_scale(tx_size);
 #if CONFIG_AOM_QM
   int seg_id = xd->mi[0]->mbmi.segment_id;
-  const qm_val_t *iqmatrix = pd->seg_iqmatrix[seg_id][!ref][tx_size];
+  // Use a flat matrix (i.e. no weighting) for 1D and Identity transforms
+  const qm_val_t *iqmatrix =
+      IS_2D_TRANSFORM(tx_type)
+          ? pd->seg_iqmatrix[seg_id][!ref][tx_size]
+          : cm->giqmatrix[NUM_QM_LEVELS - 1][0][0][tx_size];
 #endif
 #if CONFIG_NEW_QUANT
   int dq = get_dq_profile_from_ctx(mb->qindex, ctx, ref, plane_type);
   const dequant_val_type_nuq *dequant_val = pd->dequant_val_nuq[dq];
 #endif  // CONFIG_NEW_QUANT
   int sz = 0;
-  const int64_t rddiv = mb->rddiv;
   int64_t rd_cost0, rd_cost1;
   int16_t t0, t1;
   int i, final_eob;
@@ -193,19 +178,8 @@ static int optimize_b_greedy(const AV1_COMMON *cm, MACROBLOCK *mb, int plane,
   int64_t rate0, rate1;
   for (i = 0; i < eob; i++) {
     const int rc = scan[i];
-    int x = qcoeff[rc];
-    t0 = av1_get_token(x);
-
-    tokens[i][0].qc = x;
-    tokens[i][0].token = t0;
-    tokens[i][0].dqc = dqcoeff[rc];
-
-    token_cache[rc] = av1_pt_energy_class[t0];
+    token_cache[rc] = av1_pt_energy_class[av1_get_token(qcoeff[rc])];
   }
-  tokens[eob][0].token = EOB_TOKEN;
-  tokens[eob][0].qc = 0;
-  tokens[eob][0].dqc = 0;
-  tokens[eob][1] = tokens[eob][0];
 
   unsigned int(*token_costs_ptr)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
       token_costs;
@@ -213,20 +187,22 @@ static int optimize_b_greedy(const AV1_COMMON *cm, MACROBLOCK *mb, int plane,
   final_eob = 0;
 
   int64_t eob_cost0, eob_cost1;
+  tran_low_t before_best_eob_qc = 0;
+  tran_low_t before_best_eob_dqc = 0;
 
   const int ctx0 = ctx;
   /* Record the r-d cost */
   int64_t accu_rate = 0;
-  int64_t accu_error = 0;
+  // Initialized to the worst possible error for the largest transform size.
+  // This ensures that it never goes negative.
+  int64_t accu_error = ((int64_t)1) << 50;
 
   rate0 = get_token_bit_costs(*(token_costs_ptr + band_translate[0]), 0, ctx0,
                               EOB_TOKEN);
-  int64_t best_block_rd_cost = RDCOST(rdmult, rddiv, rate0, accu_error);
+  int64_t best_block_rd_cost = RDCOST(rdmult, rate0, accu_error);
 
   // int64_t best_block_rd_cost_all0 = best_block_rd_cost;
-
   int x_prev = 1;
-
   for (i = 0; i < eob; i++) {
     const int rc = scan[i];
     int x = qcoeff[rc];
@@ -238,9 +214,9 @@ static int optimize_b_greedy(const AV1_COMMON *cm, MACROBLOCK *mb, int plane,
 
     if (x == 0) {
       // no need to search when x == 0
-      rate0 =
-          get_token_bit_costs(*(token_costs_ptr + band_cur), token_tree_sel_cur,
-                              ctx_cur, tokens[i][0].token);
+      int token = av1_get_token(x);
+      rate0 = get_token_bit_costs(*(token_costs_ptr + band_cur),
+                                  token_tree_sel_cur, ctx_cur, token);
       accu_rate += rate0;
       x_prev = 0;
       // accu_error does not change when x==0
@@ -249,7 +225,7 @@ static int optimize_b_greedy(const AV1_COMMON *cm, MACROBLOCK *mb, int plane,
        */
       // compute the distortion for the first candidate
       // and the distortion for quantizing to 0.
-      int dx0 = (-coeff[rc]) * (1 << shift);
+      int dx0 = abs(coeff[rc]) * (1 << shift);
 #if CONFIG_HIGHBITDEPTH
       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
         dx0 >>= xd->bd - 8;
@@ -273,7 +249,9 @@ static int optimize_b_greedy(const AV1_COMMON *cm, MACROBLOCK *mb, int plane,
       dx = (dqcoeff[rc] - coeff[rc]) * (1 << shift);
 #if CONFIG_HIGHBITDEPTH
       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-        dx >>= xd->bd - 8;
+        int dx_sign = dx < 0 ? 1 : 0;
+        dx = abs(dx) >> (xd->bd - 8);
+        if (dx_sign) dx = -dx;
       }
 #endif  // CONFIG_HIGHBITDEPTH
       d2 = (int64_t)dx * dx;
@@ -329,14 +307,16 @@ static int optimize_b_greedy(const AV1_COMMON *cm, MACROBLOCK *mb, int plane,
       if (i < default_eob - 1) {
         int ctx_next, token_tree_sel_next;
         int band_next = band_translate[i + 1];
+        int token_next =
+            i + 1 != eob ? av1_get_token(qcoeff[scan[i + 1]]) : EOB_TOKEN;
 
         token_cache[rc] = av1_pt_energy_class[t0];
         ctx_next = get_coef_context(nb, token_cache, i + 1);
         token_tree_sel_next = (x == 0);
 
-        next_bits0 = get_token_bit_costs(*(token_costs_ptr + band_next),
-                                         token_tree_sel_next, ctx_next,
-                                         tokens[i + 1][0].token);
+        next_bits0 =
+            get_token_bit_costs(*(token_costs_ptr + band_next),
+                                token_tree_sel_next, ctx_next, token_next);
         next_eob_bits0 =
             get_token_bit_costs(*(token_costs_ptr + band_next),
                                 token_tree_sel_next, ctx_next, EOB_TOKEN);
@@ -345,9 +325,9 @@ static int optimize_b_greedy(const AV1_COMMON *cm, MACROBLOCK *mb, int plane,
         ctx_next = get_coef_context(nb, token_cache, i + 1);
         token_tree_sel_next = (x_a == 0);
 
-        next_bits1 = get_token_bit_costs(*(token_costs_ptr + band_next),
-                                         token_tree_sel_next, ctx_next,
-                                         tokens[i + 1][0].token);
+        next_bits1 =
+            get_token_bit_costs(*(token_costs_ptr + band_next),
+                                token_tree_sel_next, ctx_next, token_next);
 
         if (x_a != 0) {
           next_eob_bits1 =
@@ -356,16 +336,16 @@ static int optimize_b_greedy(const AV1_COMMON *cm, MACROBLOCK *mb, int plane,
         }
       }
 
-      rd_cost0 = RDCOST(rdmult, rddiv, (rate0 + next_bits0), d2);
-      rd_cost1 = RDCOST(rdmult, rddiv, (rate1 + next_bits1), d2_a);
+      rd_cost0 = RDCOST(rdmult, (rate0 + next_bits0), d2);
+      rd_cost1 = RDCOST(rdmult, (rate1 + next_bits1), d2_a);
 
       best_x = (rd_cost1 < rd_cost0);
 
-      eob_cost0 = RDCOST(rdmult, rddiv, (accu_rate + rate0 + next_eob_bits0),
+      eob_cost0 = RDCOST(rdmult, (accu_rate + rate0 + next_eob_bits0),
                          (accu_error + d2 - d0));
       eob_cost1 = eob_cost0;
       if (x_a != 0) {
-        eob_cost1 = RDCOST(rdmult, rddiv, (accu_rate + rate1 + next_eob_bits1),
+        eob_cost1 = RDCOST(rdmult, (accu_rate + rate1 + next_eob_bits1),
                            (accu_error + d2_a - d0));
         best_eob_x = (eob_cost1 < eob_cost0);
       } else {
@@ -410,38 +390,35 @@ static int optimize_b_greedy(const AV1_COMMON *cm, MACROBLOCK *mb, int plane,
 
         token_cache[rc] = av1_pt_energy_class[t0];
       }
+      assert(accu_error >= 0);
 
       x_prev = qcoeff[rc];
 
       // determine whether to move the eob position to i+1
-      int64_t best_eob_cost_i = eob_cost0;
-
-      tokens[i][1].token = t0;
-      tokens[i][1].qc = x;
-      tokens[i][1].dqc = dqc;
-
-      if ((x_a != 0) && (best_eob_x)) {
-        best_eob_cost_i = eob_cost1;
-
-        tokens[i][1].token = t1;
-        tokens[i][1].qc = x_a;
-        tokens[i][1].dqc = dqc_a;
-      }
+      int use_a = (x_a != 0) && (best_eob_x);
+      int64_t best_eob_cost_i = use_a ? eob_cost1 : eob_cost0;
 
       if (best_eob_cost_i < best_block_rd_cost) {
         best_block_rd_cost = best_eob_cost_i;
         final_eob = i + 1;
+        if (use_a) {
+          before_best_eob_qc = x_a;
+          before_best_eob_dqc = dqc_a;
+        } else {
+          before_best_eob_qc = x;
+          before_best_eob_dqc = dqc;
+        }
       }
     }  // if (x==0)
   }    // for (i)
 
   assert(final_eob <= eob);
   if (final_eob > 0) {
-    assert(tokens[final_eob - 1][1].qc != 0);
+    assert(before_best_eob_qc != 0);
     i = final_eob - 1;
     int rc = scan[i];
-    qcoeff[rc] = tokens[i][1].qc;
-    dqcoeff[rc] = tokens[i][1].dqc;
+    qcoeff[rc] = before_best_eob_qc;
+    dqcoeff[rc] = before_best_eob_dqc;
   }
 
   for (i = final_eob; i < eob; i++) {
@@ -453,366 +430,19 @@ static int optimize_b_greedy(const AV1_COMMON *cm, MACROBLOCK *mb, int plane,
   mb->plane[plane].eobs[block] = final_eob;
   return final_eob;
 }
-
-#else  // USE_GREEDY_OPTIMIZE_B
-
-typedef struct av1_token_state_org {
-  int64_t error;
-  int rate;
-  int16_t next;
-  int16_t token;
-  tran_low_t qc;
-  tran_low_t dqc;
-  uint8_t best_index;
-} av1_token_state_org;
-
-static int optimize_b_org(const AV1_COMMON *cm, MACROBLOCK *mb, int plane,
-                          int block, TX_SIZE tx_size, int ctx) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
-  struct macroblock_plane *const p = &mb->plane[plane];
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int ref = is_inter_block(&xd->mi[0]->mbmi);
-  av1_token_state_org tokens[MAX_TX_SQUARE + 1][2];
-  uint8_t token_cache[MAX_TX_SQUARE];
-  const tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
-  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  const int eob = p->eobs[block];
-  const PLANE_TYPE plane_type = pd->plane_type;
-  const int default_eob = tx_size_2d[tx_size];
-  const int16_t *const dequant_ptr = pd->dequant;
-  const uint8_t *const band_translate = get_band_translate(tx_size);
-  TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
-  const SCAN_ORDER *const scan_order =
-      get_scan(cm, tx_size, tx_type, is_inter_block(&xd->mi[0]->mbmi));
-  const int16_t *const scan = scan_order->scan;
-  const int16_t *const nb = scan_order->neighbors;
-  int dqv;
-  const int shift = av1_get_tx_scale(tx_size);
-#if CONFIG_AOM_QM
-  int seg_id = xd->mi[0]->mbmi.segment_id;
-  const qm_val_t *iqmatrix = pd->seg_iqmatrix[seg_id][!ref][tx_size];
-#endif
-#if CONFIG_NEW_QUANT
-  int dq = get_dq_profile_from_ctx(mb->qindex, ctx, ref, plane_type);
-  const dequant_val_type_nuq *dequant_val = pd->dequant_val_nuq[dq];
-#endif  // CONFIG_NEW_QUANT
-  int next = eob, sz = 0;
-  const int64_t rdmult = (mb->rdmult * plane_rd_mult[ref][plane_type]) >> 1;
-  const int64_t rddiv = mb->rddiv;
-  int64_t rd_cost0, rd_cost1;
-  int rate0, rate1;
-  int64_t error0, error1;
-  int16_t t0, t1;
-  int best, band = (eob < default_eob) ? band_translate[eob]
-                                       : band_translate[eob - 1];
-  int pt, i, final_eob;
-  const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, xd->bd);
-  unsigned int(*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
-      mb->token_costs[txsize_sqr_map[tx_size]][plane_type][ref];
-  const uint16_t *band_counts = &band_count_table[tx_size][band];
-  uint16_t band_left = eob - band_cum_count_table[tx_size][band] + 1;
-  int shortcut = 0;
-  int next_shortcut = 0;
-
-#if CONFIG_EXT_DELTA_Q
-  const int qindex = cm->seg.enabled
-                         ? av1_get_qindex(&cm->seg, xd->mi[0]->mbmi.segment_id,
-                                          cm->base_qindex)
-                         : cm->base_qindex;
-  assert(qindex > 0);
-  (void)qindex;
-#else
-  assert(mb->qindex > 0);
-#endif
-
-  token_costs += band;
-
-  assert((!plane_type && !plane) || (plane_type && plane));
-  assert(eob <= default_eob);
-
-  /* Now set up a Viterbi trellis to evaluate alternative roundings. */
-  /* Initialize the sentinel node of the trellis. */
-  tokens[eob][0].rate = 0;
-  tokens[eob][0].error = 0;
-  tokens[eob][0].next = default_eob;
-  tokens[eob][0].token = EOB_TOKEN;
-  tokens[eob][0].qc = 0;
-  tokens[eob][1] = tokens[eob][0];
-
-  for (i = 0; i < eob; i++) {
-    const int rc = scan[i];
-    tokens[i][0].rate = av1_get_token_cost(qcoeff[rc], &t0, cat6_bits);
-    tokens[i][0].token = t0;
-    token_cache[rc] = av1_pt_energy_class[t0];
-  }
-
-  for (i = eob; i-- > 0;) {
-    int base_bits, dx;
-    int64_t d2;
-    const int rc = scan[i];
-    int x = qcoeff[rc];
-#if CONFIG_AOM_QM
-    int iwt = iqmatrix[rc];
-    dqv = dequant_ptr[rc != 0];
-    dqv = ((iwt * (int)dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
-#else
-    dqv = dequant_ptr[rc != 0];
-#endif
-    next_shortcut = shortcut;
-
-    /* Only add a trellis state for non-zero coefficients. */
-    if (UNLIKELY(x)) {
-      error0 = tokens[next][0].error;
-      error1 = tokens[next][1].error;
-      /* Evaluate the first possibility for this state. */
-      rate0 = tokens[next][0].rate;
-      rate1 = tokens[next][1].rate;
-
-      if (next_shortcut) {
-        /* Consider both possible successor states. */
-        if (next < default_eob) {
-          pt = get_coef_context(nb, token_cache, i + 1);
-          rate0 +=
-              get_token_bit_costs(*token_costs, 0, pt, tokens[next][0].token);
-          rate1 +=
-              get_token_bit_costs(*token_costs, 0, pt, tokens[next][1].token);
-        }
-        UPDATE_RD_COST();
-        /* And pick the best. */
-        best = rd_cost1 < rd_cost0;
-      } else {
-        if (next < default_eob) {
-          pt = get_coef_context(nb, token_cache, i + 1);
-          rate0 +=
-              get_token_bit_costs(*token_costs, 0, pt, tokens[next][0].token);
-        }
-        best = 0;
-      }
-
-      dx = (dqcoeff[rc] - coeff[rc]) * (1 << shift);
-#if CONFIG_HIGHBITDEPTH
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-        dx >>= xd->bd - 8;
-      }
-#endif  // CONFIG_HIGHBITDEPTH
-      d2 = (int64_t)dx * dx;
-      tokens[i][0].rate += (best ? rate1 : rate0);
-      tokens[i][0].error = d2 + (best ? error1 : error0);
-      tokens[i][0].next = next;
-      tokens[i][0].qc = x;
-      tokens[i][0].dqc = dqcoeff[rc];
-      tokens[i][0].best_index = best;
-
-      /* Evaluate the second possibility for this state. */
-      rate0 = tokens[next][0].rate;
-      rate1 = tokens[next][1].rate;
-
-      // The threshold of 3 is empirically obtained.
-      if (UNLIKELY(abs(x) > 3)) {
-        shortcut = 0;
-      } else {
-#if CONFIG_NEW_QUANT
-        shortcut = ((av1_dequant_abscoeff_nuq(abs(x), dqv,
-                                              dequant_val[band_translate[i]]) >
-                     (abs(coeff[rc]) << shift)) &&
-                    (av1_dequant_abscoeff_nuq(abs(x) - 1, dqv,
-                                              dequant_val[band_translate[i]]) <
-                     (abs(coeff[rc]) << shift)));
-#else  // CONFIG_NEW_QUANT
-#if CONFIG_AOM_QM
-        if ((abs(x) * dequant_ptr[rc != 0] * iwt >
-             ((abs(coeff[rc]) << shift) << AOM_QM_BITS)) &&
-            (abs(x) * dequant_ptr[rc != 0] * iwt <
-             (((abs(coeff[rc]) << shift) + dequant_ptr[rc != 0])
-              << AOM_QM_BITS)))
-#else
-        if ((abs(x) * dequant_ptr[rc != 0] > (abs(coeff[rc]) << shift)) &&
-            (abs(x) * dequant_ptr[rc != 0] <
-             (abs(coeff[rc]) << shift) + dequant_ptr[rc != 0]))
-#endif  // CONFIG_AOM_QM
-          shortcut = 1;
-        else
-          shortcut = 0;
-#endif  // CONFIG_NEW_QUANT
-      }
-
-      if (shortcut) {
-        sz = -(x < 0);
-        x -= 2 * sz + 1;
-      } else {
-        tokens[i][1] = tokens[i][0];
-        next = i;
-
-        if (UNLIKELY(!(--band_left))) {
-          --band_counts;
-          band_left = *band_counts;
-          --token_costs;
-        }
-        continue;
-      }
-
-      /* Consider both possible successor states. */
-      if (!x) {
-        /* If we reduced this coefficient to zero, check to see if
-         *  we need to move the EOB back here.
-         */
-        t0 = tokens[next][0].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
-        t1 = tokens[next][1].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
-        base_bits = 0;
-      } else {
-        base_bits = av1_get_token_cost(x, &t0, cat6_bits);
-        t1 = t0;
-      }
-
-      if (next_shortcut) {
-        if (LIKELY(next < default_eob)) {
-          if (t0 != EOB_TOKEN) {
-            token_cache[rc] = av1_pt_energy_class[t0];
-            pt = get_coef_context(nb, token_cache, i + 1);
-            rate0 += get_token_bit_costs(*token_costs, !x, pt,
-                                         tokens[next][0].token);
-          }
-          if (t1 != EOB_TOKEN) {
-            token_cache[rc] = av1_pt_energy_class[t1];
-            pt = get_coef_context(nb, token_cache, i + 1);
-            rate1 += get_token_bit_costs(*token_costs, !x, pt,
-                                         tokens[next][1].token);
-          }
-        }
-
-        UPDATE_RD_COST();
-        /* And pick the best. */
-        best = rd_cost1 < rd_cost0;
-      } else {
-        // The two states in next stage are identical.
-        if (next < default_eob && t0 != EOB_TOKEN) {
-          token_cache[rc] = av1_pt_energy_class[t0];
-          pt = get_coef_context(nb, token_cache, i + 1);
-          rate0 +=
-              get_token_bit_costs(*token_costs, !x, pt, tokens[next][0].token);
-        }
-        best = 0;
-      }
-
-#if CONFIG_NEW_QUANT
-      dx = av1_dequant_coeff_nuq(x, dqv, dequant_val[band_translate[i]]) -
-           (coeff[rc] << shift);
-#if CONFIG_HIGHBITDEPTH
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-        dx >>= xd->bd - 8;
-      }
-#endif  // CONFIG_HIGHBITDEPTH
-#else   // CONFIG_NEW_QUANT
-#if CONFIG_HIGHBITDEPTH
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-        dx -= ((dqv >> (xd->bd - 8)) + sz) ^ sz;
-      } else {
-        dx -= (dqv + sz) ^ sz;
-      }
-#else
-      dx -= (dqv + sz) ^ sz;
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_NEW_QUANT
-      d2 = (int64_t)dx * dx;
-
-      tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
-      tokens[i][1].error = d2 + (best ? error1 : error0);
-      tokens[i][1].next = next;
-      tokens[i][1].token = best ? t1 : t0;
-      tokens[i][1].qc = x;
-
-      if (x) {
-#if CONFIG_NEW_QUANT
-        tokens[i][1].dqc = av1_dequant_abscoeff_nuq(
-            abs(x), dqv, dequant_val[band_translate[i]]);
-        tokens[i][1].dqc = shift ? ROUND_POWER_OF_TWO(tokens[i][1].dqc, shift)
-                                 : tokens[i][1].dqc;
-        if (sz) tokens[i][1].dqc = -tokens[i][1].dqc;
-#else
-        if (x < 0)
-          tokens[i][1].dqc = -((-x * dqv) >> shift);
-        else
-          tokens[i][1].dqc = (x * dqv) >> shift;
-#endif  // CONFIG_NEW_QUANT
-      } else {
-        tokens[i][1].dqc = 0;
-      }
-
-      tokens[i][1].best_index = best;
-      /* Finally, make this the new head of the trellis. */
-      next = i;
-    } else {
-      /* There's no choice to make for a zero coefficient, so we don't
-       *  add a new trellis node, but we do need to update the costs.
-       */
-      t0 = tokens[next][0].token;
-      t1 = tokens[next][1].token;
-      pt = get_coef_context(nb, token_cache, i + 1);
-      /* Update the cost of each path if we're past the EOB token. */
-      if (t0 != EOB_TOKEN) {
-        tokens[next][0].rate += get_token_bit_costs(*token_costs, 1, pt, t0);
-        tokens[next][0].token = ZERO_TOKEN;
-      }
-      if (t1 != EOB_TOKEN) {
-        tokens[next][1].rate += get_token_bit_costs(*token_costs, 1, pt, t1);
-        tokens[next][1].token = ZERO_TOKEN;
-      }
-      tokens[i][0].best_index = tokens[i][1].best_index = 0;
-      shortcut = (tokens[next][0].rate != tokens[next][1].rate);
-      /* Don't update next, because we didn't add a new node. */
-    }
-
-    if (UNLIKELY(!(--band_left))) {
-      --band_counts;
-      band_left = *band_counts;
-      --token_costs;
-    }
-  }
-
-  /* Now pick the best path through the whole trellis. */
-  rate0 = tokens[next][0].rate;
-  rate1 = tokens[next][1].rate;
-  error0 = tokens[next][0].error;
-  error1 = tokens[next][1].error;
-  t0 = tokens[next][0].token;
-  t1 = tokens[next][1].token;
-  rate0 += get_token_bit_costs(*token_costs, 0, ctx, t0);
-  rate1 += get_token_bit_costs(*token_costs, 0, ctx, t1);
-  UPDATE_RD_COST();
-  best = rd_cost1 < rd_cost0;
-
-  final_eob = -1;
-
-  for (i = next; i < eob; i = next) {
-    const int x = tokens[i][best].qc;
-    const int rc = scan[i];
-    if (x) final_eob = i;
-    qcoeff[rc] = x;
-    dqcoeff[rc] = tokens[i][best].dqc;
-
-    next = tokens[i][best].next;
-    best = tokens[i][best].best_index;
-  }
-  final_eob++;
-
-  mb->plane[plane].eobs[block] = final_eob;
-  assert(final_eob <= default_eob);
-  return final_eob;
-}
-
-#endif  // USE_GREEDY_OPTIMIZE_B
 #endif  // !CONFIG_LV_MAP
 
-int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int block,
-                   BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                   const ENTROPY_CONTEXT *a, const ENTROPY_CONTEXT *l) {
+int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int blk_row,
+                   int blk_col, int block, BLOCK_SIZE plane_bsize,
+                   TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
+                   const ENTROPY_CONTEXT *l) {
   MACROBLOCKD *const xd = &mb->e_mbd;
   struct macroblock_plane *const p = &mb->plane[plane];
   const int eob = p->eobs[block];
   assert((mb->qindex == 0) ^ (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0));
   if (eob == 0) return eob;
   if (xd->lossless[xd->mi[0]->mbmi.segment_id]) return eob;
+
 #if CONFIG_PVQ
   (void)cm;
   (void)tx_size;
@@ -823,26 +453,24 @@ int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int block,
 
 #if !CONFIG_LV_MAP
   (void)plane_bsize;
+  (void)blk_row;
+  (void)blk_col;
 #if CONFIG_VAR_TX
   int ctx = get_entropy_context(tx_size, a, l);
 #else
   int ctx = combine_entropy_contexts(*a, *l);
-#endif
-
-#if USE_GREEDY_OPTIMIZE_B
-  return optimize_b_greedy(cm, mb, plane, block, tx_size, ctx);
-#else   // USE_GREEDY_OPTIMIZE_B
-  return optimize_b_org(cm, mb, plane, block, tx_size, ctx);
-#endif  // USE_GREEDY_OPTIMIZE_B
+#endif  // CONFIG_VAR_TX
+  return optimize_b_greedy(cm, mb, plane, blk_row, blk_col, block, tx_size,
+                           ctx);
 #else   // !CONFIG_LV_MAP
   TXB_CTX txb_ctx;
   get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
-  return av1_optimize_txb(cm, mb, plane, block, tx_size, &txb_ctx);
+  return av1_optimize_txb(cm, mb, plane, blk_row, blk_col, block, tx_size,
+                          &txb_ctx);
 #endif  // !CONFIG_LV_MAP
 }
 
 #if !CONFIG_PVQ
-#if CONFIG_HIGHBITDEPTH
 typedef enum QUANT_FUNC {
   QUANT_FUNC_LOWBD = 0,
   QUANT_FUNC_HIGHBD = 1,
@@ -862,29 +490,12 @@ static AV1_QUANT_FACADE
 #endif  // !CONFIG_NEW_QUANT
       { NULL, NULL }
     };
+#endif  // !CONFIG_PVQ
 
-#else
-
-typedef enum QUANT_FUNC {
-  QUANT_FUNC_LOWBD = 0,
-  QUANT_FUNC_TYPES = 1
-} QUANT_FUNC;
-
-static AV1_QUANT_FACADE quant_func_list[AV1_XFORM_QUANT_TYPES]
-                                       [QUANT_FUNC_TYPES] = {
-#if !CONFIG_NEW_QUANT
-                                         { av1_quantize_fp_facade },
-                                         { av1_quantize_b_facade },
-                                         { av1_quantize_dc_facade },
-#else   // !CONFIG_NEW_QUANT
-                                         { av1_quantize_fp_nuq_facade },
-                                         { av1_quantize_b_nuq_facade },
-                                         { av1_quantize_dc_nuq_facade },
-#endif  // !CONFIG_NEW_QUANT
-                                         { NULL }
-                                       };
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_PVQ
+typedef void (*fwdTxfmFunc)(const int16_t *diff, tran_low_t *coeff, int stride,
+                            TxfmParam *txfm_param);
+static const fwdTxfmFunc fwd_txfm_func[2] = { av1_fwd_txfm,
+                                              av1_highbd_fwd_txfm };
 
 void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
                      int blk_row, int blk_col, BLOCK_SIZE plane_bsize,
@@ -892,7 +503,7 @@ void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
                      AV1_XFORM_QUANT xform_quant_idx) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-#if !(CONFIG_PVQ || CONFIG_DAALA_DIST)
+#if !(CONFIG_PVQ || CONFIG_DIST_8X8)
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
 #else
@@ -900,9 +511,14 @@ void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
   struct macroblockd_plane *const pd = &xd->plane[plane];
 #endif
   PLANE_TYPE plane_type = get_plane_type(plane);
-  TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+  TX_TYPE tx_type =
+      av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
+
+#if CONFIG_AOM_QM || CONFIG_NEW_QUANT
   const int is_inter = is_inter_block(mbmi);
-  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, is_inter);
+#endif
+
+  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, mbmi);
   tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
   tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
@@ -910,19 +526,28 @@ void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
   const int diff_stride = block_size_wide[plane_bsize];
 #if CONFIG_AOM_QM
   int seg_id = mbmi->segment_id;
-  const qm_val_t *qmatrix = pd->seg_qmatrix[seg_id][!is_inter][tx_size];
-  const qm_val_t *iqmatrix = pd->seg_iqmatrix[seg_id][!is_inter][tx_size];
+  // Use a flat matrix (i.e. no weighting) for 1D and Identity transforms
+  const qm_val_t *qmatrix =
+      IS_2D_TRANSFORM(tx_type) ? pd->seg_qmatrix[seg_id][!is_inter][tx_size]
+                               : cm->gqmatrix[NUM_QM_LEVELS - 1][0][0][tx_size];
+  const qm_val_t *iqmatrix =
+      IS_2D_TRANSFORM(tx_type)
+          ? pd->seg_iqmatrix[seg_id][!is_inter][tx_size]
+          : cm->giqmatrix[NUM_QM_LEVELS - 1][0][0][tx_size];
 #endif
 
-  FWD_TXFM_PARAM fwd_txfm_param;
+  TxfmParam txfm_param;
 
-#if CONFIG_PVQ || CONFIG_DAALA_DIST
+#if CONFIG_PVQ || CONFIG_DIST_8X8 || CONFIG_LGT || CONFIG_MRC_TX
   uint8_t *dst;
-  int16_t *pred;
   const int dst_stride = pd->dst.stride;
-  int tx_blk_size;
+#if CONFIG_PVQ || CONFIG_DIST_8X8
+  int16_t *pred;
+  const int txw = tx_size_wide[tx_size];
+  const int txh = tx_size_high[tx_size];
   int i, j;
 #endif
+#endif
 
 #if !CONFIG_PVQ
   const int tx2d_size = tx_size_2d[tx_size];
@@ -960,79 +585,68 @@ void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
   src_int16 =
       &p->src_int16[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
 
-  // transform block size in pixels
-  tx_blk_size = tx_size_wide[tx_size];
 #if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    for (j = 0; j < tx_blk_size; j++)
-      for (i = 0; i < tx_blk_size; i++)
+    for (j = 0; j < txh; j++)
+      for (i = 0; i < txw; i++)
         src_int16[diff_stride * j + i] =
             CONVERT_TO_SHORTPTR(src)[src_stride * j + i];
   } else {
 #endif  // CONFIG_HIGHBITDEPTH
-    for (j = 0; j < tx_blk_size; j++)
-      for (i = 0; i < tx_blk_size; i++)
+    for (j = 0; j < txh; j++)
+      for (i = 0; i < txw; i++)
         src_int16[diff_stride * j + i] = src[src_stride * j + i];
 #if CONFIG_HIGHBITDEPTH
   }
 #endif  // CONFIG_HIGHBITDEPTH
 #endif
 
-#if CONFIG_PVQ || CONFIG_DAALA_DIST
+#if CONFIG_PVQ || CONFIG_DIST_8X8 || CONFIG_LGT || CONFIG_MRC_TX
   dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+#if CONFIG_PVQ || CONFIG_DIST_8X8
   pred = &pd->pred[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
 
-  // transform block size in pixels
-  tx_blk_size = tx_size_wide[tx_size];
-
 // copy uint8 orig and predicted block to int16 buffer
 // in order to use existing VP10 transform functions
 #if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    for (j = 0; j < tx_blk_size; j++)
-      for (i = 0; i < tx_blk_size; i++)
+    for (j = 0; j < txh; j++)
+      for (i = 0; i < txw; i++)
         pred[diff_stride * j + i] =
             CONVERT_TO_SHORTPTR(dst)[dst_stride * j + i];
   } else {
 #endif  // CONFIG_HIGHBITDEPTH
-    for (j = 0; j < tx_blk_size; j++)
-      for (i = 0; i < tx_blk_size; i++)
+    for (j = 0; j < txh; j++)
+      for (i = 0; i < txw; i++)
         pred[diff_stride * j + i] = dst[dst_stride * j + i];
 #if CONFIG_HIGHBITDEPTH
   }
 #endif  // CONFIG_HIGHBITDEPTH
-#endif
+#endif  // CONFIG_PVQ || CONFIG_DIST_8X8
+#endif  // CONFIG_PVQ || CONFIG_DIST_8X8 || CONFIG_LGT || CONFIG_MRC_TX
 
   (void)ctx;
 
-  fwd_txfm_param.tx_type = tx_type;
-  fwd_txfm_param.tx_size = tx_size;
-  fwd_txfm_param.lossless = xd->lossless[mbmi->segment_id];
+  txfm_param.tx_type = tx_type;
+  txfm_param.tx_size = tx_size;
+  txfm_param.lossless = xd->lossless[mbmi->segment_id];
+#if CONFIG_MRC_TX || CONFIG_LGT
+  txfm_param.dst = dst;
+  txfm_param.stride = dst_stride;
+#endif  // CONFIG_MRC_TX || CONFIG_LGT
+#if CONFIG_LGT
+  txfm_param.is_inter = is_inter_block(mbmi);
+  txfm_param.mode = get_prediction_mode(xd->mi[0], plane, tx_size, block);
+#endif
 
 #if !CONFIG_PVQ
-#if CONFIG_HIGHBITDEPTH
-  fwd_txfm_param.bd = xd->bd;
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    av1_highbd_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
-    if (xform_quant_idx != AV1_XFORM_QUANT_SKIP_QUANT) {
-      if (LIKELY(!x->skip_block)) {
-        quant_func_list[xform_quant_idx][QUANT_FUNC_HIGHBD](
-            coeff, tx2d_size, p, qcoeff, pd, dqcoeff, eob, scan_order, &qparam);
-      } else {
-        av1_quantize_skip(tx2d_size, qcoeff, dqcoeff, eob);
-      }
-    }
-#if CONFIG_LV_MAP
-    p->txb_entropy_ctx[block] =
-        (uint8_t)av1_get_txb_entropy_context(qcoeff, scan_order, *eob);
-#endif  // CONFIG_LV_MAP
-    return;
-  }
-#endif  // CONFIG_HIGHBITDEPTH
-  av1_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
+  txfm_param.bd = xd->bd;
+  const int is_hbd = get_bitdepth_data_path_index(xd);
+  fwd_txfm_func[is_hbd](src_diff, coeff, diff_stride, &txfm_param);
+
   if (xform_quant_idx != AV1_XFORM_QUANT_SKIP_QUANT) {
     if (LIKELY(!x->skip_block)) {
-      quant_func_list[xform_quant_idx][QUANT_FUNC_LOWBD](
+      quant_func_list[xform_quant_idx][is_hbd](
           coeff, tx2d_size, p, qcoeff, pd, dqcoeff, eob, scan_order, &qparam);
     } else {
       av1_quantize_skip(tx2d_size, qcoeff, dqcoeff, eob);
@@ -1042,17 +656,18 @@ void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
   p->txb_entropy_ctx[block] =
       (uint8_t)av1_get_txb_entropy_context(qcoeff, scan_order, *eob);
 #endif  // CONFIG_LV_MAP
-#else   // #if !CONFIG_PVQ
+  return;
+#else  // CONFIG_PVQ
   (void)xform_quant_idx;
 #if CONFIG_HIGHBITDEPTH
-  fwd_txfm_param.bd = xd->bd;
+  txfm_param.bd = xd->bd;
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    av1_highbd_fwd_txfm(src_int16, coeff, diff_stride, &fwd_txfm_param);
-    av1_highbd_fwd_txfm(pred, ref_coeff, diff_stride, &fwd_txfm_param);
+    av1_highbd_fwd_txfm(src_int16, coeff, diff_stride, &txfm_param);
+    av1_highbd_fwd_txfm(pred, ref_coeff, diff_stride, &txfm_param);
   } else {
 #endif
-    av1_fwd_txfm(src_int16, coeff, diff_stride, &fwd_txfm_param);
-    av1_fwd_txfm(pred, ref_coeff, diff_stride, &fwd_txfm_param);
+    av1_fwd_txfm(src_int16, coeff, diff_stride, &txfm_param);
+    av1_fwd_txfm(pred, ref_coeff, diff_stride, &txfm_param);
 #if CONFIG_HIGHBITDEPTH
   }
 #endif
@@ -1130,7 +745,8 @@ static void encode_block(int plane, int block, int blk_row, int blk_col,
 #endif
 
 #if !CONFIG_PVQ
-  av1_optimize_b(cm, x, plane, block, plane_bsize, tx_size, a, l);
+  av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size, a,
+                 l);
 
   av1_set_txb_context(x, plane, block, tx_size, a, l);
 
@@ -1143,9 +759,16 @@ static void encode_block(int plane, int block, int blk_row, int blk_col,
 
   if (x->pvq_skip[plane]) return;
 #endif
-  TX_TYPE tx_type = get_tx_type(pd->plane_type, xd, block, tx_size);
+  TX_TYPE tx_type =
+      av1_get_tx_type(pd->plane_type, xd, blk_row, blk_col, block, tx_size);
+#if CONFIG_LGT
+  PREDICTION_MODE mode = get_prediction_mode(xd->mi[0], plane, tx_size, block);
+  av1_inverse_transform_block(xd, dqcoeff, mode, tx_type, tx_size, dst,
+                              pd->dst.stride, p->eobs[block]);
+#else
   av1_inverse_transform_block(xd, dqcoeff, tx_type, tx_size, dst,
                               pd->dst.stride, p->eobs[block]);
+#endif
 }
 
 #if CONFIG_VAR_TX
@@ -1174,16 +797,32 @@ static void encode_block_inter(int plane, int block, int blk_row, int blk_col,
     encode_block(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg);
   } else {
     assert(tx_size < TX_SIZES_ALL);
+#if CONFIG_RECT_TX_EXT
+    int is_qttx = plane_tx_size == quarter_txsize_lookup[plane_bsize];
+    const TX_SIZE sub_txs = is_qttx ? plane_tx_size : sub_tx_size_map[tx_size];
+    if (is_qttx) assert(blk_row == 0 && blk_col == 0 && block == 0);
+#else
     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
     assert(sub_txs < tx_size);
+#endif
     // This is the square transform block partition entry point.
     int bsl = tx_size_wide_unit[sub_txs];
     int i;
     assert(bsl > 0);
 
     for (i = 0; i < 4; ++i) {
+#if CONFIG_RECT_TX_EXT
+      int is_wide_tx = tx_size_wide_unit[sub_txs] > tx_size_high_unit[sub_txs];
+      const int offsetr =
+          is_qttx ? (is_wide_tx ? i * tx_size_high_unit[sub_txs] : 0)
+                  : blk_row + ((i >> 1) * bsl);
+      const int offsetc =
+          is_qttx ? (is_wide_tx ? 0 : i * tx_size_wide_unit[sub_txs])
+                  : blk_col + ((i & 0x01) * bsl);
+#else
       const int offsetr = blk_row + ((i >> 1) * bsl);
       const int offsetc = blk_col + ((i & 0x01) * bsl);
+#endif
       int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
 
       if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
@@ -1211,6 +850,7 @@ static void encode_block_pass1(int plane, int block, int blk_row, int blk_col,
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  TxfmParam txfm_param;
   uint8_t *dst;
   int ctx = 0;
   dst = &pd->dst
@@ -1246,22 +886,20 @@ static void encode_block_pass1(int plane, int block, int blk_row, int blk_col,
 #endif  // CONFIG_HIGHBITDEPTH
     }
 #endif  // !CONFIG_PVQ
+    txfm_param.bd = xd->bd;
+    txfm_param.tx_type = DCT_DCT;
+    txfm_param.eob = p->eobs[block];
+    txfm_param.lossless = xd->lossless[xd->mi[0]->mbmi.segment_id];
 #if CONFIG_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
-        av1_highbd_iwht4x4_add(dqcoeff, dst, pd->dst.stride, p->eobs[block],
-                               xd->bd);
-      } else {
-        av1_highbd_idct4x4_add(dqcoeff, dst, pd->dst.stride, p->eobs[block],
-                               xd->bd);
-      }
+      av1_highbd_inv_txfm_add_4x4(dqcoeff, dst, pd->dst.stride, &txfm_param);
       return;
     }
 #endif  //  CONFIG_HIGHBITDEPTH
     if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
-      av1_iwht4x4_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+      av1_iwht4x4_add(dqcoeff, dst, pd->dst.stride, &txfm_param);
     } else {
-      av1_idct4x4_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+      av1_idct4x4_add(dqcoeff, dst, pd->dst.stride, &txfm_param);
     }
   }
 }
@@ -1316,7 +954,7 @@ void av1_encode_sb(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
     av1_get_entropy_contexts(bsize, 0, pd, ctx.ta[plane], ctx.tl[plane]);
 #else
     const struct macroblockd_plane *const pd = &xd->plane[plane];
-    const TX_SIZE tx_size = get_tx_size(plane, xd);
+    const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
     av1_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane], ctx.tl[plane]);
 #endif
 
@@ -1327,11 +965,27 @@ void av1_encode_sb(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
     arg.tl = ctx.tl[plane];
 
 #if CONFIG_VAR_TX
-    for (idy = 0; idy < mi_height; idy += bh) {
-      for (idx = 0; idx < mi_width; idx += bw) {
-        encode_block_inter(plane, block, idy, idx, plane_bsize, max_tx_size,
-                           &arg);
-        block += step;
+    const BLOCK_SIZE max_unit_bsize = get_plane_block_size(BLOCK_64X64, pd);
+    int mu_blocks_wide =
+        block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
+    int mu_blocks_high =
+        block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
+
+    mu_blocks_wide = AOMMIN(mi_width, mu_blocks_wide);
+    mu_blocks_high = AOMMIN(mi_height, mu_blocks_high);
+
+    for (idy = 0; idy < mi_height; idy += mu_blocks_high) {
+      for (idx = 0; idx < mi_width; idx += mu_blocks_wide) {
+        int blk_row, blk_col;
+        const int unit_height = AOMMIN(mu_blocks_high + idy, mi_height);
+        const int unit_width = AOMMIN(mu_blocks_wide + idx, mi_width);
+        for (blk_row = idy; blk_row < unit_height; blk_row += bh) {
+          for (blk_col = idx; blk_col < unit_width; blk_col += bw) {
+            encode_block_inter(plane, block, blk_row, blk_col, plane_bsize,
+                               max_tx_size, &arg);
+            block += step;
+          }
+        }
       }
     }
 #else
@@ -1357,7 +1011,7 @@ void av1_encode_sb_supertx(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize) {
 #if CONFIG_VAR_TX
     const TX_SIZE tx_size = TX_4X4;
 #else
-    const TX_SIZE tx_size = get_tx_size(plane, xd);
+    const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
 #endif
     av1_subtract_plane(x, bsize, plane);
     av1_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane], ctx.tl[plane]);
@@ -1435,6 +1089,24 @@ static void quantize_scaler(int coeff, int16_t zbin, int16_t round_value,
   }
 }
 
+#if CONFIG_HIGHBITDEPTH
+typedef void (*hbd_dpcm_fwd_tx_func)(const int16_t *input, int stride,
+                                     TX_TYPE_1D tx_type, tran_low_t *output,
+                                     int dir);
+
+static hbd_dpcm_fwd_tx_func get_hbd_dpcm_fwd_tx_func(int tx_length) {
+  switch (tx_length) {
+    case 4: return av1_hbd_dpcm_ft4_c;
+    case 8: return av1_hbd_dpcm_ft8_c;
+    case 16: return av1_hbd_dpcm_ft16_c;
+    case 32:
+      return av1_hbd_dpcm_ft32_c;
+    // TODO(huisu): add support for TX_64X64.
+    default: assert(0); return NULL;
+  }
+}
+#endif  // CONFIG_HIGHBITDEPTH
+
 typedef void (*dpcm_fwd_tx_func)(const int16_t *input, int stride,
                                  TX_TYPE_1D tx_type, tran_low_t *output);
 
@@ -1539,7 +1211,7 @@ static void hbd_process_block_dpcm_vert(
     int16_t *src_diff, int diff_stride, tran_low_t *coeff, tran_low_t *qcoeff,
     tran_low_t *dqcoeff) {
   const int tx1d_width = tx_size_wide[tx_size];
-  dpcm_fwd_tx_func forward_tx = get_dpcm_fwd_tx_func(tx1d_width);
+  hbd_dpcm_fwd_tx_func forward_tx = get_hbd_dpcm_fwd_tx_func(tx1d_width);
   hbd_dpcm_inv_txfm_add_func inverse_tx =
       av1_get_hbd_dpcm_inv_txfm_add_func(tx1d_width);
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
@@ -1553,7 +1225,7 @@ static void hbd_process_block_dpcm_vert(
     // Subtraction.
     for (int c = 0; c < tx1d_width; ++c) src_diff[c] = src[c] - dst[c];
     // Forward transform.
-    forward_tx(src_diff, 1, tx_type_1d, coeff);
+    forward_tx(src_diff, 1, tx_type_1d, coeff, 1);
     // Quantization.
     for (int c = 0; c < tx1d_width; ++c) {
       quantize_scaler(coeff[c], p->zbin[q_idx], p->round[q_idx],
@@ -1562,7 +1234,7 @@ static void hbd_process_block_dpcm_vert(
       q_idx = 1;
     }
     // Inverse transform.
-    inverse_tx(dqcoeff, 1, tx_type_1d, bd, dst);
+    inverse_tx(dqcoeff, 1, tx_type_1d, bd, dst, 1);
     // Move to the next row.
     coeff += tx1d_width;
     qcoeff += tx1d_width;
@@ -1580,7 +1252,7 @@ static void hbd_process_block_dpcm_horz(
     int16_t *src_diff, int diff_stride, tran_low_t *coeff, tran_low_t *qcoeff,
     tran_low_t *dqcoeff) {
   const int tx1d_height = tx_size_high[tx_size];
-  dpcm_fwd_tx_func forward_tx = get_dpcm_fwd_tx_func(tx1d_height);
+  hbd_dpcm_fwd_tx_func forward_tx = get_hbd_dpcm_fwd_tx_func(tx1d_height);
   hbd_dpcm_inv_txfm_add_func inverse_tx =
       av1_get_hbd_dpcm_inv_txfm_add_func(tx1d_height);
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
@@ -1597,7 +1269,7 @@ static void hbd_process_block_dpcm_horz(
     }
     // Forward transform.
     tran_low_t tx_buff[64];
-    forward_tx(src_diff, diff_stride, tx_type_1d, tx_buff);
+    forward_tx(src_diff, diff_stride, tx_type_1d, tx_buff, 0);
     for (int r = 0; r < tx1d_height; ++r) coeff[r * tx1d_width] = tx_buff[r];
     // Quantization.
     for (int r = 0; r < tx1d_height; ++r) {
@@ -1609,7 +1281,7 @@ static void hbd_process_block_dpcm_horz(
     }
     // Inverse transform.
     for (int r = 0; r < tx1d_height; ++r) tx_buff[r] = dqcoeff[r * tx1d_width];
-    inverse_tx(tx_buff, dst_stride, tx_type_1d, bd, dst);
+    inverse_tx(tx_buff, dst_stride, tx_type_1d, bd, dst, 0);
     // Move to the next column.
     ++coeff, ++qcoeff, ++dqcoeff, ++src_diff, ++dst, ++src;
   }
@@ -1631,7 +1303,8 @@ void av1_encode_block_intra_dpcm(const AV1_COMMON *cm, MACROBLOCK *x,
   const int dst_stride = pd->dst.stride;
   const int tx1d_width = tx_size_wide[tx_size];
   const int tx1d_height = tx_size_high[tx_size];
-  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, 0);
+  const SCAN_ORDER *const scan_order =
+      get_scan(cm, tx_size, tx_type, &xd->mi[0]->mbmi);
   tran_low_t *coeff = BLOCK_OFFSET(p->coeff, block);
   tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   uint8_t *dst =
@@ -1711,30 +1384,20 @@ void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
   struct macroblockd_plane *const pd = &xd->plane[plane];
   tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   PLANE_TYPE plane_type = get_plane_type(plane);
-  const TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+  const TX_TYPE tx_type =
+      av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
   uint16_t *eob = &p->eobs[block];
   const int dst_stride = pd->dst.stride;
   uint8_t *dst =
       &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
-#if CONFIG_CFL
-
-#if CONFIG_EC_ADAPT
-  FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
-#else
-  FRAME_CONTEXT *const ec_ctx = cm->fc;
-#endif  // CONFIG_EC_ADAPT
 
-  av1_predict_intra_block_encoder_facade(x, ec_ctx, plane, block, blk_col,
-                                         blk_row, tx_size, plane_bsize);
-#else
   av1_predict_intra_block_facade(xd, plane, block, blk_col, blk_row, tx_size);
-#endif
 
+#if CONFIG_DPCM_INTRA || CONFIG_LGT
+  const PREDICTION_MODE mode =
+      get_prediction_mode(xd->mi[0], plane, tx_size, block);
 #if CONFIG_DPCM_INTRA
-  const int block_raster_idx = av1_block_index_to_raster_order(tx_size, block);
   const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const PREDICTION_MODE mode =
-      (plane == 0) ? get_y_mode(xd->mi[0], block_raster_idx) : mbmi->uv_mode;
   if (av1_use_dpcm_intra(plane, mode, tx_type, mbmi)) {
     av1_encode_block_intra_dpcm(cm, x, mode, plane, block, blk_row, blk_col,
                                 plane_bsize, tx_size, tx_type, args->ta,
@@ -1742,6 +1405,7 @@ void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
     return;
   }
 #endif  // CONFIG_DPCM_INTRA
+#endif  // CONFIG_DPCM_INTRA || CONFIG_LGT
 
   av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
 
@@ -1751,7 +1415,8 @@ void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
   if (args->enable_optimize_b) {
     av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
                     ctx, AV1_XFORM_QUANT_FP);
-    av1_optimize_b(cm, x, plane, block, plane_bsize, tx_size, a, l);
+    av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size,
+                   a, l);
   } else {
     av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
                     ctx, AV1_XFORM_QUANT_B);
@@ -1763,220 +1428,25 @@ void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
 
   if (x->pvq_skip[plane]) return;
 #endif  // CONFIG_PVQ
-  av1_inverse_transform_block(xd, dqcoeff, tx_type, tx_size, dst, dst_stride,
-                              *eob);
+  av1_inverse_transform_block(xd, dqcoeff,
+#if CONFIG_LGT
+                              mode,
+#endif
+                              tx_type, tx_size, dst, dst_stride, *eob);
 #if !CONFIG_PVQ
   if (*eob) *(args->skip) = 0;
 #else
 // Note : *(args->skip) == mbmi->skip
 #endif
 #if CONFIG_CFL
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   if (plane == AOM_PLANE_Y && x->cfl_store_y) {
-    cfl_store(xd->cfl, dst, dst_stride, blk_row, blk_col, tx_size);
-  }
-
-  if (mbmi->uv_mode == DC_PRED) {
-    // TODO(ltrudeau) find a cleaner way to detect last transform block
-    if (plane == AOM_PLANE_U) {
-      xd->cfl->num_tx_blk[CFL_PRED_U] =
-          (blk_row == 0 && blk_col == 0) ? 1
-                                         : xd->cfl->num_tx_blk[CFL_PRED_U] + 1;
-    }
-
-    if (plane == AOM_PLANE_V) {
-      xd->cfl->num_tx_blk[CFL_PRED_V] =
-          (blk_row == 0 && blk_col == 0) ? 1
-                                         : xd->cfl->num_tx_blk[CFL_PRED_V] + 1;
-
-      if (mbmi->skip &&
-          xd->cfl->num_tx_blk[CFL_PRED_U] == xd->cfl->num_tx_blk[CFL_PRED_V]) {
-        assert(plane_bsize != BLOCK_INVALID);
-        const int block_width = block_size_wide[plane_bsize];
-        const int block_height = block_size_high[plane_bsize];
-
-        // if SKIP is chosen at the block level, and ind != 0, we must change
-        // the prediction
-        if (mbmi->cfl_alpha_idx != 0) {
-          const struct macroblockd_plane *const pd_cb = &xd->plane[AOM_PLANE_U];
-          uint8_t *const dst_cb = pd_cb->dst.buf;
-          const int dst_stride_cb = pd_cb->dst.stride;
-          uint8_t *const dst_cr = pd->dst.buf;
-          const int dst_stride_cr = pd->dst.stride;
-          for (int j = 0; j < block_height; j++) {
-            for (int i = 0; i < block_width; i++) {
-              dst_cb[dst_stride_cb * j + i] =
-                  (uint8_t)(xd->cfl->dc_pred[CFL_PRED_U] + 0.5);
-              dst_cr[dst_stride_cr * j + i] =
-                  (uint8_t)(xd->cfl->dc_pred[CFL_PRED_V] + 0.5);
-            }
-          }
-          mbmi->cfl_alpha_idx = 0;
-          mbmi->cfl_alpha_signs[CFL_PRED_U] = CFL_SIGN_POS;
-          mbmi->cfl_alpha_signs[CFL_PRED_V] = CFL_SIGN_POS;
-        }
-      }
-    }
+    // TODO (ltrudeau) Store sub-8x8 inter blocks when bottom right block is
+    // intra predicted.
+    cfl_store(xd->cfl, dst, dst_stride, blk_row, blk_col, tx_size, plane_bsize);
   }
 #endif
 }
 
-#if CONFIG_CFL
-static int cfl_alpha_dist(const uint8_t *y_pix, int y_stride, double y_avg,
-                          const uint8_t *src, int src_stride, int blk_width,
-                          int blk_height, double dc_pred, double alpha,
-                          int *dist_neg_out) {
-  const double dc_pred_bias = dc_pred + 0.5;
-  int dist = 0;
-  int diff;
-
-  if (alpha == 0.0) {
-    const int dc_pred_i = (int)dc_pred_bias;
-    for (int j = 0; j < blk_height; j++) {
-      for (int i = 0; i < blk_width; i++) {
-        diff = src[i] - dc_pred_i;
-        dist += diff * diff;
-      }
-      src += src_stride;
-    }
-
-    if (dist_neg_out) *dist_neg_out = dist;
-
-    return dist;
-  }
-
-  int dist_neg = 0;
-  for (int j = 0; j < blk_height; j++) {
-    for (int i = 0; i < blk_width; i++) {
-      const double scaled_luma = alpha * (y_pix[i] - y_avg);
-      const int uv = src[i];
-      diff = uv - (int)(scaled_luma + dc_pred_bias);
-      dist += diff * diff;
-      diff = uv + (int)(scaled_luma - dc_pred_bias);
-      dist_neg += diff * diff;
-    }
-    y_pix += y_stride;
-    src += src_stride;
-  }
-
-  if (dist_neg_out) *dist_neg_out = dist_neg;
-
-  return dist;
-}
-
-static int cfl_compute_alpha_ind(MACROBLOCK *const x, const CFL_CTX *const cfl,
-                                 BLOCK_SIZE bsize,
-                                 CFL_SIGN_TYPE signs_out[CFL_SIGNS]) {
-  const struct macroblock_plane *const p_u = &x->plane[AOM_PLANE_U];
-  const struct macroblock_plane *const p_v = &x->plane[AOM_PLANE_V];
-  const uint8_t *const src_u = p_u->src.buf;
-  const uint8_t *const src_v = p_v->src.buf;
-  const int src_stride_u = p_u->src.stride;
-  const int src_stride_v = p_v->src.stride;
-  const int block_width = block_size_wide[bsize];
-  const int block_height = block_size_high[bsize];
-  const double dc_pred_u = cfl->dc_pred[CFL_PRED_U];
-  const double dc_pred_v = cfl->dc_pred[CFL_PRED_V];
-
-  // Temporary pixel buffer used to store the CfL prediction when we compute the
-  // alpha index.
-  uint8_t tmp_pix[MAX_SB_SQUARE];
-  // Load CfL Prediction over the entire block
-  const double y_avg =
-      cfl_load(cfl, tmp_pix, MAX_SB_SIZE, 0, 0, block_width, block_height);
-
-  int sse[CFL_PRED_PLANES][CFL_MAGS_SIZE];
-  sse[CFL_PRED_U][0] =
-      cfl_alpha_dist(tmp_pix, MAX_SB_SIZE, y_avg, src_u, src_stride_u,
-                     block_width, block_height, dc_pred_u, 0, NULL);
-  sse[CFL_PRED_V][0] =
-      cfl_alpha_dist(tmp_pix, MAX_SB_SIZE, y_avg, src_v, src_stride_v,
-                     block_width, block_height, dc_pred_v, 0, NULL);
-  for (int m = 1; m < CFL_MAGS_SIZE; m += 2) {
-    assert(cfl_alpha_mags[m + 1] == -cfl_alpha_mags[m]);
-    sse[CFL_PRED_U][m] = cfl_alpha_dist(
-        tmp_pix, MAX_SB_SIZE, y_avg, src_u, src_stride_u, block_width,
-        block_height, dc_pred_u, cfl_alpha_mags[m], &sse[CFL_PRED_U][m + 1]);
-    sse[CFL_PRED_V][m] = cfl_alpha_dist(
-        tmp_pix, MAX_SB_SIZE, y_avg, src_v, src_stride_v, block_width,
-        block_height, dc_pred_v, cfl_alpha_mags[m], &sse[CFL_PRED_V][m + 1]);
-  }
-
-  int dist;
-  int64_t cost;
-  int64_t best_cost;
-
-  // Compute least squares parameter of the entire block
-  // IMPORTANT: We assume that the first code is 0,0
-  int ind = 0;
-  signs_out[CFL_PRED_U] = CFL_SIGN_POS;
-  signs_out[CFL_PRED_V] = CFL_SIGN_POS;
-
-  dist = sse[CFL_PRED_U][0] + sse[CFL_PRED_V][0];
-  dist *= 16;
-  best_cost = RDCOST(x->rdmult, x->rddiv, cfl->costs[0], dist);
-
-  for (int c = 1; c < CFL_ALPHABET_SIZE; c++) {
-    const int idx_u = cfl_alpha_codes[c][CFL_PRED_U];
-    const int idx_v = cfl_alpha_codes[c][CFL_PRED_V];
-    for (CFL_SIGN_TYPE sign_u = idx_u == 0; sign_u < CFL_SIGNS; sign_u++) {
-      for (CFL_SIGN_TYPE sign_v = idx_v == 0; sign_v < CFL_SIGNS; sign_v++) {
-        dist = sse[CFL_PRED_U][idx_u + (sign_u == CFL_SIGN_NEG)] +
-               sse[CFL_PRED_V][idx_v + (sign_v == CFL_SIGN_NEG)];
-        dist *= 16;
-        cost = RDCOST(x->rdmult, x->rddiv, cfl->costs[c], dist);
-        if (cost < best_cost) {
-          best_cost = cost;
-          ind = c;
-          signs_out[CFL_PRED_U] = sign_u;
-          signs_out[CFL_PRED_V] = sign_v;
-        }
-      }
-    }
-  }
-
-  return ind;
-}
-
-static inline void cfl_update_costs(CFL_CTX *cfl, FRAME_CONTEXT *ec_ctx) {
-  assert(ec_ctx->cfl_alpha_cdf[CFL_ALPHABET_SIZE - 1] ==
-         AOM_ICDF(CDF_PROB_TOP));
-  const int prob_den = CDF_PROB_TOP;
-
-  int prob_num = AOM_ICDF(ec_ctx->cfl_alpha_cdf[0]);
-  cfl->costs[0] = av1_cost_zero(get_prob(prob_num, prob_den));
-
-  for (int c = 1; c < CFL_ALPHABET_SIZE; c++) {
-    int sign_bit_cost = (cfl_alpha_codes[c][CFL_PRED_U] != 0) +
-                        (cfl_alpha_codes[c][CFL_PRED_V] != 0);
-    prob_num = AOM_ICDF(ec_ctx->cfl_alpha_cdf[c]) -
-               AOM_ICDF(ec_ctx->cfl_alpha_cdf[c - 1]);
-    cfl->costs[c] = av1_cost_zero(get_prob(prob_num, prob_den)) +
-                    av1_cost_literal(sign_bit_cost);
-  }
-}
-
-void av1_predict_intra_block_encoder_facade(MACROBLOCK *x,
-                                            FRAME_CONTEXT *ec_ctx, int plane,
-                                            int block_idx, int blk_col,
-                                            int blk_row, TX_SIZE tx_size,
-                                            BLOCK_SIZE plane_bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  if (plane != AOM_PLANE_Y && mbmi->uv_mode == DC_PRED) {
-    if (blk_col == 0 && blk_row == 0 && plane == AOM_PLANE_U) {
-      CFL_CTX *const cfl = xd->cfl;
-      cfl_update_costs(cfl, ec_ctx);
-      cfl_dc_pred(xd, plane_bsize, tx_size);
-      mbmi->cfl_alpha_idx =
-          cfl_compute_alpha_ind(x, cfl, plane_bsize, mbmi->cfl_alpha_signs);
-    }
-  }
-  av1_predict_intra_block_facade(xd, plane, block_idx, blk_col, blk_row,
-                                 tx_size);
-}
-#endif
-
 void av1_encode_intra_block_plane(AV1_COMMON *cm, MACROBLOCK *x,
                                   BLOCK_SIZE bsize, int plane,
                                   int enable_optimize_b, int mi_row,
@@ -2001,7 +1471,7 @@ void av1_encode_intra_block_plane(AV1_COMMON *cm, MACROBLOCK *x,
 
   if (enable_optimize_b) {
     const struct macroblockd_plane *const pd = &xd->plane[plane];
-    const TX_SIZE tx_size = get_tx_size(plane, xd);
+    const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
     av1_get_entropy_contexts(bsize, tx_size, pd, ta, tl);
   }
   av1_foreach_transformed_block_in_plane(
diff --git a/third_party/aom/av1/encoder/encodemb.h b/third_party/aom/av1/encoder/encodemb.h
index 35a2c1570..65476bcae 100644
--- a/third_party/aom/av1/encoder/encodemb.h
+++ b/third_party/aom/av1/encoder/encodemb.h
@@ -53,9 +53,10 @@ void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
                      int blk_row, int blk_col, BLOCK_SIZE plane_bsize,
                      TX_SIZE tx_size, int ctx, AV1_XFORM_QUANT xform_quant_idx);
 
-int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int block,
-                   BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                   const ENTROPY_CONTEXT *a, const ENTROPY_CONTEXT *l);
+int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int blk_row,
+                   int blk_col, int block, BLOCK_SIZE plane_bsize,
+                   TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
+                   const ENTROPY_CONTEXT *l);
 
 void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
                       int blk_col, int blk_row, TX_SIZE tx_size);
@@ -86,14 +87,6 @@ void av1_store_pvq_enc_info(PVQ_INFO *pvq_info, int *qg, int *theta, int *k,
                             int *size, int skip_rest, int skip_dir, int bs);
 #endif
 
-#if CONFIG_CFL
-void av1_predict_intra_block_encoder_facade(MACROBLOCK *x,
-                                            FRAME_CONTEXT *ec_ctx, int plane,
-                                            int block_idx, int blk_col,
-                                            int blk_row, TX_SIZE tx_size,
-                                            BLOCK_SIZE plane_bsize);
-#endif
-
 #if CONFIG_DPCM_INTRA
 void av1_encode_block_intra_dpcm(const AV1_COMMON *cm, MACROBLOCK *x,
                                  PREDICTION_MODE mode, int plane, int block,
diff --git a/third_party/aom/av1/encoder/encodemv.c b/third_party/aom/av1/encoder/encodemv.c
index eb0ff88c4..fd61fe6b2 100644
--- a/third_party/aom/av1/encoder/encodemv.c
+++ b/third_party/aom/av1/encoder/encodemv.c
@@ -31,7 +31,7 @@ void av1_entropy_mv_init(void) {
 }
 
 static void encode_mv_component(aom_writer *w, int comp, nmv_component *mvcomp,
-                                int usehp) {
+                                MvSubpelPrecision precision) {
   int offset;
   const int sign = comp < 0;
   const int mag = sign ? -comp : comp;
@@ -42,34 +42,53 @@ static void encode_mv_component(aom_writer *w, int comp, nmv_component *mvcomp,
 
   assert(comp != 0);
 
-  // Sign
+// Sign
+#if CONFIG_NEW_MULTISYMBOL
+  aom_write_bit(w, sign);
+#else
   aom_write(w, sign, mvcomp->sign);
+#endif
 
   // Class
   aom_write_symbol(w, mv_class, mvcomp->class_cdf, MV_CLASSES);
 
   // Integer bits
   if (mv_class == MV_CLASS_0) {
+#if CONFIG_NEW_MULTISYMBOL
+    aom_write_symbol(w, d, mvcomp->class0_cdf, CLASS0_SIZE);
+#else
     aom_write(w, d, mvcomp->class0[0]);
+#endif
   } else {
     int i;
     const int n = mv_class + CLASS0_BITS - 1;  // number of bits
     for (i = 0; i < n; ++i) aom_write(w, (d >> i) & 1, mvcomp->bits[i]);
   }
 
-  // Fractional bits
-  aom_write_symbol(
-      w, fr, mv_class == MV_CLASS_0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf,
-      MV_FP_SIZE);
+// Fractional bits
+#if CONFIG_INTRABC
+  if (precision > MV_SUBPEL_NONE)
+#endif  // CONFIG_INTRABC
+  {
+    aom_write_symbol(w, fr, mv_class == MV_CLASS_0 ? mvcomp->class0_fp_cdf[d]
+                                                   : mvcomp->fp_cdf,
+                     MV_FP_SIZE);
+  }
 
   // High precision bit
-  if (usehp)
+  if (precision > MV_SUBPEL_LOW_PRECISION)
+#if CONFIG_NEW_MULTISYMBOL
+    aom_write_symbol(
+        w, hp, mv_class == MV_CLASS_0 ? mvcomp->class0_hp_cdf : mvcomp->hp_cdf,
+        2);
+#else
     aom_write(w, hp, mv_class == MV_CLASS_0 ? mvcomp->class0_hp : mvcomp->hp);
+#endif
 }
 
 static void build_nmv_component_cost_table(int *mvcost,
                                            const nmv_component *const mvcomp,
-                                           int usehp) {
+                                           MvSubpelPrecision precision) {
   int i, v;
   int sign_cost[2], class_cost[MV_CLASSES], class0_cost[CLASS0_SIZE];
   int bits_cost[MV_OFFSET_BITS][2];
@@ -89,7 +108,7 @@ static void build_nmv_component_cost_table(int *mvcost,
     av1_cost_tokens(class0_fp_cost[i], mvcomp->class0_fp[i], av1_mv_fp_tree);
   av1_cost_tokens(fp_cost, mvcomp->fp, av1_mv_fp_tree);
 
-  if (usehp) {
+  if (precision > MV_SUBPEL_LOW_PRECISION) {
     class0_hp_cost[0] = av1_cost_zero(mvcomp->class0_hp);
     class0_hp_cost[1] = av1_cost_one(mvcomp->class0_hp);
     hp_cost[0] = av1_cost_zero(mvcomp->hp);
@@ -110,16 +129,21 @@ static void build_nmv_component_cost_table(int *mvcost,
       const int b = c + CLASS0_BITS - 1; /* number of bits */
       for (i = 0; i < b; ++i) cost += bits_cost[i][((d >> i) & 1)];
     }
-    if (c == MV_CLASS_0) {
-      cost += class0_fp_cost[d][f];
-    } else {
-      cost += fp_cost[f];
-    }
-    if (usehp) {
+#if CONFIG_INTRABC
+    if (precision > MV_SUBPEL_NONE)
+#endif  // CONFIG_INTRABC
+    {
       if (c == MV_CLASS_0) {
-        cost += class0_hp_cost[e];
+        cost += class0_fp_cost[d][f];
       } else {
-        cost += hp_cost[e];
+        cost += fp_cost[f];
+      }
+      if (precision > MV_SUBPEL_LOW_PRECISION) {
+        if (c == MV_CLASS_0) {
+          cost += class0_hp_cost[e];
+        } else {
+          cost += hp_cost[e];
+        }
       }
     }
     mvcost[v] = cost + sign_cost[0];
@@ -127,36 +151,16 @@ static void build_nmv_component_cost_table(int *mvcost,
   }
 }
 
+#if !CONFIG_NEW_MULTISYMBOL
 static void update_mv(aom_writer *w, const unsigned int ct[2], aom_prob *cur_p,
                       aom_prob upd_p) {
   (void)upd_p;
-#if CONFIG_TILE_GROUPS
   // Just use the default maximum number of tile groups to avoid passing in the
   // actual
   // number
   av1_cond_prob_diff_update(w, cur_p, ct, DEFAULT_MAX_NUM_TG);
-#else
-  av1_cond_prob_diff_update(w, cur_p, ct, 1);
-#endif
 }
 
-#if !CONFIG_EC_ADAPT
-static void write_mv_update(const aom_tree_index *tree,
-                            aom_prob probs[/*n - 1*/],
-                            const unsigned int counts[/*n - 1*/], int n,
-                            aom_writer *w) {
-  int i;
-  unsigned int branch_ct[32][2];
-
-  // Assuming max number of probabilities <= 32
-  assert(n <= 32);
-
-  av1_tree_probs_from_distribution(tree, branch_ct, counts);
-  for (i = 0; i < n - 1; ++i)
-    update_mv(w, branch_ct[i], &probs[i], MV_UPDATE_PROB);
-}
-#endif
-
 void av1_write_nmv_probs(AV1_COMMON *cm, int usehp, aom_writer *w,
                          nmv_context_counts *const nmv_counts) {
   int i;
@@ -164,34 +168,6 @@ void av1_write_nmv_probs(AV1_COMMON *cm, int usehp, aom_writer *w,
   for (nmv_ctx = 0; nmv_ctx < NMV_CONTEXTS; ++nmv_ctx) {
     nmv_context *const mvc = &cm->fc->nmvc[nmv_ctx];
     nmv_context_counts *const counts = &nmv_counts[nmv_ctx];
-#if !CONFIG_EC_ADAPT
-    write_mv_update(av1_mv_joint_tree, mvc->joints, counts->joints, MV_JOINTS,
-                    w);
-
-    for (i = 0; i < 2; ++i) {
-      int j;
-      nmv_component *comp = &mvc->comps[i];
-      nmv_component_counts *comp_counts = &counts->comps[i];
-
-      update_mv(w, comp_counts->sign, &comp->sign, MV_UPDATE_PROB);
-      write_mv_update(av1_mv_class_tree, comp->classes, comp_counts->classes,
-                      MV_CLASSES, w);
-      write_mv_update(av1_mv_class0_tree, comp->class0, comp_counts->class0,
-                      CLASS0_SIZE, w);
-      for (j = 0; j < MV_OFFSET_BITS; ++j)
-        update_mv(w, comp_counts->bits[j], &comp->bits[j], MV_UPDATE_PROB);
-    }
-
-    for (i = 0; i < 2; ++i) {
-      int j;
-      for (j = 0; j < CLASS0_SIZE; ++j)
-        write_mv_update(av1_mv_fp_tree, mvc->comps[i].class0_fp[j],
-                        counts->comps[i].class0_fp[j], MV_FP_SIZE, w);
-
-      write_mv_update(av1_mv_fp_tree, mvc->comps[i].fp, counts->comps[i].fp,
-                      MV_FP_SIZE, w);
-    }
-#endif
 
     if (usehp) {
       for (i = 0; i < 2; ++i) {
@@ -202,6 +178,7 @@ void av1_write_nmv_probs(AV1_COMMON *cm, int usehp, aom_writer *w,
     }
   }
 }
+#endif
 
 void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref,
                    nmv_context *mvctx, int usehp) {
@@ -230,18 +207,19 @@ void av1_encode_dv(aom_writer *w, const MV *mv, const MV *ref,
 
   aom_write_symbol(w, j, mvctx->joint_cdf, MV_JOINTS);
   if (mv_joint_vertical(j))
-    encode_mv_component(w, diff.row, &mvctx->comps[0], 0);
+    encode_mv_component(w, diff.row, &mvctx->comps[0], MV_SUBPEL_NONE);
 
   if (mv_joint_horizontal(j))
-    encode_mv_component(w, diff.col, &mvctx->comps[1], 0);
+    encode_mv_component(w, diff.col, &mvctx->comps[1], MV_SUBPEL_NONE);
 }
 #endif  // CONFIG_INTRABC
 
 void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
-                              const nmv_context *ctx, int usehp) {
+                              const nmv_context *ctx,
+                              MvSubpelPrecision precision) {
   av1_cost_tokens(mvjoint, ctx->joints, av1_mv_joint_tree);
-  build_nmv_component_cost_table(mvcost[0], &ctx->comps[0], usehp);
-  build_nmv_component_cost_table(mvcost[1], &ctx->comps[1], usehp);
+  build_nmv_component_cost_table(mvcost[0], &ctx->comps[0], precision);
+  build_nmv_component_cost_table(mvcost[1], &ctx->comps[1], precision);
 }
 
 #if CONFIG_EXT_INTER
@@ -284,6 +262,27 @@ static void inc_mvs(const MB_MODE_INFO *mbmi, const MB_MODE_INFO_EXT *mbmi_ext,
                     mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
     nmv_context_counts *counts = &nmv_counts[nmv_ctx];
     av1_inc_mv(&diff, counts, 1);
+#if CONFIG_COMPOUND_SINGLEREF
+  } else {
+    assert(  // mode == SR_NEAREST_NEWMV ||
+        mode == SR_NEAR_NEWMV || mode == SR_ZERO_NEWMV || mode == SR_NEW_NEWMV);
+    const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0].as_mv;
+    int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+    int nmv_ctx =
+        av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
+                    mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
+    nmv_context_counts *counts = &nmv_counts[nmv_ctx];
+    (void)pred_mvs;
+    MV diff;
+    if (mode == SR_NEW_NEWMV) {
+      diff.row = mvs[0].as_mv.row - ref->row;
+      diff.col = mvs[0].as_mv.col - ref->col;
+      av1_inc_mv(&diff, counts, 1);
+    }
+    diff.row = mvs[1].as_mv.row - ref->row;
+    diff.col = mvs[1].as_mv.col - ref->col;
+    av1_inc_mv(&diff, counts, 1);
+#endif  // CONFIG_COMPOUND_SINGLEREF
   }
 }
 
@@ -328,7 +327,7 @@ static void inc_mvs_sub8x8(const MODE_INFO *mi, int block, const int_mv mvs[2],
     av1_inc_mv(&diff, counts, 1);
   }
 }
-#else
+#else   // !CONFIG_EXT_INTER
 static void inc_mvs(const MB_MODE_INFO *mbmi, const MB_MODE_INFO_EXT *mbmi_ext,
                     const int_mv mvs[2], const int_mv pred_mvs[2],
                     nmv_context_counts *nmv_counts) {
diff --git a/third_party/aom/av1/encoder/encodemv.h b/third_party/aom/av1/encoder/encodemv.h
index 6d442147f..8689cec27 100644
--- a/third_party/aom/av1/encoder/encodemv.h
+++ b/third_party/aom/av1/encoder/encodemv.h
@@ -20,14 +20,17 @@ extern "C" {
 
 void av1_entropy_mv_init(void);
 
+#if !CONFIG_NEW_MULTISYMBOL
 void av1_write_nmv_probs(AV1_COMMON *cm, int usehp, aom_writer *w,
                          nmv_context_counts *const counts);
+#endif
 
 void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref,
                    nmv_context *mvctx, int usehp);
 
 void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
-                              const nmv_context *mvctx, int usehp);
+                              const nmv_context *mvctx,
+                              MvSubpelPrecision precision);
 
 void av1_update_mv_count(ThreadData *td);
 
diff --git a/third_party/aom/av1/encoder/encoder.c b/third_party/aom/av1/encoder/encoder.c
index 4782ce2b7..943e2c6a0 100644
--- a/third_party/aom/av1/encoder/encoder.c
+++ b/third_party/aom/av1/encoder/encoder.c
@@ -18,7 +18,6 @@
 #include "av1/common/alloccommon.h"
 #if CONFIG_CDEF
 #include "av1/common/cdef.h"
-#include "av1/common/clpf.h"
 #endif  // CONFIG_CDEF
 #include "av1/common/filter.h"
 #include "av1/common/idct.h"
@@ -31,6 +30,9 @@
 #include "av1/encoder/aq_cyclicrefresh.h"
 #include "av1/encoder/aq_variance.h"
 #include "av1/encoder/bitstream.h"
+#if CONFIG_BGSPRITE
+#include "av1/encoder/bgsprite.h"
+#endif  // CONFIG_BGSPRITE
 #if CONFIG_ANS
 #include "aom_dsp/buf_ans.h"
 #endif
@@ -73,6 +75,8 @@
 
 #if CONFIG_ENTROPY_STATS
 FRAME_COUNTS aggregate_fc;
+// Aggregate frame counts per frame context type
+FRAME_COUNTS aggregate_fc_per_type[FRAME_CONTEXTS];
 #endif  // CONFIG_ENTROPY_STATS
 
 #define AM_SEGMENT_ID_INACTIVE 7
@@ -421,7 +425,6 @@ void av1_initialize_enc(void) {
 
 static void dealloc_compressor_data(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
-  int i;
 
   aom_free(cpi->mbmi_ext_base);
   cpi->mbmi_ext_base = NULL;
@@ -467,10 +470,6 @@ static void dealloc_compressor_data(AV1_COMP *cpi) {
   cpi->td.mb.mask_buf = NULL;
 #endif
 
-  // Free up-sampled reference buffers.
-  for (i = 0; i < (REF_FRAMES + 1); i++)
-    aom_free_frame_buffer(&cpi->upsampled_ref_bufs[i].buf);
-
   av1_free_ref_frame_buffers(cm->buffer_pool);
 #if CONFIG_LV_MAP
   av1_free_txb_buf(cpi);
@@ -483,8 +482,11 @@ static void dealloc_compressor_data(AV1_COMP *cpi) {
   aom_free_frame_buffer(&cpi->last_frame_db);
   aom_free_frame_buffer(&cpi->trial_frame_rst);
   aom_free(cpi->extra_rstbuf);
-  for (i = 0; i < MAX_MB_PLANE; ++i)
-    av1_free_restoration_struct(&cpi->rst_search[i]);
+  {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; ++i)
+      av1_free_restoration_struct(&cpi->rst_search[i]);
+  }
 #endif  // CONFIG_LOOP_RESTORATION
   aom_free_frame_buffer(&cpi->scaled_source);
   aom_free_frame_buffer(&cpi->scaled_last_source);
@@ -497,8 +499,7 @@ static void dealloc_compressor_data(AV1_COMP *cpi) {
   av1_free_pc_tree(&cpi->td);
 
 #if CONFIG_PALETTE
-  if (cpi->common.allow_screen_content_tools)
-    aom_free(cpi->td.mb.palette_buffer);
+  aom_free(cpi->td.mb.palette_buffer);
 #endif  // CONFIG_PALETTE
 
 #if CONFIG_ANS
@@ -735,13 +736,18 @@ static void alloc_util_frame_buffers(AV1_COMP *cpi) {
                                NULL, NULL))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate last frame deblocked buffer");
-  if (aom_realloc_frame_buffer(&cpi->trial_frame_rst, cm->width, cm->height,
-                               cm->subsampling_x, cm->subsampling_y,
+  if (aom_realloc_frame_buffer(
+          &cpi->trial_frame_rst,
+#if CONFIG_FRAME_SUPERRES
+          cm->superres_upscaled_width, cm->superres_upscaled_height,
+#else
+          cm->width, cm->height,
+#endif  // CONFIG_FRAME_SUPERRES
+          cm->subsampling_x, cm->subsampling_y,
 #if CONFIG_HIGHBITDEPTH
-                               cm->use_highbitdepth,
+          cm->use_highbitdepth,
 #endif
-                               AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL,
-                               NULL, NULL))
+          AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate trial restored frame buffer");
   int extra_rstbuf_sz = RESTORATION_EXTBUF_SIZE;
@@ -821,93 +827,107 @@ void av1_new_framerate(AV1_COMP *cpi, double framerate) {
 
 static void set_tile_info(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
-#if CONFIG_TILE_GROUPS && CONFIG_DEPENDENT_HORZTILES
+#if CONFIG_DEPENDENT_HORZTILES
   int tile_row, tile_col, num_tiles_in_tg;
   int tg_row_start, tg_col_start;
 #endif
 #if CONFIG_EXT_TILE
+  if (cpi->oxcf.large_scale_tile) {
 #if CONFIG_EXT_PARTITION
-  if (cpi->oxcf.superblock_size != AOM_SUPERBLOCK_SIZE_64X64) {
-    cm->tile_width = clamp(cpi->oxcf.tile_columns, 1, 32);
-    cm->tile_height = clamp(cpi->oxcf.tile_rows, 1, 32);
-    cm->tile_width <<= MAX_MIB_SIZE_LOG2;
-    cm->tile_height <<= MAX_MIB_SIZE_LOG2;
-  } else {
+    if (cpi->oxcf.superblock_size != AOM_SUPERBLOCK_SIZE_64X64) {
+      cm->tile_width = clamp(cpi->oxcf.tile_columns, 1, 32);
+      cm->tile_height = clamp(cpi->oxcf.tile_rows, 1, 32);
+      cm->tile_width <<= MAX_MIB_SIZE_LOG2;
+      cm->tile_height <<= MAX_MIB_SIZE_LOG2;
+    } else {
+      cm->tile_width = clamp(cpi->oxcf.tile_columns, 1, 64);
+      cm->tile_height = clamp(cpi->oxcf.tile_rows, 1, 64);
+      cm->tile_width <<= MAX_MIB_SIZE_LOG2 - 1;
+      cm->tile_height <<= MAX_MIB_SIZE_LOG2 - 1;
+    }
+#else
     cm->tile_width = clamp(cpi->oxcf.tile_columns, 1, 64);
     cm->tile_height = clamp(cpi->oxcf.tile_rows, 1, 64);
-    cm->tile_width <<= MAX_MIB_SIZE_LOG2 - 1;
-    cm->tile_height <<= MAX_MIB_SIZE_LOG2 - 1;
-  }
-#else
-  cm->tile_width = clamp(cpi->oxcf.tile_columns, 1, 64);
-  cm->tile_height = clamp(cpi->oxcf.tile_rows, 1, 64);
-  cm->tile_width <<= MAX_MIB_SIZE_LOG2;
-  cm->tile_height <<= MAX_MIB_SIZE_LOG2;
+    cm->tile_width <<= MAX_MIB_SIZE_LOG2;
+    cm->tile_height <<= MAX_MIB_SIZE_LOG2;
 #endif  // CONFIG_EXT_PARTITION
 
-  cm->tile_width = AOMMIN(cm->tile_width, cm->mi_cols);
-  cm->tile_height = AOMMIN(cm->tile_height, cm->mi_rows);
+    cm->tile_width = AOMMIN(cm->tile_width, cm->mi_cols);
+    cm->tile_height = AOMMIN(cm->tile_height, cm->mi_rows);
 
-  assert(cm->tile_width >> MAX_MIB_SIZE <= 32);
-  assert(cm->tile_height >> MAX_MIB_SIZE <= 32);
+    assert(cm->tile_width >> MAX_MIB_SIZE <= 32);
+    assert(cm->tile_height >> MAX_MIB_SIZE <= 32);
 
-  // Get the number of tiles
-  cm->tile_cols = 1;
-  while (cm->tile_cols * cm->tile_width < cm->mi_cols) ++cm->tile_cols;
+    // Get the number of tiles
+    cm->tile_cols = 1;
+    while (cm->tile_cols * cm->tile_width < cm->mi_cols) ++cm->tile_cols;
 
-  cm->tile_rows = 1;
-  while (cm->tile_rows * cm->tile_height < cm->mi_rows) ++cm->tile_rows;
-#else
-  int min_log2_tile_cols, max_log2_tile_cols;
-  av1_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
+    cm->tile_rows = 1;
+    while (cm->tile_rows * cm->tile_height < cm->mi_rows) ++cm->tile_rows;
+  } else {
+#endif  // CONFIG_EXT_TILE
+    int min_log2_tile_cols, max_log2_tile_cols;
+    av1_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
 
-  cm->log2_tile_cols =
-      clamp(cpi->oxcf.tile_columns, min_log2_tile_cols, max_log2_tile_cols);
-  cm->log2_tile_rows = cpi->oxcf.tile_rows;
+    cm->log2_tile_cols =
+        clamp(cpi->oxcf.tile_columns, min_log2_tile_cols, max_log2_tile_cols);
+    cm->log2_tile_rows = cpi->oxcf.tile_rows;
 
-  cm->tile_cols = 1 << cm->log2_tile_cols;
-  cm->tile_rows = 1 << cm->log2_tile_rows;
+    cm->tile_cols = 1 << cm->log2_tile_cols;
+    cm->tile_rows = 1 << cm->log2_tile_rows;
 
-  cm->tile_width = ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
-  cm->tile_width >>= cm->log2_tile_cols;
-  cm->tile_height = ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2);
-  cm->tile_height >>= cm->log2_tile_rows;
+    cm->tile_width = ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
+    cm->tile_width >>= cm->log2_tile_cols;
+    cm->tile_height = ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2);
+    cm->tile_height >>= cm->log2_tile_rows;
 
-  // round to integer multiples of max superblock size
-  cm->tile_width = ALIGN_POWER_OF_TWO(cm->tile_width, MAX_MIB_SIZE_LOG2);
-  cm->tile_height = ALIGN_POWER_OF_TWO(cm->tile_height, MAX_MIB_SIZE_LOG2);
+    // round to integer multiples of max superblock size
+    cm->tile_width = ALIGN_POWER_OF_TWO(cm->tile_width, MAX_MIB_SIZE_LOG2);
+    cm->tile_height = ALIGN_POWER_OF_TWO(cm->tile_height, MAX_MIB_SIZE_LOG2);
+#if CONFIG_EXT_TILE
+  }
 #endif  // CONFIG_EXT_TILE
 
 #if CONFIG_DEPENDENT_HORZTILES
   cm->dependent_horz_tiles = cpi->oxcf.dependent_horz_tiles;
 #if CONFIG_EXT_TILE
-  if (cm->tile_rows <= 1) cm->dependent_horz_tiles = 0;
-#else
-  if (cm->log2_tile_rows == 0) cm->dependent_horz_tiles = 0;
-#endif
-#if CONFIG_TILE_GROUPS
-  if (cpi->oxcf.mtu == 0) {
-    cm->num_tg = cpi->oxcf.num_tile_groups;
+  if (cm->large_scale_tile) {
+    // May not needed since cpi->oxcf.dependent_horz_tiles is already adjusted.
+    cm->dependent_horz_tiles = 0;
   } else {
-    // Use a default value for the purposes of weighting costs in probability
-    // updates
-    cm->num_tg = DEFAULT_MAX_NUM_TG;
+#endif  // CONFIG_EXT_TILE
+    if (cm->log2_tile_rows == 0) cm->dependent_horz_tiles = 0;
+#if CONFIG_EXT_TILE
   }
-  num_tiles_in_tg =
-      (cm->tile_cols * cm->tile_rows + cm->num_tg - 1) / cm->num_tg;
-  tg_row_start = 0;
-  tg_col_start = 0;
-  for (tile_row = 0; tile_row < cm->tile_rows; ++tile_row) {
-    for (tile_col = 0; tile_col < cm->tile_cols; ++tile_col) {
-      if ((tile_row * cm->tile_cols + tile_col) % num_tiles_in_tg == 0) {
-        tg_row_start = tile_row;
-        tg_col_start = tile_col;
+#endif  // CONFIG_EXT_TILE
+
+#if CONFIG_EXT_TILE
+  if (!cm->large_scale_tile) {
+#endif  // CONFIG_EXT_TILE
+    if (cpi->oxcf.mtu == 0) {
+      cm->num_tg = cpi->oxcf.num_tile_groups;
+    } else {
+      // Use a default value for the purposes of weighting costs in probability
+      // updates
+      cm->num_tg = DEFAULT_MAX_NUM_TG;
+    }
+    num_tiles_in_tg =
+        (cm->tile_cols * cm->tile_rows + cm->num_tg - 1) / cm->num_tg;
+    tg_row_start = 0;
+    tg_col_start = 0;
+    for (tile_row = 0; tile_row < cm->tile_rows; ++tile_row) {
+      for (tile_col = 0; tile_col < cm->tile_cols; ++tile_col) {
+        if ((tile_row * cm->tile_cols + tile_col) % num_tiles_in_tg == 0) {
+          tg_row_start = tile_row;
+          tg_col_start = tile_col;
+        }
+        cm->tile_group_start_row[tile_row][tile_col] = tg_row_start;
+        cm->tile_group_start_col[tile_row][tile_col] = tg_col_start;
       }
-      cm->tile_group_start_row[tile_row][tile_col] = tg_row_start;
-      cm->tile_group_start_col[tile_row][tile_col] = tg_col_start;
     }
+#if CONFIG_EXT_TILE
   }
-#endif
+#endif  // CONFIG_EXT_TILE
 #endif
 
 #if CONFIG_LOOPFILTERING_ACROSS_TILES
@@ -965,6 +985,10 @@ static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) {
   cm->use_highbitdepth = oxcf->use_highbitdepth;
 #endif
   cm->color_space = oxcf->color_space;
+#if CONFIG_COLORSPACE_HEADERS
+  cm->transfer_function = oxcf->transfer_function;
+  cm->chroma_sample_position = oxcf->chroma_sample_position;
+#endif
   cm->color_range = oxcf->color_range;
 
   cm->width = oxcf->width;
@@ -1175,6 +1199,21 @@ MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad4x4x3)
 MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad4x4x8)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x4x4d)
 
+#if CONFIG_EXT_PARTITION_TYPES
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x16x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x4)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x4_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x4x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x32)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x32x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x8)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x8_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x8x4d)
+#endif
+
 #if CONFIG_EXT_INTER
 #define HIGHBD_MBFP(BT, MCSDF, MCSVF) \
   cpi->fn_ptr[BT].msdf = MCSDF;       \
@@ -1223,6 +1262,13 @@ MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x8)
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x4)
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x8)
 MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x4)
+
+#if CONFIG_EXT_PARTITION_TYPES
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x16)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x4)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x32)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x8)
+#endif
 #endif  // CONFIG_EXT_INTER
 
 #if CONFIG_MOTION_VAR
@@ -1266,6 +1312,13 @@ MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x8)
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x4)
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x8)
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x4)
+
+#if CONFIG_EXT_PARTITION_TYPES
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x16)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x4)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x32)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x8)
+#endif
 #endif  // CONFIG_MOTION_VAR
 
 static void highbd_set_var_fns(AV1_COMP *const cpi) {
@@ -1273,6 +1326,32 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
   if (cm->use_highbitdepth) {
     switch (cm->bit_depth) {
       case AOM_BITS_8:
+#if CONFIG_EXT_PARTITION_TYPES
+        HIGHBD_BFP(BLOCK_32X8, aom_highbd_sad32x8_bits8,
+                   aom_highbd_sad32x8_avg_bits8, aom_highbd_8_variance32x8,
+                   aom_highbd_8_sub_pixel_variance32x8,
+                   aom_highbd_8_sub_pixel_avg_variance32x8, NULL, NULL,
+                   aom_highbd_sad32x8x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_8X32, aom_highbd_sad8x32_bits8,
+                   aom_highbd_sad8x32_avg_bits8, aom_highbd_8_variance8x32,
+                   aom_highbd_8_sub_pixel_variance8x32,
+                   aom_highbd_8_sub_pixel_avg_variance8x32, NULL, NULL,
+                   aom_highbd_sad8x32x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_16X4, aom_highbd_sad16x4_bits8,
+                   aom_highbd_sad16x4_avg_bits8, aom_highbd_8_variance16x4,
+                   aom_highbd_8_sub_pixel_variance16x4,
+                   aom_highbd_8_sub_pixel_avg_variance16x4, NULL, NULL,
+                   aom_highbd_sad16x4x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_4X16, aom_highbd_sad4x16_bits8,
+                   aom_highbd_sad4x16_avg_bits8, aom_highbd_8_variance4x16,
+                   aom_highbd_8_sub_pixel_variance4x16,
+                   aom_highbd_8_sub_pixel_avg_variance4x16, NULL, NULL,
+                   aom_highbd_sad4x16x4d_bits8)
+#endif
+
         HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits8,
                    aom_highbd_sad32x16_avg_bits8, aom_highbd_8_variance32x16,
                    aom_highbd_8_sub_pixel_variance32x16,
@@ -1354,7 +1433,7 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
             aom_highbd_8_sub_pixel_avg_variance4x4, aom_highbd_sad4x4x3_bits8,
             aom_highbd_sad4x4x8_bits8, aom_highbd_sad4x4x4d_bits8)
 
-#if CONFIG_CB4X4
+#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
         HIGHBD_BFP(BLOCK_2X2, NULL, NULL, aom_highbd_8_variance2x2, NULL, NULL,
                    NULL, NULL, NULL)
         HIGHBD_BFP(BLOCK_4X2, NULL, NULL, aom_highbd_8_variance4x2, NULL, NULL,
@@ -1420,6 +1499,19 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
                     aom_highbd_8_masked_sub_pixel_variance8x4)
         HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits8,
                     aom_highbd_8_masked_sub_pixel_variance4x4)
+#if CONFIG_EXT_PARTITION_TYPES
+        HIGHBD_MBFP(BLOCK_32X8, aom_highbd_masked_sad32x8_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance32x8)
+
+        HIGHBD_MBFP(BLOCK_8X32, aom_highbd_masked_sad8x32_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance8x32)
+
+        HIGHBD_MBFP(BLOCK_16X4, aom_highbd_masked_sad16x4_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance16x4)
+
+        HIGHBD_MBFP(BLOCK_4X16, aom_highbd_masked_sad4x16_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance4x16)
+#endif
 #endif  // CONFIG_EXT_INTER
 #if CONFIG_MOTION_VAR
 #if CONFIG_EXT_PARTITION
@@ -1472,10 +1564,53 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
         HIGHBD_OBFP(BLOCK_4X4, aom_highbd_obmc_sad4x4_bits8,
                     aom_highbd_obmc_variance4x4,
                     aom_highbd_obmc_sub_pixel_variance4x4)
+#if CONFIG_EXT_PARTITION_TYPES
+        HIGHBD_OBFP(BLOCK_32X8, aom_highbd_obmc_sad32x8_bits8,
+                    aom_highbd_obmc_variance32x8,
+                    aom_highbd_obmc_sub_pixel_variance32x8)
+
+        HIGHBD_OBFP(BLOCK_8X32, aom_highbd_obmc_sad8x32_bits8,
+                    aom_highbd_obmc_variance8x32,
+                    aom_highbd_obmc_sub_pixel_variance8x32)
+
+        HIGHBD_OBFP(BLOCK_16X4, aom_highbd_obmc_sad16x4_bits8,
+                    aom_highbd_obmc_variance16x4,
+                    aom_highbd_obmc_sub_pixel_variance16x4)
+
+        HIGHBD_OBFP(BLOCK_4X16, aom_highbd_obmc_sad4x16_bits8,
+                    aom_highbd_obmc_variance4x16,
+                    aom_highbd_obmc_sub_pixel_variance4x16)
+#endif
 #endif  // CONFIG_MOTION_VAR
         break;
 
       case AOM_BITS_10:
+#if CONFIG_EXT_PARTITION_TYPES
+        HIGHBD_BFP(BLOCK_32X8, aom_highbd_sad32x8_bits10,
+                   aom_highbd_sad32x8_avg_bits10, aom_highbd_10_variance32x8,
+                   aom_highbd_10_sub_pixel_variance32x8,
+                   aom_highbd_10_sub_pixel_avg_variance32x8, NULL, NULL,
+                   aom_highbd_sad32x8x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_8X32, aom_highbd_sad8x32_bits10,
+                   aom_highbd_sad8x32_avg_bits10, aom_highbd_10_variance8x32,
+                   aom_highbd_10_sub_pixel_variance8x32,
+                   aom_highbd_10_sub_pixel_avg_variance8x32, NULL, NULL,
+                   aom_highbd_sad8x32x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_16X4, aom_highbd_sad16x4_bits10,
+                   aom_highbd_sad16x4_avg_bits10, aom_highbd_10_variance16x4,
+                   aom_highbd_10_sub_pixel_variance16x4,
+                   aom_highbd_10_sub_pixel_avg_variance16x4, NULL, NULL,
+                   aom_highbd_sad16x4x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_4X16, aom_highbd_sad4x16_bits10,
+                   aom_highbd_sad4x16_avg_bits10, aom_highbd_10_variance4x16,
+                   aom_highbd_10_sub_pixel_variance4x16,
+                   aom_highbd_10_sub_pixel_avg_variance4x16, NULL, NULL,
+                   aom_highbd_sad4x16x4d_bits10)
+#endif
+
         HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits10,
                    aom_highbd_sad32x16_avg_bits10, aom_highbd_10_variance32x16,
                    aom_highbd_10_sub_pixel_variance32x16,
@@ -1559,7 +1694,7 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
             aom_highbd_10_sub_pixel_avg_variance4x4, aom_highbd_sad4x4x3_bits10,
             aom_highbd_sad4x4x8_bits10, aom_highbd_sad4x4x4d_bits10)
 
-#if CONFIG_CB4X4
+#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
         HIGHBD_BFP(BLOCK_2X2, NULL, NULL, aom_highbd_10_variance2x2, NULL, NULL,
                    NULL, NULL, NULL)
         HIGHBD_BFP(BLOCK_4X2, NULL, NULL, aom_highbd_10_variance4x2, NULL, NULL,
@@ -1627,6 +1762,19 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
                     aom_highbd_10_masked_sub_pixel_variance8x4)
         HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits10,
                     aom_highbd_10_masked_sub_pixel_variance4x4)
+#if CONFIG_EXT_PARTITION_TYPES
+        HIGHBD_MBFP(BLOCK_32X8, aom_highbd_masked_sad32x8_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance32x8)
+
+        HIGHBD_MBFP(BLOCK_8X32, aom_highbd_masked_sad8x32_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance8x32)
+
+        HIGHBD_MBFP(BLOCK_16X4, aom_highbd_masked_sad16x4_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance16x4)
+
+        HIGHBD_MBFP(BLOCK_4X16, aom_highbd_masked_sad4x16_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance4x16)
+#endif
 #endif  // CONFIG_EXT_INTER
 #if CONFIG_MOTION_VAR
 #if CONFIG_EXT_PARTITION
@@ -1679,10 +1827,53 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
         HIGHBD_OBFP(BLOCK_4X4, aom_highbd_obmc_sad4x4_bits10,
                     aom_highbd_10_obmc_variance4x4,
                     aom_highbd_10_obmc_sub_pixel_variance4x4)
+#if CONFIG_EXT_PARTITION_TYPES
+        HIGHBD_OBFP(BLOCK_32X8, aom_highbd_obmc_sad32x8_bits10,
+                    aom_highbd_10_obmc_variance32x8,
+                    aom_highbd_10_obmc_sub_pixel_variance32x8)
+
+        HIGHBD_OBFP(BLOCK_8X32, aom_highbd_obmc_sad8x32_bits10,
+                    aom_highbd_10_obmc_variance8x32,
+                    aom_highbd_10_obmc_sub_pixel_variance8x32)
+
+        HIGHBD_OBFP(BLOCK_16X4, aom_highbd_obmc_sad16x4_bits10,
+                    aom_highbd_10_obmc_variance16x4,
+                    aom_highbd_10_obmc_sub_pixel_variance16x4)
+
+        HIGHBD_OBFP(BLOCK_4X16, aom_highbd_obmc_sad4x16_bits10,
+                    aom_highbd_10_obmc_variance4x16,
+                    aom_highbd_10_obmc_sub_pixel_variance4x16)
+#endif
 #endif  // CONFIG_MOTION_VAR
         break;
 
       case AOM_BITS_12:
+#if CONFIG_EXT_PARTITION_TYPES
+        HIGHBD_BFP(BLOCK_32X8, aom_highbd_sad32x8_bits12,
+                   aom_highbd_sad32x8_avg_bits12, aom_highbd_12_variance32x8,
+                   aom_highbd_12_sub_pixel_variance32x8,
+                   aom_highbd_12_sub_pixel_avg_variance32x8, NULL, NULL,
+                   aom_highbd_sad32x8x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_8X32, aom_highbd_sad8x32_bits12,
+                   aom_highbd_sad8x32_avg_bits12, aom_highbd_12_variance8x32,
+                   aom_highbd_12_sub_pixel_variance8x32,
+                   aom_highbd_12_sub_pixel_avg_variance8x32, NULL, NULL,
+                   aom_highbd_sad8x32x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_16X4, aom_highbd_sad16x4_bits12,
+                   aom_highbd_sad16x4_avg_bits12, aom_highbd_12_variance16x4,
+                   aom_highbd_12_sub_pixel_variance16x4,
+                   aom_highbd_12_sub_pixel_avg_variance16x4, NULL, NULL,
+                   aom_highbd_sad16x4x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_4X16, aom_highbd_sad4x16_bits12,
+                   aom_highbd_sad4x16_avg_bits12, aom_highbd_12_variance4x16,
+                   aom_highbd_12_sub_pixel_variance4x16,
+                   aom_highbd_12_sub_pixel_avg_variance4x16, NULL, NULL,
+                   aom_highbd_sad4x16x4d_bits12)
+#endif
+
         HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits12,
                    aom_highbd_sad32x16_avg_bits12, aom_highbd_12_variance32x16,
                    aom_highbd_12_sub_pixel_variance32x16,
@@ -1766,7 +1957,7 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
             aom_highbd_12_sub_pixel_avg_variance4x4, aom_highbd_sad4x4x3_bits12,
             aom_highbd_sad4x4x8_bits12, aom_highbd_sad4x4x4d_bits12)
 
-#if CONFIG_CB4X4
+#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
         HIGHBD_BFP(BLOCK_2X2, NULL, NULL, aom_highbd_12_variance2x2, NULL, NULL,
                    NULL, NULL, NULL)
         HIGHBD_BFP(BLOCK_4X2, NULL, NULL, aom_highbd_12_variance4x2, NULL, NULL,
@@ -1834,6 +2025,19 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
                     aom_highbd_12_masked_sub_pixel_variance8x4)
         HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits12,
                     aom_highbd_12_masked_sub_pixel_variance4x4)
+#if CONFIG_EXT_PARTITION_TYPES
+        HIGHBD_MBFP(BLOCK_32X8, aom_highbd_masked_sad32x8_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance32x8)
+
+        HIGHBD_MBFP(BLOCK_8X32, aom_highbd_masked_sad8x32_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance8x32)
+
+        HIGHBD_MBFP(BLOCK_16X4, aom_highbd_masked_sad16x4_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance16x4)
+
+        HIGHBD_MBFP(BLOCK_4X16, aom_highbd_masked_sad4x16_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance4x16)
+#endif
 #endif  // CONFIG_EXT_INTER
 
 #if CONFIG_MOTION_VAR
@@ -1887,6 +2091,23 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
         HIGHBD_OBFP(BLOCK_4X4, aom_highbd_obmc_sad4x4_bits12,
                     aom_highbd_12_obmc_variance4x4,
                     aom_highbd_12_obmc_sub_pixel_variance4x4)
+#if CONFIG_EXT_PARTITION_TYPES
+        HIGHBD_OBFP(BLOCK_32X8, aom_highbd_obmc_sad32x8_bits12,
+                    aom_highbd_12_obmc_variance32x8,
+                    aom_highbd_12_obmc_sub_pixel_variance32x8)
+
+        HIGHBD_OBFP(BLOCK_8X32, aom_highbd_obmc_sad8x32_bits12,
+                    aom_highbd_12_obmc_variance8x32,
+                    aom_highbd_12_obmc_sub_pixel_variance8x32)
+
+        HIGHBD_OBFP(BLOCK_16X4, aom_highbd_obmc_sad16x4_bits12,
+                    aom_highbd_12_obmc_variance16x4,
+                    aom_highbd_12_obmc_sub_pixel_variance16x4)
+
+        HIGHBD_OBFP(BLOCK_4X16, aom_highbd_obmc_sad4x16_bits12,
+                    aom_highbd_12_obmc_variance4x16,
+                    aom_highbd_12_obmc_sub_pixel_variance4x16)
+#endif
 #endif  // CONFIG_MOTION_VAR
         break;
 
@@ -1933,10 +2154,15 @@ void set_compound_tools(AV1_COMMON *cm) {
 void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
   AV1_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
+  MACROBLOCK *const x = &cpi->td.mb;
 
   if (cm->profile != oxcf->profile) cm->profile = oxcf->profile;
   cm->bit_depth = oxcf->bit_depth;
   cm->color_space = oxcf->color_space;
+#if CONFIG_COLORSPACE_HEADERS
+  cm->transfer_function = oxcf->transfer_function;
+  cm->chroma_sample_position = oxcf->chroma_sample_position;
+#endif
   cm->color_range = oxcf->color_range;
 
   if (cm->profile <= PROFILE_1)
@@ -1945,9 +2171,9 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
     assert(cm->bit_depth > AOM_BITS_8);
 
   cpi->oxcf = *oxcf;
-  cpi->td.mb.e_mbd.bd = (int)cm->bit_depth;
+  x->e_mbd.bd = (int)cm->bit_depth;
 #if CONFIG_GLOBAL_MOTION
-  cpi->td.mb.e_mbd.global_motion = cm->global_motion;
+  x->e_mbd.global_motion = cm->global_motion;
 #endif  // CONFIG_GLOBAL_MOTION
 
   if ((oxcf->pass == 0) && (oxcf->rc_mode == AOM_Q)) {
@@ -1969,17 +2195,9 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
   cm->reset_frame_context = RESET_FRAME_CONTEXT_NONE;
 
 #if CONFIG_PALETTE
-  cm->allow_screen_content_tools = (cpi->oxcf.content == AOM_CONTENT_SCREEN);
-  if (cm->allow_screen_content_tools) {
-    MACROBLOCK *x = &cpi->td.mb;
-    if (x->palette_buffer == 0) {
-      CHECK_MEM_ERROR(cm, x->palette_buffer,
-                      aom_memalign(16, sizeof(*x->palette_buffer)));
-    }
-    // Reallocate the pc_tree, as it's contents depends on
-    // the state of cm->allow_screen_content_tools
-    av1_free_pc_tree(&cpi->td);
-    av1_setup_pc_tree(&cpi->common, &cpi->td);
+  if (x->palette_buffer == NULL) {
+    CHECK_MEM_ERROR(cm, x->palette_buffer,
+                    aom_memalign(16, sizeof(*x->palette_buffer)));
   }
 #endif  // CONFIG_PALETTE
 #if CONFIG_EXT_INTER
@@ -2058,15 +2276,6 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
 #endif  // CONFIG_ANS && ANS_MAX_SYMBOLS
 }
 
-static INLINE void init_upsampled_ref_frame_bufs(AV1_COMP *cpi) {
-  int i;
-
-  for (i = 0; i < (REF_FRAMES + 1); ++i) {
-    cpi->upsampled_ref_bufs[i].ref_count = 0;
-    cpi->upsampled_ref_idx[i] = INVALID_IDX;
-  }
-}
-
 AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
                                 BufferPool *const pool) {
   unsigned int i;
@@ -2099,10 +2308,6 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
   cpi->resize_state = 0;
   cpi->resize_avg_qp = 0;
   cpi->resize_buffer_underflow = 0;
-  cpi->resize_scale_num = 16;
-  cpi->resize_scale_den = 16;
-  cpi->resize_next_scale_num = 16;
-  cpi->resize_next_scale_den = 16;
 
   cpi->common.buffer_pool = pool;
 
@@ -2197,6 +2402,7 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
 #endif
 #if CONFIG_ENTROPY_STATS
   av1_zero(aggregate_fc);
+  av1_zero_array(aggregate_fc_per_type, FRAME_CONTEXTS);
 #endif  // CONFIG_ENTROPY_STATS
 
   cpi->first_time_stamp_ever = INT64_MAX;
@@ -2278,8 +2484,6 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
 
 #endif
 
-  init_upsampled_ref_frame_bufs(cpi);
-
   av1_set_speed_features_framesize_independent(cpi);
   av1_set_speed_features_framesize_dependent(cpi);
 
@@ -2293,6 +2497,24 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
   cpi->fn_ptr[BT].sdx8f = SDX8F;                                \
   cpi->fn_ptr[BT].sdx4df = SDX4DF;
 
+#if CONFIG_EXT_PARTITION_TYPES
+  BFP(BLOCK_4X16, aom_sad4x16, aom_sad4x16_avg, aom_variance4x16,
+      aom_sub_pixel_variance4x16, aom_sub_pixel_avg_variance4x16, NULL, NULL,
+      aom_sad4x16x4d)
+
+  BFP(BLOCK_16X4, aom_sad16x4, aom_sad16x4_avg, aom_variance16x4,
+      aom_sub_pixel_variance16x4, aom_sub_pixel_avg_variance16x4, NULL, NULL,
+      aom_sad16x4x4d)
+
+  BFP(BLOCK_8X32, aom_sad8x32, aom_sad8x32_avg, aom_variance8x32,
+      aom_sub_pixel_variance8x32, aom_sub_pixel_avg_variance8x32, NULL, NULL,
+      aom_sad8x32x4d)
+
+  BFP(BLOCK_32X8, aom_sad32x8, aom_sad32x8_avg, aom_variance32x8,
+      aom_sub_pixel_variance32x8, aom_sub_pixel_avg_variance32x8, NULL, NULL,
+      aom_sad32x8x4d)
+#endif
+
 #if CONFIG_EXT_PARTITION
   BFP(BLOCK_128X128, aom_sad128x128, aom_sad128x128_avg, aom_variance128x128,
       aom_sub_pixel_variance128x128, aom_sub_pixel_avg_variance128x128,
@@ -2359,7 +2581,7 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
       aom_sub_pixel_variance4x4, aom_sub_pixel_avg_variance4x4, aom_sad4x4x3,
       aom_sad4x4x8, aom_sad4x4x4d)
 
-#if CONFIG_CB4X4
+#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
   BFP(BLOCK_2X2, NULL, NULL, aom_variance2x2, NULL, NULL, NULL, NULL, NULL)
   BFP(BLOCK_2X4, NULL, NULL, aom_variance2x4, NULL, NULL, NULL, NULL, NULL)
   BFP(BLOCK_4X2, NULL, NULL, aom_variance4x2, NULL, NULL, NULL, NULL, NULL)
@@ -2405,6 +2627,20 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
        aom_obmc_sub_pixel_variance8x4)
   OBFP(BLOCK_4X4, aom_obmc_sad4x4, aom_obmc_variance4x4,
        aom_obmc_sub_pixel_variance4x4)
+
+#if CONFIG_EXT_PARTITION_TYPES
+  OBFP(BLOCK_4X16, aom_obmc_sad4x16, aom_obmc_variance4x16,
+       aom_obmc_sub_pixel_variance4x16)
+
+  OBFP(BLOCK_16X4, aom_obmc_sad16x4, aom_obmc_variance16x4,
+       aom_obmc_sub_pixel_variance16x4)
+
+  OBFP(BLOCK_8X32, aom_obmc_sad8x32, aom_obmc_variance8x32,
+       aom_obmc_sub_pixel_variance8x32)
+
+  OBFP(BLOCK_32X8, aom_obmc_sad32x8, aom_obmc_variance32x8,
+       aom_obmc_sub_pixel_variance32x8)
+#endif
 #endif  // CONFIG_MOTION_VAR
 
 #if CONFIG_EXT_INTER
@@ -2431,6 +2667,16 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
   MBFP(BLOCK_4X8, aom_masked_sad4x8, aom_masked_sub_pixel_variance4x8)
   MBFP(BLOCK_8X4, aom_masked_sad8x4, aom_masked_sub_pixel_variance8x4)
   MBFP(BLOCK_4X4, aom_masked_sad4x4, aom_masked_sub_pixel_variance4x4)
+
+#if CONFIG_EXT_PARTITION_TYPES
+  MBFP(BLOCK_4X16, aom_masked_sad4x16, aom_masked_sub_pixel_variance4x16)
+
+  MBFP(BLOCK_16X4, aom_masked_sad16x4, aom_masked_sub_pixel_variance16x4)
+
+  MBFP(BLOCK_8X32, aom_masked_sad8x32, aom_masked_sub_pixel_variance8x32)
+
+  MBFP(BLOCK_32X8, aom_masked_sad32x8, aom_masked_sub_pixel_variance32x8)
+#endif
 #endif  // CONFIG_EXT_INTER
 
 #if CONFIG_HIGHBITDEPTH
@@ -2449,7 +2695,9 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
 
   av1_loop_filter_init(cm);
 #if CONFIG_FRAME_SUPERRES
-  cm->superres_scale_numerator = SUPERRES_SCALE_DENOMINATOR;
+  cm->superres_scale_numerator = SCALE_DENOMINATOR;
+  cm->superres_upscaled_width = oxcf->width;
+  cm->superres_upscaled_height = oxcf->height;
 #endif  // CONFIG_FRAME_SUPERRES
 #if CONFIG_LOOP_RESTORATION
   av1_loop_restoration_precal();
@@ -2479,6 +2727,8 @@ void av1_remove_compressor(AV1_COMP *cpi) {
       fprintf(stderr, "Writing counts.stt\n");
       FILE *f = fopen("counts.stt", "wb");
       fwrite(&aggregate_fc, sizeof(aggregate_fc), 1, f);
+      fwrite(aggregate_fc_per_type, sizeof(aggregate_fc_per_type[0]),
+             FRAME_CONTEXTS, f);
       fclose(f);
     }
 #endif  // CONFIG_ENTROPY_STATS
@@ -2566,8 +2816,7 @@ void av1_remove_compressor(AV1_COMP *cpi) {
     // Deallocate allocated thread data.
     if (t < cpi->num_workers - 1) {
 #if CONFIG_PALETTE
-      if (cpi->common.allow_screen_content_tools)
-        aom_free(thread_data->td->palette_buffer);
+      aom_free(thread_data->td->palette_buffer);
 #endif  // CONFIG_PALETTE
 #if CONFIG_MOTION_VAR
       aom_free(thread_data->td->above_pred_buf);
@@ -2835,71 +3084,6 @@ void aom_write_one_yuv_frame(AV1_COMMON *cm, YV12_BUFFER_CONFIG *s) {
 }
 #endif  // OUTPUT_YUV_REC
 
-#if CONFIG_HIGHBITDEPTH
-static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
-                                   YV12_BUFFER_CONFIG *dst, int planes,
-                                   int bd) {
-#else
-static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
-                                   YV12_BUFFER_CONFIG *dst, int planes) {
-#endif  // CONFIG_HIGHBITDEPTH
-  const int src_w = src->y_crop_width;
-  const int src_h = src->y_crop_height;
-  const int dst_w = dst->y_crop_width;
-  const int dst_h = dst->y_crop_height;
-  const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer,
-                                   src->v_buffer };
-  const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride };
-  uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer };
-  const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride };
-  const InterpFilterParams interp_filter_params =
-      av1_get_interp_filter_params(EIGHTTAP_REGULAR);
-  const int16_t *kernel = interp_filter_params.filter_ptr;
-  const int taps = interp_filter_params.taps;
-  int x, y, i;
-
-  assert(planes <= 3);
-  for (y = 0; y < dst_h; y += 16) {
-    for (x = 0; x < dst_w; x += 16) {
-      for (i = 0; i < planes; ++i) {
-        const int factor = (i == 0 || i == 3 ? 1 : 2);
-        const int x_q4 = x * (16 / factor) * src_w / dst_w;
-        const int y_q4 = y * (16 / factor) * src_h / dst_h;
-        const int src_stride = src_strides[i];
-        const int dst_stride = dst_strides[i];
-        const uint8_t *src_ptr = srcs[i] +
-                                 (y / factor) * src_h / dst_h * src_stride +
-                                 (x / factor) * src_w / dst_w;
-        uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor);
-
-#if CONFIG_HIGHBITDEPTH
-        if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
-          aom_highbd_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
-                               &kernel[(x_q4 & 0xf) * taps], 16 * src_w / dst_w,
-                               &kernel[(y_q4 & 0xf) * taps], 16 * src_h / dst_h,
-                               16 / factor, 16 / factor, bd);
-        } else {
-          aom_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride,
-                        &kernel[(x_q4 & 0xf) * taps], 16 * src_w / dst_w,
-                        &kernel[(y_q4 & 0xf) * taps], 16 * src_h / dst_h,
-                        16 / factor, 16 / factor);
-        }
-#else
-        aom_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride,
-                      &kernel[(x_q4 & 0xf) * taps], 16 * src_w / dst_w,
-                      &kernel[(y_q4 & 0xf) * taps], 16 * src_h / dst_h,
-                      16 / factor, 16 / factor);
-#endif  // CONFIG_HIGHBITDEPTH
-      }
-    }
-  }
-
-  if (planes == 1)
-    aom_extend_frame_borders_y(dst);
-  else
-    aom_extend_frame_borders(dst);
-}
-
 #if CONFIG_GLOBAL_MOTION
 #define GM_RECODE_LOOP_NUM4X4_FACTOR 192
 static int recode_loop_test_global_motion(AV1_COMP *cpi) {
@@ -2949,52 +3133,6 @@ static int recode_loop_test(AV1_COMP *cpi, int high_limit, int low_limit, int q,
   return force_recode;
 }
 
-static INLINE int get_free_upsampled_ref_buf(EncRefCntBuffer *ubufs) {
-  int i;
-
-  for (i = 0; i < (REF_FRAMES + 1); i++) {
-    if (!ubufs[i].ref_count) {
-      return i;
-    }
-  }
-  return INVALID_IDX;
-}
-
-// Up-sample 1 reference frame.
-static INLINE int upsample_ref_frame(AV1_COMP *cpi,
-                                     const YV12_BUFFER_CONFIG *const ref) {
-  AV1_COMMON *const cm = &cpi->common;
-  EncRefCntBuffer *ubufs = cpi->upsampled_ref_bufs;
-  int new_uidx = get_free_upsampled_ref_buf(ubufs);
-
-  if (new_uidx == INVALID_IDX) {
-    return INVALID_IDX;
-  } else {
-    YV12_BUFFER_CONFIG *upsampled_ref = &ubufs[new_uidx].buf;
-
-    // Can allocate buffer for Y plane only.
-    if (upsampled_ref->buffer_alloc_sz < (ref->buffer_alloc_sz << 6))
-      if (aom_realloc_frame_buffer(upsampled_ref, (cm->width << 3),
-                                   (cm->height << 3), cm->subsampling_x,
-                                   cm->subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-                                   cm->use_highbitdepth,
-#endif
-                                   (AOM_BORDER_IN_PIXELS << 3),
-                                   cm->byte_alignment, NULL, NULL, NULL))
-        aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                           "Failed to allocate up-sampled frame buffer");
-
-// Currently, only Y plane is up-sampled, U, V are not used.
-#if CONFIG_HIGHBITDEPTH
-    scale_and_extend_frame(ref, upsampled_ref, 1, (int)cm->bit_depth);
-#else
-    scale_and_extend_frame(ref, upsampled_ref, 1);
-#endif
-    return new_uidx;
-  }
-}
-
 #define DUMP_REF_FRAME_IMAGES 0
 
 #if DUMP_REF_FRAME_IMAGES == 1
@@ -3068,34 +3206,50 @@ static INLINE void shift_last_ref_frames(AV1_COMP *cpi) {
 }
 #endif  // CONFIG_EXT_REFS
 
+#if CONFIG_VAR_REFS
+static void enc_check_valid_ref_frames(AV1_COMP *const cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  MV_REFERENCE_FRAME ref_frame;
+
+  // TODO(zoeliu): To handle ALTREF_FRAME the same way as do with other
+  //               reference frames. Current encoder invalid ALTREF when ALTREF
+  //               is the same as LAST, but invalid all the other references
+  //               when they are the same as ALTREF.
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    int ref_buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+    RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - LAST_FRAME];
+
+    if (ref_buf_idx != INVALID_IDX) {
+      ref_buf->is_valid = 1;
+
+      MV_REFERENCE_FRAME ref;
+      for (ref = LAST_FRAME; ref < ref_frame; ++ref) {
+        int buf_idx = get_ref_frame_buf_idx(cpi, ref);
+        RefBuffer *const buf = &cm->frame_refs[ref - LAST_FRAME];
+        if (buf->is_valid && buf_idx == ref_buf_idx) {
+          if (ref_frame != ALTREF_FRAME || ref == LAST_FRAME) {
+            ref_buf->is_valid = 0;
+            break;
+          } else {
+            buf->is_valid = 0;
+          }
+        }
+      }
+    } else {
+      ref_buf->is_valid = 0;
+    }
+  }
+}
+#endif  // CONFIG_VAR_REFS
+
 void av1_update_reference_frames(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   BufferPool *const pool = cm->buffer_pool;
-  const int use_upsampled_ref = cpi->sf.use_upsampled_references;
-  int new_uidx = 0;
 
   // NOTE: Save the new show frame buffer index for --test-code=warn, i.e.,
   //       for the purpose to verify no mismatch between encoder and decoder.
   if (cm->show_frame) cpi->last_show_frame_buf_idx = cm->new_fb_idx;
 
-  if (use_upsampled_ref) {
-#if CONFIG_EXT_REFS
-    if (cm->show_existing_frame) {
-      new_uidx = cpi->upsampled_ref_idx[cpi->existing_fb_idx_to_show];
-      // TODO(zoeliu): Once following is confirmed, remove it.
-      assert(cpi->upsampled_ref_bufs[new_uidx].ref_count > 0);
-    } else {
-#endif  // CONFIG_EXT_REFS
-      // Up-sample the current encoded frame.
-      RefCntBuffer *bufs = pool->frame_bufs;
-      const YV12_BUFFER_CONFIG *const ref = &bufs[cm->new_fb_idx].buf;
-
-      new_uidx = upsample_ref_frame(cpi, ref);
-#if CONFIG_EXT_REFS
-      assert(new_uidx != INVALID_IDX);
-    }
-#endif  // CONFIG_EXT_REFS
-  }
   // At this point the new frame has been encoded.
   // If any buffer copy / swapping is signaled it should be done here.
   if (cm->frame_type == KEY_FRAME) {
@@ -3107,17 +3261,6 @@ void av1_update_reference_frames(AV1_COMP *cpi) {
 #endif  // CONFIG_EXT_REFS
     ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->alt_fb_idx],
                cm->new_fb_idx);
-
-    if (use_upsampled_ref) {
-      uref_cnt_fb(cpi->upsampled_ref_bufs,
-                  &cpi->upsampled_ref_idx[cpi->gld_fb_idx], new_uidx);
-#if CONFIG_EXT_REFS
-      uref_cnt_fb(cpi->upsampled_ref_bufs,
-                  &cpi->upsampled_ref_idx[cpi->bwd_fb_idx], new_uidx);
-#endif  // CONFIG_EXT_REFS
-      uref_cnt_fb(cpi->upsampled_ref_bufs,
-                  &cpi->upsampled_ref_idx[cpi->alt_fb_idx], new_uidx);
-    }
   } else if (av1_preserve_existing_gf(cpi)) {
     // We have decided to preserve the previously existing golden frame as our
     // new ARF frame. However, in the short term in function
@@ -3131,10 +3274,6 @@ void av1_update_reference_frames(AV1_COMP *cpi) {
 
     ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->alt_fb_idx],
                cm->new_fb_idx);
-    if (use_upsampled_ref)
-      uref_cnt_fb(cpi->upsampled_ref_bufs,
-                  &cpi->upsampled_ref_idx[cpi->alt_fb_idx], new_uidx);
-
     tmp = cpi->alt_fb_idx;
     cpi->alt_fb_idx = cpi->gld_fb_idx;
     cpi->gld_fb_idx = tmp;
@@ -3146,19 +3285,6 @@ void av1_update_reference_frames(AV1_COMP *cpi) {
 // TODO(zoeliu): Do we need to copy cpi->interp_filter_selected[0] over to
 // cpi->interp_filter_selected[GOLDEN_FRAME]?
 #if CONFIG_EXT_REFS
-  } else if (cpi->rc.is_last_bipred_frame) {
-    // Refresh the LAST_FRAME with the BWDREF_FRAME and retire the LAST3_FRAME
-    // by updating the virtual indices. Note that the frame BWDREF_FRAME points
-    // to now should be retired, and it should not be used before refreshed.
-    int tmp = cpi->lst_fb_idxes[LAST_REF_FRAMES - 1];
-
-    shift_last_ref_frames(cpi);
-    cpi->lst_fb_idxes[0] = cpi->bwd_fb_idx;
-    cpi->bwd_fb_idx = tmp;
-
-    memcpy(cpi->interp_filter_selected[LAST_FRAME],
-           cpi->interp_filter_selected[BWDREF_FRAME],
-           sizeof(cpi->interp_filter_selected[BWDREF_FRAME]));
   } else if (cpi->rc.is_src_frame_ext_arf && cm->show_existing_frame) {
     // Deal with the special case for showing existing internal ALTREF_FRAME
     // Refresh the LAST_FRAME with the ALTREF_FRAME and retire the LAST3_FRAME
@@ -3195,9 +3321,6 @@ void av1_update_reference_frames(AV1_COMP *cpi) {
       }
 #endif  // CONFIG_EXT_REFS
       ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[arf_idx], cm->new_fb_idx);
-      if (use_upsampled_ref)
-        uref_cnt_fb(cpi->upsampled_ref_bufs, &cpi->upsampled_ref_idx[arf_idx],
-                    new_uidx);
 
       memcpy(cpi->interp_filter_selected[ALTREF_FRAME + which_arf],
              cpi->interp_filter_selected[0],
@@ -3207,9 +3330,6 @@ void av1_update_reference_frames(AV1_COMP *cpi) {
     if (cpi->refresh_golden_frame) {
       ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
                  cm->new_fb_idx);
-      if (use_upsampled_ref)
-        uref_cnt_fb(cpi->upsampled_ref_bufs,
-                    &cpi->upsampled_ref_idx[cpi->gld_fb_idx], new_uidx);
 
 #if !CONFIG_EXT_REFS
       if (!cpi->rc.is_src_frame_alt_ref)
@@ -3234,9 +3354,6 @@ void av1_update_reference_frames(AV1_COMP *cpi) {
 
       ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->bwd_fb_idx],
                  cm->new_fb_idx);
-      if (use_upsampled_ref)
-        uref_cnt_fb(cpi->upsampled_ref_bufs,
-                    &cpi->upsampled_ref_idx[cpi->bwd_fb_idx], new_uidx);
 
       memcpy(cpi->interp_filter_selected[BWDREF_FRAME],
              cpi->interp_filter_selected[0],
@@ -3293,11 +3410,6 @@ void av1_update_reference_frames(AV1_COMP *cpi) {
         ref_cnt_fb(pool->frame_bufs,
                    &cm->ref_frame_map[cpi->lst_fb_idxes[ref_frame]],
                    cm->new_fb_idx);
-
-        if (use_upsampled_ref)
-          uref_cnt_fb(cpi->upsampled_ref_bufs,
-                      &cpi->upsampled_ref_idx[cpi->lst_fb_idxes[ref_frame]],
-                      new_uidx);
       }
     } else {
       int tmp;
@@ -3306,30 +3418,39 @@ void av1_update_reference_frames(AV1_COMP *cpi) {
                  &cm->ref_frame_map[cpi->lst_fb_idxes[LAST_REF_FRAMES - 1]],
                  cm->new_fb_idx);
 
-      if (use_upsampled_ref)
-        uref_cnt_fb(
-            cpi->upsampled_ref_bufs,
-            &cpi->upsampled_ref_idx[cpi->lst_fb_idxes[LAST_REF_FRAMES - 1]],
-            new_uidx);
-
       tmp = cpi->lst_fb_idxes[LAST_REF_FRAMES - 1];
 
       shift_last_ref_frames(cpi);
       cpi->lst_fb_idxes[0] = tmp;
 
       assert(cm->show_existing_frame == 0);
-      // NOTE: Currently only LF_UPDATE and INTNL_OVERLAY_UPDATE frames are to
-      //       refresh the LAST_FRAME.
       memcpy(cpi->interp_filter_selected[LAST_FRAME],
              cpi->interp_filter_selected[0],
              sizeof(cpi->interp_filter_selected[0]));
+
+      if (cpi->rc.is_last_bipred_frame) {
+        // Refresh the LAST_FRAME with the BWDREF_FRAME and retire the
+        // LAST3_FRAME by updating the virtual indices.
+        //
+        // NOTE: The source frame for BWDREF does not have a holding position as
+        //       the OVERLAY frame for ALTREF's. Hence, to resolve the reference
+        //       virtual index reshuffling for BWDREF, the encoder always
+        //       specifies a LAST_BIPRED right before BWDREF and completes the
+        //       reshuffling job accordingly.
+        tmp = cpi->lst_fb_idxes[LAST_REF_FRAMES - 1];
+
+        shift_last_ref_frames(cpi);
+        cpi->lst_fb_idxes[0] = cpi->bwd_fb_idx;
+        cpi->bwd_fb_idx = tmp;
+
+        memcpy(cpi->interp_filter_selected[LAST_FRAME],
+               cpi->interp_filter_selected[BWDREF_FRAME],
+               sizeof(cpi->interp_filter_selected[BWDREF_FRAME]));
+      }
     }
 #else
     ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx],
                cm->new_fb_idx);
-    if (use_upsampled_ref)
-      uref_cnt_fb(cpi->upsampled_ref_bufs,
-                  &cpi->upsampled_ref_idx[cpi->lst_fb_idx], new_uidx);
     if (!cpi->rc.is_src_frame_alt_ref) {
       memcpy(cpi->interp_filter_selected[LAST_FRAME],
              cpi->interp_filter_selected[0],
@@ -3344,61 +3465,8 @@ void av1_update_reference_frames(AV1_COMP *cpi) {
 #endif  // DUMP_REF_FRAME_IMAGES
 }
 
-static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
-  MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
-  struct loopfilter *lf = &cm->lf;
-  if (is_lossless_requested(&cpi->oxcf)) {
-    lf->filter_level = 0;
-  } else {
-    struct aom_usec_timer timer;
-
-    aom_clear_system_state();
-
-    aom_usec_timer_start(&timer);
-
-    av1_pick_filter_level(cpi->source, cpi, cpi->sf.lpf_pick);
-
-    aom_usec_timer_mark(&timer);
-    cpi->time_pick_lpf += aom_usec_timer_elapsed(&timer);
-  }
-
-  if (lf->filter_level > 0) {
-#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_CB4X4
-    av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
-#else
-    if (cpi->num_workers > 1)
-      av1_loop_filter_frame_mt(cm->frame_to_show, cm, xd->plane,
-                               lf->filter_level, 0, 0, cpi->workers,
-                               cpi->num_workers, &cpi->lf_row_sync);
-    else
-      av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
-#endif
-  }
-#if CONFIG_CDEF
-  if (is_lossless_requested(&cpi->oxcf)) {
-    cm->cdef_bits = 0;
-    cm->cdef_strengths[0] = 0;
-    cm->nb_cdef_strengths = 1;
-  } else {
-    // Find cm->dering_level, cm->clpf_strength_u and cm->clpf_strength_v
-    av1_cdef_search(cm->frame_to_show, cpi->source, cm, xd);
-
-    // Apply the filter
-    av1_cdef_frame(cm->frame_to_show, cm, xd);
-  }
-#endif
-#if CONFIG_LOOP_RESTORATION
-  av1_pick_filter_restoration(cpi->source, cpi, cpi->sf.lpf_pick);
-  if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
-      cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
-      cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
-    av1_loop_restoration_frame(cm->frame_to_show, cm, cm->rst_info, 7, 0, NULL);
-  }
-#endif  // CONFIG_LOOP_RESTORATION
-  aom_extend_frame_inner_borders(cm->frame_to_show);
-}
-
 static INLINE void alloc_frame_mvs(AV1_COMMON *const cm, int buffer_idx) {
+  assert(buffer_idx != INVALID_IDX);
   RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx];
   if (new_fb_ptr->mvs == NULL || new_fb_ptr->mi_rows < cm->mi_rows ||
       new_fb_ptr->mi_cols < cm->mi_cols) {
@@ -3458,8 +3526,8 @@ void av1_scale_references(AV1_COMP *cpi) {
                   cm->byte_alignment, NULL, NULL, NULL))
             aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                                "Failed to allocate frame buffer");
-          scale_and_extend_frame(ref, &new_fb_ptr->buf, MAX_MB_PLANE,
-                                 (int)cm->bit_depth);
+          av1_resize_and_extend_frame(ref, &new_fb_ptr->buf,
+                                      (int)cm->bit_depth);
           cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
           alloc_frame_mvs(cm, new_fb);
         }
@@ -3482,36 +3550,11 @@ void av1_scale_references(AV1_COMP *cpi) {
                                        NULL, NULL, NULL))
             aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                                "Failed to allocate frame buffer");
-          scale_and_extend_frame(ref, &new_fb_ptr->buf, MAX_MB_PLANE);
+          av1_resize_and_extend_frame(ref, &new_fb_ptr->buf);
           cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
           alloc_frame_mvs(cm, new_fb);
         }
 #endif  // CONFIG_HIGHBITDEPTH
-
-        if (cpi->sf.use_upsampled_references &&
-            (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width ||
-             new_fb_ptr->buf.y_crop_height != cm->height)) {
-          const int map_idx = get_ref_frame_map_idx(cpi, ref_frame);
-          EncRefCntBuffer *ubuf =
-              &cpi->upsampled_ref_bufs[cpi->upsampled_ref_idx[map_idx]];
-
-          if (aom_realloc_frame_buffer(&ubuf->buf, (cm->width << 3),
-                                       (cm->height << 3), cm->subsampling_x,
-                                       cm->subsampling_y,
-#if CONFIG_HIGHBITDEPTH
-                                       cm->use_highbitdepth,
-#endif
-                                       (AOM_BORDER_IN_PIXELS << 3),
-                                       cm->byte_alignment, NULL, NULL, NULL))
-            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
-                               "Failed to allocate up-sampled frame buffer");
-#if CONFIG_HIGHBITDEPTH
-          scale_and_extend_frame(&new_fb_ptr->buf, &ubuf->buf, 1,
-                                 (int)cm->bit_depth);
-#else
-          scale_and_extend_frame(&new_fb_ptr->buf, &ubuf->buf, 1);
-#endif
-        }
       } else {
         const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
         RefCntBuffer *const buf = &pool->frame_bufs[buf_idx];
@@ -3742,66 +3785,38 @@ static void init_motion_estimation(AV1_COMP *cpi) {
 }
 
 #if CONFIG_LOOP_RESTORATION
-static void set_restoration_tilesize(int width, int height,
+#define COUPLED_CHROMA_FROM_LUMA_RESTORATION 0
+static void set_restoration_tilesize(int width, int height, int sx, int sy,
                                      RestorationInfo *rst) {
   (void)width;
   (void)height;
+  (void)sx;
+  (void)sy;
+#if COUPLED_CHROMA_FROM_LUMA_RESTORATION
+  int s = AOMMIN(sx, sy);
+#else
+  int s = 0;
+#endif  // !COUPLED_CHROMA_FROM_LUMA_RESTORATION
+
   rst[0].restoration_tilesize = (RESTORATION_TILESIZE_MAX >> 1);
-  rst[1].restoration_tilesize = rst[0].restoration_tilesize;
-  rst[2].restoration_tilesize = rst[0].restoration_tilesize;
+  rst[1].restoration_tilesize = rst[0].restoration_tilesize >> s;
+  rst[2].restoration_tilesize = rst[1].restoration_tilesize;
 }
 #endif  // CONFIG_LOOP_RESTORATION
 
-static void set_scaled_size(AV1_COMP *cpi) {
-  AV1_COMMON *const cm = &cpi->common;
-  AV1EncoderConfig *const oxcf = &cpi->oxcf;
-
-  // TODO(afergs): Replace with call to av1_resize_pending? Could replace
-  //               scaled_size_set as well.
-  // TODO(afergs): Realistically, if resize_pending is true, then the other
-  //               conditions must already be satisfied.
-  //               Try this first:
-  //                 av1_resize_pending &&
-  //                 (DYNAMIC && (1 Pass CBR || 2 Pass VBR)
-  //                  STATIC  && FIRST_FRAME)
-  //               Really, av1_resize_pending should just reflect the above.
-  // TODO(afergs): Allow fixed resizing in AOM_CBR mode?
-  // 2 Pass VBR: Resize if fixed resize and first frame, or dynamic resize and
-  //             a resize is pending.
-  // 1 Pass CBR: Resize if dynamic resize and resize pending.
-  if ((oxcf->pass == 2 && oxcf->rc_mode == AOM_VBR &&
-       ((oxcf->resize_mode == RESIZE_FIXED && cm->current_video_frame == 0) ||
-        (oxcf->resize_mode == RESIZE_DYNAMIC && av1_resize_pending(cpi)))) ||
-      (oxcf->pass == 0 && oxcf->rc_mode == AOM_CBR &&
-       oxcf->resize_mode == RESIZE_DYNAMIC && av1_resize_pending(cpi))) {
-    // TODO(afergs): This feels hacky... Should it just set? Should
-    //               av1_set_next_scaled_size be a library function?
-    av1_calculate_next_scaled_size(cpi, &oxcf->scaled_frame_width,
-                                   &oxcf->scaled_frame_height);
-  }
-}
-
 static void set_frame_size(AV1_COMP *cpi, int width, int height) {
-  int ref_frame;
   AV1_COMMON *const cm = &cpi->common;
-  AV1EncoderConfig *const oxcf = &cpi->oxcf;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  int ref_frame;
 
   if (width != cm->width || height != cm->height) {
     // There has been a change in the encoded frame size
     av1_set_size_literal(cpi, width, height);
-
-    // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed.
-    // TODO(afergs): Make condition just (pass == 0) or (rc_mode == CBR) -
-    //               UNLESS CBR starts allowing FIXED resizing. Then the resize
-    //               mode will need to get checked too.
-    if (oxcf->pass == 0 && oxcf->rc_mode == AOM_CBR &&
-        oxcf->resize_mode == RESIZE_DYNAMIC)
-      set_mv_search_params(cpi);  // TODO(afergs): Needed? Caller calls after...
+    set_mv_search_params(cpi);
   }
 
 #if !CONFIG_XIPHRC
-  if (oxcf->pass == 2) {
+  if (cpi->oxcf.pass == 2) {
     av1_set_target_rate(cpi);
   }
 #endif
@@ -3820,18 +3835,29 @@ static void set_frame_size(AV1_COMP *cpi, int width, int height) {
                        "Failed to allocate frame buffer");
 
 #if CONFIG_LOOP_RESTORATION
-  set_restoration_tilesize(cm->width, cm->height, cm->rst_info);
+  set_restoration_tilesize(
+#if CONFIG_FRAME_SUPERRES
+      cm->superres_upscaled_width, cm->superres_upscaled_height,
+#else
+      cm->width, cm->height,
+#endif  // CONFIG_FRAME_SUPERRES
+      cm->subsampling_x, cm->subsampling_y, cm->rst_info);
   for (int i = 0; i < MAX_MB_PLANE; ++i)
     cm->rst_info[i].frame_restoration_type = RESTORE_NONE;
   av1_alloc_restoration_buffers(cm);
   for (int i = 0; i < MAX_MB_PLANE; ++i) {
     cpi->rst_search[i].restoration_tilesize =
         cm->rst_info[i].restoration_tilesize;
-    av1_alloc_restoration_struct(cm, &cpi->rst_search[i], cm->width,
-                                 cm->height);
+    av1_alloc_restoration_struct(cm, &cpi->rst_search[i],
+#if CONFIG_FRAME_SUPERRES
+                                 cm->superres_upscaled_width,
+                                 cm->superres_upscaled_height);
+#else
+                                 cm->width, cm->height);
+#endif  // CONFIG_FRAME_SUPERRES
   }
-#endif  // CONFIG_LOOP_RESTORATION
-  alloc_util_frame_buffers(cpi);
+#endif                            // CONFIG_LOOP_RESTORATION
+  alloc_util_frame_buffers(cpi);  // TODO(afergs): Remove? Gets called anyways.
   init_motion_estimation(cpi);
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
@@ -3857,6 +3883,12 @@ static void set_frame_size(AV1_COMP *cpi, int width, int height) {
       ref_buf->buf = NULL;
     }
   }
+
+#if CONFIG_VAR_REFS
+  // Check duplicate reference frames
+  enc_check_valid_ref_frames(cpi);
+#endif  // CONFIG_VAR_REFS
+
 #if CONFIG_INTRABC
 #if CONFIG_HIGHBITDEPTH
   av1_setup_scale_factors_for_frame(&xd->sf_identity, cm->width, cm->height,
@@ -3872,84 +3904,167 @@ static void set_frame_size(AV1_COMP *cpi, int width, int height) {
 }
 
 static void setup_frame_size(AV1_COMP *cpi) {
-  set_scaled_size(cpi);
+  int encode_width = cpi->oxcf.width;
+  int encode_height = cpi->oxcf.height;
+
+  uint8_t resize_num = av1_calculate_next_resize_scale(cpi);
+  av1_calculate_scaled_size(&encode_width, &encode_height, resize_num);
+
 #if CONFIG_FRAME_SUPERRES
-  int encode_width;
-  int encode_height;
-  av1_calculate_superres_size(cpi, &encode_width, &encode_height);
+  AV1_COMMON *cm = &cpi->common;
+  cm->superres_upscaled_width = encode_width;
+  cm->superres_upscaled_height = encode_height;
+  cm->superres_scale_numerator =
+      av1_calculate_next_superres_scale(cpi, encode_width, encode_width);
+  av1_calculate_scaled_size(&encode_width, &encode_height,
+                            cm->superres_scale_numerator);
+#endif  // CONFIG_FRAME_SUPERRES
+
   set_frame_size(cpi, encode_width, encode_height);
+}
+
+#if CONFIG_FRAME_SUPERRES
+static void superres_post_encode(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+
+  if (av1_superres_unscaled(cm)) return;
+
+  av1_superres_upscale(cm, NULL);
+
+  // If regular resizing is occurring the source will need to be downscaled to
+  // match the upscaled superres resolution. Otherwise the original source is
+  // used.
+  if (av1_resize_unscaled(cm)) {
+    cpi->source = cpi->unscaled_source;
+    if (cpi->last_source != NULL) cpi->last_source = cpi->unscaled_last_source;
+  } else {
+    assert(cpi->unscaled_source->y_crop_width != cm->superres_upscaled_width);
+    assert(cpi->unscaled_source->y_crop_height != cm->superres_upscaled_height);
+    // Do downscale. cm->(width|height) has been updated by av1_superres_upscale
+    if (aom_realloc_frame_buffer(
+            &cpi->scaled_source, cm->superres_upscaled_width,
+            cm->superres_upscaled_height, cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_HIGHBITDEPTH
+            cm->use_highbitdepth,
+#endif  // CONFIG_HIGHBITDEPTH
+            AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL))
+      aom_internal_error(
+          &cm->error, AOM_CODEC_MEM_ERROR,
+          "Failed to reallocate scaled source buffer for superres");
+    assert(cpi->scaled_source.y_crop_width == cm->superres_upscaled_width);
+    assert(cpi->scaled_source.y_crop_height == cm->superres_upscaled_height);
+#if CONFIG_HIGHBITDEPTH
+    av1_resize_and_extend_frame(cpi->unscaled_source, &cpi->scaled_source,
+                                (int)cm->bit_depth);
 #else
-  set_frame_size(cpi, cpi->oxcf.scaled_frame_width,
-                 cpi->oxcf.scaled_frame_height);
-#endif  // CONFIG_FRAME_SUPERRES
+    av1_resize_and_extend_frame(cpi->unscaled_source, &cpi->scaled_source);
+#endif  // CONFIG_HIGHBITDEPTH
+    cpi->source = &cpi->scaled_source;
+  }
 }
+#endif  // CONFIG_FRAME_SUPERRES
 
-static void reset_use_upsampled_references(AV1_COMP *cpi) {
-  MV_REFERENCE_FRAME ref_frame;
+static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
+  MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
+  struct loopfilter *lf = &cm->lf;
+  int no_loopfilter = 0;
 
-  // reset up-sampled reference buffer structure.
-  init_upsampled_ref_frame_bufs(cpi);
+  if (is_lossless_requested(&cpi->oxcf)) no_loopfilter = 1;
 
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi, ref_frame);
-    int new_uidx = upsample_ref_frame(cpi, ref);
+#if CONFIG_EXT_TILE
+  // 0 loopfilter level is only necessary if individual tile
+  // decoding is required.
+  if (cm->single_tile_decoding) no_loopfilter = 1;
+#endif  // CONFIG_EXT_TILE
+
+  if (no_loopfilter) {
+    lf->filter_level = 0;
+  } else {
+    struct aom_usec_timer timer;
+
+    aom_clear_system_state();
+
+    aom_usec_timer_start(&timer);
+
+    av1_pick_filter_level(cpi->source, cpi, cpi->sf.lpf_pick);
+
+    aom_usec_timer_mark(&timer);
+    cpi->time_pick_lpf += aom_usec_timer_elapsed(&timer);
+  }
 
-    // Update the up-sampled reference index.
-    cpi->upsampled_ref_idx[get_ref_frame_map_idx(cpi, ref_frame)] = new_uidx;
-    cpi->upsampled_ref_bufs[new_uidx].ref_count++;
+  if (lf->filter_level > 0) {
+#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_CB4X4
+#if CONFIG_UV_LVL
+    av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
+    av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level_u, 1, 0);
+    av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level_v, 2, 0);
+#else
+    av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
+#endif  // CONFIG_UV_LVL
+#else
+    if (cpi->num_workers > 1)
+      av1_loop_filter_frame_mt(cm->frame_to_show, cm, xd->plane,
+                               lf->filter_level, 0, 0, cpi->workers,
+                               cpi->num_workers, &cpi->lf_row_sync);
+    else
+      av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
+#endif
+  }
+#if CONFIG_CDEF
+  if (is_lossless_requested(&cpi->oxcf)) {
+    cm->cdef_bits = 0;
+    cm->cdef_strengths[0] = 0;
+    cm->nb_cdef_strengths = 1;
+  } else {
+    // Find cm->dering_level, cm->clpf_strength_u and cm->clpf_strength_v
+    av1_cdef_search(cm->frame_to_show, cpi->source, cm, xd,
+                    cpi->oxcf.speed > 0);
+
+    // Apply the filter
+    av1_cdef_frame(cm->frame_to_show, cm, xd);
   }
+#endif
+
+#if CONFIG_FRAME_SUPERRES
+  superres_post_encode(cpi);
+#endif  // CONFIG_FRAME_SUPERRES
+
+#if CONFIG_LOOP_RESTORATION
+  av1_pick_filter_restoration(cpi->source, cpi, cpi->sf.lpf_pick);
+  if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
+      cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
+      cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
+    av1_loop_restoration_frame(cm->frame_to_show, cm, cm->rst_info, 7, 0, NULL);
+  }
+#endif  // CONFIG_LOOP_RESTORATION
+  // TODO(debargha): Fix mv search range on encoder side
+  // aom_extend_frame_inner_borders(cm->frame_to_show);
+  aom_extend_frame_borders(cm->frame_to_show);
 }
 
 static void encode_without_recode_loop(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   int q = 0, bottom_index = 0, top_index = 0;  // Dummy variables.
-  const int use_upsampled_ref = cpi->sf.use_upsampled_references;
 
   aom_clear_system_state();
 
-#if CONFIG_FRAME_SUPERRES
-  // TODO(afergs): Figure out when is actually a good time to do superres
-  cm->superres_scale_numerator = SUPERRES_SCALE_DENOMINATOR;
-  // (uint8_t)(rand() % 9 + SUPERRES_SCALE_NUMERATOR_MIN);
-  cpi->superres_pending = cpi->oxcf.superres_enabled && 0;
-#endif  // CONFIG_FRAME_SUPERRES
-
+  set_size_independent_vars(cpi);
   setup_frame_size(cpi);
-  av1_resize_step(cpi);
-
-  // For 1 pass CBR under dynamic resize mode: use faster scaling for source.
-  // Only for 2x2 scaling for now.
-  if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == AOM_CBR &&
-      cpi->oxcf.resize_mode == RESIZE_DYNAMIC &&
-      cpi->un_scaled_source->y_width == (cm->width << 1) &&
-      cpi->un_scaled_source->y_height == (cm->height << 1)) {
-    cpi->source = av1_scale_if_required_fast(cm, cpi->un_scaled_source,
-                                             &cpi->scaled_source);
-    if (cpi->unscaled_last_source != NULL)
-      cpi->last_source = av1_scale_if_required_fast(
-          cm, cpi->unscaled_last_source, &cpi->scaled_last_source);
-  } else {
-    cpi->source =
-        av1_scale_if_required(cm, cpi->un_scaled_source, &cpi->scaled_source);
-    if (cpi->unscaled_last_source != NULL)
-      cpi->last_source = av1_scale_if_required(cm, cpi->unscaled_last_source,
-                                               &cpi->scaled_last_source);
-  }
+  assert(cm->width == cpi->scaled_source.y_crop_width);
+  assert(cm->height == cpi->scaled_source.y_crop_height);
+
+  set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
+
+  cpi->source =
+      av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source);
+  if (cpi->unscaled_last_source != NULL)
+    cpi->last_source = av1_scale_if_required(cm, cpi->unscaled_last_source,
+                                             &cpi->scaled_last_source);
 
   if (frame_is_intra_only(cm) == 0) {
     av1_scale_references(cpi);
   }
 
-  set_size_independent_vars(cpi);
-  set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
-
-  // cpi->sf.use_upsampled_references can be different from frame to frame.
-  // Every time when cpi->sf.use_upsampled_references is changed from 0 to 1.
-  // The reference frames for this frame have to be up-sampled before encoding.
-  if (!use_upsampled_ref && cpi->sf.use_upsampled_references &&
-      cm->frame_type != KEY_FRAME)
-    reset_use_upsampled_references(cpi);
-
   av1_set_quantizer(cm, q);
   setup_frame(cpi);
   suppress_active_map(cpi);
@@ -3968,11 +4083,6 @@ static void encode_without_recode_loop(AV1_COMP *cpi) {
   // transform / motion compensation build reconstruction frame
   av1_encode_frame(cpi);
 
-#if CONFIG_FRAME_SUPERRES
-  // TODO(afergs): Upscale the frame to show
-  cpi->superres_pending = 0;
-#endif  // CONFIG_FRAME_SUPERRES
-
   // Update some stats from cyclic refresh, and check if we should not update
   // golden reference, for 1 pass CBR.
   if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->frame_type != KEY_FRAME &&
@@ -4000,7 +4110,6 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size,
   int frame_over_shoot_limit;
   int frame_under_shoot_limit;
   int q = 0, q_low = 0, q_high = 0;
-  const int use_upsampled_ref = cpi->sf.use_upsampled_references;
 
   set_size_independent_vars(cpi);
 
@@ -4009,22 +4118,9 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size,
 
     setup_frame_size(cpi);
 
-#if CONFIG_FRAME_SUPERRES
-    if (loop_count == 0 || av1_resize_pending(cpi) || cpi->superres_pending) {
-#else
-    if (loop_count == 0 || av1_resize_pending(cpi)) {
-#endif  // CONFIG_FRAME_SUPERRES
+    if (loop_count == 0) {
       set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
 
-      // cpi->sf.use_upsampled_references can be different from frame to frame.
-      // Every time when cpi->sf.use_upsampled_references is changed from 0 to
-      // 1.
-      // The reference frames for this frame have to be up-sampled before
-      // encoding.
-      if (!use_upsampled_ref && cpi->sf.use_upsampled_references &&
-          cm->frame_type != KEY_FRAME)
-        reset_use_upsampled_references(cpi);
-
       // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed.
       set_mv_search_params(cpi);
 
@@ -4034,9 +4130,6 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size,
       undershoot_seen = 0;
 #endif
 
-      // Advance resize to next state now that updates are done
-      av1_resize_step(cpi);
-
       q_low = bottom_index;
       q_high = top_index;
 
@@ -4051,8 +4144,7 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size,
     }
 
     cpi->source =
-        av1_scale_if_required(cm, cpi->un_scaled_source, &cpi->scaled_source);
-
+        av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source);
     if (cpi->unscaled_last_source != NULL)
       cpi->last_source = av1_scale_if_required(cm, cpi->unscaled_last_source,
                                                &cpi->scaled_last_source);
@@ -4174,8 +4266,6 @@ static void encode_with_recode_loop(AV1_COMP *cpi, size_t *size,
 #if !CONFIG_XIPHRC
         int retries = 0;
 
-        // TODO(afergs): Replace removed recode when av1_resize_pending is true
-
         // Frame size out of permitted range:
         // Update correction factor & compute new Q to try...
         // Frame is too large
@@ -4285,7 +4375,7 @@ static int get_ref_frame_flags(const AV1_COMP *cpi) {
       map[cpi->lst_fb_idxes[2]] == map[cpi->lst_fb_idxes[1]];
   const int gld_is_last2 = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[1]];
   const int gld_is_last3 = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[2]];
-#else
+#else  // !CONFIG_ONE_SIDED_COMPOUND
   const int bwd_is_last = map[cpi->bwd_fb_idx] == map[cpi->lst_fb_idxes[0]];
   const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idxes[0]];
 
@@ -4299,12 +4389,12 @@ static int get_ref_frame_flags(const AV1_COMP *cpi) {
 
   const int bwd_is_gld = map[cpi->bwd_fb_idx] == map[cpi->gld_fb_idx];
 
-#endif
+#endif  // CONFIG_ONE_SIDED_COMPOUND
   const int last2_is_alt = map[cpi->lst_fb_idxes[1]] == map[cpi->alt_fb_idx];
   const int last3_is_alt = map[cpi->lst_fb_idxes[2]] == map[cpi->alt_fb_idx];
   const int gld_is_alt = map[cpi->gld_fb_idx] == map[cpi->alt_fb_idx];
   const int bwd_is_alt = map[cpi->bwd_fb_idx] == map[cpi->alt_fb_idx];
-#else
+#else   // !CONFIG_EXT_REFS
   const int gld_is_last = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idx];
   const int gld_is_alt = map[cpi->gld_fb_idx] == map[cpi->alt_fb_idx];
   const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idx];
@@ -4476,11 +4566,14 @@ static void dump_filtered_recon_frames(AV1_COMP *cpi) {
   }
   printf(
       "\nFrame=%5d, encode_update_type[%5d]=%1d, show_existing_frame=%d, "
-      "y_stride=%4d, uv_stride=%4d, width=%4d, height=%4d\n",
+      "source_alt_ref_active=%d, refresh_alt_ref_frame=%d, rf_level=%d, "
+      "y_stride=%4d, uv_stride=%4d, cm->width=%4d, cm->height=%4d\n",
       cm->current_video_frame, cpi->twopass.gf_group.index,
       cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index],
-      cm->show_existing_frame, recon_buf->y_stride, recon_buf->uv_stride,
-      cm->width, cm->height);
+      cm->show_existing_frame, cpi->rc.source_alt_ref_active,
+      cpi->refresh_alt_ref_frame,
+      cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index],
+      recon_buf->y_stride, recon_buf->uv_stride, cm->width, cm->height);
 
   // --- Y ---
   for (h = 0; h < cm->height; ++h) {
@@ -4502,8 +4595,6 @@ static void dump_filtered_recon_frames(AV1_COMP *cpi) {
 }
 #endif  // DUMP_RECON_FRAMES
 
-#if CONFIG_EC_ADAPT
-
 static void make_update_tile_list_enc(AV1_COMP *cpi, const int tile_rows,
                                       const int tile_cols,
                                       FRAME_CONTEXT *ec_ctxs[]) {
@@ -4512,7 +4603,6 @@ static void make_update_tile_list_enc(AV1_COMP *cpi, const int tile_rows,
     ec_ctxs[i] = &cpi->tile_data[i].tctx;
 }
 
-#endif
 static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
                                       uint8_t *dest, int skip_adapt,
                                       unsigned int *frame_flags) {
@@ -4520,13 +4610,11 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   struct segmentation *const seg = &cm->seg;
   TX_SIZE t;
-#if CONFIG_EC_ADAPT
   FRAME_CONTEXT **tile_ctxs = aom_malloc(cm->tile_rows * cm->tile_cols *
                                          sizeof(&cpi->tile_data[0].tctx));
   aom_cdf_prob **cdf_ptrs =
       aom_malloc(cm->tile_rows * cm->tile_cols *
                  sizeof(&cpi->tile_data[0].tctx.partition_cdf[0][0]));
-#endif
 #if CONFIG_XIPHRC
   int frame_type;
   int drop_this_frame = 0;
@@ -4610,15 +4698,10 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
 #endif
     }
 
-    cm->last_width = cm->width;
-    cm->last_height = cm->height;
-
     ++cm->current_video_frame;
 
-#if CONFIG_EC_ADAPT
     aom_free(tile_ctxs);
     aom_free(cdf_ptrs);
-#endif
     return;
   }
 #endif  // CONFIG_EXT_REFS
@@ -4654,7 +4737,6 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
       cm->reset_frame_context = RESET_FRAME_CONTEXT_CURRENT;
     }
   }
-#if CONFIG_TILE_GROUPS
   if (cpi->oxcf.mtu == 0) {
     cm->num_tg = cpi->oxcf.num_tile_groups;
   } else {
@@ -4662,20 +4744,18 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
     // updates
     cm->num_tg = DEFAULT_MAX_NUM_TG;
   }
-#endif
 
 #if CONFIG_EXT_TILE
-  cm->tile_encoding_mode = cpi->oxcf.tile_encoding_mode;
+  cm->large_scale_tile = cpi->oxcf.large_scale_tile;
+  cm->single_tile_decoding = cpi->oxcf.single_tile_decoding;
 #endif  // CONFIG_EXT_TILE
 
 #if CONFIG_XIPHRC
   if (drop_this_frame) {
     av1_rc_postencode_update_drop_frame(cpi);
     ++cm->current_video_frame;
-#if CONFIG_EC_ADAPT
     aom_free(tile_ctxs);
     aom_free(cdf_ptrs);
-#endif
     return;
   }
 #else
@@ -4686,10 +4766,8 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
     if (av1_rc_drop_frame(cpi)) {
       av1_rc_postencode_update_drop_frame(cpi);
       ++cm->current_video_frame;
-#if CONFIG_EC_ADAPT
       aom_free(tile_ctxs);
       aom_free(cdf_ptrs);
-#endif
       return;
     }
   }
@@ -4770,6 +4848,10 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
 
   cm->frame_to_show = get_frame_new_buffer(cm);
   cm->frame_to_show->color_space = cm->color_space;
+#if CONFIG_COLORSPACE_HEADERS
+  cm->frame_to_show->transfer_function = cm->transfer_function;
+  cm->frame_to_show->chroma_sample_position = cm->chroma_sample_position;
+#endif
   cm->frame_to_show->color_range = cm->color_range;
   cm->frame_to_show->render_width = cm->render_width;
   cm->frame_to_show->render_height = cm->render_height;
@@ -4786,10 +4868,8 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
   av1_pack_bitstream(cpi, dest, size);
 
   if (skip_adapt) {
-#if CONFIG_EC_ADAPT
     aom_free(tile_ctxs);
     aom_free(cdf_ptrs);
-#endif
     return;
   }
 
@@ -4823,11 +4903,13 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
                              cpi->td.rd_counts.coef_counts[t]);
 #if CONFIG_ENTROPY_STATS
   av1_accumulate_frame_counts(&aggregate_fc, &cm->counts);
+  assert(cm->frame_context_idx < FRAME_CONTEXTS);
+  av1_accumulate_frame_counts(&aggregate_fc_per_type[cm->frame_context_idx],
+                              &cm->counts);
 #endif  // CONFIG_ENTROPY_STATS
   if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
     av1_adapt_coef_probs(cm);
     av1_adapt_intra_frame_probs(cm);
-#if CONFIG_EC_ADAPT
     make_update_tile_list_enc(cpi, cm->tile_rows, cm->tile_cols, tile_ctxs);
     av1_average_tile_coef_cdfs(cpi->common.fc, tile_ctxs, cdf_ptrs,
                                cm->tile_rows * cm->tile_cols);
@@ -4837,7 +4919,6 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
     av1_average_tile_pvq_cdfs(cpi->common.fc, tile_ctxs,
                               cm->tile_rows * cm->tile_cols);
 #endif  // CONFIG_PVQ
-#endif  // CONFIG_EC_ADAPT
 #if CONFIG_ADAPT_SCAN
     av1_adapt_scan_order(cm);
 #endif  // CONFIG_ADAPT_SCAN
@@ -4847,12 +4928,10 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
     if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
       av1_adapt_inter_frame_probs(cm);
       av1_adapt_mv_probs(cm, cm->allow_high_precision_mv);
-#if CONFIG_EC_ADAPT
       av1_average_tile_inter_cdfs(&cpi->common, cpi->common.fc, tile_ctxs,
                                   cdf_ptrs, cm->tile_rows * cm->tile_cols);
       av1_average_tile_mv_cdfs(cpi->common.fc, tile_ctxs, cdf_ptrs,
                                cm->tile_rows * cm->tile_cols);
-#endif
     }
   }
 
@@ -4888,10 +4967,8 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
   if (drop_this_frame) {
     av1_rc_postencode_update_drop_frame(cpi);
     ++cm->current_video_frame;
-#if CONFIG_EC_ADAPT
     aom_free(tile_ctxs);
     aom_free(cdf_ptrs);
-#endif
     return;
   }
 #else   // !CONFIG_XIPHRC
@@ -4915,13 +4992,6 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
   cm->seg.update_data = 0;
   cm->lf.mode_ref_delta_update = 0;
 
-  // keep track of the last coded dimensions
-  cm->last_width = cm->width;
-  cm->last_height = cm->height;
-
-  // reset to normal state now that we are done.
-  if (!cm->show_existing_frame) cm->last_show_frame = cm->show_frame;
-
   if (cm->show_frame) {
 #if CONFIG_EXT_REFS
 // TODO(zoeliu): We may only swamp mi and prev_mi for those frames that are
@@ -4935,13 +5005,20 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
 
 #if CONFIG_EXT_REFS
   // NOTE: Shall not refer to any frame not used as reference.
-  if (cm->is_reference_frame)
+  if (cm->is_reference_frame) {
 #endif  // CONFIG_EXT_REFS
     cm->prev_frame = cm->cur_frame;
-#if CONFIG_EC_ADAPT
+    // keep track of the last coded dimensions
+    cm->last_width = cm->width;
+    cm->last_height = cm->height;
+
+    // reset to normal state now that we are done.
+    cm->last_show_frame = cm->show_frame;
+#if CONFIG_EXT_REFS
+  }
+#endif  // CONFIG_EXT_REFS
   aom_free(tile_ctxs);
   aom_free(cdf_ptrs);
-#endif
 }
 
 static void Pass0Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest,
@@ -5220,12 +5297,17 @@ static void adjust_image_stat(double y, double u, double v, double all,
   s->worst = AOMMIN(s->worst, all);
 }
 
-static void compute_internal_stats(AV1_COMP *cpi) {
+static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) {
   AV1_COMMON *const cm = &cpi->common;
   double samples = 0.0;
   uint32_t in_bit_depth = 8;
   uint32_t bit_depth = 8;
 
+#if CONFIG_INTER_STATS_ONLY
+  if (cm->frame_type == KEY_FRAME) return;  // skip key frame
+#endif
+  cpi->bytes += frame_bytes;
+
 #if CONFIG_HIGHBITDEPTH
   if (cm->use_highbitdepth) {
     in_bit_depth = cpi->oxcf.input_bit_depth;
@@ -5413,8 +5495,7 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
     if (cpi->b_calculate_psnr) generate_psnr_packet(cpi);
 
 #if CONFIG_INTERNAL_STATS
-    compute_internal_stats(cpi);
-    cpi->bytes += (int)(*size);
+    compute_internal_stats(cpi, (int)(*size));
 #endif  // CONFIG_INTERNAL_STATS
 
     // Clear down mmx registers
@@ -5448,8 +5529,17 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
       cpi->alt_ref_source = source;
 
       if (oxcf->arnr_max_frames > 0) {
-        // Produce the filtered ARF frame.
-        av1_temporal_filter(cpi, arf_src_index);
+// Produce the filtered ARF frame.
+#if CONFIG_BGSPRITE
+        int bgsprite_ret = av1_background_sprite(cpi, arf_src_index);
+        // Do temporal filter if bgsprite not generated.
+        if (bgsprite_ret != 0)
+#endif  // CONFIG_BGSPRITE
+          av1_temporal_filter(cpi,
+#if CONFIG_BGSPRITE
+                              NULL,
+#endif  // CONFIG_BGSPRITE
+                              arf_src_index);
         aom_extend_frame_borders(&cpi->alt_ref_buffer);
         force_src_buffer = &cpi->alt_ref_buffer;
       }
@@ -5489,7 +5579,7 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
       if ((last_source = av1_lookahead_peek(cpi->lookahead, -1)) == NULL)
         return -1;
     }
-
+    if (cm->current_video_frame > 0) assert(last_source != NULL);
     // Read in the source frame.
     source = av1_lookahead_pop(cpi->lookahead, flush);
 
@@ -5501,11 +5591,9 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
       check_src_altref(cpi, source);
     }
   }
-
   if (source) {
-    cpi->un_scaled_source = cpi->source =
+    cpi->unscaled_source = cpi->source =
         force_src_buffer ? force_src_buffer : &source->img;
-
     cpi->unscaled_last_source = last_source != NULL ? &last_source->img : NULL;
 
     *time_stamp = source->ts_start;
@@ -5576,7 +5664,6 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
     av1_rc_get_second_pass_params(cpi);
   } else if (oxcf->pass == 1) {
     setup_frame_size(cpi);
-    av1_resize_step(cpi);
   }
 #endif
 
@@ -5645,8 +5732,7 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
 
 #if CONFIG_INTERNAL_STATS
   if (oxcf->pass != 1) {
-    compute_internal_stats(cpi);
-    cpi->bytes += (int)(*size);
+    compute_internal_stats(cpi, (int)(*size));
   }
 #endif  // CONFIG_INTERNAL_STATS
 
@@ -5712,9 +5798,10 @@ int av1_set_internal_size(AV1_COMP *cpi, AOM_SCALING horiz_mode,
 int av1_set_size_literal(AV1_COMP *cpi, int width, int height) {
   AV1_COMMON *cm = &cpi->common;
 #if CONFIG_HIGHBITDEPTH
-  check_initial_width(cpi, cm->use_highbitdepth, 1, 1);
+  check_initial_width(cpi, cm->use_highbitdepth, cm->subsampling_x,
+                      cm->subsampling_y);
 #else
-  check_initial_width(cpi, 1, 1);
+  check_initial_width(cpi, cm->subsampling_x, cm->subsampling_y);
 #endif  // CONFIG_HIGHBITDEPTH
 
   if (width <= 0 || height <= 0) return 1;
diff --git a/third_party/aom/av1/encoder/encoder.h b/third_party/aom/av1/encoder/encoder.h
index ee1257c2d..9b98975b7 100644
--- a/third_party/aom/av1/encoder/encoder.h
+++ b/third_party/aom/av1/encoder/encoder.h
@@ -21,6 +21,7 @@
 #include "av1/common/entropymode.h"
 #include "av1/common/thread_common.h"
 #include "av1/common/onyxc_int.h"
+#include "av1/common/resize.h"
 #include "av1/encoder/aq_cyclicrefresh.h"
 #if CONFIG_ANS
 #include "aom_dsp/ans.h"
@@ -52,6 +53,10 @@
 extern "C" {
 #endif
 
+#if CONFIG_SPEED_REFS
+#define MIN_SPEED_REFS_BLKSIZE BLOCK_16X16
+#endif  // CONFIG_SPEED_REFS
+
 typedef struct {
   int nmv_vec_cost[NMV_CONTEXTS][MV_JOINTS];
   int nmv_costs[NMV_CONTEXTS][2][MV_VALS];
@@ -128,7 +133,14 @@ typedef enum {
   RESIZE_NONE = 0,    // No frame resizing allowed.
   RESIZE_FIXED = 1,   // All frames are coded at the specified dimension.
   RESIZE_DYNAMIC = 2  // Coded size of each frame is determined by the codec.
-} RESIZE_TYPE;
+} RESIZE_MODE;
+#if CONFIG_FRAME_SUPERRES
+typedef enum {
+  SUPERRES_NONE = 0,
+  SUPERRES_FIXED = 1,
+  SUPERRES_DYNAMIC = 2
+} SUPERRES_MODE;
+#endif  // CONFIG_FRAME_SUPERRES
 
 typedef struct AV1EncoderConfig {
   BITSTREAM_PROFILE profile;
@@ -190,22 +202,22 @@ typedef struct AV1EncoderConfig {
   int qm_minlevel;
   int qm_maxlevel;
 #endif
-#if CONFIG_TILE_GROUPS
   unsigned int num_tile_groups;
   unsigned int mtu;
-#endif
 
 #if CONFIG_TEMPMV_SIGNALING
   unsigned int disable_tempmv;
 #endif
   // Internal frame size scaling.
-  RESIZE_TYPE resize_mode;
-  int scaled_frame_width;
-  int scaled_frame_height;
+  RESIZE_MODE resize_mode;
+  uint8_t resize_scale_numerator;
+  uint8_t resize_kf_scale_numerator;
 
 #if CONFIG_FRAME_SUPERRES
-  // Frame Super-Resolution size scaling
-  int superres_enabled;
+  // Frame Super-Resolution size scaling.
+  SUPERRES_MODE superres_mode;
+  uint8_t superres_scale_numerator;
+  uint8_t superres_kf_scale_numerator;
 #endif  // CONFIG_FRAME_SUPERRES
 
   // Enable feature to reduce the frame quantization every x frames.
@@ -265,6 +277,10 @@ typedef struct AV1EncoderConfig {
   int use_highbitdepth;
 #endif
   aom_color_space_t color_space;
+#if CONFIG_COLORSPACE_HEADERS
+  aom_transfer_function_t transfer_function;
+  aom_chroma_sample_position_t chroma_sample_position;
+#endif
   int color_range;
   int render_width;
   int render_height;
@@ -276,7 +292,8 @@ typedef struct AV1EncoderConfig {
   int ans_window_size_log2;
 #endif  // CONFIG_ANS && ANS_MAX_SYMBOLS
 #if CONFIG_EXT_TILE
-  unsigned int tile_encoding_mode;
+  unsigned int large_scale_tile;
+  unsigned int single_tile_decoding;
 #endif  // CONFIG_EXT_TILE
 
   unsigned int motion_vector_unit_test;
@@ -289,8 +306,8 @@ static INLINE int is_lossless_requested(const AV1EncoderConfig *cfg) {
 // TODO(jingning) All spatially adaptive variables should go to TileDataEnc.
 typedef struct TileDataEnc {
   TileInfo tile_info;
-  int thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
-  int mode_map[BLOCK_SIZES][MAX_MODES];
+  int thresh_freq_fact[BLOCK_SIZES_ALL][MAX_MODES];
+  int mode_map[BLOCK_SIZES_ALL][MAX_MODES];
   int m_search_count;
   int ex_search_count;
 #if CONFIG_PVQ
@@ -299,9 +316,7 @@ typedef struct TileDataEnc {
 #if CONFIG_CFL
   CFL_CTX cfl;
 #endif
-#if CONFIG_EC_ADAPT
   DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx);
-#endif
 } TileDataEnc;
 
 typedef struct RD_COUNTS {
@@ -311,6 +326,8 @@ typedef struct RD_COUNTS {
   // Stores number of 4x4 blocks using global motion per reference frame.
   int global_motion_used[TOTAL_REFS_PER_FRAME];
 #endif  // CONFIG_GLOBAL_MOTION
+  int single_ref_used_flag;
+  int compound_ref_used_flag;
 } RD_COUNTS;
 
 typedef struct ThreadData {
@@ -372,18 +389,11 @@ typedef struct AV1_COMP {
 
   YV12_BUFFER_CONFIG *source;
   YV12_BUFFER_CONFIG *last_source;  // NULL for first frame and alt_ref frames
-  YV12_BUFFER_CONFIG *un_scaled_source;
+  YV12_BUFFER_CONFIG *unscaled_source;
   YV12_BUFFER_CONFIG scaled_source;
   YV12_BUFFER_CONFIG *unscaled_last_source;
   YV12_BUFFER_CONFIG scaled_last_source;
 
-  // Up-sampled reference buffers
-  // NOTE(zoeliu): It is needed to allocate sufficient space to the up-sampled
-  // reference buffers, which should include the up-sampled version of all the
-  // possibly stored references plus the currently coded frame itself.
-  EncRefCntBuffer upsampled_ref_bufs[REF_FRAMES + 1];
-  int upsampled_ref_idx[REF_FRAMES + 1];
-
   // For a still frame, this flag is set to 1 to skip partition search.
   int partition_search_skippable_frame;
 
@@ -471,7 +481,7 @@ typedef struct AV1_COMP {
   fractional_mv_step_fp *find_fractional_mv_step;
   av1_full_search_fn_t full_search_sad;  // It is currently unused.
   av1_diamond_search_fn_t diamond_search_sad;
-  aom_variance_fn_ptr_t fn_ptr[BLOCK_SIZES];
+  aom_variance_fn_ptr_t fn_ptr[BLOCK_SIZES_ALL];
   uint64_t time_receive_data;
   uint64_t time_compress_data;
   uint64_t time_pick_lpf;
@@ -538,17 +548,24 @@ typedef struct AV1_COMP {
 #if CONFIG_EXT_INTER
   unsigned int inter_compound_mode_cost[INTER_MODE_CONTEXTS]
                                        [INTER_COMPOUND_MODES];
+#if CONFIG_COMPOUND_SINGLEREF
+  unsigned int inter_singleref_comp_mode_cost[INTER_MODE_CONTEXTS]
+                                             [INTER_SINGLEREF_COMP_MODES];
+#endif  // CONFIG_COMPOUND_SINGLEREF
 #if CONFIG_INTERINTRA
   unsigned int interintra_mode_cost[BLOCK_SIZE_GROUPS][INTERINTRA_MODES];
 #endif  // CONFIG_INTERINTRA
 #endif  // CONFIG_EXT_INTER
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-  int motion_mode_cost[BLOCK_SIZES][MOTION_MODES];
+  int motion_mode_cost[BLOCK_SIZES_ALL][MOTION_MODES];
 #if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-  int motion_mode_cost1[BLOCK_SIZES][2];
+  int motion_mode_cost1[BLOCK_SIZES_ALL][2];
 #endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+#if CONFIG_MOTION_VAR && CONFIG_NCOBMC_ADAPT_WEIGHT
+  int ncobmc_mode_cost[ADAPT_OVERLAP_BLOCKS][MAX_NCOBMC_MODES];
+#endif  // CONFIG_MOTION_VAR && CONFIG_NCOBMC_ADAPT_WEIGHT
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-  int intra_uv_mode_cost[INTRA_MODES][INTRA_MODES];
+  int intra_uv_mode_cost[INTRA_MODES][UV_INTRA_MODES];
   int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
   int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
 #if CONFIG_EXT_PARTITION_TYPES
@@ -601,18 +618,10 @@ typedef struct AV1_COMP {
   TileBufferEnc tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS];
 
   int resize_state;
-  int resize_scale_num;
-  int resize_scale_den;
-  int resize_next_scale_num;
-  int resize_next_scale_den;
   int resize_avg_qp;
   int resize_buffer_underflow;
   int resize_count;
 
-#if CONFIG_FRAME_SUPERRES
-  int superres_pending;
-#endif  // CONFIG_FRAME_SUPERRES
-
   // VARIANCE_AQ segment map refresh
   int vaq_refresh;
 
@@ -640,6 +649,15 @@ typedef struct AV1_COMP {
 #if CONFIG_LV_MAP
   tran_low_t *tcoeff_buf[MAX_MB_PLANE];
 #endif
+
+#if CONFIG_SPEED_REFS
+  int sb_scanning_pass_idx;
+#endif  // CONFIG_SPEED_REFS
+
+#if CONFIG_FLEX_REFS
+  int extra_arf_allowed;
+  int bwd_ref_allowed;
+#endif  // CONFIG_FLEX_REFS
 } AV1_COMP;
 
 void av1_initialize_enc(void);
@@ -729,14 +747,6 @@ static INLINE YV12_BUFFER_CONFIG *get_ref_frame_buffer(
                                 : NULL;
 }
 
-static INLINE const YV12_BUFFER_CONFIG *get_upsampled_ref(
-    const AV1_COMP *cpi, const MV_REFERENCE_FRAME ref_frame) {
-  // Use up-sampled reference frames.
-  const int buf_idx =
-      cpi->upsampled_ref_idx[get_ref_frame_map_idx(cpi, ref_frame)];
-  return &cpi->upsampled_ref_bufs[buf_idx].buf;
-}
-
 #if CONFIG_EXT_REFS || CONFIG_TEMPMV_SIGNALING
 static INLINE int enc_is_ref_frame_buf(AV1_COMP *cpi, RefCntBuffer *frame_buf) {
   MV_REFERENCE_FRAME ref_frame;
@@ -831,23 +841,22 @@ static INLINE void uref_cnt_fb(EncRefCntBuffer *ubufs, int *uidx,
   ubufs[new_uidx].ref_count++;
 }
 
-// Returns 1 if a resize is pending and 0 otherwise.
-static INLINE int av1_resize_pending(const struct AV1_COMP *cpi) {
-  return cpi->resize_scale_num != cpi->resize_next_scale_num ||
-         cpi->resize_scale_den != cpi->resize_next_scale_den;
-}
-
 // Returns 1 if a frame is unscaled and 0 otherwise.
-static INLINE int av1_resize_unscaled(const struct AV1_COMP *cpi) {
-  return cpi->resize_scale_num == cpi->resize_scale_den;
+static INLINE int av1_resize_unscaled(const AV1_COMMON *cm) {
+#if CONFIG_FRAME_SUPERRES
+  return cm->superres_upscaled_width == cm->render_width &&
+         cm->superres_upscaled_height == cm->render_height;
+#else
+  return cm->width == cm->render_width && cm->height == cm->render_height;
+#endif  // CONFIG_FRAME_SUPERRES
 }
 
-// Moves resizing to the next state. This is just setting the numerator and
-// denominator to the next numerator and denominator, causing
-// av1_resize_pending to subsequently return false.
-static INLINE void av1_resize_step(struct AV1_COMP *cpi) {
-  cpi->resize_scale_num = cpi->resize_next_scale_num;
-  cpi->resize_scale_den = cpi->resize_next_scale_den;
+static INLINE int av1_frame_unscaled(const AV1_COMMON *cm) {
+#if CONFIG_FRAME_SUPERRES
+  return av1_superres_unscaled(cm) && av1_resize_unscaled(cm);
+#else
+  return av1_resize_unscaled(cm);
+#endif  // CONFIG_FRAME_SUPERRES
 }
 
 #ifdef __cplusplus
diff --git a/third_party/aom/av1/encoder/encodetxb.c b/third_party/aom/av1/encoder/encodetxb.c
index 731642064..3aa4c183e 100644
--- a/third_party/aom/av1/encoder/encodetxb.c
+++ b/third_party/aom/av1/encoder/encodetxb.c
@@ -70,38 +70,43 @@ static void write_golomb(aom_writer *w, int level) {
 }
 
 void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
-                          aom_writer *w, int block, int plane,
-                          const tran_low_t *tcoeff, uint16_t eob,
-                          TXB_CTX *txb_ctx) {
+                          aom_writer *w, int blk_row, int blk_col, int block,
+                          int plane, TX_SIZE tx_size, const tran_low_t *tcoeff,
+                          uint16_t eob, TXB_CTX *txb_ctx) {
   aom_prob *nz_map;
   aom_prob *eob_flag;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   const PLANE_TYPE plane_type = get_plane_type(plane);
-  const TX_SIZE tx_size = get_tx_size(plane, xd);
-  const TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
-  const SCAN_ORDER *const scan_order =
-      get_scan(cm, tx_size, tx_type, is_inter_block(mbmi));
+  const TX_SIZE txs_ctx = get_txsize_context(tx_size);
+  const TX_TYPE tx_type =
+      av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
+  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, mbmi);
   const int16_t *scan = scan_order->scan;
+  const int16_t *iscan = scan_order->iscan;
   int c;
   int is_nz;
   const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
+  const int height = tx_size_high[tx_size];
   const int seg_eob = tx_size_2d[tx_size];
-  uint8_t txb_mask[32 * 32] = { 0 };
   uint16_t update_eob = 0;
 
-  aom_write(w, eob == 0, cm->fc->txb_skip[tx_size][txb_ctx->txb_skip_ctx]);
+  (void)blk_row;
+  (void)blk_col;
+
+  aom_write(w, eob == 0, cm->fc->txb_skip[txs_ctx][txb_ctx->txb_skip_ctx]);
 
   if (eob == 0) return;
 #if CONFIG_TXK_SEL
-  av1_write_tx_type(cm, xd, block, plane, w);
+  av1_write_tx_type(cm, xd, blk_row, blk_col, block, plane,
+                    get_min_tx_size(tx_size), w);
 #endif
 
-  nz_map = cm->fc->nz_map[tx_size][plane_type];
-  eob_flag = cm->fc->eob_flag[tx_size][plane_type];
+  nz_map = cm->fc->nz_map[txs_ctx][plane_type];
+  eob_flag = cm->fc->eob_flag[txs_ctx][plane_type];
 
   for (c = 0; c < eob; ++c) {
-    int coeff_ctx = get_nz_map_ctx(tcoeff, txb_mask, scan[c], bwl);
-    int eob_ctx = get_eob_ctx(tcoeff, scan[c], bwl);
+    int coeff_ctx = get_nz_map_ctx(tcoeff, scan[c], bwl, height, iscan);
+    int eob_ctx = get_eob_ctx(tcoeff, scan[c], txs_ctx);
 
     tran_low_t v = tcoeff[scan[c]];
     is_nz = (v != 0);
@@ -113,12 +118,11 @@ void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
     if (is_nz) {
       aom_write(w, c == (eob - 1), eob_flag[eob_ctx]);
     }
-    txb_mask[scan[c]] = 1;
   }
 
   int i;
   for (i = 0; i < NUM_BASE_LEVELS; ++i) {
-    aom_prob *coeff_base = cm->fc->coeff_base[tx_size][plane_type][i];
+    aom_prob *coeff_base = cm->fc->coeff_base[txs_ctx][plane_type][i];
 
     update_eob = 0;
     for (c = eob - 1; c >= 0; --c) {
@@ -129,7 +133,7 @@ void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
 
       if (level <= i) continue;
 
-      ctx = get_base_ctx(tcoeff, scan[c], bwl, i + 1);
+      ctx = get_base_ctx(tcoeff, scan[c], bwl, height, i + 1);
 
       if (level == i + 1) {
         aom_write(w, 1, coeff_base[ctx]);
@@ -161,13 +165,13 @@ void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
     }
 
     // level is above 1.
-    ctx = get_br_ctx(tcoeff, scan[c], bwl);
+    ctx = get_br_ctx(tcoeff, scan[c], bwl, height);
     for (idx = 0; idx < COEFF_BASE_RANGE; ++idx) {
       if (level == (idx + 1 + NUM_BASE_LEVELS)) {
-        aom_write(w, 1, cm->fc->coeff_lps[tx_size][plane_type][ctx]);
+        aom_write(w, 1, cm->fc->coeff_lps[txs_ctx][plane_type][ctx]);
         break;
       }
-      aom_write(w, 0, cm->fc->coeff_lps[tx_size][plane_type][ctx]);
+      aom_write(w, 0, cm->fc->coeff_lps[txs_ctx][plane_type][ctx]);
     }
     if (idx < COEFF_BASE_RANGE) continue;
 
@@ -183,7 +187,10 @@ void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x,
   BLOCK_SIZE bsize = mbmi->sb_type;
   struct macroblockd_plane *pd = &xd->plane[plane];
 
-#if CONFIG_CB4X4
+#if CONFIG_CHROMA_SUB8X8
+  const BLOCK_SIZE plane_bsize =
+      AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
+#elif CONFIG_CB4X4
   const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
 #else
   const BLOCK_SIZE plane_bsize =
@@ -191,7 +198,7 @@ void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x,
 #endif
   const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
   const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
-  TX_SIZE tx_size = get_tx_size(plane, xd);
+  const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
   const int bkw = tx_size_wide_unit[tx_size];
   const int bkh = tx_size_high_unit[tx_size];
   const int step = tx_size_wide_unit[tx_size] * tx_size_high_unit[tx_size];
@@ -203,7 +210,8 @@ void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x,
       uint16_t eob = x->mbmi_ext->eobs[plane][block];
       TXB_CTX txb_ctx = { x->mbmi_ext->txb_skip_ctx[plane][block],
                           x->mbmi_ext->dc_sign_ctx[plane][block] };
-      av1_write_coeffs_txb(cm, xd, w, block, plane, tcoeff, eob, &txb_ctx);
+      av1_write_coeffs_txb(cm, xd, w, row, col, block, plane, tx_size, tcoeff,
+                           eob, &txb_ctx);
       block += step;
     }
   }
@@ -211,7 +219,7 @@ void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x,
 
 static INLINE void get_base_ctx_set(const tran_low_t *tcoeffs,
                                     int c,  // raster order
-                                    const int bwl,
+                                    const int bwl, const int height,
                                     int ctx_set[NUM_BASE_LEVELS]) {
   const int row = c >> bwl;
   const int col = c - (row << bwl);
@@ -226,7 +234,7 @@ static INLINE void get_base_ctx_set(const tran_low_t *tcoeffs,
     int ref_col = col + base_ref_offset[idx][1];
     int pos = (ref_row << bwl) + ref_col;
 
-    if (ref_row < 0 || ref_col < 0 || ref_row >= stride || ref_col >= stride)
+    if (ref_row < 0 || ref_col < 0 || ref_row >= height || ref_col >= stride)
       continue;
 
     abs_coeff = abs(tcoeffs[pos]);
@@ -280,12 +288,14 @@ static INLINE int get_base_cost(tran_low_t abs_qc, int ctx,
 }
 
 int av1_cost_coeffs_txb(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
-                        int block, TXB_CTX *txb_ctx) {
+                        int blk_row, int blk_col, int block, TX_SIZE tx_size,
+                        TXB_CTX *txb_ctx) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
-  const TX_SIZE tx_size = get_tx_size(plane, xd);
+  TX_SIZE txs_ctx = get_txsize_context(tx_size);
   const PLANE_TYPE plane_type = get_plane_type(plane);
-  const TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+  const TX_TYPE tx_type =
+      av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   const struct macroblock_plane *p = &x->plane[plane];
   const int eob = p->eobs[block];
@@ -293,27 +303,26 @@ int av1_cost_coeffs_txb(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
   int c, cost;
   const int seg_eob = AOMMIN(eob, tx_size_2d[tx_size] - 1);
   int txb_skip_ctx = txb_ctx->txb_skip_ctx;
-  aom_prob *nz_map = xd->fc->nz_map[tx_size][plane_type];
+  aom_prob *nz_map = xd->fc->nz_map[txs_ctx][plane_type];
 
   const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
-  // txb_mask is only initialized for once here. After that, it will be set when
-  // coding zero map and then reset when coding level 1 info.
-  uint8_t txb_mask[32 * 32] = { 0 };
+  const int height = tx_size_high[tx_size];
+
   aom_prob(*coeff_base)[COEFF_BASE_CONTEXTS] =
-      xd->fc->coeff_base[tx_size][plane_type];
+      xd->fc->coeff_base[txs_ctx][plane_type];
 
-  const SCAN_ORDER *const scan_order =
-      get_scan(cm, tx_size, tx_type, is_inter_block(mbmi));
+  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, mbmi);
   const int16_t *scan = scan_order->scan;
+  const int16_t *iscan = scan_order->iscan;
 
   cost = 0;
 
   if (eob == 0) {
-    cost = av1_cost_bit(xd->fc->txb_skip[tx_size][txb_skip_ctx], 1);
+    cost = av1_cost_bit(xd->fc->txb_skip[txs_ctx][txb_skip_ctx], 1);
     return cost;
   }
 
-  cost = av1_cost_bit(xd->fc->txb_skip[tx_size][txb_skip_ctx], 0);
+  cost = av1_cost_bit(xd->fc->txb_skip[txs_ctx][txb_skip_ctx], 0);
 
 #if CONFIG_TXK_SEL
   cost += av1_tx_type_cost(cpi, xd, mbmi->sb_type, plane, tx_size, tx_type);
@@ -325,7 +334,7 @@ int av1_cost_coeffs_txb(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
     int level = abs(v);
 
     if (c < seg_eob) {
-      int coeff_ctx = get_nz_map_ctx(qcoeff, txb_mask, scan[c], bwl);
+      int coeff_ctx = get_nz_map_ctx(qcoeff, scan[c], bwl, height, iscan);
       cost += av1_cost_bit(nz_map[coeff_ctx], is_nz);
     }
 
@@ -342,7 +351,7 @@ int av1_cost_coeffs_txb(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
         cost += av1_cost_bit(128, sign);
       }
 
-      get_base_ctx_set(qcoeff, scan[c], bwl, ctx_ls);
+      get_base_ctx_set(qcoeff, scan[c], bwl, height, ctx_ls);
 
       int i;
       for (i = 0; i < NUM_BASE_LEVELS; ++i) {
@@ -359,15 +368,15 @@ int av1_cost_coeffs_txb(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
         int idx;
         int ctx;
 
-        ctx = get_br_ctx(qcoeff, scan[c], bwl);
+        ctx = get_br_ctx(qcoeff, scan[c], bwl, height);
 
         for (idx = 0; idx < COEFF_BASE_RANGE; ++idx) {
           if (level == (idx + 1 + NUM_BASE_LEVELS)) {
             cost +=
-                av1_cost_bit(xd->fc->coeff_lps[tx_size][plane_type][ctx], 1);
+                av1_cost_bit(xd->fc->coeff_lps[txs_ctx][plane_type][ctx], 1);
             break;
           }
-          cost += av1_cost_bit(xd->fc->coeff_lps[tx_size][plane_type][ctx], 0);
+          cost += av1_cost_bit(xd->fc->coeff_lps[txs_ctx][plane_type][ctx], 0);
         }
 
         if (idx >= COEFF_BASE_RANGE) {
@@ -389,13 +398,11 @@ int av1_cost_coeffs_txb(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
       }
 
       if (c < seg_eob) {
-        int eob_ctx = get_eob_ctx(qcoeff, scan[c], bwl);
-        cost += av1_cost_bit(xd->fc->eob_flag[tx_size][plane_type][eob_ctx],
+        int eob_ctx = get_eob_ctx(qcoeff, scan[c], txs_ctx);
+        cost += av1_cost_bit(xd->fc->eob_flag[txs_ctx][plane_type][eob_ctx],
                              c == (eob - 1));
       }
     }
-
-    txb_mask[scan[c]] = 1;
   }
 
   return cost;
@@ -409,26 +416,26 @@ static INLINE int has_base(tran_low_t qc, int base_idx) {
 static void gen_base_count_mag_arr(int (*base_count_arr)[MAX_TX_SQUARE],
                                    int (*base_mag_arr)[2],
                                    const tran_low_t *qcoeff, int stride,
-                                   int eob, const int16_t *scan) {
+                                   int height, int eob, const int16_t *scan) {
   for (int c = 0; c < eob; ++c) {
     const int coeff_idx = scan[c];  // raster order
     if (!has_base(qcoeff[coeff_idx], 0)) continue;
     const int row = coeff_idx / stride;
     const int col = coeff_idx % stride;
     int *mag = base_mag_arr[coeff_idx];
-    get_mag(mag, qcoeff, stride, row, col, base_ref_offset,
+    get_mag(mag, qcoeff, stride, height, row, col, base_ref_offset,
             BASE_CONTEXT_POSITION_NUM);
     for (int i = 0; i < NUM_BASE_LEVELS; ++i) {
       if (!has_base(qcoeff[coeff_idx], i)) continue;
       int *count = base_count_arr[i] + coeff_idx;
-      *count = get_level_count(qcoeff, stride, row, col, i, base_ref_offset,
-                               BASE_CONTEXT_POSITION_NUM);
+      *count = get_level_count(qcoeff, stride, height, row, col, i,
+                               base_ref_offset, BASE_CONTEXT_POSITION_NUM);
     }
   }
 }
 
 static void gen_nz_count_arr(int(*nz_count_arr), const tran_low_t *qcoeff,
-                             int stride, int eob,
+                             int stride, int height, int eob,
                              const SCAN_ORDER *scan_order) {
   const int16_t *scan = scan_order->scan;
   const int16_t *iscan = scan_order->iscan;
@@ -436,7 +443,8 @@ static void gen_nz_count_arr(int(*nz_count_arr), const tran_low_t *qcoeff,
     const int coeff_idx = scan[c];  // raster order
     const int row = coeff_idx / stride;
     const int col = coeff_idx % stride;
-    nz_count_arr[coeff_idx] = get_nz_count(qcoeff, stride, row, col, iscan);
+    nz_count_arr[coeff_idx] =
+        get_nz_count(qcoeff, stride, height, row, col, iscan);
   }
 }
 
@@ -478,8 +486,8 @@ static INLINE int has_br(tran_low_t qc) {
 }
 
 static void gen_br_count_mag_arr(int *br_count_arr, int (*br_mag_arr)[2],
-                                 const tran_low_t *qcoeff, int stride, int eob,
-                                 const int16_t *scan) {
+                                 const tran_low_t *qcoeff, int stride,
+                                 int height, int eob, const int16_t *scan) {
   for (int c = 0; c < eob; ++c) {
     const int coeff_idx = scan[c];  // raster order
     if (!has_br(qcoeff[coeff_idx])) continue;
@@ -487,9 +495,9 @@ static void gen_br_count_mag_arr(int *br_count_arr, int (*br_mag_arr)[2],
     const int col = coeff_idx % stride;
     int *count = br_count_arr + coeff_idx;
     int *mag = br_mag_arr[coeff_idx];
-    *count = get_level_count(qcoeff, stride, row, col, NUM_BASE_LEVELS,
+    *count = get_level_count(qcoeff, stride, height, row, col, NUM_BASE_LEVELS,
                              br_ref_offset, BR_CONTEXT_POSITION_NUM);
-    get_mag(mag, qcoeff, stride, row, col, br_ref_offset,
+    get_mag(mag, qcoeff, stride, height, row, col, br_ref_offset,
             BR_CONTEXT_POSITION_NUM);
   }
 }
@@ -543,18 +551,19 @@ static INLINE int get_golomb_cost(int abs_qc) {
 void gen_txb_cache(TxbCache *txb_cache, TxbInfo *txb_info) {
   const int16_t *scan = txb_info->scan_order->scan;
   gen_nz_count_arr(txb_cache->nz_count_arr, txb_info->qcoeff, txb_info->stride,
-                   txb_info->eob, txb_info->scan_order);
+                   txb_info->height, txb_info->eob, txb_info->scan_order);
   gen_nz_ctx_arr(txb_cache->nz_ctx_arr, txb_cache->nz_count_arr,
                  txb_info->qcoeff, txb_info->bwl, txb_info->eob,
                  txb_info->scan_order);
   gen_base_count_mag_arr(txb_cache->base_count_arr, txb_cache->base_mag_arr,
-                         txb_info->qcoeff, txb_info->stride, txb_info->eob,
-                         scan);
+                         txb_info->qcoeff, txb_info->stride, txb_info->height,
+                         txb_info->eob, scan);
   gen_base_ctx_arr(txb_cache->base_ctx_arr, txb_cache->base_count_arr,
                    txb_cache->base_mag_arr, txb_info->qcoeff, txb_info->stride,
                    txb_info->eob, scan);
   gen_br_count_mag_arr(txb_cache->br_count_arr, txb_cache->br_mag_arr,
-                       txb_info->qcoeff, txb_info->stride, txb_info->eob, scan);
+                       txb_info->qcoeff, txb_info->stride, txb_info->height,
+                       txb_info->eob, scan);
   gen_br_ctx_arr(txb_cache->br_ctx_arr, txb_cache->br_count_arr,
                  txb_cache->br_mag_arr, txb_info->qcoeff, txb_info->stride,
                  txb_info->eob, scan);
@@ -781,7 +790,7 @@ static int try_self_level_down(tran_low_t *low_coeff, int coeff_idx,
 
     if (scan_idx < txb_info->seg_eob) {
       const int eob_ctx =
-          get_eob_ctx(txb_info->qcoeff, coeff_idx, txb_info->bwl);
+          get_eob_ctx(txb_info->qcoeff, coeff_idx, txb_info->txs_ctx);
       cost_diff -= av1_cost_bit(txb_probs->eob_flag[eob_ctx],
                                 scan_idx == (txb_info->eob - 1));
     }
@@ -853,9 +862,13 @@ int try_level_down(int coeff_idx, const TxbCache *txb_cache,
       const int nb_row = row - sig_ref_offset[i][0];
       const int nb_col = col - sig_ref_offset[i][1];
       const int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
+
+      if (!(nb_row >= 0 && nb_col >= 0 && nb_row < txb_info->height &&
+            nb_col < txb_info->stride))
+        continue;
+
       const int nb_scan_idx = iscan[nb_coeff_idx];
-      if (nb_scan_idx < eob && nb_row >= 0 && nb_col >= 0 &&
-          nb_row < txb_info->stride && nb_col < txb_info->stride) {
+      if (nb_scan_idx < eob) {
         const int cost_diff = try_neighbor_level_down_nz(
             nb_coeff_idx, coeff_idx, txb_cache, txb_probs, txb_info);
         if (cost_map)
@@ -871,9 +884,13 @@ int try_level_down(int coeff_idx, const TxbCache *txb_cache,
       const int nb_row = row - base_ref_offset[i][0];
       const int nb_col = col - base_ref_offset[i][1];
       const int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
+
+      if (!(nb_row >= 0 && nb_col >= 0 && nb_row < txb_info->height &&
+            nb_col < txb_info->stride))
+        continue;
+
       const int nb_scan_idx = iscan[nb_coeff_idx];
-      if (nb_scan_idx < eob && nb_row >= 0 && nb_col >= 0 &&
-          nb_row < txb_info->stride && nb_col < txb_info->stride) {
+      if (nb_scan_idx < eob) {
         const int cost_diff = try_neighbor_level_down_base(
             nb_coeff_idx, coeff_idx, txb_cache, txb_probs, txb_info);
         if (cost_map)
@@ -889,9 +906,13 @@ int try_level_down(int coeff_idx, const TxbCache *txb_cache,
       const int nb_row = row - br_ref_offset[i][0];
       const int nb_col = col - br_ref_offset[i][1];
       const int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
+
+      if (!(nb_row >= 0 && nb_col >= 0 && nb_row < txb_info->height &&
+            nb_col < txb_info->stride))
+        continue;
+
       const int nb_scan_idx = iscan[nb_coeff_idx];
-      if (nb_scan_idx < eob && nb_row >= 0 && nb_col >= 0 &&
-          nb_row < txb_info->stride && nb_col < txb_info->stride) {
+      if (nb_scan_idx < eob) {
         const int cost_diff = try_neighbor_level_down_br(
             nb_coeff_idx, coeff_idx, txb_cache, txb_probs, txb_info);
         if (cost_map)
@@ -925,7 +946,7 @@ static int get_low_coeff_cost(int coeff_idx, const TxbCache *txb_cache,
     cost += get_base_cost(abs_qc, ctx, txb_probs->coeff_base, base_idx);
     if (scan_idx < txb_info->seg_eob) {
       const int eob_ctx =
-          get_eob_ctx(txb_info->qcoeff, coeff_idx, txb_info->bwl);
+          get_eob_ctx(txb_info->qcoeff, coeff_idx, txb_info->txs_ctx);
       cost += av1_cost_bit(txb_probs->eob_flag[eob_ctx],
                            scan_idx == (txb_info->eob - 1));
     }
@@ -982,7 +1003,7 @@ int try_change_eob(int *new_eob, int coeff_idx, const TxbCache *txb_cache,
     // Note that get_eob_ctx does NOT actually account for qcoeff, so we don't
     // need to lower down the qcoeff here
     const int eob_ctx =
-        get_eob_ctx(txb_info->qcoeff, scan[*new_eob - 1], txb_info->bwl);
+        get_eob_ctx(txb_info->qcoeff, scan[*new_eob - 1], txb_info->txs_ctx);
     cost_diff -= av1_cost_bit(txb_probs->eob_flag[eob_ctx], 0);
     cost_diff += av1_cost_bit(txb_probs->eob_flag[eob_ctx], 1);
   } else {
@@ -1016,10 +1037,14 @@ void update_level_down(int coeff_idx, TxbCache *txb_cache, TxbInfo *txb_info) {
   for (int i = 0; i < SIG_REF_OFFSET_NUM; ++i) {
     const int nb_row = row - sig_ref_offset[i][0];
     const int nb_col = col - sig_ref_offset[i][1];
+
+    if (!(nb_row >= 0 && nb_col >= 0 && nb_row < txb_info->height &&
+          nb_col < txb_info->stride))
+      continue;
+
     const int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
     const int nb_scan_idx = iscan[nb_coeff_idx];
-    if (nb_scan_idx < eob && nb_row >= 0 && nb_col >= 0 &&
-        nb_row < txb_info->stride && nb_col < txb_info->stride) {
+    if (nb_scan_idx < eob) {
       const int scan_idx = iscan[coeff_idx];
       if (scan_idx < nb_scan_idx) {
         const int level = 1;
@@ -1030,7 +1055,7 @@ void update_level_down(int coeff_idx, TxbCache *txb_cache, TxbInfo *txb_info) {
         const int count = txb_cache->nz_count_arr[nb_coeff_idx];
         txb_cache->nz_ctx_arr[nb_coeff_idx][0] = get_nz_map_ctx_from_count(
             count, txb_info->qcoeff, nb_coeff_idx, txb_info->bwl, iscan);
-        // int ref_ctx = get_nz_map_ctx2(txb_info->qcoeff, nb_coeff_idx,
+        // int ref_ctx = get_nz_map_ctx(txb_info->qcoeff, nb_coeff_idx,
         // txb_info->bwl, iscan);
         // if (ref_ctx != txb_cache->nz_ctx_arr[nb_coeff_idx][0])
         //   printf("nz ctx %d ref_ctx %d\n",
@@ -1043,11 +1068,15 @@ void update_level_down(int coeff_idx, TxbCache *txb_cache, TxbInfo *txb_info) {
     const int nb_row = row - base_ref_offset[i][0];
     const int nb_col = col - base_ref_offset[i][1];
     const int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
+
+    if (!(nb_row >= 0 && nb_col >= 0 && nb_row < txb_info->height &&
+          nb_col < txb_info->stride))
+      continue;
+
     const tran_low_t nb_coeff = txb_info->qcoeff[nb_coeff_idx];
     if (!has_base(nb_coeff, 0)) continue;
     const int nb_scan_idx = iscan[nb_coeff_idx];
-    if (nb_scan_idx < eob && nb_row >= 0 && nb_col >= 0 &&
-        nb_row < txb_info->stride && nb_col < txb_info->stride) {
+    if (nb_scan_idx < eob) {
       if (row >= nb_row && col >= nb_col)
         update_mag_arr(txb_cache->base_mag_arr[nb_coeff_idx], abs_qc);
       const int mag =
@@ -1076,11 +1105,15 @@ void update_level_down(int coeff_idx, TxbCache *txb_cache, TxbInfo *txb_info) {
     const int nb_row = row - br_ref_offset[i][0];
     const int nb_col = col - br_ref_offset[i][1];
     const int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
+
+    if (!(nb_row >= 0 && nb_col >= 0 && nb_row < txb_info->height &&
+          nb_col < txb_info->stride))
+      continue;
+
     const int nb_scan_idx = iscan[nb_coeff_idx];
     const tran_low_t nb_coeff = txb_info->qcoeff[nb_coeff_idx];
     if (!has_br(nb_coeff)) continue;
-    if (nb_scan_idx < eob && nb_row >= 0 && nb_col >= 0 &&
-        nb_row < txb_info->stride && nb_col < txb_info->stride) {
+    if (nb_scan_idx < eob) {
       const int level = 1 + NUM_BASE_LEVELS;
       if (abs_qc == level) {
         txb_cache->br_count_arr[nb_coeff_idx] -= 1;
@@ -1112,8 +1145,8 @@ static int get_coeff_cost(tran_low_t qc, int scan_idx, TxbInfo *txb_info,
   const int16_t *iscan = txb_info->scan_order->iscan;
 
   if (scan_idx < txb_info->seg_eob) {
-    int coeff_ctx =
-        get_nz_map_ctx2(txb_info->qcoeff, scan[scan_idx], txb_info->bwl, iscan);
+    int coeff_ctx = get_nz_map_ctx(txb_info->qcoeff, scan[scan_idx],
+                                   txb_info->bwl, txb_info->height, iscan);
     cost += av1_cost_bit(txb_probs->nz_map[coeff_ctx], is_nz);
   }
 
@@ -1122,7 +1155,8 @@ static int get_coeff_cost(tran_low_t qc, int scan_idx, TxbInfo *txb_info,
                               txb_ctx->dc_sign_ctx);
 
     int ctx_ls[NUM_BASE_LEVELS] = { 0 };
-    get_base_ctx_set(txb_info->qcoeff, scan[scan_idx], txb_info->bwl, ctx_ls);
+    get_base_ctx_set(txb_info->qcoeff, scan[scan_idx], txb_info->bwl,
+                     txb_info->height, ctx_ls);
 
     int i;
     for (i = 0; i < NUM_BASE_LEVELS; ++i) {
@@ -1130,14 +1164,15 @@ static int get_coeff_cost(tran_low_t qc, int scan_idx, TxbInfo *txb_info,
     }
 
     if (abs_qc > NUM_BASE_LEVELS) {
-      int ctx = get_br_ctx(txb_info->qcoeff, scan[scan_idx], txb_info->bwl);
+      int ctx = get_br_ctx(txb_info->qcoeff, scan[scan_idx], txb_info->bwl,
+                           txb_info->height);
       cost += get_br_cost(abs_qc, ctx, txb_probs->coeff_lps);
       cost += get_golomb_cost(abs_qc);
     }
 
     if (scan_idx < txb_info->seg_eob) {
       int eob_ctx =
-          get_eob_ctx(txb_info->qcoeff, scan[scan_idx], txb_info->bwl);
+          get_eob_ctx(txb_info->qcoeff, scan[scan_idx], txb_info->txs_ctx);
       cost += av1_cost_bit(txb_probs->eob_flag[eob_ctx],
                            scan_idx == (txb_info->eob - 1));
     }
@@ -1323,8 +1358,7 @@ void try_level_down_facade(LevelDownStats *stats, int scan_idx,
     test_level_down(coeff_idx, txb_cache, txb_probs, txb_info);
 #endif
   }
-  stats->rd_diff = RDCOST(txb_info->rdmult, txb_info->rddiv, stats->cost_diff,
-                          stats->dist_diff);
+  stats->rd_diff = RDCOST(txb_info->rdmult, stats->cost_diff, stats->dist_diff);
   if (stats->rd_diff < 0) stats->update = 1;
   return;
 }
@@ -1424,18 +1458,17 @@ static int optimize_txb(TxbInfo *txb_info, const TxbProbs *txb_probs,
 
 // These numbers are empirically obtained.
 static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
-#if CONFIG_EC_ADAPT
   { 17, 13 }, { 16, 10 },
-#else
-  { 20, 12 }, { 16, 12 },
-#endif
 };
 
-int av1_optimize_txb(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
-                     TX_SIZE tx_size, TXB_CTX *txb_ctx) {
+int av1_optimize_txb(const AV1_COMMON *cm, MACROBLOCK *x, int plane,
+                     int blk_row, int blk_col, int block, TX_SIZE tx_size,
+                     TXB_CTX *txb_ctx) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const PLANE_TYPE plane_type = get_plane_type(plane);
-  const TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+  const TX_SIZE txs_ctx = get_txsize_context(tx_size);
+  const TX_TYPE tx_type =
+      av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
   const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   const struct macroblock_plane *p = &x->plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
@@ -1445,34 +1478,34 @@ int av1_optimize_txb(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
   const tran_low_t *tcoeff = BLOCK_OFFSET(p->coeff, block);
   const int16_t *dequant = pd->dequant;
   const int seg_eob = AOMMIN(eob, tx_size_2d[tx_size] - 1);
-  const aom_prob *nz_map = xd->fc->nz_map[tx_size][plane_type];
+  const aom_prob *nz_map = xd->fc->nz_map[txs_ctx][plane_type];
 
   const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
   const int stride = 1 << bwl;
+  const int height = tx_size_high[tx_size];
   aom_prob(*coeff_base)[COEFF_BASE_CONTEXTS] =
-      xd->fc->coeff_base[tx_size][plane_type];
+      xd->fc->coeff_base[txs_ctx][plane_type];
 
-  const aom_prob *coeff_lps = xd->fc->coeff_lps[tx_size][plane_type];
+  const aom_prob *coeff_lps = xd->fc->coeff_lps[txs_ctx][plane_type];
 
   const int is_inter = is_inter_block(mbmi);
-  const SCAN_ORDER *const scan_order =
-      get_scan(cm, tx_size, tx_type, is_inter_block(mbmi));
+  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, mbmi);
 
   const TxbProbs txb_probs = { xd->fc->dc_sign[plane_type],
                                nz_map,
                                coeff_base,
                                coeff_lps,
-                               xd->fc->eob_flag[tx_size][plane_type],
-                               xd->fc->txb_skip[tx_size] };
+                               xd->fc->eob_flag[txs_ctx][plane_type],
+                               xd->fc->txb_skip[txs_ctx] };
 
   const int shift = av1_get_tx_scale(tx_size);
   const int64_t rdmult =
       (x->rdmult * plane_rd_mult[is_inter][plane_type] + 2) >> 2;
-  const int64_t rddiv = x->rddiv;
 
-  TxbInfo txb_info = { qcoeff,     dqcoeff, tcoeff, dequant, shift,
-                       tx_size,    bwl,     stride, eob,     seg_eob,
-                       scan_order, txb_ctx, rdmult, rddiv };
+  TxbInfo txb_info = { qcoeff,  dqcoeff, tcoeff,     dequant, shift,
+                       tx_size, txs_ctx, bwl,        stride,  height,
+                       eob,     seg_eob, scan_order, txb_ctx, rdmult };
+
   TxbCache txb_cache;
   gen_txb_cache(&txb_cache, &txb_info);
 
@@ -1510,9 +1543,9 @@ void av1_update_txb_context_b(int plane, int block, int blk_row, int blk_col,
   const uint16_t eob = p->eobs[block];
   const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   const PLANE_TYPE plane_type = pd->plane_type;
-  const TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
-  const SCAN_ORDER *const scan_order =
-      get_scan(cm, tx_size, tx_type, is_inter_block(mbmi));
+  const TX_TYPE tx_type =
+      av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
+  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, mbmi);
   (void)plane_bsize;
 
   int cul_level = av1_get_txb_entropy_context(qcoeff, scan_order, eob);
@@ -1536,25 +1569,28 @@ void av1_update_and_record_txb_context(int plane, int block, int blk_row,
   const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   tran_low_t *tcoeff = BLOCK_OFFSET(x->mbmi_ext->tcoeff[plane], block);
   const int segment_id = mbmi->segment_id;
-  const TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
-  const SCAN_ORDER *const scan_order =
-      get_scan(cm, tx_size, tx_type, is_inter_block(mbmi));
+  const TX_TYPE tx_type =
+      av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
+  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, mbmi);
   const int16_t *scan = scan_order->scan;
+  const int16_t *iscan = scan_order->iscan;
   const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
   int c, i;
   TXB_CTX txb_ctx;
   get_txb_ctx(plane_bsize, tx_size, plane, pd->above_context + blk_col,
               pd->left_context + blk_row, &txb_ctx);
   const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
+  const int height = tx_size_high[tx_size];
   int cul_level = 0;
   unsigned int(*nz_map_count)[SIG_COEF_CONTEXTS][2];
-  uint8_t txb_mask[32 * 32] = { 0 };
 
-  nz_map_count = &td->counts->nz_map[tx_size][plane_type];
+  TX_SIZE txsize_ctx = get_txsize_context(tx_size);
+
+  nz_map_count = &td->counts->nz_map[txsize_ctx][plane_type];
 
   memcpy(tcoeff, qcoeff, sizeof(*tcoeff) * seg_eob);
 
-  ++td->counts->txb_skip[tx_size][txb_ctx.txb_skip_ctx][eob == 0];
+  ++td->counts->txb_skip[txsize_ctx][txb_ctx.txb_skip_ctx][eob == 0];
   x->mbmi_ext->txb_skip_ctx[plane][block] = txb_ctx.txb_skip_ctx;
 
   x->mbmi_ext->eobs[plane][block] = eob;
@@ -1565,24 +1601,23 @@ void av1_update_and_record_txb_context(int plane, int block, int blk_row,
   }
 
 #if CONFIG_TXK_SEL
-  av1_update_tx_type_count(cm, xd, block, plane, mbmi->sb_type, tx_size,
-                           td->counts);
+  av1_update_tx_type_count(cm, xd, blk_row, blk_col, block, plane,
+                           mbmi->sb_type, get_min_tx_size(tx_size), td->counts);
 #endif
 
   for (c = 0; c < eob; ++c) {
     tran_low_t v = qcoeff[scan[c]];
     int is_nz = (v != 0);
-    int coeff_ctx = get_nz_map_ctx(tcoeff, txb_mask, scan[c], bwl);
-    int eob_ctx = get_eob_ctx(tcoeff, scan[c], bwl);
+    int coeff_ctx = get_nz_map_ctx(tcoeff, scan[c], bwl, height, iscan);
+    int eob_ctx = get_eob_ctx(tcoeff, scan[c], txsize_ctx);
 
     if (c == seg_eob - 1) break;
 
     ++(*nz_map_count)[coeff_ctx][is_nz];
 
     if (is_nz) {
-      ++td->counts->eob_flag[tx_size][plane_type][eob_ctx][c == (eob - 1)];
+      ++td->counts->eob_flag[txsize_ctx][plane_type][eob_ctx][c == (eob - 1)];
     }
-    txb_mask[scan[c]] = 1;
   }
 
   // Reverse process order to handle coefficient level and sign.
@@ -1595,10 +1630,10 @@ void av1_update_and_record_txb_context(int plane, int block, int blk_row,
 
       if (level <= i) continue;
 
-      ctx = get_base_ctx(tcoeff, scan[c], bwl, i + 1);
+      ctx = get_base_ctx(tcoeff, scan[c], bwl, height, i + 1);
 
       if (level == i + 1) {
-        ++td->counts->coeff_base[tx_size][plane_type][i][ctx][1];
+        ++td->counts->coeff_base[txsize_ctx][plane_type][i][ctx][1];
         if (c == 0) {
           int dc_sign_ctx = txb_ctx.dc_sign_ctx;
 
@@ -1608,7 +1643,7 @@ void av1_update_and_record_txb_context(int plane, int block, int blk_row,
         cul_level += level;
         continue;
       }
-      ++td->counts->coeff_base[tx_size][plane_type][i][ctx][0];
+      ++td->counts->coeff_base[txsize_ctx][plane_type][i][ctx][0];
       update_eob = AOMMAX(update_eob, c);
     }
   }
@@ -1630,13 +1665,13 @@ void av1_update_and_record_txb_context(int plane, int block, int blk_row,
     }
 
     // level is above 1.
-    ctx = get_br_ctx(tcoeff, scan[c], bwl);
+    ctx = get_br_ctx(tcoeff, scan[c], bwl, height);
     for (idx = 0; idx < COEFF_BASE_RANGE; ++idx) {
       if (level == (idx + 1 + NUM_BASE_LEVELS)) {
-        ++td->counts->coeff_lps[tx_size][plane_type][ctx][1];
+        ++td->counts->coeff_lps[txsize_ctx][plane_type][ctx][1];
         break;
       }
-      ++td->counts->coeff_lps[tx_size][plane_type][ctx][0];
+      ++td->counts->coeff_lps[txsize_ctx][plane_type][ctx][0];
     }
     if (idx < COEFF_BASE_RANGE) continue;
 
@@ -1835,46 +1870,74 @@ int64_t av1_search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
   TX_TYPE txk_end = TX_TYPES - 1;
   TX_TYPE best_tx_type = txk_start;
   int64_t best_rd = INT64_MAX;
+  uint8_t best_eob = 0;
   const int coeff_ctx = combine_entropy_contexts(*a, *l);
+  RD_STATS best_rd_stats;
   TX_TYPE tx_type;
+
+  av1_invalid_rd_stats(&best_rd_stats);
+
   for (tx_type = txk_start; tx_type <= txk_end; ++tx_type) {
-    if (plane == 0) mbmi->txk_type[block] = tx_type;
-    TX_TYPE ref_tx_type =
-        get_tx_type(get_plane_type(plane), xd, block, tx_size);
+    if (plane == 0) mbmi->txk_type[(blk_row << 4) + blk_col] = tx_type;
+    TX_TYPE ref_tx_type = av1_get_tx_type(get_plane_type(plane), xd, blk_row,
+                                          blk_col, block, tx_size);
     if (tx_type != ref_tx_type) {
-      // use get_tx_type() to check if the tx_type is valid for the current mode
-      // if it's not, we skip it here.
+      // use av1_get_tx_type() to check if the tx_type is valid for the current
+      // mode if it's not, we skip it here.
       continue;
     }
+
+#if CONFIG_EXT_TX
+    int is_inter = is_inter_block(mbmi);
+    int ext_tx_set = get_ext_tx_set(get_min_tx_size(tx_size), mbmi->sb_type,
+                                    is_inter, cm->reduced_tx_set_used);
+    if (!(is_inter && ext_tx_used_inter[ext_tx_set][tx_type]) &&
+        !(!is_inter && ext_tx_used_intra[ext_tx_set][tx_type]))
+      continue;
+#endif  // CONFIG_EXT_TX
+
     RD_STATS this_rd_stats;
     av1_invalid_rd_stats(&this_rd_stats);
     av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
                     coeff_ctx, AV1_XFORM_QUANT_FP);
-    av1_optimize_b(cm, x, plane, block, plane_bsize, tx_size, a, l);
+    av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size,
+                   a, l);
     av1_dist_block(cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size,
                    &this_rd_stats.dist, &this_rd_stats.sse,
                    OUTPUT_HAS_PREDICTED_PIXELS);
-    const SCAN_ORDER *scan_order =
-        get_scan(cm, tx_size, tx_type, is_inter_block(mbmi));
-    this_rd_stats.rate = av1_cost_coeffs(
-        cpi, x, plane, block, tx_size, scan_order, a, l, use_fast_coef_costing);
-    int rd =
-        RDCOST(x->rdmult, x->rddiv, this_rd_stats.rate, this_rd_stats.dist);
+    const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, mbmi);
+    this_rd_stats.rate =
+        av1_cost_coeffs(cpi, x, plane, blk_row, blk_col, block, tx_size,
+                        scan_order, a, l, use_fast_coef_costing);
+    int rd = RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
+
     if (rd < best_rd) {
       best_rd = rd;
-      *rd_stats = this_rd_stats;
+      best_rd_stats = this_rd_stats;
       best_tx_type = tx_type;
+      best_eob = x->plane[plane].txb_entropy_ctx[block];
     }
   }
-  if (plane == 0) mbmi->txk_type[block] = best_tx_type;
-  // TODO(angiebird): Instead of re-call av1_xform_quant and av1_optimize_b,
-  // copy the best result in the above tx_type search for loop
-  av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                  coeff_ctx, AV1_XFORM_QUANT_FP);
-  av1_optimize_b(cm, x, plane, block, plane_bsize, tx_size, a, l);
+
+  av1_merge_rd_stats(rd_stats, &best_rd_stats);
+
+  //  if (x->plane[plane].eobs[block] == 0)
+  //    if (best_tx_type != DCT_DCT)
+  //      exit(0);
+
+  if (best_eob == 0 && is_inter_block(mbmi)) best_tx_type = DCT_DCT;
+
+  if (plane == 0) mbmi->txk_type[(blk_row << 4) + blk_col] = best_tx_type;
+  x->plane[plane].txb_entropy_ctx[block] = best_eob;
+
   if (!is_inter_block(mbmi)) {
     // intra mode needs decoded result such that the next transform block
     // can use it for prediction.
+    av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                    coeff_ctx, AV1_XFORM_QUANT_FP);
+    av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size,
+                   a, l);
+
     av1_inverse_transform_block_facade(xd, plane, block, blk_row, blk_col,
                                        x->plane[plane].eobs[block]);
   }
diff --git a/third_party/aom/av1/encoder/encodetxb.h b/third_party/aom/av1/encoder/encodetxb.h
index 836033a54..cbafe59c9 100644
--- a/third_party/aom/av1/encoder/encodetxb.h
+++ b/third_party/aom/av1/encoder/encodetxb.h
@@ -30,14 +30,15 @@ typedef struct TxbInfo {
   const int16_t *dequant;
   int shift;
   TX_SIZE tx_size;
+  TX_SIZE txs_ctx;
   int bwl;
   int stride;
+  int height;
   int eob;
   int seg_eob;
   const SCAN_ORDER *scan_order;
   TXB_CTX *txb_ctx;
   int64_t rdmult;
-  int64_t rddiv;
 } TxbInfo;
 
 typedef struct TxbCache {
@@ -66,11 +67,12 @@ typedef struct TxbProbs {
 void av1_alloc_txb_buf(AV1_COMP *cpi);
 void av1_free_txb_buf(AV1_COMP *cpi);
 int av1_cost_coeffs_txb(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
-                        int block, TXB_CTX *txb_ctx);
+                        int blk_row, int blk_col, int block, TX_SIZE tx_size,
+                        TXB_CTX *txb_ctx);
 void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
-                          aom_writer *w, int block, int plane,
-                          const tran_low_t *tcoeff, uint16_t eob,
-                          TXB_CTX *txb_ctx);
+                          aom_writer *w, int blk_row, int blk_col, int block,
+                          int plane, TX_SIZE tx_size, const tran_low_t *tcoeff,
+                          uint16_t eob, TXB_CTX *txb_ctx);
 void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x,
                          aom_writer *w, int plane);
 int av1_get_txb_entropy_context(const tran_low_t *qcoeff,
@@ -95,8 +97,9 @@ int64_t av1_search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
                             const ENTROPY_CONTEXT *a, const ENTROPY_CONTEXT *l,
                             int use_fast_coef_costing, RD_STATS *rd_stats);
 #endif
-int av1_optimize_txb(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
-                     TX_SIZE tx_size, TXB_CTX *txb_ctx);
+int av1_optimize_txb(const AV1_COMMON *cm, MACROBLOCK *x, int plane,
+                     int blk_row, int blk_col, int block, TX_SIZE tx_size,
+                     TXB_CTX *txb_ctx);
 #ifdef __cplusplus
 }
 #endif
diff --git a/third_party/aom/av1/encoder/ethread.c b/third_party/aom/av1/encoder/ethread.c
index 7af5f78b6..1aa1d52a2 100644
--- a/third_party/aom/av1/encoder/ethread.c
+++ b/third_party/aom/av1/encoder/ethread.c
@@ -26,6 +26,10 @@ static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
         td_t->rd_counts.global_motion_used[i];
 #endif  // CONFIG_GLOBAL_MOTION
 
+  td->rd_counts.compound_ref_used_flag |=
+      td_t->rd_counts.compound_ref_used_flag;
+  td->rd_counts.single_ref_used_flag |= td_t->rd_counts.single_ref_used_flag;
+
   for (i = 0; i < TX_SIZES; i++)
     for (j = 0; j < PLANE_TYPES; j++)
       for (k = 0; k < REF_TYPES; k++)
@@ -122,11 +126,9 @@ void av1_encode_tiles_mt(AV1_COMP *cpi) {
 
 #if CONFIG_PALETTE
         // Allocate buffers used by palette coding mode.
-        if (cpi->common.allow_screen_content_tools) {
-          CHECK_MEM_ERROR(
-              cm, thread_data->td->palette_buffer,
-              aom_memalign(16, sizeof(*thread_data->td->palette_buffer)));
-        }
+        CHECK_MEM_ERROR(
+            cm, thread_data->td->palette_buffer,
+            aom_memalign(16, sizeof(*thread_data->td->palette_buffer)));
 #endif  // CONFIG_PALETTE
 
         // Create threads
@@ -168,7 +170,7 @@ void av1_encode_tiles_mt(AV1_COMP *cpi) {
     }
 
 #if CONFIG_PALETTE
-    if (cpi->common.allow_screen_content_tools && i < num_workers - 1)
+    if (i < num_workers - 1)
       thread_data->td->mb.palette_buffer = thread_data->td->palette_buffer;
 #endif  // CONFIG_PALETTE
   }
diff --git a/third_party/aom/av1/encoder/firstpass.c b/third_party/aom/av1/encoder/firstpass.c
index 7a0abba2d..e7d78d83e 100644
--- a/third_party/aom/av1/encoder/firstpass.c
+++ b/third_party/aom/av1/encoder/firstpass.c
@@ -456,6 +456,31 @@ static void set_first_pass_params(AV1_COMP *cpi) {
   cpi->rc.frames_to_key = INT_MAX;
 }
 
+#if CONFIG_FLEX_REFS
+static double raw_motion_error_stdev(int *raw_motion_err_list,
+                                     int raw_motion_err_counts) {
+  int64_t sum_raw_err = 0;
+  double raw_err_avg = 0;
+  double raw_err_stdev = 0;
+  if (raw_motion_err_counts == 0) return 0;
+
+  int i;
+  for (i = 0; i < raw_motion_err_counts; i++) {
+    sum_raw_err += raw_motion_err_list[i];
+  }
+  raw_err_avg = sum_raw_err / raw_motion_err_counts;
+  for (i = 0; i < raw_motion_err_counts; i++) {
+    raw_err_stdev += (raw_motion_err_list[i] - raw_err_avg) *
+                     (raw_motion_err_list[i] - raw_err_avg);
+  }
+  // Calculate the standard deviation for the motion error of all the inter
+  // blocks of the 0,0 motion using the last source
+  // frame as the reference.
+  raw_err_stdev = sqrt(raw_err_stdev / raw_motion_err_counts);
+  return raw_err_stdev;
+}
+#endif  // CONFIG_FLEX_REFS
+
 #define UL_INTRA_THRESH 50
 #define INVALID_ROW -1
 void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
@@ -506,6 +531,13 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
   od_adapt_ctx pvq_context;
 #endif
 
+#if CONFIG_FLEX_REFS
+  int *raw_motion_err_list;
+  int raw_motion_err_counts = 0;
+  CHECK_MEM_ERROR(
+      cm, raw_motion_err_list,
+      aom_calloc(cm->mb_rows * cm->mb_cols, sizeof(*raw_motion_err_list)));
+#endif  // CONFIG_FLEX_REFS
   // First pass code requires valid last and new frame buffers.
   assert(new_yv12 != NULL);
   assert(frame_is_intra_only(cm) || (lst_yv12 != NULL));
@@ -968,6 +1000,9 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
             }
           }
         }
+#if CONFIG_FLEX_REFS
+        raw_motion_err_list[raw_motion_err_counts++] = raw_motion_error;
+#endif  // CONFIG_FLEX_REFS
       } else {
         sr_coded_error += (int64_t)this_error;
       }
@@ -981,7 +1016,6 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
       recon_yoffset += 16;
       recon_uvoffset += uv_mb_height;
     }
-
     // Adjust to the next row of MBs.
     x->plane[0].src.buf += 16 * x->plane[0].src.stride - 16 * cm->mb_cols;
     x->plane[1].src.buf +=
@@ -991,7 +1025,10 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
 
     aom_clear_system_state();
   }
-
+#if CONFIG_FLEX_REFS
+  const double raw_err_stdev =
+      raw_motion_error_stdev(raw_motion_err_list, raw_motion_err_counts);
+#endif  // CONFIG_FLEX_REFS
 #if CONFIG_PVQ
 #if !CONFIG_ANS
   od_ec_enc_clear(&x->daala_enc.w.ec);
@@ -1045,6 +1082,9 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
     fps.intra_skip_pct = (double)intra_skip_count / num_mbs;
     fps.inactive_zone_rows = (double)image_data_start_row;
     fps.inactive_zone_cols = (double)0;  // TODO(paulwilkins): fix
+#if CONFIG_FLEX_REFS
+    fps.raw_error_stdev = raw_err_stdev;
+#endif  // CONFIG_FLEX_REFS
 
     if (mvcount > 0) {
       fps.MVr = (double)sum_mvr / mvcount;
@@ -1231,27 +1271,6 @@ static void setup_rf_level_maxq(AV1_COMP *cpi) {
   }
 }
 
-void av1_calculate_next_scaled_size(const AV1_COMP *cpi,
-                                    int *scaled_frame_width,
-                                    int *scaled_frame_height) {
-  *scaled_frame_width =
-      cpi->oxcf.width * cpi->resize_next_scale_num / cpi->resize_next_scale_den;
-  *scaled_frame_height = cpi->oxcf.height * cpi->resize_next_scale_num /
-                         cpi->resize_next_scale_den;
-}
-
-#if CONFIG_FRAME_SUPERRES
-void av1_calculate_superres_size(const AV1_COMP *cpi, int *encoded_width,
-                                 int *encoded_height) {
-  *encoded_width = cpi->oxcf.scaled_frame_width *
-                   cpi->common.superres_scale_numerator /
-                   SUPERRES_SCALE_DENOMINATOR;
-  *encoded_height = cpi->oxcf.scaled_frame_height *
-                    cpi->common.superres_scale_numerator /
-                    SUPERRES_SCALE_DENOMINATOR;
-}
-#endif  // CONFIG_FRAME_SUPERRES
-
 void av1_init_second_pass(AV1_COMP *cpi) {
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   TWO_PASS *const twopass = &cpi->twopass;
@@ -1673,6 +1692,9 @@ static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
   // (3) The bi-predictive group interval is strictly smaller than the
   //     golden group interval.
   const int is_bipred_enabled =
+#if CONFIG_FLEX_REFS
+      cpi->bwd_ref_allowed &&
+#endif
       rc->source_alt_ref_pending && rc->bipred_group_interval &&
       rc->bipred_group_interval <=
           (rc->baseline_gf_interval - rc->source_alt_ref_pending);
@@ -2046,6 +2068,11 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   const int is_key_frame = frame_is_intra_only(cm);
   const int arf_active_or_kf = is_key_frame || rc->source_alt_ref_active;
 
+#if CONFIG_FLEX_REFS
+  cpi->extra_arf_allowed = 1;
+  cpi->bwd_ref_allowed = 1;
+#endif
+
   // Reset the GF group data structures unless this is a key
   // frame in which case it will already have been done.
   if (is_key_frame == 0) {
@@ -2106,6 +2133,12 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     }
   }
 
+#if CONFIG_FLEX_REFS
+  double avg_sr_coded_error = 0;
+  double avg_raw_err_stdev = 0;
+  int non_zero_stdev_count = 0;
+#endif  // CONFIG_FLEX_REFS
+
   i = 0;
   while (i < rc->static_scene_max_gf_interval && i < rc->frames_to_key) {
     ++i;
@@ -2129,6 +2162,14 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     accumulate_frame_motion_stats(
         &next_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
         &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+#if CONFIG_FLEX_REFS
+    // sum up the metric values of current gf group
+    avg_sr_coded_error += next_frame.sr_coded_error;
+    if (next_frame.raw_error_stdev) {
+      non_zero_stdev_count++;
+      avg_raw_err_stdev += next_frame.raw_error_stdev;
+    }
+#endif  // CONFIG_FLEX_REFS
 
     // Accumulate the effect of prediction quality decay.
     if (!flash_detected) {
@@ -2175,7 +2216,6 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     *this_frame = next_frame;
     old_boost_score = boost_score;
   }
-
   twopass->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0);
 
   // Was the group length constrained by the requirement for a new KF?
@@ -2202,11 +2242,35 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
   // Set the interval until the next gf.
   rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending);
-
 #if CONFIG_EXT_REFS
-  // Compute how many extra alt_refs we can have
-  cpi->num_extra_arfs = get_number_of_extra_arfs(rc->baseline_gf_interval,
-                                                 rc->source_alt_ref_pending);
+#if CONFIG_FLEX_REFS
+  const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
+                                                             : cpi->common.MBs;
+  if (i) avg_sr_coded_error /= i;
+  if (non_zero_stdev_count) avg_raw_err_stdev /= non_zero_stdev_count;
+
+  // Disable extra alter refs and backward ref for "still" gf group
+  // zero_motion_accumulator indicates the minimum percentage of (0, 0) motion
+  // in gf group
+  // avg_sr_coded_error indicates the average of the sse per pixel of each frame
+  // in gf group
+  // avg_raw_err_stdev indicates the average of the standard deviation of (0, 0)
+  // motion error per block of each frame in gf group
+  assert(num_mbs > 0);
+  const int disable_bwd_extarf =
+      (zero_motion_accumulator > MIN_ZERO_MOTION &&
+       avg_sr_coded_error / num_mbs < MAX_SR_CODED_ERROR &&
+       avg_raw_err_stdev < MAX_RAW_ERR_VAR);
+
+  if (disable_bwd_extarf) cpi->extra_arf_allowed = cpi->bwd_ref_allowed = 0;
+
+  if (!cpi->extra_arf_allowed)
+    cpi->num_extra_arfs = 0;
+  else
+#endif  // CONFIG_FLEX_REFS
+    // Compute how many extra alt_refs we can have
+    cpi->num_extra_arfs = get_number_of_extra_arfs(rc->baseline_gf_interval,
+                                                   rc->source_alt_ref_pending);
   // Currently at maximum two extra ARFs' are allowed
   assert(cpi->num_extra_arfs <= MAX_EXT_ARFS);
 #endif  // CONFIG_EXT_REFS
@@ -2291,12 +2355,6 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     twopass->section_intra_rating = calculate_section_intra_ratio(
         start_pos, twopass->stats_in_end, rc->baseline_gf_interval);
   }
-
-  if (oxcf->resize_mode == RESIZE_DYNAMIC) {
-    // Default to starting GF groups at normal frame size.
-    // TODO(afergs): Make a function for this
-    cpi->resize_next_scale_num = cpi->resize_next_scale_den;
-  }
 }
 
 // Threshold for use of the lagging second reference frame. High second ref
@@ -2638,12 +2696,6 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   // The count of bits left is adjusted elsewhere based on real coded frame
   // sizes.
   twopass->modified_error_left -= kf_group_err;
-
-  if (oxcf->resize_mode == RESIZE_DYNAMIC) {
-    // Default to normal-sized frame on keyframes.
-    // TODO(afergs): Make a function for this
-    cpi->resize_next_scale_num = cpi->resize_next_scale_den;
-  }
 }
 
 // Define the reference buffers that will be updated post encode.
@@ -2741,7 +2793,7 @@ static void configure_buffer_updates(AV1_COMP *cpi) {
       break;
 
     case LAST_BIPRED_UPDATE:
-      cpi->refresh_last_frame = 0;
+      cpi->refresh_last_frame = 1;
       cpi->refresh_golden_frame = 0;
       cpi->refresh_bwd_ref_frame = 0;
       cpi->refresh_alt_ref_frame = 0;
diff --git a/third_party/aom/av1/encoder/firstpass.h b/third_party/aom/av1/encoder/firstpass.h
index 43104454c..266766d99 100644
--- a/third_party/aom/av1/encoder/firstpass.h
+++ b/third_party/aom/av1/encoder/firstpass.h
@@ -52,6 +52,13 @@ typedef struct {
 #define MIN_EXT_ARF_INTERVAL 4
 #endif  // CONFIG_EXT_REFS
 
+#if CONFIG_FLEX_REFS
+#define MIN_ZERO_MOTION 0.95
+#define MAX_SR_CODED_ERROR 40
+#define MAX_RAW_ERR_VAR 2000
+#define MIN_MV_IN_OUT 0.4
+#endif  // CONFIG_FLEX_REFS
+
 #define VLOW_MOTION_THRESHOLD 950
 
 typedef struct {
@@ -77,6 +84,10 @@ typedef struct {
   double new_mv_count;
   double duration;
   double count;
+#if CONFIG_FLEX_REFS
+  // standard deviation for (0, 0) motion prediction error
+  double raw_error_stdev;
+#endif  // CONFIG_FLEX_REFS
 } FIRSTPASS_STATS;
 
 typedef enum {
@@ -177,18 +188,6 @@ void av1_twopass_postencode_update(struct AV1_COMP *cpi);
 // Post encode update of the rate control parameters for 2-pass
 void av1_twopass_postencode_update(struct AV1_COMP *cpi);
 
-void av1_calculate_next_scaled_size(const struct AV1_COMP *cpi,
-                                    int *scaled_frame_width,
-                                    int *scaled_frame_height);
-
-#if CONFIG_FRAME_SUPERRES
-// This is the size after superress scaling, which could be 1:1.
-// Superres scaling happens after regular downscaling.
-// TODO(afergs): Limit overall reduction to 1/2 of the original size
-void av1_calculate_superres_size(const struct AV1_COMP *cpi, int *encoded_width,
-                                 int *encoded_height);
-#endif  // CONFIG_FRAME_SUPERRES
-
 #if CONFIG_EXT_REFS
 static INLINE int get_number_of_extra_arfs(int interval, int arf_pending) {
   if (arf_pending && MAX_EXT_ARFS > 0)
diff --git a/third_party/aom/av1/encoder/global_motion.c b/third_party/aom/av1/encoder/global_motion.c
index 74cbc8ae7..661a1feb4 100644
--- a/third_party/aom/av1/encoder/global_motion.c
+++ b/third_party/aom/av1/encoder/global_motion.c
@@ -131,8 +131,8 @@ int64_t refine_integerized_param(WarpedMotionParams *wm,
 #endif  // CONFIG_HIGHBITDEPTH
                                  uint8_t *ref, int r_width, int r_height,
                                  int r_stride, uint8_t *dst, int d_width,
-                                 int d_height, int d_stride,
-                                 int n_refinements) {
+                                 int d_height, int d_stride, int n_refinements,
+                                 int64_t best_frame_error) {
   static const int max_trans_model_params[TRANS_TYPES] = {
     0, 2, 4, 6, 8, 8, 8
   };
@@ -147,15 +147,16 @@ int64_t refine_integerized_param(WarpedMotionParams *wm,
   int32_t best_param;
 
   force_wmtype(wm, wmtype);
-  best_error = av1_warp_error(wm,
+  best_error = av1_warp_error(
+      wm,
 #if CONFIG_HIGHBITDEPTH
-                              use_hbd, bd,
+      use_hbd, bd,
 #endif  // CONFIG_HIGHBITDEPTH
-                              ref, r_width, r_height, r_stride,
-                              dst + border * d_stride + border, border, border,
-                              d_width - 2 * border, d_height - 2 * border,
-                              d_stride, 0, 0, 16, 16);
-  step = 1 << (n_refinements + 1);
+      ref, r_width, r_height, r_stride, dst + border * d_stride + border,
+      border, border, d_width - 2 * border, d_height - 2 * border, d_stride, 0,
+      0, SCALE_SUBPEL_SHIFTS, SCALE_SUBPEL_SHIFTS, best_frame_error);
+  best_error = AOMMIN(best_error, best_frame_error);
+  step = 1 << (n_refinements - 1);
   for (i = 0; i < n_refinements; i++, step >>= 1) {
     for (p = 0; p < n_params; ++p) {
       int step_dir = 0;
@@ -174,7 +175,7 @@ int64_t refine_integerized_param(WarpedMotionParams *wm,
 #endif  // CONFIG_HIGHBITDEPTH
           ref, r_width, r_height, r_stride, dst + border * d_stride + border,
           border, border, d_width - 2 * border, d_height - 2 * border, d_stride,
-          0, 0, 16, 16);
+          0, 0, SCALE_SUBPEL_SHIFTS, SCALE_SUBPEL_SHIFTS, best_error);
       if (step_error < best_error) {
         best_error = step_error;
         best_param = *param;
@@ -190,7 +191,7 @@ int64_t refine_integerized_param(WarpedMotionParams *wm,
 #endif  // CONFIG_HIGHBITDEPTH
           ref, r_width, r_height, r_stride, dst + border * d_stride + border,
           border, border, d_width - 2 * border, d_height - 2 * border, d_stride,
-          0, 0, 16, 16);
+          0, 0, SCALE_SUBPEL_SHIFTS, SCALE_SUBPEL_SHIFTS, best_error);
       if (step_error < best_error) {
         best_error = step_error;
         best_param = *param;
@@ -209,7 +210,8 @@ int64_t refine_integerized_param(WarpedMotionParams *wm,
 #endif  // CONFIG_HIGHBITDEPTH
             ref, r_width, r_height, r_stride, dst + border * d_stride + border,
             border, border, d_width - 2 * border, d_height - 2 * border,
-            d_stride, 0, 0, 16, 16);
+            d_stride, 0, 0, SCALE_SUBPEL_SHIFTS, SCALE_SUBPEL_SHIFTS,
+            best_error);
         if (step_error < best_error) {
           best_error = step_error;
           best_param = *param;
diff --git a/third_party/aom/av1/encoder/global_motion.h b/third_party/aom/av1/encoder/global_motion.h
index 38509df6a..7fca5327f 100644
--- a/third_party/aom/av1/encoder/global_motion.h
+++ b/third_party/aom/av1/encoder/global_motion.h
@@ -36,7 +36,8 @@ int64_t refine_integerized_param(WarpedMotionParams *wm,
 #endif  // CONFIG_HIGHBITDEPTH
                                  uint8_t *ref, int r_width, int r_height,
                                  int r_stride, uint8_t *dst, int d_width,
-                                 int d_height, int d_stride, int n_refinements);
+                                 int d_height, int d_stride, int n_refinements,
+                                 int64_t best_frame_error);
 
 /*
   Computes "num_motions" candidate global motion parameters between two frames.
diff --git a/third_party/aom/av1/encoder/hybrid_fwd_txfm.c b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c
index c57deed84..85f4b7d9b 100644
--- a/third_party/aom/av1/encoder/hybrid_fwd_txfm.c
+++ b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c
@@ -18,7 +18,7 @@
 
 #if CONFIG_CHROMA_2X2
 static void fwd_txfm_2x2(const int16_t *src_diff, tran_low_t *coeff,
-                         int diff_stride, TX_TYPE tx_type, int lossless) {
+                         int diff_stride, TxfmParam *txfm_param) {
   tran_high_t a1 = src_diff[0];
   tran_high_t b1 = src_diff[1];
   tran_high_t c1 = src_diff[diff_stride];
@@ -39,134 +39,151 @@ static void fwd_txfm_2x2(const int16_t *src_diff, tran_low_t *coeff,
   coeff[2] = (tran_low_t)(4 * c1);
   coeff[3] = (tran_low_t)(4 * d1);
 
-  (void)tx_type;
-  (void)lossless;
+  (void)txfm_param;
 }
 #endif
 
 static void fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
-                         int diff_stride, TX_TYPE tx_type, int lossless) {
-  if (lossless) {
-    assert(tx_type == DCT_DCT);
+                         int diff_stride, TxfmParam *txfm_param) {
+  if (txfm_param->lossless) {
+    assert(txfm_param->tx_type == DCT_DCT);
     av1_fwht4x4(src_diff, coeff, diff_stride);
     return;
   }
 
-  av1_fht4x4(src_diff, coeff, diff_stride, tx_type);
+#if CONFIG_LGT
+  // only C version has LGTs
+  av1_fht4x4_c(src_diff, coeff, diff_stride, txfm_param);
+#else
+  av1_fht4x4(src_diff, coeff, diff_stride, txfm_param);
+#endif
 }
 
 static void fwd_txfm_4x8(const int16_t *src_diff, tran_low_t *coeff,
-                         int diff_stride, TX_TYPE tx_type,
-                         FWD_TXFM_OPT fwd_txfm_opt) {
-  (void)fwd_txfm_opt;
-  av1_fht4x8(src_diff, coeff, diff_stride, tx_type);
+                         int diff_stride, TxfmParam *txfm_param) {
+#if CONFIG_LGT
+  av1_fht4x8_c(src_diff, coeff, diff_stride, txfm_param);
+#else
+  av1_fht4x8(src_diff, coeff, diff_stride, txfm_param);
+#endif
 }
 
 static void fwd_txfm_8x4(const int16_t *src_diff, tran_low_t *coeff,
-                         int diff_stride, TX_TYPE tx_type,
-                         FWD_TXFM_OPT fwd_txfm_opt) {
-  (void)fwd_txfm_opt;
-  av1_fht8x4(src_diff, coeff, diff_stride, tx_type);
+                         int diff_stride, TxfmParam *txfm_param) {
+#if CONFIG_LGT
+  av1_fht8x4_c(src_diff, coeff, diff_stride, txfm_param);
+#else
+  av1_fht8x4(src_diff, coeff, diff_stride, txfm_param);
+#endif
 }
 
 static void fwd_txfm_8x16(const int16_t *src_diff, tran_low_t *coeff,
-                          int diff_stride, TX_TYPE tx_type,
-                          FWD_TXFM_OPT fwd_txfm_opt) {
-  (void)fwd_txfm_opt;
-  av1_fht8x16(src_diff, coeff, diff_stride, tx_type);
+                          int diff_stride, TxfmParam *txfm_param) {
+#if CONFIG_LGT
+  av1_fht8x16_c(src_diff, coeff, diff_stride, txfm_param);
+#else
+  av1_fht8x16(src_diff, coeff, diff_stride, txfm_param);
+#endif
 }
 
 static void fwd_txfm_16x8(const int16_t *src_diff, tran_low_t *coeff,
-                          int diff_stride, TX_TYPE tx_type,
-                          FWD_TXFM_OPT fwd_txfm_opt) {
-  (void)fwd_txfm_opt;
-  av1_fht16x8(src_diff, coeff, diff_stride, tx_type);
+                          int diff_stride, TxfmParam *txfm_param) {
+#if CONFIG_LGT
+  av1_fht16x8_c(src_diff, coeff, diff_stride, txfm_param);
+#else
+  av1_fht16x8(src_diff, coeff, diff_stride, txfm_param);
+#endif
 }
 
 static void fwd_txfm_16x32(const int16_t *src_diff, tran_low_t *coeff,
-                           int diff_stride, TX_TYPE tx_type,
-                           FWD_TXFM_OPT fwd_txfm_opt) {
-  (void)fwd_txfm_opt;
-  av1_fht16x32(src_diff, coeff, diff_stride, tx_type);
+                           int diff_stride, TxfmParam *txfm_param) {
+  av1_fht16x32(src_diff, coeff, diff_stride, txfm_param);
 }
 
 static void fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff,
-                           int diff_stride, TX_TYPE tx_type,
-                           FWD_TXFM_OPT fwd_txfm_opt) {
-  (void)fwd_txfm_opt;
-  av1_fht32x16(src_diff, coeff, diff_stride, tx_type);
+                           int diff_stride, TxfmParam *txfm_param) {
+  av1_fht32x16(src_diff, coeff, diff_stride, txfm_param);
 }
 
 static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
-                         int diff_stride, TX_TYPE tx_type,
-                         FWD_TXFM_OPT fwd_txfm_opt) {
-  (void)fwd_txfm_opt;
-  av1_fht8x8(src_diff, coeff, diff_stride, tx_type);
+                         int diff_stride, TxfmParam *txfm_param) {
+#if CONFIG_LGT
+  av1_fht8x8_c(src_diff, coeff, diff_stride, txfm_param);
+#else
+  av1_fht8x8(src_diff, coeff, diff_stride, txfm_param);
+#endif
 }
 
 static void fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
-                           int diff_stride, TX_TYPE tx_type,
-                           FWD_TXFM_OPT fwd_txfm_opt) {
-  (void)fwd_txfm_opt;
-  av1_fht16x16(src_diff, coeff, diff_stride, tx_type);
+                           int diff_stride, TxfmParam *txfm_param) {
+  av1_fht16x16(src_diff, coeff, diff_stride, txfm_param);
 }
 
 static void fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff,
-                           int diff_stride, TX_TYPE tx_type,
-                           FWD_TXFM_OPT fwd_txfm_opt) {
-  (void)fwd_txfm_opt;
-  av1_fht32x32(src_diff, coeff, diff_stride, tx_type);
+                           int diff_stride, TxfmParam *txfm_param) {
+#if CONFIG_MRC_TX
+  // MRC_DCT currently only has a C implementation
+  if (txfm_param->tx_type == MRC_DCT) {
+    av1_fht32x32_c(src_diff, coeff, diff_stride, txfm_param);
+    return;
+  }
+#endif  // CONFIG_MRC_TX
+  av1_fht32x32(src_diff, coeff, diff_stride, txfm_param);
 }
 
 #if CONFIG_TX64X64
 static void fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff,
-                           int diff_stride, TX_TYPE tx_type,
-                           FWD_TXFM_OPT fwd_txfm_opt) {
-  (void)fwd_txfm_opt;
+                           int diff_stride, TxfmParam *txfm_param) {
 #if CONFIG_EXT_TX
-  if (tx_type == IDTX)
-    av1_fwd_idtx_c(src_diff, coeff, diff_stride, 64, tx_type);
+  if (txfm_param->tx_type == IDTX)
+    av1_fwd_idtx_c(src_diff, coeff, diff_stride, 64, txfm_param->tx_type);
   else
 #endif
-    av1_fht64x64(src_diff, coeff, diff_stride, tx_type);
+    av1_fht64x64(src_diff, coeff, diff_stride, txfm_param);
 }
 #endif  // CONFIG_TX64X64
 
-#if CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
+#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
 static void fwd_txfm_16x4(const int16_t *src_diff, tran_low_t *coeff,
-                          int diff_stride, TX_TYPE tx_type,
-                          FWD_TXFM_OPT fwd_txfm_opt) {
-  (void)fwd_txfm_opt;
-  av1_fht16x4(src_diff, coeff, diff_stride, tx_type);
+                          int diff_stride, TxfmParam *txfm_param) {
+#if CONFIG_LGT
+  av1_fht16x4_c(src_diff, coeff, diff_stride, txfm_param);
+#else
+  av1_fht16x4(src_diff, coeff, diff_stride, txfm_param);
+#endif
 }
 
 static void fwd_txfm_4x16(const int16_t *src_diff, tran_low_t *coeff,
-                          int diff_stride, TX_TYPE tx_type,
-                          FWD_TXFM_OPT fwd_txfm_opt) {
-  (void)fwd_txfm_opt;
-  av1_fht4x16(src_diff, coeff, diff_stride, tx_type);
+                          int diff_stride, TxfmParam *txfm_param) {
+#if CONFIG_LGT
+  av1_fht4x16_c(src_diff, coeff, diff_stride, txfm_param);
+#else
+  av1_fht4x16(src_diff, coeff, diff_stride, txfm_param);
+#endif
 }
 
 static void fwd_txfm_32x8(const int16_t *src_diff, tran_low_t *coeff,
-                          int diff_stride, TX_TYPE tx_type,
-                          FWD_TXFM_OPT fwd_txfm_opt) {
-  (void)fwd_txfm_opt;
-  av1_fht32x8(src_diff, coeff, diff_stride, tx_type);
+                          int diff_stride, TxfmParam *txfm_param) {
+#if CONFIG_LGT
+  av1_fht32x8_c(src_diff, coeff, diff_stride, txfm_param);
+#else
+  av1_fht32x8(src_diff, coeff, diff_stride, txfm_param);
+#endif
 }
 
 static void fwd_txfm_8x32(const int16_t *src_diff, tran_low_t *coeff,
-                          int diff_stride, TX_TYPE tx_type,
-                          FWD_TXFM_OPT fwd_txfm_opt) {
-  (void)fwd_txfm_opt;
-  av1_fht8x32(src_diff, coeff, diff_stride, tx_type);
+                          int diff_stride, TxfmParam *txfm_param) {
+#if CONFIG_LGT
+  av1_fht8x32_c(src_diff, coeff, diff_stride, txfm_param);
+#else
+  av1_fht8x32(src_diff, coeff, diff_stride, txfm_param);
+#endif
 }
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
+#endif
 
-#if CONFIG_HIGHBITDEPTH
 #if CONFIG_CHROMA_2X2
 static void highbd_fwd_txfm_2x2(const int16_t *src_diff, tran_low_t *coeff,
-                                int diff_stride, TX_TYPE tx_type, int lossless,
-                                const int bd) {
+                                int diff_stride, TxfmParam *txfm_param) {
   tran_high_t a1 = src_diff[0];
   tran_high_t b1 = src_diff[1];
   tran_high_t c1 = src_diff[diff_stride];
@@ -187,27 +204,27 @@ static void highbd_fwd_txfm_2x2(const int16_t *src_diff, tran_low_t *coeff,
   coeff[2] = (tran_low_t)(4 * c1);
   coeff[3] = (tran_low_t)(4 * d1);
 
-  (void)tx_type;
-  (void)lossless;
-  (void)bd;
+  (void)txfm_param;
 }
 #endif
 
 static void highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
-                                int diff_stride, TX_TYPE tx_type, int lossless,
-                                const int bd) {
-  if (lossless) {
+                                int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const int tx_type = txfm_param->tx_type;
+  const int bd = txfm_param->bd;
+  if (txfm_param->lossless) {
     assert(tx_type == DCT_DCT);
     av1_highbd_fwht4x4(src_diff, coeff, diff_stride);
     return;
   }
-
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
     case DCT_ADST:
     case ADST_ADST:
-      av1_fwd_txfm2d_4x4(src_diff, coeff, diff_stride, tx_type, bd);
+      // fallthrough intended
+      av1_fwd_txfm2d_4x4(src_diff, dst_coeff, diff_stride, tx_type, bd);
       break;
 #if CONFIG_EXT_TX
     case FLIPADST_DCT:
@@ -215,80 +232,79 @@ static void highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
-      av1_fwd_txfm2d_4x4(src_diff, coeff, diff_stride, tx_type, bd);
+      // fallthrough intended
+      av1_fwd_txfm2d_4x4(src_diff, dst_coeff, diff_stride, tx_type, bd);
       break;
+    // use the c version for anything including identity for now
     case V_DCT:
     case H_DCT:
     case V_ADST:
     case H_ADST:
     case V_FLIPADST:
     case H_FLIPADST:
-      av1_highbd_fht4x4_c(src_diff, coeff, diff_stride, tx_type);
+    case IDTX:
+      // fallthrough intended
+      av1_fwd_txfm2d_4x4_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
       break;
-    case IDTX: av1_fwd_idtx_c(src_diff, coeff, diff_stride, 4, tx_type); break;
 #endif  // CONFIG_EXT_TX
     default: assert(0);
   }
 }
 
 static void highbd_fwd_txfm_4x8(const int16_t *src_diff, tran_low_t *coeff,
-                                int diff_stride, TX_TYPE tx_type,
-                                FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
-  (void)fwd_txfm_opt;
-  (void)bd;
-  av1_highbd_fht4x8(src_diff, coeff, diff_stride, tx_type);
+                                int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_4x8_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                       txfm_param->bd);
 }
 
 static void highbd_fwd_txfm_8x4(const int16_t *src_diff, tran_low_t *coeff,
-                                int diff_stride, TX_TYPE tx_type,
-                                FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
-  (void)fwd_txfm_opt;
-  (void)bd;
-  av1_highbd_fht8x4(src_diff, coeff, diff_stride, tx_type);
+                                int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_8x4_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                       txfm_param->bd);
 }
 
 static void highbd_fwd_txfm_8x16(const int16_t *src_diff, tran_low_t *coeff,
-                                 int diff_stride, TX_TYPE tx_type,
-                                 FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
-  (void)fwd_txfm_opt;
-  (void)bd;
-  av1_highbd_fht8x16(src_diff, coeff, diff_stride, tx_type);
+                                 int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_8x16_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                        txfm_param->bd);
 }
 
 static void highbd_fwd_txfm_16x8(const int16_t *src_diff, tran_low_t *coeff,
-                                 int diff_stride, TX_TYPE tx_type,
-                                 FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
-  (void)fwd_txfm_opt;
-  (void)bd;
-  av1_highbd_fht16x8(src_diff, coeff, diff_stride, tx_type);
+                                 int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_16x8_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                        txfm_param->bd);
 }
 
 static void highbd_fwd_txfm_16x32(const int16_t *src_diff, tran_low_t *coeff,
-                                  int diff_stride, TX_TYPE tx_type,
-                                  FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
-  (void)fwd_txfm_opt;
-  (void)bd;
-  av1_highbd_fht16x32(src_diff, coeff, diff_stride, tx_type);
+                                  int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_16x32_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                         txfm_param->bd);
 }
 
 static void highbd_fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff,
-                                  int diff_stride, TX_TYPE tx_type,
-                                  FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
-  (void)fwd_txfm_opt;
-  (void)bd;
-  av1_highbd_fht32x16(src_diff, coeff, diff_stride, tx_type);
+                                  int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_32x16_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                         txfm_param->bd);
 }
 
 static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
-                                int diff_stride, TX_TYPE tx_type,
-                                FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
-  (void)fwd_txfm_opt;
+                                int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const int tx_type = txfm_param->tx_type;
+  const int bd = txfm_param->bd;
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
     case DCT_ADST:
     case ADST_ADST:
-      av1_fwd_txfm2d_8x8(src_diff, coeff, diff_stride, tx_type, bd);
+      // fallthrough intended
+      av1_fwd_txfm2d_8x8(src_diff, dst_coeff, diff_stride, tx_type, bd);
       break;
 #if CONFIG_EXT_TX
     case FLIPADST_DCT:
@@ -296,33 +312,37 @@ static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
-      av1_fwd_txfm2d_8x8(src_diff, coeff, diff_stride, tx_type, bd);
+      // fallthrough intended
+      av1_fwd_txfm2d_8x8(src_diff, dst_coeff, diff_stride, tx_type, bd);
       break;
+    // use the c version for anything including identity for now
     case V_DCT:
     case H_DCT:
     case V_ADST:
     case H_ADST:
     case V_FLIPADST:
     case H_FLIPADST:
-      // Use C version since DST exists only in C
-      av1_highbd_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
+    case IDTX:
+      // fallthrough intended
+      av1_fwd_txfm2d_8x8_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
       break;
-    case IDTX: av1_fwd_idtx_c(src_diff, coeff, diff_stride, 8, tx_type); break;
 #endif  // CONFIG_EXT_TX
     default: assert(0);
   }
 }
 
 static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
-                                  int diff_stride, TX_TYPE tx_type,
-                                  FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
-  (void)fwd_txfm_opt;
+                                  int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const int tx_type = txfm_param->tx_type;
+  const int bd = txfm_param->bd;
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
     case DCT_ADST:
     case ADST_ADST:
-      av1_fwd_txfm2d_16x16(src_diff, coeff, diff_stride, tx_type, bd);
+      // fallthrough intended
+      av1_fwd_txfm2d_16x16(src_diff, dst_coeff, diff_stride, tx_type, bd);
       break;
 #if CONFIG_EXT_TX
     case FLIPADST_DCT:
@@ -330,63 +350,72 @@ static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
-      av1_fwd_txfm2d_16x16(src_diff, coeff, diff_stride, tx_type, bd);
+      // fallthrough intended
+      av1_fwd_txfm2d_16x16(src_diff, dst_coeff, diff_stride, tx_type, bd);
       break;
+    // use the c version for anything including identity for now
     case V_DCT:
     case H_DCT:
     case V_ADST:
     case H_ADST:
     case V_FLIPADST:
     case H_FLIPADST:
-      // Use C version since DST exists only in C
-      av1_highbd_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
+    case IDTX:
+      // fallthrough intended
+      av1_fwd_txfm2d_16x16_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
       break;
-    case IDTX: av1_fwd_idtx_c(src_diff, coeff, diff_stride, 16, tx_type); break;
 #endif  // CONFIG_EXT_TX
     default: assert(0);
   }
 }
 
 static void highbd_fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff,
-                                  int diff_stride, TX_TYPE tx_type,
-                                  FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
-  (void)fwd_txfm_opt;
+                                  int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const int tx_type = txfm_param->tx_type;
+  const int bd = txfm_param->bd;
   switch (tx_type) {
     case DCT_DCT:
-      av1_fwd_txfm2d_32x32(src_diff, coeff, diff_stride, tx_type, bd);
-      break;
-#if CONFIG_EXT_TX
     case ADST_DCT:
     case DCT_ADST:
     case ADST_ADST:
+      // fallthrough intended
+      av1_fwd_txfm2d_32x32(src_diff, dst_coeff, diff_stride, tx_type, bd);
+      break;
+#if CONFIG_EXT_TX
     case FLIPADST_DCT:
     case DCT_FLIPADST:
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
+      // fallthrough intended
+      av1_fwd_txfm2d_32x32(src_diff, dst_coeff, diff_stride, tx_type, bd);
+      break;
+    // use the c version for anything including identity for now
     case V_DCT:
     case H_DCT:
     case V_ADST:
     case H_ADST:
     case V_FLIPADST:
     case H_FLIPADST:
-      av1_highbd_fht32x32_c(src_diff, coeff, diff_stride, tx_type);
+    case IDTX:
+      // fallthrough intended
+      av1_fwd_txfm2d_32x32_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
       break;
-    case IDTX: av1_fwd_idtx_c(src_diff, coeff, diff_stride, 32, tx_type); break;
 #endif  // CONFIG_EXT_TX
-    default: assert(0); break;
+    default: assert(0);
   }
 }
 
 #if CONFIG_TX64X64
 static void highbd_fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff,
-                                  int diff_stride, TX_TYPE tx_type,
-                                  FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
-  (void)fwd_txfm_opt;
-  (void)bd;
+                                  int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const int tx_type = txfm_param->tx_type;
+  const int bd = txfm_param->bd;
   switch (tx_type) {
     case DCT_DCT:
-      av1_highbd_fht64x64(src_diff, coeff, diff_stride, tx_type);
+      av1_fwd_txfm2d_64x64(src_diff, dst_coeff, diff_stride, tx_type, bd);
       break;
 #if CONFIG_EXT_TX
     case ADST_DCT:
@@ -403,141 +432,119 @@ static void highbd_fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff,
     case H_ADST:
     case V_FLIPADST:
     case H_FLIPADST:
-      av1_highbd_fht64x64(src_diff, coeff, diff_stride, tx_type);
+      // TODO(sarahparker)
+      // I've deleted the 64x64 implementations that existed in lieu
+      // of adst, flipadst and identity for simplicity but will bring back
+      // in a later change. This shouldn't impact performance since
+      // DCT_DCT is the only extended type currently allowed for 64x64,
+      // as dictated by get_ext_tx_set_type in blockd.h.
+      av1_fwd_txfm2d_64x64_c(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
+      break;
+    case IDTX:
+      av1_fwd_idtx_c(src_diff, dst_coeff, diff_stride, 64, tx_type);
       break;
-    case IDTX: av1_fwd_idtx_c(src_diff, coeff, diff_stride, 64, tx_type); break;
 #endif  // CONFIG_EXT_TX
     default: assert(0); break;
   }
 }
 #endif  // CONFIG_TX64X64
-#endif  // CONFIG_HIGHBITDEPTH
 
 void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
-                  FWD_TXFM_PARAM *fwd_txfm_param) {
-  const int fwd_txfm_opt = FWD_TXFM_OPT_NORMAL;
-  const TX_TYPE tx_type = fwd_txfm_param->tx_type;
-  const TX_SIZE tx_size = fwd_txfm_param->tx_size;
-  const int lossless = fwd_txfm_param->lossless;
+                  TxfmParam *txfm_param) {
+  const TX_SIZE tx_size = txfm_param->tx_size;
   switch (tx_size) {
 #if CONFIG_TX64X64
     case TX_64X64:
-      fwd_txfm_64x64(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+      fwd_txfm_64x64(src_diff, coeff, diff_stride, txfm_param);
       break;
 #endif  // CONFIG_TX64X64
     case TX_32X32:
-      fwd_txfm_32x32(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+      fwd_txfm_32x32(src_diff, coeff, diff_stride, txfm_param);
       break;
     case TX_16X16:
-      fwd_txfm_16x16(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
-      break;
-    case TX_8X8:
-      fwd_txfm_8x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
-      break;
-    case TX_4X8:
-      fwd_txfm_4x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
-      break;
-    case TX_8X4:
-      fwd_txfm_8x4(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+      fwd_txfm_16x16(src_diff, coeff, diff_stride, txfm_param);
       break;
+    case TX_8X8: fwd_txfm_8x8(src_diff, coeff, diff_stride, txfm_param); break;
+    case TX_4X8: fwd_txfm_4x8(src_diff, coeff, diff_stride, txfm_param); break;
+    case TX_8X4: fwd_txfm_8x4(src_diff, coeff, diff_stride, txfm_param); break;
     case TX_8X16:
-      fwd_txfm_8x16(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+      fwd_txfm_8x16(src_diff, coeff, diff_stride, txfm_param);
       break;
     case TX_16X8:
-      fwd_txfm_16x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+      fwd_txfm_16x8(src_diff, coeff, diff_stride, txfm_param);
       break;
     case TX_16X32:
-      fwd_txfm_16x32(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+      fwd_txfm_16x32(src_diff, coeff, diff_stride, txfm_param);
       break;
     case TX_32X16:
-      fwd_txfm_32x16(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
-      break;
-    case TX_4X4:
-      fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type, lossless);
+      fwd_txfm_32x16(src_diff, coeff, diff_stride, txfm_param);
       break;
+    case TX_4X4: fwd_txfm_4x4(src_diff, coeff, diff_stride, txfm_param); break;
 #if CONFIG_CHROMA_2X2
-    case TX_2X2:
-      fwd_txfm_2x2(src_diff, coeff, diff_stride, tx_type, lossless);
-      break;
+    case TX_2X2: fwd_txfm_2x2(src_diff, coeff, diff_stride, txfm_param); break;
 #endif
-#if CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
+#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
     case TX_4X16:
-      fwd_txfm_4x16(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+      fwd_txfm_4x16(src_diff, coeff, diff_stride, txfm_param);
       break;
     case TX_16X4:
-      fwd_txfm_16x4(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+      fwd_txfm_16x4(src_diff, coeff, diff_stride, txfm_param);
       break;
     case TX_8X32:
-      fwd_txfm_8x32(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+      fwd_txfm_8x32(src_diff, coeff, diff_stride, txfm_param);
       break;
     case TX_32X8:
-      fwd_txfm_32x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+      fwd_txfm_32x8(src_diff, coeff, diff_stride, txfm_param);
       break;
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
+#endif
     default: assert(0); break;
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
 void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
-                         int diff_stride, FWD_TXFM_PARAM *fwd_txfm_param) {
-  const int fwd_txfm_opt = FWD_TXFM_OPT_NORMAL;
-  const TX_TYPE tx_type = fwd_txfm_param->tx_type;
-  const TX_SIZE tx_size = fwd_txfm_param->tx_size;
-  const int lossless = fwd_txfm_param->lossless;
-  const int bd = fwd_txfm_param->bd;
+                         int diff_stride, TxfmParam *txfm_param) {
+  const TX_SIZE tx_size = txfm_param->tx_size;
   switch (tx_size) {
 #if CONFIG_TX64X64
     case TX_64X64:
-      highbd_fwd_txfm_64x64(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
-                            bd);
+      highbd_fwd_txfm_64x64(src_diff, coeff, diff_stride, txfm_param);
       break;
 #endif  // CONFIG_TX64X64
     case TX_32X32:
-      highbd_fwd_txfm_32x32(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
-                            bd);
+      highbd_fwd_txfm_32x32(src_diff, coeff, diff_stride, txfm_param);
       break;
     case TX_16X16:
-      highbd_fwd_txfm_16x16(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
-                            bd);
+      highbd_fwd_txfm_16x16(src_diff, coeff, diff_stride, txfm_param);
       break;
     case TX_8X8:
-      highbd_fwd_txfm_8x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
-                          bd);
+      highbd_fwd_txfm_8x8(src_diff, coeff, diff_stride, txfm_param);
       break;
     case TX_4X8:
-      highbd_fwd_txfm_4x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
-                          bd);
+      highbd_fwd_txfm_4x8(src_diff, coeff, diff_stride, txfm_param);
       break;
     case TX_8X4:
-      highbd_fwd_txfm_8x4(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
-                          bd);
+      highbd_fwd_txfm_8x4(src_diff, coeff, diff_stride, txfm_param);
       break;
     case TX_8X16:
-      highbd_fwd_txfm_8x16(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
-                           bd);
+      highbd_fwd_txfm_8x16(src_diff, coeff, diff_stride, txfm_param);
       break;
     case TX_16X8:
-      highbd_fwd_txfm_16x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
-                           bd);
+      highbd_fwd_txfm_16x8(src_diff, coeff, diff_stride, txfm_param);
       break;
     case TX_16X32:
-      highbd_fwd_txfm_16x32(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
-                            bd);
+      highbd_fwd_txfm_16x32(src_diff, coeff, diff_stride, txfm_param);
       break;
     case TX_32X16:
-      highbd_fwd_txfm_32x16(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
-                            bd);
+      highbd_fwd_txfm_32x16(src_diff, coeff, diff_stride, txfm_param);
       break;
     case TX_4X4:
-      highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type, lossless, bd);
+      highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, txfm_param);
       break;
 #if CONFIG_CHROMA_2X2
     case TX_2X2:
-      highbd_fwd_txfm_2x2(src_diff, coeff, diff_stride, tx_type, lossless, bd);
+      highbd_fwd_txfm_2x2(src_diff, coeff, diff_stride, txfm_param);
       break;
 #endif
     default: assert(0); break;
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/av1/encoder/hybrid_fwd_txfm.h b/third_party/aom/av1/encoder/hybrid_fwd_txfm.h
index e6fd17275..b25ffb8d8 100644
--- a/third_party/aom/av1/encoder/hybrid_fwd_txfm.h
+++ b/third_party/aom/av1/encoder/hybrid_fwd_txfm.h
@@ -14,28 +14,15 @@
 
 #include "./aom_config.h"
 
-typedef enum FWD_TXFM_OPT { FWD_TXFM_OPT_NORMAL } FWD_TXFM_OPT;
-
-typedef struct FWD_TXFM_PARAM {
-  TX_TYPE tx_type;
-  TX_SIZE tx_size;
-  int lossless;
-#if CONFIG_HIGHBITDEPTH
-  int bd;
-#endif  // CONFIG_HIGHBITDEPTH
-} FWD_TXFM_PARAM;
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
-                  FWD_TXFM_PARAM *fwd_txfm_param);
+                  TxfmParam *txfm_param);
 
-#if CONFIG_HIGHBITDEPTH
 void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
-                         int diff_stride, FWD_TXFM_PARAM *fwd_txfm_param);
-#endif  // CONFIG_HIGHBITDEPTH
+                         int diff_stride, TxfmParam *txfm_param);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/encoder/mcomp.c b/third_party/aom/av1/encoder/mcomp.c
index 52080ca0d..4efadff1b 100644
--- a/third_party/aom/av1/encoder/mcomp.c
+++ b/third_party/aom/av1/encoder/mcomp.c
@@ -228,49 +228,45 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
 
 #define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c)
 
-static INLINE const uint8_t *upre(const uint8_t *buf, int stride, int r,
-                                  int c) {
-  return &buf[(r)*stride + (c)];
-}
-
 /* checks if (r, c) has better score than previous best */
 #if CONFIG_EXT_INTER
-#define CHECK_BETTER1(v, r, c)                                               \
-  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                    \
-    MV this_mv = { r, c };                                                   \
-    thismse = upsampled_pref_error(                                          \
-        xd, vfp, src_address, src_stride, upre(y, y_stride, r, c), y_stride, \
-        second_pred, mask, mask_stride, invert_mask, w, h, &sse);            \
-    v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);       \
-    v += thismse;                                                            \
-    if (v < besterr) {                                                       \
-      besterr = v;                                                           \
-      br = r;                                                                \
-      bc = c;                                                                \
-      *distortion = thismse;                                                 \
-      *sse1 = sse;                                                           \
-    }                                                                        \
-  } else {                                                                   \
-    v = INT_MAX;                                                             \
+#define CHECK_BETTER1(v, r, c)                                              \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                   \
+    MV this_mv = { r, c };                                                  \
+    thismse = upsampled_pref_error(xd, vfp, src_address, src_stride,        \
+                                   pre(y, y_stride, r, c), y_stride, sp(c), \
+                                   sp(r), second_pred, mask, mask_stride,   \
+                                   invert_mask, w, h, &sse);                \
+    v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);      \
+    v += thismse;                                                           \
+    if (v < besterr) {                                                      \
+      besterr = v;                                                          \
+      br = r;                                                               \
+      bc = c;                                                               \
+      *distortion = thismse;                                                \
+      *sse1 = sse;                                                          \
+    }                                                                       \
+  } else {                                                                  \
+    v = INT_MAX;                                                            \
   }
 #else
-#define CHECK_BETTER1(v, r, c)                                         \
-  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {              \
-    MV this_mv = { r, c };                                             \
-    thismse = upsampled_pref_error(xd, vfp, src_address, src_stride,   \
-                                   upre(y, y_stride, r, c), y_stride,  \
-                                   second_pred, w, h, &sse);           \
-    v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit); \
-    v += thismse;                                                      \
-    if (v < besterr) {                                                 \
-      besterr = v;                                                     \
-      br = r;                                                          \
-      bc = c;                                                          \
-      *distortion = thismse;                                           \
-      *sse1 = sse;                                                     \
-    }                                                                  \
-  } else {                                                             \
-    v = INT_MAX;                                                       \
+#define CHECK_BETTER1(v, r, c)                                              \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                   \
+    MV this_mv = { r, c };                                                  \
+    thismse = upsampled_pref_error(xd, vfp, src_address, src_stride,        \
+                                   pre(y, y_stride, r, c), y_stride, sp(c), \
+                                   sp(r), second_pred, w, h, &sse);         \
+    v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);      \
+    v += thismse;                                                           \
+    if (v < besterr) {                                                      \
+      besterr = v;                                                          \
+      br = r;                                                               \
+      bc = c;                                                               \
+      *distortion = thismse;                                                \
+      *sse1 = sse;                                                          \
+    }                                                                       \
+  } else {                                                                  \
+    v = INT_MAX;                                                            \
   }
 #endif  // CONFIG_EXT_INTER
 
@@ -700,16 +696,14 @@ static const MV search_step_table[12] = {
 };
 /* clang-format on */
 
-static int upsampled_pref_error(const MACROBLOCKD *xd,
-                                const aom_variance_fn_ptr_t *vfp,
-                                const uint8_t *const src, const int src_stride,
-                                const uint8_t *const y, int y_stride,
-                                const uint8_t *second_pred,
+static int upsampled_pref_error(
+    const MACROBLOCKD *xd, const aom_variance_fn_ptr_t *vfp,
+    const uint8_t *const src, const int src_stride, const uint8_t *const y,
+    int y_stride, int subpel_x_q3, int subpel_y_q3, const uint8_t *second_pred,
 #if CONFIG_EXT_INTER
-                                const uint8_t *mask, int mask_stride,
-                                int invert_mask,
+    const uint8_t *mask, int mask_stride, int invert_mask,
 #endif
-                                int w, int h, unsigned int *sse) {
+    int w, int h, unsigned int *sse) {
   unsigned int besterr;
 #if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -717,15 +711,17 @@ static int upsampled_pref_error(const MACROBLOCKD *xd,
     if (second_pred != NULL) {
 #if CONFIG_EXT_INTER
       if (mask)
-        aom_highbd_comp_mask_upsampled_pred(pred16, second_pred, w, h, y,
-                                            y_stride, mask, mask_stride,
-                                            invert_mask);
+        aom_highbd_comp_mask_upsampled_pred(
+            pred16, second_pred, w, h, subpel_x_q3, subpel_y_q3, y, y_stride,
+            mask, mask_stride, invert_mask, xd->bd);
       else
 #endif
-        aom_highbd_comp_avg_upsampled_pred(pred16, second_pred, w, h, y,
-                                           y_stride);
+        aom_highbd_comp_avg_upsampled_pred(pred16, second_pred, w, h,
+                                           subpel_x_q3, subpel_y_q3, y,
+                                           y_stride, xd->bd);
     } else {
-      aom_highbd_upsampled_pred(pred16, w, h, y, y_stride);
+      aom_highbd_upsampled_pred(pred16, w, h, subpel_x_q3, subpel_y_q3, y,
+                                y_stride, xd->bd);
     }
 
     besterr = vfp->vf(CONVERT_TO_BYTEPTR(pred16), w, src, src_stride, sse);
@@ -738,13 +734,15 @@ static int upsampled_pref_error(const MACROBLOCKD *xd,
     if (second_pred != NULL) {
 #if CONFIG_EXT_INTER
       if (mask)
-        aom_comp_mask_upsampled_pred(pred, second_pred, w, h, y, y_stride, mask,
+        aom_comp_mask_upsampled_pred(pred, second_pred, w, h, subpel_x_q3,
+                                     subpel_y_q3, y, y_stride, mask,
                                      mask_stride, invert_mask);
       else
 #endif
-        aom_comp_avg_upsampled_pred(pred, second_pred, w, h, y, y_stride);
+        aom_comp_avg_upsampled_pred(pred, second_pred, w, h, subpel_x_q3,
+                                    subpel_y_q3, y, y_stride);
     } else {
-      aom_upsampled_pred(pred, w, h, y, y_stride);
+      aom_upsampled_pred(pred, w, h, subpel_x_q3, subpel_y_q3, y, y_stride);
     }
 
     besterr = vfp->vf(pred, w, src, src_stride, sse);
@@ -764,12 +762,12 @@ static unsigned int upsampled_setup_center_error(
 #endif
     int w, int h, int offset, int *mvjcost, int *mvcost[2], unsigned int *sse1,
     int *distortion) {
-  unsigned int besterr = upsampled_pref_error(xd, vfp, src, src_stride,
-                                              y + offset, y_stride, second_pred,
+  unsigned int besterr = upsampled_pref_error(
+      xd, vfp, src, src_stride, y + offset, y_stride, 0, 0, second_pred,
 #if CONFIG_EXT_INTER
-                                              mask, mask_stride, invert_mask,
+      mask, mask_stride, invert_mask,
 #endif
-                                              w, h, sse1);
+      w, h, sse1);
   *distortion = besterr;
   besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
   return besterr;
@@ -824,7 +822,7 @@ int av1_find_best_sub_pixel_tree(
 #if CONFIG_EXT_INTER
         mask, mask_stride, invert_mask,
 #endif
-        w, h, (offset * 8), mvjcost, mvcost, sse1, distortion);
+        w, h, offset, mvjcost, mvcost, sse1, distortion);
   else
     besterr =
         setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, src_address,
@@ -845,17 +843,15 @@ int av1_find_best_sub_pixel_tree(
         MV this_mv = { tr, tc };
 
         if (use_upsampled_ref) {
-          const uint8_t *const pre_address = y + tr * y_stride + tc;
-
           thismse = upsampled_pref_error(xd, vfp, src_address, src_stride,
-                                         pre_address, y_stride, second_pred,
+                                         pre(y, y_stride, tr, tc), y_stride,
+                                         sp(tc), sp(tr), second_pred,
 #if CONFIG_EXT_INTER
                                          mask, mask_stride, invert_mask,
 #endif
                                          w, h, &sse);
         } else {
-          const uint8_t *const pre_address =
-              y + (tr >> 3) * y_stride + (tc >> 3);
+          const uint8_t *const pre_address = pre(y, y_stride, tr, tc);
           if (second_pred == NULL)
             thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr),
                                src_address, src_stride, &sse);
@@ -894,16 +890,15 @@ int av1_find_best_sub_pixel_tree(
       MV this_mv = { tr, tc };
 
       if (use_upsampled_ref) {
-        const uint8_t *const pre_address = y + tr * y_stride + tc;
-
         thismse = upsampled_pref_error(xd, vfp, src_address, src_stride,
-                                       pre_address, y_stride, second_pred,
+                                       pre(y, y_stride, tr, tc), y_stride,
+                                       sp(tc), sp(tr), second_pred,
 #if CONFIG_EXT_INTER
                                        mask, mask_stride, invert_mask,
 #endif
                                        w, h, &sse);
       } else {
-        const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
+        const uint8_t *const pre_address = pre(y, y_stride, tr, tc);
 
         if (second_pred == NULL)
           thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), src_address,
@@ -992,9 +987,16 @@ unsigned int av1_compute_motion_cost(const AV1_COMP *cpi, MACROBLOCK *const x,
 }
 
 // Refine MV in a small range
+#if WARPED_MOTION_SORT_SAMPLES
+unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x,
+                                  BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                  int *pts0, int *pts_inref0, int *pts_mv0,
+                                  int total_samples) {
+#else
 unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x,
                                   BLOCK_SIZE bsize, int mi_row, int mi_col,
                                   int *pts, int *pts_inref) {
+#endif  // WARPED_MOTION_SORT_SAMPLES
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   MODE_INFO *mi = xd->mi[0];
@@ -1007,6 +1009,9 @@ unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x,
   int16_t *tr = &mbmi->mv[0].as_mv.row;
   int16_t *tc = &mbmi->mv[0].as_mv.col;
   WarpedMotionParams best_wm_params = mbmi->wm_params[0];
+#if WARPED_MOTION_SORT_SAMPLES
+  int best_num_proj_ref = mbmi->num_proj_ref[0];
+#endif  // WARPED_MOTION_SORT_SAMPLES
   unsigned int bestmse;
   int minc, maxc, minr, maxr;
   const int start = cm->allow_high_precision_mv ? 0 : 4;
@@ -1033,6 +1038,16 @@ unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x,
 
       if (*tc >= minc && *tc <= maxc && *tr >= minr && *tr <= maxr) {
         MV this_mv = { *tr, *tc };
+#if WARPED_MOTION_SORT_SAMPLES
+        int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+
+        memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
+        memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0));
+        if (total_samples > 1)
+          mbmi->num_proj_ref[0] =
+              sortSamples(pts_mv0, &this_mv, pts, pts_inref, total_samples);
+#endif  // WARPED_MOTION_SORT_SAMPLES
+
         if (!find_projection(mbmi->num_proj_ref[0], pts, pts_inref, bsize, *tr,
                              *tc, &mbmi->wm_params[0], mi_row, mi_col)) {
           thismse =
@@ -1041,6 +1056,9 @@ unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x,
           if (thismse < bestmse) {
             best_idx = idx;
             best_wm_params = mbmi->wm_params[0];
+#if WARPED_MOTION_SORT_SAMPLES
+            best_num_proj_ref = mbmi->num_proj_ref[0];
+#endif  // WARPED_MOTION_SORT_SAMPLES
             bestmse = thismse;
           }
         }
@@ -1058,7 +1076,9 @@ unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x,
   *tr = br;
   *tc = bc;
   mbmi->wm_params[0] = best_wm_params;
-
+#if WARPED_MOTION_SORT_SAMPLES
+  mbmi->num_proj_ref[0] = best_num_proj_ref;
+#endif  // WARPED_MOTION_SORT_SAMPLES
   return bestmse;
 }
 #endif  // CONFIG_WARPED_MOTION
@@ -2653,19 +2673,20 @@ int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
 #define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c)
 
 #undef CHECK_BETTER1
-#define CHECK_BETTER1(v, r, c)                                            \
-  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                 \
-    thismse = upsampled_obmc_pref_error(                                  \
-        xd, mask, vfp, z, upre(y, y_stride, r, c), y_stride, w, h, &sse); \
-    if ((v = MVC(r, c) + thismse) < besterr) {                            \
-      besterr = v;                                                        \
-      br = r;                                                             \
-      bc = c;                                                             \
-      *distortion = thismse;                                              \
-      *sse1 = sse;                                                        \
-    }                                                                     \
-  } else {                                                                \
-    v = INT_MAX;                                                          \
+#define CHECK_BETTER1(v, r, c)                                              \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                   \
+    thismse =                                                               \
+        upsampled_obmc_pref_error(xd, mask, vfp, z, pre(y, y_stride, r, c), \
+                                  y_stride, sp(c), sp(r), w, h, &sse);      \
+    if ((v = MVC(r, c) + thismse) < besterr) {                              \
+      besterr = v;                                                          \
+      br = r;                                                               \
+      bc = c;                                                               \
+      *distortion = thismse;                                                \
+      *sse1 = sse;                                                          \
+    }                                                                       \
+  } else {                                                                  \
+    v = INT_MAX;                                                            \
   }
 
 static unsigned int setup_obmc_center_error(
@@ -2684,12 +2705,14 @@ static int upsampled_obmc_pref_error(const MACROBLOCKD *xd, const int32_t *mask,
                                      const aom_variance_fn_ptr_t *vfp,
                                      const int32_t *const wsrc,
                                      const uint8_t *const y, int y_stride,
-                                     int w, int h, unsigned int *sse) {
+                                     int subpel_x_q3, int subpel_y_q3, int w,
+                                     int h, unsigned int *sse) {
   unsigned int besterr;
 #if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
-    aom_highbd_upsampled_pred(pred16, w, h, y, y_stride);
+    aom_highbd_upsampled_pred(pred16, w, h, subpel_x_q3, subpel_y_q3, y,
+                              y_stride, xd->bd);
 
     besterr = vfp->ovf(CONVERT_TO_BYTEPTR(pred16), w, wsrc, mask, sse);
   } else {
@@ -2698,7 +2721,7 @@ static int upsampled_obmc_pref_error(const MACROBLOCKD *xd, const int32_t *mask,
   DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
   (void)xd;
 #endif  // CONFIG_HIGHBITDEPTH
-    aom_upsampled_pred(pred, w, h, y, y_stride);
+    aom_upsampled_pred(pred, w, h, subpel_x_q3, subpel_y_q3, y, y_stride);
 
     besterr = vfp->ovf(pred, w, wsrc, mask, sse);
 #if CONFIG_HIGHBITDEPTH
@@ -2714,18 +2737,17 @@ static unsigned int upsampled_setup_obmc_center_error(
     int h, int offset, int *mvjcost, int *mvcost[2], unsigned int *sse1,
     int *distortion) {
   unsigned int besterr = upsampled_obmc_pref_error(
-      xd, mask, vfp, wsrc, y + offset, y_stride, w, h, sse1);
+      xd, mask, vfp, wsrc, y + offset, y_stride, 0, 0, w, h, sse1);
   *distortion = besterr;
   besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
   return besterr;
 }
 
 int av1_find_best_obmc_sub_pixel_tree_up(
-    const AV1_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col, MV *bestmv,
-    const MV *ref_mv, int allow_hp, int error_per_bit,
-    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
-    int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1,
-    int is_second, int use_upsampled_ref) {
+    MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp,
+    int error_per_bit, const aom_variance_fn_ptr_t *vfp, int forced_stop,
+    int iters_per_step, int *mvjcost, int *mvcost[2], int *distortion,
+    unsigned int *sse1, int is_second, int use_upsampled_ref) {
   const int32_t *wsrc = x->wsrc_buf;
   const int32_t *mask = x->mask_buf;
   const int *const z = wsrc;
@@ -2756,21 +2778,11 @@ int av1_find_best_obmc_sub_pixel_tree_up(
   int y_stride;
   const uint8_t *y;
 
-  const struct buf_2d backup_pred = pd->pre[is_second];
   int minc, maxc, minr, maxr;
 
   av1_set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr,
                                  ref_mv);
 
-  if (use_upsampled_ref) {
-    int ref = xd->mi[0]->mbmi.ref_frame[is_second];
-    const YV12_BUFFER_CONFIG *upsampled_ref = get_upsampled_ref(cpi, ref);
-    setup_pred_plane(&pd->pre[is_second], mbmi->sb_type,
-                     upsampled_ref->y_buffer, upsampled_ref->y_crop_width,
-                     upsampled_ref->y_crop_height, upsampled_ref->y_stride,
-                     (mi_row << 3), (mi_col << 3), NULL, pd->subsampling_x,
-                     pd->subsampling_y);
-  }
   y = pd->pre[is_second].buf;
   y_stride = pd->pre[is_second].stride;
   offset = bestmv->row * y_stride + bestmv->col;
@@ -2784,7 +2796,7 @@ int av1_find_best_obmc_sub_pixel_tree_up(
   if (use_upsampled_ref)
     besterr = upsampled_setup_obmc_center_error(
         xd, mask, bestmv, ref_mv, error_per_bit, vfp, z, y, y_stride, w, h,
-        (offset * 8), mvjcost, mvcost, sse1, distortion);
+        offset, mvjcost, mvcost, sse1, distortion);
   else
     besterr = setup_obmc_center_error(mask, bestmv, ref_mv, error_per_bit, vfp,
                                       z, y, y_stride, offset, mvjcost, mvcost,
@@ -2797,15 +2809,13 @@ int av1_find_best_obmc_sub_pixel_tree_up(
       tc = bc + search_step[idx].col;
       if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
         MV this_mv = { tr, tc };
+        const uint8_t *const pre_address = pre(y, y_stride, tr, tc);
 
         if (use_upsampled_ref) {
-          const uint8_t *const pre_address = y + tr * y_stride + tc;
-
-          thismse = upsampled_obmc_pref_error(
-              xd, mask, vfp, src_address, pre_address, y_stride, w, h, &sse);
+          thismse =
+              upsampled_obmc_pref_error(xd, mask, vfp, src_address, pre_address,
+                                        y_stride, sp(tc), sp(tr), w, h, &sse);
         } else {
-          const uint8_t *const pre_address =
-              y + (tr >> 3) * y_stride + (tc >> 3);
           thismse = vfp->osvf(pre_address, y_stride, sp(tc), sp(tr),
                               src_address, mask, &sse);
         }
@@ -2833,15 +2843,12 @@ int av1_find_best_obmc_sub_pixel_tree_up(
       MV this_mv = { tr, tc };
 
       if (use_upsampled_ref) {
-        const uint8_t *const pre_address = y + tr * y_stride + tc;
-
         thismse = upsampled_obmc_pref_error(xd, mask, vfp, src_address,
-                                            pre_address, y_stride, w, h, &sse);
+                                            pre(y, y_stride, tr, tc), y_stride,
+                                            sp(tc), sp(tr), w, h, &sse);
       } else {
-        const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
-
-        thismse = vfp->osvf(pre_address, y_stride, sp(tc), sp(tr), src_address,
-                            mask, &sse);
+        thismse = vfp->osvf(pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr),
+                            src_address, mask, &sse);
       }
 
       cost_array[4] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
@@ -2889,10 +2896,6 @@ int av1_find_best_obmc_sub_pixel_tree_up(
   bestmv->row = br;
   bestmv->col = bc;
 
-  if (use_upsampled_ref) {
-    pd->pre[is_second] = backup_pred;
-  }
-
   return besterr;
 }
 
diff --git a/third_party/aom/av1/encoder/mcomp.h b/third_party/aom/av1/encoder/mcomp.h
index 7e8b4b29d..733e415ce 100644
--- a/third_party/aom/av1/encoder/mcomp.h
+++ b/third_party/aom/av1/encoder/mcomp.h
@@ -143,11 +143,10 @@ int av1_obmc_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x,
                                 const aom_variance_fn_ptr_t *fn_ptr,
                                 const MV *ref_mv, MV *dst_mv, int is_second);
 int av1_find_best_obmc_sub_pixel_tree_up(
-    const struct AV1_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col,
-    MV *bestmv, const MV *ref_mv, int allow_hp, int error_per_bit,
-    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
-    int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1,
-    int is_second, int use_upsampled_ref);
+    MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp,
+    int error_per_bit, const aom_variance_fn_ptr_t *vfp, int forced_stop,
+    int iters_per_step, int *mvjcost, int *mvcost[2], int *distortion,
+    unsigned int *sse1, int is_second, int use_upsampled_ref);
 #endif  // CONFIG_MOTION_VAR
 #ifdef __cplusplus
 }  // extern "C"
@@ -157,10 +156,18 @@ int av1_find_best_obmc_sub_pixel_tree_up(
 unsigned int av1_compute_motion_cost(const struct AV1_COMP *cpi,
                                      MACROBLOCK *const x, BLOCK_SIZE bsize,
                                      int mi_row, int mi_col, const MV *this_mv);
+#if WARPED_MOTION_SORT_SAMPLES
+unsigned int av1_refine_warped_mv(const struct AV1_COMP *cpi,
+                                  MACROBLOCK *const x, BLOCK_SIZE bsize,
+                                  int mi_row, int mi_col, int *pts0,
+                                  int *pts_inref0, int *pts_mv0,
+                                  int total_samples);
+#else
 unsigned int av1_refine_warped_mv(const struct AV1_COMP *cpi,
                                   MACROBLOCK *const x, BLOCK_SIZE bsize,
                                   int mi_row, int mi_col, int *pts,
                                   int *pts_inref);
+#endif  // WARPED_MOTION_SORT_SAMPLES
 #endif  // CONFIG_WARPED_MOTION
 
 #endif  // AV1_ENCODER_MCOMP_H_
diff --git a/third_party/aom/av1/encoder/palette.c b/third_party/aom/av1/encoder/palette.c
index 235964dde..bac06cd17 100644
--- a/third_party/aom/av1/encoder/palette.c
+++ b/third_party/aom/av1/encoder/palette.c
@@ -145,27 +145,6 @@ int av1_remove_duplicates(float *centroids, int num_centroids) {
   return num_unique;
 }
 
-int av1_count_colors(const uint8_t *src, int stride, int rows, int cols) {
-  int n = 0, r, c, i, val_count[256];
-  uint8_t val;
-  memset(val_count, 0, sizeof(val_count));
-
-  for (r = 0; r < rows; ++r) {
-    for (c = 0; c < cols; ++c) {
-      val = src[r * stride + c];
-      ++val_count[val];
-    }
-  }
-
-  for (i = 0; i < 256; ++i) {
-    if (val_count[i]) {
-      ++n;
-    }
-  }
-
-  return n;
-}
-
 #if CONFIG_PALETTE_DELTA_ENCODING
 static int delta_encode_cost(const int *colors, int num, int bit_depth,
                              int min_val) {
@@ -291,30 +270,3 @@ int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi,
   return 2 * bit_depth * n * av1_cost_bit(128, 0);
 #endif  // CONFIG_PALETTE_DELTA_ENCODING
 }
-
-#if CONFIG_HIGHBITDEPTH
-int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols,
-                            int bit_depth) {
-  int n = 0, r, c, i;
-  uint16_t val;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  int val_count[1 << 12];
-
-  assert(bit_depth <= 12);
-  memset(val_count, 0, (1 << 12) * sizeof(val_count[0]));
-  for (r = 0; r < rows; ++r) {
-    for (c = 0; c < cols; ++c) {
-      val = src[r * stride + c];
-      ++val_count[val];
-    }
-  }
-
-  for (i = 0; i < (1 << bit_depth); ++i) {
-    if (val_count[i]) {
-      ++n;
-    }
-  }
-
-  return n;
-}
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/av1/encoder/palette.h b/third_party/aom/av1/encoder/palette.h
index f5a3c1bdd..8afe5a782 100644
--- a/third_party/aom/av1/encoder/palette.h
+++ b/third_party/aom/av1/encoder/palette.h
@@ -36,14 +36,6 @@ void av1_k_means(const float *data, float *centroids, uint8_t *indices, int n,
 // method.
 int av1_remove_duplicates(float *centroids, int num_centroids);
 
-// Returns the number of colors in 'src'.
-int av1_count_colors(const uint8_t *src, int stride, int rows, int cols);
-#if CONFIG_HIGHBITDEPTH
-// Same as av1_count_colors(), but for high-bitdepth mode.
-int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols,
-                            int bit_depth);
-#endif  // CONFIG_HIGHBITDEPTH
-
 #if CONFIG_PALETTE_DELTA_ENCODING
 // Given a color cache and a set of base colors, find if each cache color is
 // present in the base colors, record the binary results in "cache_color_found".
diff --git a/third_party/aom/av1/encoder/pickcdef.c b/third_party/aom/av1/encoder/pickcdef.c
index da64fb48d..e4ec38826 100644
--- a/third_party/aom/av1/encoder/pickcdef.c
+++ b/third_party/aom/av1/encoder/pickcdef.c
@@ -19,13 +19,19 @@
 #include "av1/common/reconinter.h"
 #include "av1/encoder/encoder.h"
 
+#define REDUCED_STRENGTHS 8
+#define REDUCED_TOTAL_STRENGTHS (REDUCED_STRENGTHS * CLPF_STRENGTHS)
 #define TOTAL_STRENGTHS (DERING_STRENGTHS * CLPF_STRENGTHS)
 
+static int priconv[REDUCED_STRENGTHS] = { 0, 1, 2, 3, 4, 7, 12, 25 };
+
 /* Search for the best strength to add as an option, knowing we
    already selected nb_strengths options. */
 static uint64_t search_one(int *lev, int nb_strengths,
-                           uint64_t mse[][TOTAL_STRENGTHS], int sb_count) {
+                           uint64_t mse[][TOTAL_STRENGTHS], int sb_count,
+                           int fast) {
   uint64_t tot_mse[TOTAL_STRENGTHS];
+  const int total_strengths = fast ? REDUCED_TOTAL_STRENGTHS : TOTAL_STRENGTHS;
   int i, j;
   uint64_t best_tot_mse = (uint64_t)1 << 63;
   int best_id = 0;
@@ -40,13 +46,13 @@ static uint64_t search_one(int *lev, int nb_strengths,
       }
     }
     /* Find best mse when adding each possible new option. */
-    for (j = 0; j < TOTAL_STRENGTHS; j++) {
+    for (j = 0; j < total_strengths; j++) {
       uint64_t best = best_mse;
       if (mse[i][j] < best) best = mse[i][j];
       tot_mse[j] += best;
     }
   }
-  for (j = 0; j < TOTAL_STRENGTHS; j++) {
+  for (j = 0; j < total_strengths; j++) {
     if (tot_mse[j] < best_tot_mse) {
       best_tot_mse = tot_mse[j];
       best_id = j;
@@ -59,9 +65,10 @@ static uint64_t search_one(int *lev, int nb_strengths,
 /* Search for the best luma+chroma strength to add as an option, knowing we
    already selected nb_strengths options. */
 static uint64_t search_one_dual(int *lev0, int *lev1, int nb_strengths,
-                                uint64_t (**mse)[TOTAL_STRENGTHS],
-                                int sb_count) {
+                                uint64_t (**mse)[TOTAL_STRENGTHS], int sb_count,
+                                int fast) {
   uint64_t tot_mse[TOTAL_STRENGTHS][TOTAL_STRENGTHS];
+  const int total_strengths = fast ? REDUCED_TOTAL_STRENGTHS : TOTAL_STRENGTHS;
   int i, j;
   uint64_t best_tot_mse = (uint64_t)1 << 63;
   int best_id0 = 0;
@@ -79,9 +86,9 @@ static uint64_t search_one_dual(int *lev0, int *lev1, int nb_strengths,
       }
     }
     /* Find best mse when adding each possible new option. */
-    for (j = 0; j < TOTAL_STRENGTHS; j++) {
+    for (j = 0; j < total_strengths; j++) {
       int k;
-      for (k = 0; k < TOTAL_STRENGTHS; k++) {
+      for (k = 0; k < total_strengths; k++) {
         uint64_t best = best_mse;
         uint64_t curr = mse[0][i][j];
         curr += mse[1][i][k];
@@ -90,9 +97,9 @@ static uint64_t search_one_dual(int *lev0, int *lev1, int nb_strengths,
       }
     }
   }
-  for (j = 0; j < TOTAL_STRENGTHS; j++) {
+  for (j = 0; j < total_strengths; j++) {
     int k;
-    for (k = 0; k < TOTAL_STRENGTHS; k++) {
+    for (k = 0; k < total_strengths; k++) {
       if (tot_mse[j][k] < best_tot_mse) {
         best_tot_mse = tot_mse[j][k];
         best_id0 = j;
@@ -108,20 +115,23 @@ static uint64_t search_one_dual(int *lev0, int *lev1, int nb_strengths,
 /* Search for the set of strengths that minimizes mse. */
 static uint64_t joint_strength_search(int *best_lev, int nb_strengths,
                                       uint64_t mse[][TOTAL_STRENGTHS],
-                                      int sb_count) {
+                                      int sb_count, int fast) {
   uint64_t best_tot_mse;
   int i;
   best_tot_mse = (uint64_t)1 << 63;
   /* Greedy search: add one strength options at a time. */
   for (i = 0; i < nb_strengths; i++) {
-    best_tot_mse = search_one(best_lev, i, mse, sb_count);
+    best_tot_mse = search_one(best_lev, i, mse, sb_count, fast);
   }
   /* Trying to refine the greedy search by reconsidering each
      already-selected option. */
-  for (i = 0; i < 4 * nb_strengths; i++) {
-    int j;
-    for (j = 0; j < nb_strengths - 1; j++) best_lev[j] = best_lev[j + 1];
-    best_tot_mse = search_one(best_lev, nb_strengths - 1, mse, sb_count);
+  if (!fast) {
+    for (i = 0; i < 4 * nb_strengths; i++) {
+      int j;
+      for (j = 0; j < nb_strengths - 1; j++) best_lev[j] = best_lev[j + 1];
+      best_tot_mse =
+          search_one(best_lev, nb_strengths - 1, mse, sb_count, fast);
+    }
   }
   return best_tot_mse;
 }
@@ -130,13 +140,14 @@ static uint64_t joint_strength_search(int *best_lev, int nb_strengths,
 static uint64_t joint_strength_search_dual(int *best_lev0, int *best_lev1,
                                            int nb_strengths,
                                            uint64_t (**mse)[TOTAL_STRENGTHS],
-                                           int sb_count) {
+                                           int sb_count, int fast) {
   uint64_t best_tot_mse;
   int i;
   best_tot_mse = (uint64_t)1 << 63;
   /* Greedy search: add one strength options at a time. */
   for (i = 0; i < nb_strengths; i++) {
-    best_tot_mse = search_one_dual(best_lev0, best_lev1, i, mse, sb_count);
+    best_tot_mse =
+        search_one_dual(best_lev0, best_lev1, i, mse, sb_count, fast);
   }
   /* Trying to refine the greedy search by reconsidering each
      already-selected option. */
@@ -146,8 +157,8 @@ static uint64_t joint_strength_search_dual(int *best_lev0, int *best_lev1,
       best_lev0[j] = best_lev0[j + 1];
       best_lev1[j] = best_lev1[j + 1];
     }
-    best_tot_mse =
-        search_one_dual(best_lev0, best_lev1, nb_strengths - 1, mse, sb_count);
+    best_tot_mse = search_one_dual(best_lev0, best_lev1, nb_strengths - 1, mse,
+                                   sb_count, fast);
   }
   return best_tot_mse;
 }
@@ -269,12 +280,12 @@ uint64_t compute_dering_dist(uint16_t *dst, int dstride, uint16_t *src,
 }
 
 void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
-                     AV1_COMMON *cm, MACROBLOCKD *xd) {
+                     AV1_COMMON *cm, MACROBLOCKD *xd, int fast) {
   int r, c;
   int sbr, sbc;
   uint16_t *src[3];
   uint16_t *ref_coeff[3];
-  dering_list dlist[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  dering_list dlist[MI_SIZE_64X64 * MI_SIZE_64X64];
   int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = { { 0 } };
   int var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = { { 0 } };
   int stride[3];
@@ -289,8 +300,8 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
   uint64_t best_tot_mse = (uint64_t)1 << 63;
   uint64_t tot_mse;
   int sb_count;
-  int nvsb = (cm->mi_rows + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
-  int nhsb = (cm->mi_cols + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
+  int nvsb = (cm->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  int nhsb = (cm->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
   int *sb_index = aom_malloc(nvsb * nhsb * sizeof(*sb_index));
   int *selected_strength = aom_malloc(nvsb * nhsb * sizeof(*sb_index));
   uint64_t(*mse[2])[TOTAL_STRENGTHS];
@@ -302,6 +313,7 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
   int quantizer;
   double lambda;
   int nplanes = 3;
+  const int total_strengths = fast ? REDUCED_TOTAL_STRENGTHS : TOTAL_STRENGTHS;
   DECLARE_ALIGNED(32, uint16_t, inbuf[OD_DERING_INBUF_SIZE]);
   uint16_t *in;
   DECLARE_ALIGNED(32, uint16_t, tmp_dst[MAX_SB_SQUARE]);
@@ -375,22 +387,23 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
       int nvb, nhb;
       int gi;
       int dirinit = 0;
-      nhb = AOMMIN(MAX_MIB_SIZE, cm->mi_cols - MAX_MIB_SIZE * sbc);
-      nvb = AOMMIN(MAX_MIB_SIZE, cm->mi_rows - MAX_MIB_SIZE * sbr);
-      cm->mi_grid_visible[MAX_MIB_SIZE * sbr * cm->mi_stride +
-                          MAX_MIB_SIZE * sbc]
+      nhb = AOMMIN(MI_SIZE_64X64, cm->mi_cols - MI_SIZE_64X64 * sbc);
+      nvb = AOMMIN(MI_SIZE_64X64, cm->mi_rows - MI_SIZE_64X64 * sbr);
+      cm->mi_grid_visible[MI_SIZE_64X64 * sbr * cm->mi_stride +
+                          MI_SIZE_64X64 * sbc]
           ->mbmi.cdef_strength = -1;
-      if (sb_all_skip(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE)) continue;
-      dering_count = sb_compute_dering_list(cm, sbr * MAX_MIB_SIZE,
-                                            sbc * MAX_MIB_SIZE, dlist, 1);
+      if (sb_all_skip(cm, sbr * MI_SIZE_64X64, sbc * MI_SIZE_64X64)) continue;
+      dering_count = sb_compute_dering_list(cm, sbr * MI_SIZE_64X64,
+                                            sbc * MI_SIZE_64X64, dlist, 1);
       for (pli = 0; pli < nplanes; pli++) {
         for (i = 0; i < OD_DERING_INBUF_SIZE; i++)
           inbuf[i] = OD_DERING_VERY_LARGE;
-        for (gi = 0; gi < TOTAL_STRENGTHS; gi++) {
+        for (gi = 0; gi < total_strengths; gi++) {
           int threshold;
           uint64_t curr_mse;
           int clpf_strength;
           threshold = gi / CLPF_STRENGTHS;
+          if (fast) threshold = priconv[threshold];
           if (pli > 0 && !chroma_dering) threshold = 0;
           /* We avoid filtering the pixels for which some of the pixels to
              average
@@ -406,8 +419,8 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
           if (clpf_strength == 0)
             copy_sb16_16(&in[(-yoff * OD_FILT_BSTRIDE - xoff)], OD_FILT_BSTRIDE,
                          src[pli],
-                         (sbr * MAX_MIB_SIZE << mi_high_l2[pli]) - yoff,
-                         (sbc * MAX_MIB_SIZE << mi_wide_l2[pli]) - xoff,
+                         (sbr * MI_SIZE_64X64 << mi_high_l2[pli]) - yoff,
+                         (sbc * MI_SIZE_64X64 << mi_wide_l2[pli]) - xoff,
                          stride[pli], ysize, xsize);
           od_dering(clpf_strength ? NULL : (uint8_t *)in, OD_FILT_BSTRIDE,
                     tmp_dst, in, xdec[pli], ydec[pli], dir, &dirinit, var, pli,
@@ -416,8 +429,8 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
                     dering_damping, coeff_shift, clpf_strength != 0, 1);
           curr_mse = compute_dering_dist(
               ref_coeff[pli] +
-                  (sbr * MAX_MIB_SIZE << mi_high_l2[pli]) * stride[pli] +
-                  (sbc * MAX_MIB_SIZE << mi_wide_l2[pli]),
+                  (sbr * MI_SIZE_64X64 << mi_high_l2[pli]) * stride[pli] +
+                  (sbc * MI_SIZE_64X64 << mi_wide_l2[pli]),
               stride[pli], tmp_dst, dlist, dering_count, bsize[pli],
               coeff_shift, pli);
           if (pli < 2)
@@ -425,7 +438,7 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
           else
             mse[1][sb_count][gi] += curr_mse;
           sb_index[sb_count] =
-              MAX_MIB_SIZE * sbr * cm->mi_stride + MAX_MIB_SIZE * sbc;
+              MI_SIZE_64X64 * sbr * cm->mi_stride + MI_SIZE_64X64 * sbc;
         }
       }
       sb_count++;
@@ -440,10 +453,10 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
     nb_strengths = 1 << i;
     if (nplanes >= 3)
       tot_mse = joint_strength_search_dual(best_lev0, best_lev1, nb_strengths,
-                                           mse, sb_count);
+                                           mse, sb_count, fast);
     else
-      tot_mse =
-          joint_strength_search(best_lev0, nb_strengths, mse[0], sb_count);
+      tot_mse = joint_strength_search(best_lev0, nb_strengths, mse[0], sb_count,
+                                      fast);
     /* Count superblock signalling cost. */
     tot_mse += (uint64_t)(sb_count * lambda * i);
     /* Count header signalling cost. */
@@ -477,6 +490,17 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
     selected_strength[i] = best_gi;
     cm->mi_grid_visible[sb_index[i]]->mbmi.cdef_strength = best_gi;
   }
+
+  if (fast) {
+    for (int j = 0; j < nb_strengths; j++) {
+      cm->cdef_strengths[j] =
+          priconv[cm->cdef_strengths[j] / CLPF_STRENGTHS] * CLPF_STRENGTHS +
+          (cm->cdef_strengths[j] % CLPF_STRENGTHS);
+      cm->cdef_uv_strengths[j] =
+          priconv[cm->cdef_uv_strengths[j] / CLPF_STRENGTHS] * CLPF_STRENGTHS +
+          (cm->cdef_uv_strengths[j] % CLPF_STRENGTHS);
+    }
+  }
   cm->cdef_dering_damping = dering_damping;
   cm->cdef_clpf_damping = clpf_damping;
   aom_free(mse[0]);
diff --git a/third_party/aom/av1/encoder/picklpf.c b/third_party/aom/av1/encoder/picklpf.c
index fc0ea485d..26fd55ef0 100644
--- a/third_party/aom/av1/encoder/picklpf.c
+++ b/third_party/aom/av1/encoder/picklpf.c
@@ -38,13 +38,23 @@ int av1_get_max_filter_level(const AV1_COMP *cpi) {
 
 static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
                                 AV1_COMP *const cpi, int filt_level,
-                                int partial_frame) {
+                                int partial_frame
+#if CONFIG_UV_LVL
+                                ,
+                                int plane
+#endif
+                                ) {
   AV1_COMMON *const cm = &cpi->common;
   int64_t filt_err;
 
 #if CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_CB4X4
+#if CONFIG_UV_LVL
+  av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level,
+                        plane, partial_frame);
+#else
   av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level, 1,
                         partial_frame);
+#endif  // CONFIG_UV_LVL
 #else
   if (cpi->num_workers > 1)
     av1_loop_filter_frame_mt(cm->frame_to_show, cm, cpi->td.mb.e_mbd.plane,
@@ -55,6 +65,40 @@ static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
                           1, partial_frame);
 #endif
 
+#if CONFIG_UV_LVL
+#if CONFIG_HIGHBITDEPTH
+  if (cm->use_highbitdepth) {
+    if (plane == 0)
+      filt_err = aom_highbd_get_y_sse(sd, cm->frame_to_show);
+    else if (plane == 1)
+      filt_err = aom_highbd_get_u_sse(sd, cm->frame_to_show);
+    else
+      filt_err = aom_highbd_get_v_sse(sd, cm->frame_to_show);
+  } else {
+    if (plane == 0)
+      filt_err = aom_get_y_sse(sd, cm->frame_to_show);
+    else if (plane == 1)
+      filt_err = aom_get_u_sse(sd, cm->frame_to_show);
+    else
+      filt_err = aom_get_v_sse(sd, cm->frame_to_show);
+  }
+#else
+  if (plane == 0)
+    filt_err = aom_get_y_sse(sd, cm->frame_to_show);
+  else if (plane == 1)
+    filt_err = aom_get_u_sse(sd, cm->frame_to_show);
+  else
+    filt_err = aom_get_v_sse(sd, cm->frame_to_show);
+#endif  // CONFIG_HIGHBITDEPTH
+
+  // Re-instate the unfiltered frame
+  if (plane == 0)
+    aom_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
+  else if (plane == 1)
+    aom_yv12_copy_u(&cpi->last_frame_uf, cm->frame_to_show);
+  else
+    aom_yv12_copy_v(&cpi->last_frame_uf, cm->frame_to_show);
+#else
 #if CONFIG_HIGHBITDEPTH
   if (cm->use_highbitdepth) {
     filt_err = aom_highbd_get_y_sse(sd, cm->frame_to_show);
@@ -67,12 +111,18 @@ static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
 
   // Re-instate the unfiltered frame
   aom_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
+#endif  // CONFIG_UV_LVL
 
   return filt_err;
 }
 
 int av1_search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
-                            int partial_frame, double *best_cost_ret) {
+                            int partial_frame, double *best_cost_ret
+#if CONFIG_UV_LVL
+                            ,
+                            int plane
+#endif
+                            ) {
   const AV1_COMMON *const cm = &cpi->common;
   const struct loopfilter *const lf = &cm->lf;
   const int min_filter_level = 0;
@@ -82,9 +132,20 @@ int av1_search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
   int filt_best;
   MACROBLOCK *x = &cpi->td.mb;
 
-  // Start the search at the previous frame filter level unless it is now out of
-  // range.
+// Start the search at the previous frame filter level unless it is now out of
+// range.
+#if CONFIG_UV_LVL
+  int lvl;
+  switch (plane) {
+    case 0: lvl = lf->filter_level; break;
+    case 1: lvl = lf->filter_level_u; break;
+    case 2: lvl = lf->filter_level_v; break;
+    default: lvl = lf->filter_level; break;
+  }
+  int filt_mid = clamp(lvl, min_filter_level, max_filter_level);
+#else
   int filt_mid = clamp(lf->filter_level, min_filter_level, max_filter_level);
+#endif  // CONFIG_UV_LVL
   int filter_step = filt_mid < 16 ? 4 : filt_mid / 4;
   // Sum squared error at each filter level
   int64_t ss_err[MAX_LOOP_FILTER + 1];
@@ -92,10 +153,23 @@ int av1_search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
   // Set each entry to -1
   memset(ss_err, 0xFF, sizeof(ss_err));
 
+#if CONFIG_UV_LVL
+  if (plane == 0)
+    aom_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
+  else if (plane == 1)
+    aom_yv12_copy_u(cm->frame_to_show, &cpi->last_frame_uf);
+  else if (plane == 2)
+    aom_yv12_copy_v(cm->frame_to_show, &cpi->last_frame_uf);
+#else
   //  Make a copy of the unfiltered / processed recon buffer
   aom_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
+#endif  // CONFIG_UV_LVL
 
+#if CONFIG_UV_LVL
+  best_err = try_filter_frame(sd, cpi, filt_mid, partial_frame, plane);
+#else
   best_err = try_filter_frame(sd, cpi, filt_mid, partial_frame);
+#endif  // CONFIG_UV_LVL
   filt_best = filt_mid;
   ss_err[filt_mid] = best_err;
 
@@ -115,7 +189,12 @@ int av1_search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
     if (filt_direction <= 0 && filt_low != filt_mid) {
       // Get Low filter error score
       if (ss_err[filt_low] < 0) {
+#if CONFIG_UV_LVL
+        ss_err[filt_low] =
+            try_filter_frame(sd, cpi, filt_low, partial_frame, plane);
+#else
         ss_err[filt_low] = try_filter_frame(sd, cpi, filt_low, partial_frame);
+#endif  // CONFIG_UV_LVL
       }
       // If value is close to the best so far then bias towards a lower loop
       // filter value.
@@ -131,7 +210,12 @@ int av1_search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
     // Now look at filt_high
     if (filt_direction >= 0 && filt_high != filt_mid) {
       if (ss_err[filt_high] < 0) {
+#if CONFIG_UV_LVL
+        ss_err[filt_high] =
+            try_filter_frame(sd, cpi, filt_high, partial_frame, plane);
+#else
         ss_err[filt_high] = try_filter_frame(sd, cpi, filt_high, partial_frame);
+#endif  // CONFIG_UV_LVL
       }
       // If value is significantly better than previous best, bias added against
       // raising filter value
@@ -154,8 +238,7 @@ int av1_search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
   // Update best error
   best_err = ss_err[filt_best];
 
-  if (best_cost_ret)
-    *best_cost_ret = RDCOST_DBL(x->rdmult, x->rddiv, 0, best_err);
+  if (best_cost_ret) *best_cost_ret = RDCOST_DBL(x->rdmult, 0, best_err);
   return filt_best;
 }
 
@@ -198,14 +281,16 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
     if (cm->frame_type == KEY_FRAME) filt_guess -= 4;
     lf->filter_level = clamp(filt_guess, min_filter_level, max_filter_level);
   } else {
+#if CONFIG_UV_LVL
+    lf->filter_level = av1_search_filter_level(
+        sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, NULL, 0);
+    lf->filter_level_u = av1_search_filter_level(
+        sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, NULL, 1);
+    lf->filter_level_v = av1_search_filter_level(
+        sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, NULL, 2);
+#else
     lf->filter_level = av1_search_filter_level(
         sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, NULL);
+#endif  // CONFIG_UV_LVL
   }
-
-#if CONFIG_EXT_TILE
-  // TODO(any): 0 loopfilter level is only necessary if individual tile
-  // decoding is required. We need to communicate this requirement to this
-  // code and force loop filter level 0 only if required.
-  if (cm->tile_encoding_mode) lf->filter_level = 0;
-#endif  // CONFIG_EXT_TILE
 }
diff --git a/third_party/aom/av1/encoder/picklpf.h b/third_party/aom/av1/encoder/picklpf.h
index 3c0a83462..bd248d114 100644
--- a/third_party/aom/av1/encoder/picklpf.h
+++ b/third_party/aom/av1/encoder/picklpf.h
@@ -21,8 +21,13 @@ extern "C" {
 struct yv12_buffer_config;
 struct AV1_COMP;
 int av1_get_max_filter_level(const AV1_COMP *cpi);
+#if CONFIG_UV_LVL
+int av1_search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
+                            int partial_frame, double *err, int plane);
+#else
 int av1_search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
                             int partial_frame, double *err);
+#endif
 void av1_pick_filter_level(const struct yv12_buffer_config *sd,
                            struct AV1_COMP *cpi, LPF_PICK_METHOD method);
 #ifdef __cplusplus
diff --git a/third_party/aom/av1/encoder/pickrst.c b/third_party/aom/av1/encoder/pickrst.c
index 4a446d24e..fec68377a 100644
--- a/third_party/aom/av1/encoder/pickrst.c
+++ b/third_party/aom/av1/encoder/pickrst.c
@@ -437,8 +437,8 @@ static double search_sgrproj(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
   int width, height, src_stride, dgd_stride;
   uint8_t *dgd_buffer, *src_buffer;
   if (plane == AOM_PLANE_Y) {
-    width = cm->width;
-    height = cm->height;
+    width = src->y_crop_width;
+    height = src->y_crop_height;
     src_buffer = src->y_buffer;
     src_stride = src->y_stride;
     dgd_buffer = dgd->y_buffer;
@@ -478,7 +478,7 @@ static double search_sgrproj(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
                                (1 << plane));
     // #bits when a tile is not restored
     bits = av1_cost_bit(RESTORE_NONE_SGRPROJ_PROB, 0);
-    cost_norestore = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
+    cost_norestore = RDCOST_DBL(x->rdmult, (bits >> 4), err);
     best_tile_cost[tile_idx] = DBL_MAX;
     search_selfguided_restoration(
         dgd_buffer + v_start * dgd_stride + h_start, h_end - h_start,
@@ -498,7 +498,7 @@ static double search_sgrproj(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
                               &ref_sgrproj_info)
            << AV1_PROB_COST_SHIFT;
     bits += av1_cost_bit(RESTORE_NONE_SGRPROJ_PROB, 1);
-    cost_sgrproj = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
+    cost_sgrproj = RDCOST_DBL(x->rdmult, (bits >> 4), err);
     if (cost_sgrproj >= cost_norestore) {
       type[tile_idx] = RESTORE_NONE;
     } else {
@@ -531,7 +531,7 @@ static double search_sgrproj(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
   }
   err = try_restoration_frame(src, cpi, rsi, (1 << plane), partial_frame,
                               dst_frame);
-  cost_sgrproj = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
+  cost_sgrproj = RDCOST_DBL(x->rdmult, (bits >> 4), err);
 
   return cost_sgrproj;
 }
@@ -985,8 +985,8 @@ static double search_wiener(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
   int width, height, src_stride, dgd_stride;
   uint8_t *dgd_buffer, *src_buffer;
   if (plane == AOM_PLANE_Y) {
-    width = cm->width;
-    height = cm->height;
+    width = src->y_crop_width;
+    height = src->y_crop_height;
     src_buffer = src->y_buffer;
     src_stride = src->y_stride;
     dgd_buffer = dgd->y_buffer;
@@ -1039,7 +1039,7 @@ static double search_wiener(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
                                (1 << plane));
     // #bits when a tile is not restored
     bits = av1_cost_bit(RESTORE_NONE_WIENER_PROB, 0);
-    cost_norestore = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
+    cost_norestore = RDCOST_DBL(x->rdmult, (bits >> 4), err);
     best_tile_cost[tile_idx] = DBL_MAX;
 
     av1_get_rest_tile_limits(tile_idx, 0, 0, nhtiles, nvtiles, tile_width,
@@ -1081,7 +1081,7 @@ static double search_wiener(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
         count_wiener_bits(&rsi[plane].wiener_info[tile_idx], &ref_wiener_info)
         << AV1_PROB_COST_SHIFT;
     bits += av1_cost_bit(RESTORE_NONE_WIENER_PROB, 1);
-    cost_wiener = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
+    cost_wiener = RDCOST_DBL(x->rdmult, (bits >> 4), err);
     if (cost_wiener >= cost_norestore) {
       type[tile_idx] = RESTORE_NONE;
     } else {
@@ -1114,7 +1114,7 @@ static double search_wiener(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
   }
   err = try_restoration_frame(src, cpi, rsi, 1 << plane, partial_frame,
                               dst_frame);
-  cost_wiener = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
+  cost_wiener = RDCOST_DBL(x->rdmult, (bits >> 4), err);
 
   return cost_wiener;
 }
@@ -1133,8 +1133,8 @@ static double search_norestore(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
   int h_start, h_end, v_start, v_end;
   int width, height;
   if (plane == AOM_PLANE_Y) {
-    width = cm->width;
-    height = cm->height;
+    width = src->y_crop_width;
+    height = src->y_crop_height;
   } else {
     width = src->uv_crop_width;
     height = src->uv_crop_height;
@@ -1160,13 +1160,14 @@ static double search_norestore(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
   // RD cost associated with no restoration
   err = sse_restoration_frame(cm, src, cm->frame_to_show, (1 << plane));
   bits = frame_level_restore_bits[RESTORE_NONE] << AV1_PROB_COST_SHIFT;
-  cost_norestore = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
+  cost_norestore = RDCOST_DBL(x->rdmult, (bits >> 4), err);
   return cost_norestore;
 }
 
 static double search_switchable_restoration(
-    AV1_COMP *cpi, int partial_frame, int plane, RestorationInfo *rsi,
-    double *tile_cost[RESTORE_SWITCHABLE_TYPES]) {
+    const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi, int partial_frame, int plane,
+    RestorationType *const restore_types[RESTORE_SWITCHABLE_TYPES],
+    double *const tile_cost[RESTORE_SWITCHABLE_TYPES], RestorationInfo *rsi) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *x = &cpi->td.mb;
   double cost_switchable = 0;
@@ -1174,11 +1175,11 @@ static double search_switchable_restoration(
   RestorationType r;
   int width, height;
   if (plane == AOM_PLANE_Y) {
-    width = cm->width;
-    height = cm->height;
+    width = src->y_crop_width;
+    height = src->y_crop_height;
   } else {
-    width = ROUND_POWER_OF_TWO(cm->width, cm->subsampling_x);
-    height = ROUND_POWER_OF_TWO(cm->height, cm->subsampling_y);
+    width = src->uv_crop_width;
+    height = src->uv_crop_height;
   }
   const int ntiles = av1_get_rest_ntiles(
       width, height, cm->rst_info[plane].restoration_tilesize, NULL, NULL, NULL,
@@ -1192,16 +1193,17 @@ static double search_switchable_restoration(
   rsi->frame_restoration_type = RESTORE_SWITCHABLE;
   bits = frame_level_restore_bits[rsi->frame_restoration_type]
          << AV1_PROB_COST_SHIFT;
-  cost_switchable = RDCOST_DBL(x->rdmult, x->rddiv, bits >> 4, 0);
+  cost_switchable = RDCOST_DBL(x->rdmult, bits >> 4, 0);
   for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
-    double best_cost = RDCOST_DBL(
-        x->rdmult, x->rddiv, (cpi->switchable_restore_cost[RESTORE_NONE] >> 4),
-        tile_cost[RESTORE_NONE][tile_idx]);
+    double best_cost =
+        RDCOST_DBL(x->rdmult, (cpi->switchable_restore_cost[RESTORE_NONE] >> 4),
+                   tile_cost[RESTORE_NONE][tile_idx]);
     rsi->restoration_type[tile_idx] = RESTORE_NONE;
     for (r = 1; r < RESTORE_SWITCHABLE_TYPES; r++) {
       if (force_restore_type != 0)
         if (r != force_restore_type) continue;
       int tilebits = 0;
+      if (restore_types[r][tile_idx] != r) continue;
       if (r == RESTORE_WIENER)
         tilebits +=
             count_wiener_bits(&rsi->wiener_info[tile_idx], &ref_wiener_info);
@@ -1210,8 +1212,8 @@ static double search_switchable_restoration(
             count_sgrproj_bits(&rsi->sgrproj_info[tile_idx], &ref_sgrproj_info);
       tilebits <<= AV1_PROB_COST_SHIFT;
       tilebits += cpi->switchable_restore_cost[r];
-      double cost = RDCOST_DBL(x->rdmult, x->rddiv, tilebits >> 4,
-                               tile_cost[r][tile_idx]);
+      double cost =
+          RDCOST_DBL(x->rdmult, tilebits >> 4, tile_cost[r][tile_idx]);
 
       if (cost < best_cost) {
         rsi->restoration_type[tile_idx] = r;
@@ -1243,14 +1245,17 @@ void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
   RestorationType *restore_types[RESTORE_SWITCHABLE_TYPES];
   double best_cost_restore;
   RestorationType r, best_restore;
-
-  const int ntiles_y = av1_get_rest_ntiles(cm->width, cm->height,
-                                           cm->rst_info[0].restoration_tilesize,
-                                           NULL, NULL, NULL, NULL);
+  const int ywidth = src->y_crop_width;
+  const int yheight = src->y_crop_height;
+  const int uvwidth = src->uv_crop_width;
+  const int uvheight = src->uv_crop_height;
+
+  const int ntiles_y =
+      av1_get_rest_ntiles(ywidth, yheight, cm->rst_info[0].restoration_tilesize,
+                          NULL, NULL, NULL, NULL);
   const int ntiles_uv = av1_get_rest_ntiles(
-      ROUND_POWER_OF_TWO(cm->width, cm->subsampling_x),
-      ROUND_POWER_OF_TWO(cm->height, cm->subsampling_y),
-      cm->rst_info[1].restoration_tilesize, NULL, NULL, NULL, NULL);
+      uvwidth, uvheight, cm->rst_info[1].restoration_tilesize, NULL, NULL, NULL,
+      NULL);
 
   // Assume ntiles_uv is never larger that ntiles_y and so the same arrays work.
   for (r = 0; r < RESTORE_SWITCHABLE_TYPES; r++) {
@@ -1270,9 +1275,9 @@ void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
                                 tile_cost[r], &cpi->trial_frame_rst);
     }
     if (plane == AOM_PLANE_Y)
-      cost_restore[RESTORE_SWITCHABLE] =
-          search_switchable_restoration(cpi, method == LPF_PICK_FROM_SUBIMAGE,
-                                        plane, &cm->rst_info[plane], tile_cost);
+      cost_restore[RESTORE_SWITCHABLE] = search_switchable_restoration(
+          src, cpi, method == LPF_PICK_FROM_SUBIMAGE, plane, restore_types,
+          tile_cost, &cm->rst_info[plane]);
     else
       cost_restore[RESTORE_SWITCHABLE] = DBL_MAX;
     best_cost_restore = DBL_MAX;
diff --git a/third_party/aom/av1/encoder/ransac.c b/third_party/aom/av1/encoder/ransac.c
index bbd2d179c..c6e3675be 100644
--- a/third_party/aom/av1/encoder/ransac.c
+++ b/third_party/aom/av1/encoder/ransac.c
@@ -139,6 +139,8 @@ static void normalize_homography(double *pts, int n, double *T) {
   double msqe = 0;
   double scale;
   int i;
+
+  assert(n > 0);
   for (i = 0; i < n; ++i, p += 2) {
     mean[0] += p[0];
     mean[1] += p[1];
@@ -821,13 +823,15 @@ static int ransac(const int *matched_points, int npoints,
 
   // Recompute the motions using only the inliers.
   for (i = 0; i < num_desired_motions; ++i) {
-    copy_points_at_indices(points1, corners1, motions[i].inlier_indices,
-                           motions[i].num_inliers);
-    copy_points_at_indices(points2, corners2, motions[i].inlier_indices,
-                           motions[i].num_inliers);
-
-    find_transformation(motions[i].num_inliers, points1, points2,
-                        params_by_motion + (MAX_PARAMDIM - 1) * i);
+    if (motions[i].num_inliers >= minpts) {
+      copy_points_at_indices(points1, corners1, motions[i].inlier_indices,
+                             motions[i].num_inliers);
+      copy_points_at_indices(points2, corners2, motions[i].inlier_indices,
+                             motions[i].num_inliers);
+
+      find_transformation(motions[i].num_inliers, points1, points2,
+                          params_by_motion + (MAX_PARAMDIM - 1) * i);
+    }
     num_inliers_by_motion[i] = motions[i].num_inliers;
   }
 
diff --git a/third_party/aom/av1/encoder/ratectrl.c b/third_party/aom/av1/encoder/ratectrl.c
index 4552c674e..b546fdffa 100644
--- a/third_party/aom/av1/encoder/ratectrl.c
+++ b/third_party/aom/av1/encoder/ratectrl.c
@@ -94,8 +94,8 @@ static int kf_high = 5000;
 static int kf_low = 400;
 
 double av1_resize_rate_factor(const AV1_COMP *cpi) {
-  return (double)(cpi->resize_scale_den * cpi->resize_scale_den) /
-         (cpi->resize_scale_num * cpi->resize_scale_num);
+  return (double)(cpi->oxcf.width * cpi->oxcf.height) /
+         (cpi->common.width * cpi->common.height);
 }
 
 // Functions to compute the active minq lookup table entries based on a
@@ -1081,7 +1081,7 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int *bottom_index,
   }
 
   // Modify active_best_quality for downscaled normal frames.
-  if (!av1_resize_unscaled(cpi) && !frame_is_kf_gf_arf(cpi)) {
+  if (!av1_frame_unscaled(cm) && !frame_is_kf_gf_arf(cpi)) {
     int qdelta = av1_compute_qdelta_by_rate(
         rc, cm->frame_type, active_best_quality, 2.0, cm->bit_depth);
     active_best_quality =
@@ -1164,7 +1164,7 @@ void av1_rc_set_frame_target(AV1_COMP *cpi, int target) {
   rc->this_frame_target = target;
 
   // Modify frame size target when down-scaled.
-  if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC && !av1_resize_unscaled(cpi))
+  if (!av1_frame_unscaled(cm))
     rc->this_frame_target =
         (int)(rc->this_frame_target * av1_resize_rate_factor(cpi));
 
@@ -1663,3 +1663,64 @@ void av1_set_target_rate(AV1_COMP *cpi) {
     vbr_rate_correction(cpi, &target_rate);
   av1_rc_set_frame_target(cpi, target_rate);
 }
+
+static unsigned int lcg_rand16(unsigned int *state) {
+  *state = (unsigned int)(*state * 1103515245ULL + 12345);
+  return *state / 65536 % 32768;
+}
+
+uint8_t av1_calculate_next_resize_scale(const AV1_COMP *cpi) {
+  static unsigned int seed = 56789;
+  const AV1EncoderConfig *oxcf = &cpi->oxcf;
+  if (oxcf->pass == 1) return SCALE_DENOMINATOR;
+  uint8_t new_num = SCALE_DENOMINATOR;
+
+  switch (oxcf->resize_mode) {
+    case RESIZE_NONE: new_num = SCALE_DENOMINATOR; break;
+    case RESIZE_FIXED:
+      if (cpi->common.frame_type == KEY_FRAME)
+        new_num = oxcf->resize_kf_scale_numerator;
+      else
+        new_num = oxcf->resize_scale_numerator;
+      break;
+    case RESIZE_DYNAMIC:
+      // RESIZE_DYNAMIC: Just random for now.
+      new_num = lcg_rand16(&seed) % 4 + 13;
+      break;
+    default: assert(0);
+  }
+  return new_num;
+}
+
+#if CONFIG_FRAME_SUPERRES
+// TODO(afergs): Rename av1_rc_update_superres_scale(...)?
+uint8_t av1_calculate_next_superres_scale(const AV1_COMP *cpi, int width,
+                                          int height) {
+  static unsigned int seed = 34567;
+  const AV1EncoderConfig *oxcf = &cpi->oxcf;
+  if (oxcf->pass == 1) return SCALE_DENOMINATOR;
+  uint8_t new_num = SCALE_DENOMINATOR;
+
+  switch (oxcf->superres_mode) {
+    case SUPERRES_NONE: new_num = SCALE_DENOMINATOR; break;
+    case SUPERRES_FIXED:
+      if (cpi->common.frame_type == KEY_FRAME)
+        new_num = oxcf->superres_kf_scale_numerator;
+      else
+        new_num = oxcf->superres_scale_numerator;
+      break;
+    case SUPERRES_DYNAMIC:
+      // SUPERRES_DYNAMIC: Just random for now.
+      new_num = lcg_rand16(&seed) % 9 + 8;
+      break;
+    default: assert(0);
+  }
+
+  // Make sure overall reduction is no more than 1/2 of the source size.
+  av1_calculate_scaled_size(&width, &height, new_num);
+  if (width * 2 < oxcf->width || height * 2 < oxcf->height)
+    new_num = SCALE_DENOMINATOR;
+
+  return new_num;
+}
+#endif  // CONFIG_FRAME_SUPERRES
diff --git a/third_party/aom/av1/encoder/ratectrl.h b/third_party/aom/av1/encoder/ratectrl.h
index 61bb0c224..4ebdfadd6 100644
--- a/third_party/aom/av1/encoder/ratectrl.h
+++ b/third_party/aom/av1/encoder/ratectrl.h
@@ -256,6 +256,11 @@ void av1_set_target_rate(struct AV1_COMP *cpi);
 
 int av1_resize_one_pass_cbr(struct AV1_COMP *cpi);
 
+uint8_t av1_calculate_next_resize_scale(const struct AV1_COMP *cpi);
+#if CONFIG_FRAME_SUPERRES
+uint8_t av1_calculate_next_superres_scale(const struct AV1_COMP *cpi, int width,
+                                          int height);
+#endif  // CONFIG_FRAME_SUPERRES
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/av1/encoder/rd.c b/third_party/aom/av1/encoder/rd.c
index 94c3bb96d..da3b6e209 100644
--- a/third_party/aom/av1/encoder/rd.c
+++ b/third_party/aom/av1/encoder/rd.c
@@ -50,14 +50,15 @@
 // certain modes are assumed to be based on 8x8 blocks.
 // This table is used to correct for block size.
 // The factors here are << 2 (2 = x0.5, 32 = x8 etc).
-static const uint8_t rd_thresh_block_size_factor[BLOCK_SIZES] = {
-#if CONFIG_CB4X4
+static const uint8_t rd_thresh_block_size_factor[BLOCK_SIZES_ALL] = {
+#if CONFIG_CHROMA_2X2 || CONFIG_CHROMA_SUB8X8
   2,  2,  2,
 #endif
-  2,  3,  3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32,
+  2,  3,  3,  4, 6, 6, 8, 12, 12, 16, 24, 24, 32,
 #if CONFIG_EXT_PARTITION
-  48, 48, 64
+  48, 48, 64,
 #endif  // CONFIG_EXT_PARTITION
+  4,  4,  8,  8
 };
 
 static void fill_mode_costs(AV1_COMP *cpi) {
@@ -66,16 +67,16 @@ static void fill_mode_costs(AV1_COMP *cpi) {
 
   for (i = 0; i < INTRA_MODES; ++i)
     for (j = 0; j < INTRA_MODES; ++j)
-      av1_cost_tokens(cpi->y_mode_costs[i][j], av1_kf_y_mode_prob[i][j],
-                      av1_intra_mode_tree);
+      av1_cost_tokens_from_cdf(cpi->y_mode_costs[i][j], av1_kf_y_mode_cdf[i][j],
+                               av1_intra_mode_inv);
 
   for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
-    av1_cost_tokens(cpi->mbmode_cost[i], fc->y_mode_prob[i],
-                    av1_intra_mode_tree);
+    av1_cost_tokens_from_cdf(cpi->mbmode_cost[i], fc->y_mode_cdf[i],
+                             av1_intra_mode_inv);
 
   for (i = 0; i < INTRA_MODES; ++i)
-    av1_cost_tokens(cpi->intra_uv_mode_cost[i], fc->uv_mode_prob[i],
-                    av1_intra_mode_tree);
+    av1_cost_tokens_from_cdf(cpi->intra_uv_mode_cost[i], fc->uv_mode_cdf[i],
+                             av1_intra_mode_inv);
 
   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
     av1_cost_tokens(cpi->switchable_interp_costs[i],
@@ -83,20 +84,18 @@ static void fill_mode_costs(AV1_COMP *cpi) {
 
 #if CONFIG_PALETTE
   for (i = 0; i < PALETTE_BLOCK_SIZES; ++i) {
-    av1_cost_tokens(cpi->palette_y_size_cost[i],
-                    av1_default_palette_y_size_prob[i], av1_palette_size_tree);
-    av1_cost_tokens(cpi->palette_uv_size_cost[i],
-                    av1_default_palette_uv_size_prob[i], av1_palette_size_tree);
+    av1_cost_tokens_from_cdf(cpi->palette_y_size_cost[i],
+                             fc->palette_y_size_cdf[i], NULL);
+    av1_cost_tokens_from_cdf(cpi->palette_uv_size_cost[i],
+                             fc->palette_uv_size_cdf[i], NULL);
   }
 
   for (i = 0; i < PALETTE_SIZES; ++i) {
     for (j = 0; j < PALETTE_COLOR_INDEX_CONTEXTS; ++j) {
-      av1_cost_tokens(cpi->palette_y_color_cost[i][j],
-                      av1_default_palette_y_color_index_prob[i][j],
-                      av1_palette_color_index_tree[i]);
-      av1_cost_tokens(cpi->palette_uv_color_cost[i][j],
-                      av1_default_palette_uv_color_index_prob[i][j],
-                      av1_palette_color_index_tree[i]);
+      av1_cost_tokens_from_cdf(cpi->palette_y_color_cost[i][j],
+                               fc->palette_y_color_index_cdf[i][j], NULL);
+      av1_cost_tokens_from_cdf(cpi->palette_uv_color_cost[i][j],
+                               fc->palette_uv_color_index_cdf[i][j], NULL);
     }
   }
 #endif  // CONFIG_PALETTE
@@ -147,8 +146,9 @@ static void fill_mode_costs(AV1_COMP *cpi) {
                   av1_switchable_restore_tree);
 #endif  // CONFIG_LOOP_RESTORATION
 #if CONFIG_GLOBAL_MOTION
-  av1_cost_tokens(cpi->gmtype_cost, fc->global_motion_types_prob,
-                  av1_global_motion_types_tree);
+  for (i = 0; i < TRANS_TYPES; ++i)
+    cpi->gmtype_cost[i] = (1 + (i > 0 ? GLOBAL_TYPE_BITS : 0))
+                          << AV1_PROB_COST_SHIFT;
 #endif  // CONFIG_GLOBAL_MOTION
 }
 
@@ -301,7 +301,7 @@ static void set_block_thresholds(const AV1_COMMON *cm, RD_OPT *rd) {
               0, MAXQ);
     const int q = compute_rd_thresh_factor(qindex, cm->bit_depth);
 
-    for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) {
+    for (bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
       // Threshold here seems unnecessarily harsh but fine given actual
       // range of values used for cpi->sf.thresh_mult[].
       const int t = q * rd_thresh_block_size_factor[bsize];
@@ -350,7 +350,6 @@ void av1_initialize_rd_consts(AV1_COMP *cpi) {
 
   aom_clear_system_state();
 
-  rd->RDDIV = RDDIV_BITS;  // In bits (to multiply D by 128).
   rd->RDMULT = av1_compute_rd_mult(cpi, cm->base_qindex + cm->y_dc_delta_q);
 
   set_error_per_bit(x, rd->RDMULT);
@@ -367,6 +366,16 @@ void av1_initialize_rd_consts(AV1_COMP *cpi) {
   x->mvcost = x->mv_cost_stack[0];
   x->nmvjointcost = x->nmv_vec_cost[0];
 
+#if CONFIG_INTRABC
+  if (frame_is_intra_only(cm) && cm->allow_screen_content_tools &&
+      cpi->oxcf.pass != 1) {
+    av1_build_nmv_cost_table(
+        x->nmv_vec_cost[0],
+        cm->allow_high_precision_mv ? x->nmvcost_hp[0] : x->nmvcost[0],
+        &cm->fc->ndvc, MV_SUBPEL_NONE);
+  }
+#endif
+
   if (cpi->oxcf.pass != 1) {
     av1_fill_token_costs(x->token_costs, cm->fc->coef_probs);
 
@@ -434,6 +443,12 @@ void av1_initialize_rd_consts(AV1_COMP *cpi) {
         av1_cost_tokens((int *)cpi->inter_compound_mode_cost[i],
                         cm->fc->inter_compound_mode_probs[i],
                         av1_inter_compound_mode_tree);
+#if CONFIG_COMPOUND_SINGLEREF
+      for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+        av1_cost_tokens((int *)cpi->inter_singleref_comp_mode_cost[i],
+                        cm->fc->inter_singleref_comp_mode_probs[i],
+                        av1_inter_singleref_comp_mode_tree);
+#endif  // CONFIG_COMPOUND_SINGLEREF
 #if CONFIG_INTERINTRA
       for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
         av1_cost_tokens((int *)cpi->interintra_mode_cost[i],
@@ -442,16 +457,22 @@ void av1_initialize_rd_consts(AV1_COMP *cpi) {
 #endif  // CONFIG_INTERINTRA
 #endif  // CONFIG_EXT_INTER
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-      for (i = BLOCK_8X8; i < BLOCK_SIZES; i++) {
+      for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; i++) {
         av1_cost_tokens((int *)cpi->motion_mode_cost[i],
                         cm->fc->motion_mode_prob[i], av1_motion_mode_tree);
       }
 #if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
-      for (i = BLOCK_8X8; i < BLOCK_SIZES; i++) {
+      for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; i++) {
         cpi->motion_mode_cost1[i][0] = av1_cost_bit(cm->fc->obmc_prob[i], 0);
         cpi->motion_mode_cost1[i][1] = av1_cost_bit(cm->fc->obmc_prob[i], 1);
       }
 #endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+#if CONFIG_MOTION_VAR && CONFIG_NCOBMC_ADAPT_WEIGHT
+      for (i = ADAPT_OVERLAP_BLOCK_8X8; i < ADAPT_OVERLAP_BLOCKS; ++i) {
+        av1_cost_tokens((int *)cpi->ncobmc_mode_cost[i],
+                        cm->fc->ncobmc_mode_prob[i], av1_ncobmc_mode_tree);
+      }
+#endif
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
     }
   }
@@ -648,7 +669,7 @@ static void get_entropy_contexts_plane(
       for (i = 0; i < num_4x4_h; i += 8)
         t_left[i] = !!*(const uint64_t *)&left[i];
       break;
-#if CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
+#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
     case TX_4X16:
       for (i = 0; i < num_4x4_w; i += 2)
         t_above[i] = !!*(const uint16_t *)&above[i];
@@ -675,7 +696,7 @@ static void get_entropy_contexts_plane(
       for (i = 0; i < num_4x4_h; i += 4)
         t_left[i] = !!*(const uint32_t *)&left[i];
       break;
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
+#endif
 
     default: assert(0 && "Invalid transform size."); break;
   }
@@ -749,7 +770,7 @@ static void get_entropy_contexts_plane(
       for (i = 0; i < num_4x4_h; i += 4)
         t_left[i] = !!*(const uint32_t *)&left[i];
       break;
-#if CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
+#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
     case TX_4X16:
       memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
       for (i = 0; i < num_4x4_h; i += 4)
@@ -772,7 +793,7 @@ static void get_entropy_contexts_plane(
       for (i = 0; i < num_4x4_h; i += 2)
         t_left[i] = !!*(const uint16_t *)&left[i];
       break;
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
+#endif
     default: assert(0 && "Invalid transform size."); break;
   }
 }
@@ -781,7 +802,7 @@ void av1_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
                               const struct macroblockd_plane *pd,
                               ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE],
                               ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE]) {
-#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
+#if CONFIG_CHROMA_SUB8X8
   const BLOCK_SIZE plane_bsize =
       AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
 #else
@@ -983,6 +1004,54 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
 
 #if CONFIG_EXT_INTER
 
+#if CONFIG_COMPOUND_SINGLEREF
+  rd->thresh_mult[THR_SR_NEAREST_NEARMV] += 1200;
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_SR_NEAREST_NEARL2] += 1200;
+  rd->thresh_mult[THR_SR_NEAREST_NEARL3] += 1200;
+  rd->thresh_mult[THR_SR_NEAREST_NEARB] += 1200;
+#endif  // CONFIG_EXT_REFS
+  rd->thresh_mult[THR_SR_NEAREST_NEARA] += 1200;
+  rd->thresh_mult[THR_SR_NEAREST_NEARG] += 1200;
+
+  /*
+  rd->thresh_mult[THR_SR_NEAREST_NEWMV] += 1200;
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_SR_NEAREST_NEWL2] += 1200;
+  rd->thresh_mult[THR_SR_NEAREST_NEWL3] += 1200;
+  rd->thresh_mult[THR_SR_NEAREST_NEWB] += 1200;
+#endif  // CONFIG_EXT_REFS
+  rd->thresh_mult[THR_SR_NEAREST_NEWA] += 1200;
+  rd->thresh_mult[THR_SR_NEAREST_NEWG] += 1200;*/
+
+  rd->thresh_mult[THR_SR_NEAR_NEWMV] += 1500;
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_SR_NEAR_NEWL2] += 1500;
+  rd->thresh_mult[THR_SR_NEAR_NEWL3] += 1500;
+  rd->thresh_mult[THR_SR_NEAR_NEWB] += 1500;
+#endif  // CONFIG_EXT_REFS
+  rd->thresh_mult[THR_SR_NEAR_NEWA] += 1500;
+  rd->thresh_mult[THR_SR_NEAR_NEWG] += 1500;
+
+  rd->thresh_mult[THR_SR_ZERO_NEWMV] += 2000;
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_SR_ZERO_NEWL2] += 2000;
+  rd->thresh_mult[THR_SR_ZERO_NEWL3] += 2000;
+  rd->thresh_mult[THR_SR_ZERO_NEWB] += 2000;
+#endif  // CONFIG_EXT_REFS
+  rd->thresh_mult[THR_SR_ZERO_NEWA] += 2000;
+  rd->thresh_mult[THR_SR_ZERO_NEWG] += 2000;
+
+  rd->thresh_mult[THR_SR_NEW_NEWMV] += 1700;
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_SR_NEW_NEWL2] += 1700;
+  rd->thresh_mult[THR_SR_NEW_NEWL3] += 1700;
+  rd->thresh_mult[THR_SR_NEW_NEWB] += 1700;
+#endif  // CONFIG_EXT_REFS
+  rd->thresh_mult[THR_SR_NEW_NEWA] += 1700;
+  rd->thresh_mult[THR_SR_NEW_NEWG] += 1700;
+#endif  // CONFIG_COMPOUND_SINGLEREF
+
   rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA] += 1000;
 #if CONFIG_EXT_REFS
   rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A] += 1000;
@@ -994,6 +1063,13 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2B] += 1000;
   rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3B] += 1000;
   rd->thresh_mult[THR_COMP_NEAREST_NEARESTGB] += 1000;
+
+#if CONFIG_EXT_COMP_REFS
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLL2] += 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLL3] += 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLG] += 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTBA] += 1000;
+#endif  // CONFIG_EXT_COMP_REFS
 #endif  // CONFIG_EXT_REFS
 
 #else  // CONFIG_EXT_INTER
@@ -1009,6 +1085,12 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_NEARESTL2B] += 1000;
   rd->thresh_mult[THR_COMP_NEARESTL3B] += 1000;
   rd->thresh_mult[THR_COMP_NEARESTGB] += 1000;
+#if CONFIG_EXT_COMP_REFS
+  rd->thresh_mult[THR_COMP_NEARESTLL2] += 1000;
+  rd->thresh_mult[THR_COMP_NEARESTLL3] += 1000;
+  rd->thresh_mult[THR_COMP_NEARESTLG] += 1000;
+  rd->thresh_mult[THR_COMP_NEARESTBA] += 1000;
+#endif  // CONFIG_EXT_COMP_REFS
 #endif  // CONFIG_EXT_REFS
 
 #endif  // CONFIG_EXT_INTER
@@ -1081,6 +1163,40 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_NEW_NEARGB] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEWGB] += 2000;
   rd->thresh_mult[THR_COMP_ZERO_ZEROGB] += 2500;
+
+#if CONFIG_EXT_COMP_REFS
+  rd->thresh_mult[THR_COMP_NEAR_NEARLL2] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLL2] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLL2] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLL2] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARLL2] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWLL2] += 2000;
+  rd->thresh_mult[THR_COMP_ZERO_ZEROLL2] += 2500;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARLL3] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLL3] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLL3] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLL3] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARLL3] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWLL3] += 2000;
+  rd->thresh_mult[THR_COMP_ZERO_ZEROLL3] += 2500;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARLG] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLG] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLG] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLG] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARLG] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWLG] += 2000;
+  rd->thresh_mult[THR_COMP_ZERO_ZEROLG] += 2500;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARBA] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWBA] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTBA] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWBA] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARBA] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWBA] += 2000;
+  rd->thresh_mult[THR_COMP_ZERO_ZEROBA] += 2500;
+#endif  // CONFIG_EXT_COMP_REFS
 #endif  // CONFIG_EXT_REFS
 
 #else  // CONFIG_EXT_INTER
@@ -1105,6 +1221,17 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_NEWL3B] += 2000;
   rd->thresh_mult[THR_COMP_NEARGB] += 1500;
   rd->thresh_mult[THR_COMP_NEWGB] += 2000;
+
+#if CONFIG_EXT_COMP_REFS
+  rd->thresh_mult[THR_COMP_NEARLL2] += 1500;
+  rd->thresh_mult[THR_COMP_NEWLL2] += 2000;
+  rd->thresh_mult[THR_COMP_NEARLL3] += 1500;
+  rd->thresh_mult[THR_COMP_NEWLL3] += 2000;
+  rd->thresh_mult[THR_COMP_NEARLG] += 1500;
+  rd->thresh_mult[THR_COMP_NEWLG] += 2000;
+  rd->thresh_mult[THR_COMP_NEARBA] += 1500;
+  rd->thresh_mult[THR_COMP_NEWBA] += 2000;
+#endif  // CONFIG_EXT_COMP_REFS
 #endif  // CONFIG_EXT_REFS
 
   rd->thresh_mult[THR_COMP_ZEROLA] += 2500;
@@ -1119,6 +1246,13 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   rd->thresh_mult[THR_COMP_ZEROL2B] += 2500;
   rd->thresh_mult[THR_COMP_ZEROL3B] += 2500;
   rd->thresh_mult[THR_COMP_ZEROGB] += 2500;
+
+#if CONFIG_EXT_COMP_REFS
+  rd->thresh_mult[THR_COMP_ZEROLL2] += 2500;
+  rd->thresh_mult[THR_COMP_ZEROLL3] += 2500;
+  rd->thresh_mult[THR_COMP_ZEROLG] += 2500;
+  rd->thresh_mult[THR_COMP_ZEROBA] += 2500;
+#endif  // CONFIG_EXT_COMP_REFS
 #endif  // CONFIG_EXT_REFS
 
 #endif  // CONFIG_EXT_INTER
diff --git a/third_party/aom/av1/encoder/rd.h b/third_party/aom/av1/encoder/rd.h
index 5c3eee493..ea5115b41 100644
--- a/third_party/aom/av1/encoder/rd.h
+++ b/third_party/aom/av1/encoder/rd.h
@@ -30,12 +30,13 @@ extern "C" {
 #define RDDIV_BITS 7
 #define RD_EPB_SHIFT 6
 
-#define RDCOST(RM, DM, R, D) \
-  (ROUND_POWER_OF_TWO(((int64_t)R) * (RM), AV1_PROB_COST_SHIFT) + (D << DM))
+#define RDCOST(RM, R, D)                                          \
+  (ROUND_POWER_OF_TWO(((int64_t)R) * (RM), AV1_PROB_COST_SHIFT) + \
+   (D << RDDIV_BITS))
 
-#define RDCOST_DBL(RM, DM, R, D)                                   \
+#define RDCOST_DBL(RM, R, D)                                       \
   (((((double)(R)) * (RM)) / (double)(1 << AV1_PROB_COST_SHIFT)) + \
-   ((double)(D) * (1 << (DM))))
+   ((double)(D) * (1 << RDDIV_BITS)))
 
 #define QIDX_SKIP_THRESH 115
 
@@ -96,6 +97,54 @@ typedef enum {
 
 #if CONFIG_EXT_INTER
 
+#if CONFIG_COMPOUND_SINGLEREF
+  THR_SR_NEAREST_NEARMV,
+#if CONFIG_EXT_REFS
+  THR_SR_NEAREST_NEARL2,
+  THR_SR_NEAREST_NEARL3,
+  THR_SR_NEAREST_NEARB,
+#endif  // CONFIG_EXT_REFS
+  THR_SR_NEAREST_NEARG,
+  THR_SR_NEAREST_NEARA,
+
+  /*
+  THR_SR_NEAREST_NEWMV,
+#if CONFIG_EXT_REFS
+  THR_SR_NEAREST_NEWL2,
+  THR_SR_NEAREST_NEWL3,
+  THR_SR_NEAREST_NEWB,
+#endif  // CONFIG_EXT_REFS
+  THR_SR_NEAREST_NEWG,
+  THR_SR_NEAREST_NEWA,*/
+
+  THR_SR_NEAR_NEWMV,
+#if CONFIG_EXT_REFS
+  THR_SR_NEAR_NEWL2,
+  THR_SR_NEAR_NEWL3,
+  THR_SR_NEAR_NEWB,
+#endif  // CONFIG_EXT_REFS
+  THR_SR_NEAR_NEWG,
+  THR_SR_NEAR_NEWA,
+
+  THR_SR_ZERO_NEWMV,
+#if CONFIG_EXT_REFS
+  THR_SR_ZERO_NEWL2,
+  THR_SR_ZERO_NEWL3,
+  THR_SR_ZERO_NEWB,
+#endif  // CONFIG_EXT_REFS
+  THR_SR_ZERO_NEWG,
+  THR_SR_ZERO_NEWA,
+
+  THR_SR_NEW_NEWMV,
+#if CONFIG_EXT_REFS
+  THR_SR_NEW_NEWL2,
+  THR_SR_NEW_NEWL3,
+  THR_SR_NEW_NEWB,
+#endif  // CONFIG_EXT_REFS
+  THR_SR_NEW_NEWG,
+  THR_SR_NEW_NEWA,
+#endif  // CONFIG_COMPOUND_SINGLEREF
+
   THR_COMP_NEAREST_NEARESTLA,
 #if CONFIG_EXT_REFS
   THR_COMP_NEAREST_NEARESTL2A,
@@ -107,6 +156,12 @@ typedef enum {
   THR_COMP_NEAREST_NEARESTL2B,
   THR_COMP_NEAREST_NEARESTL3B,
   THR_COMP_NEAREST_NEARESTGB,
+#if CONFIG_EXT_COMP_REFS
+  THR_COMP_NEAREST_NEARESTLL2,
+  THR_COMP_NEAREST_NEARESTLL3,
+  THR_COMP_NEAREST_NEARESTLG,
+  THR_COMP_NEAREST_NEARESTBA,
+#endif  // CONFIG_EXT_COMP_REFS
 #endif  // CONFIG_EXT_REFS
 
 #else  // CONFIG_EXT_INTER
@@ -122,6 +177,12 @@ typedef enum {
   THR_COMP_NEARESTL2B,
   THR_COMP_NEARESTL3B,
   THR_COMP_NEARESTGB,
+#if CONFIG_EXT_COMP_REFS
+  THR_COMP_NEARESTLL2,
+  THR_COMP_NEARESTLL3,
+  THR_COMP_NEARESTLG,
+  THR_COMP_NEARESTBA,
+#endif  // CONFIG_EXT_COMP_REFS
 #endif  // CONFIG_EXT_REFS
 
 #endif  // CONFIG_EXT_INTER
@@ -138,8 +199,6 @@ typedef enum {
 
 #if CONFIG_EXT_INTER
 
-  THR_COMP_NEAR_NEARESTLA,
-  THR_COMP_NEAREST_NEARLA,
   THR_COMP_NEAR_NEARLA,
   THR_COMP_NEW_NEARESTLA,
   THR_COMP_NEAREST_NEWLA,
@@ -149,8 +208,6 @@ typedef enum {
   THR_COMP_ZERO_ZEROLA,
 
 #if CONFIG_EXT_REFS
-  THR_COMP_NEAR_NEARESTL2A,
-  THR_COMP_NEAREST_NEARL2A,
   THR_COMP_NEAR_NEARL2A,
   THR_COMP_NEW_NEARESTL2A,
   THR_COMP_NEAREST_NEWL2A,
@@ -159,8 +216,6 @@ typedef enum {
   THR_COMP_NEW_NEWL2A,
   THR_COMP_ZERO_ZEROL2A,
 
-  THR_COMP_NEAR_NEARESTL3A,
-  THR_COMP_NEAREST_NEARL3A,
   THR_COMP_NEAR_NEARL3A,
   THR_COMP_NEW_NEARESTL3A,
   THR_COMP_NEAREST_NEWL3A,
@@ -170,8 +225,6 @@ typedef enum {
   THR_COMP_ZERO_ZEROL3A,
 #endif  // CONFIG_EXT_REFS
 
-  THR_COMP_NEAR_NEARESTGA,
-  THR_COMP_NEAREST_NEARGA,
   THR_COMP_NEAR_NEARGA,
   THR_COMP_NEW_NEARESTGA,
   THR_COMP_NEAREST_NEWGA,
@@ -181,8 +234,6 @@ typedef enum {
   THR_COMP_ZERO_ZEROGA,
 
 #if CONFIG_EXT_REFS
-  THR_COMP_NEAR_NEARESTLB,
-  THR_COMP_NEAREST_NEARLB,
   THR_COMP_NEAR_NEARLB,
   THR_COMP_NEW_NEARESTLB,
   THR_COMP_NEAREST_NEWLB,
@@ -191,8 +242,6 @@ typedef enum {
   THR_COMP_NEW_NEWLB,
   THR_COMP_ZERO_ZEROLB,
 
-  THR_COMP_NEAR_NEARESTL2B,
-  THR_COMP_NEAREST_NEARL2B,
   THR_COMP_NEAR_NEARL2B,
   THR_COMP_NEW_NEARESTL2B,
   THR_COMP_NEAREST_NEWL2B,
@@ -201,8 +250,6 @@ typedef enum {
   THR_COMP_NEW_NEWL2B,
   THR_COMP_ZERO_ZEROL2B,
 
-  THR_COMP_NEAR_NEARESTL3B,
-  THR_COMP_NEAREST_NEARL3B,
   THR_COMP_NEAR_NEARL3B,
   THR_COMP_NEW_NEARESTL3B,
   THR_COMP_NEAREST_NEWL3B,
@@ -211,8 +258,6 @@ typedef enum {
   THR_COMP_NEW_NEWL3B,
   THR_COMP_ZERO_ZEROL3B,
 
-  THR_COMP_NEAR_NEARESTGB,
-  THR_COMP_NEAREST_NEARGB,
   THR_COMP_NEAR_NEARGB,
   THR_COMP_NEW_NEARESTGB,
   THR_COMP_NEAREST_NEWGB,
@@ -220,6 +265,40 @@ typedef enum {
   THR_COMP_NEAR_NEWGB,
   THR_COMP_NEW_NEWGB,
   THR_COMP_ZERO_ZEROGB,
+
+#if CONFIG_EXT_COMP_REFS
+  THR_COMP_NEAR_NEARLL2,
+  THR_COMP_NEW_NEARESTLL2,
+  THR_COMP_NEAREST_NEWLL2,
+  THR_COMP_NEW_NEARLL2,
+  THR_COMP_NEAR_NEWLL2,
+  THR_COMP_NEW_NEWLL2,
+  THR_COMP_ZERO_ZEROLL2,
+
+  THR_COMP_NEAR_NEARLL3,
+  THR_COMP_NEW_NEARESTLL3,
+  THR_COMP_NEAREST_NEWLL3,
+  THR_COMP_NEW_NEARLL3,
+  THR_COMP_NEAR_NEWLL3,
+  THR_COMP_NEW_NEWLL3,
+  THR_COMP_ZERO_ZEROLL3,
+
+  THR_COMP_NEAR_NEARLG,
+  THR_COMP_NEW_NEARESTLG,
+  THR_COMP_NEAREST_NEWLG,
+  THR_COMP_NEW_NEARLG,
+  THR_COMP_NEAR_NEWLG,
+  THR_COMP_NEW_NEWLG,
+  THR_COMP_ZERO_ZEROLG,
+
+  THR_COMP_NEAR_NEARBA,
+  THR_COMP_NEW_NEARESTBA,
+  THR_COMP_NEAREST_NEWBA,
+  THR_COMP_NEW_NEARBA,
+  THR_COMP_NEAR_NEWBA,
+  THR_COMP_NEW_NEWBA,
+  THR_COMP_ZERO_ZEROBA,
+#endif  // CONFIG_EXT_COMP_REFS
 #endif  // CONFIG_EXT_REFS
 
 #else  // CONFIG_EXT_INTER
@@ -244,6 +323,17 @@ typedef enum {
   THR_COMP_NEWL3B,
   THR_COMP_NEARGB,
   THR_COMP_NEWGB,
+
+#if CONFIG_EXT_COMP_REFS
+  THR_COMP_NEARLL2,
+  THR_COMP_NEWLL2,
+  THR_COMP_NEARLL3,
+  THR_COMP_NEWLL3,
+  THR_COMP_NEARLG,
+  THR_COMP_NEWLG,
+  THR_COMP_NEARBA,
+  THR_COMP_NEWBA,
+#endif  // CONFIG_EXT_COMP_REFS
 #endif  // CONFIG_EXT_REFS
 
   THR_COMP_ZEROLA,
@@ -258,6 +348,13 @@ typedef enum {
   THR_COMP_ZEROL2B,
   THR_COMP_ZEROL3B,
   THR_COMP_ZEROGB,
+
+#if CONFIG_EXT_COMP_REFS
+  THR_COMP_ZEROLL2,
+  THR_COMP_ZEROLL3,
+  THR_COMP_ZEROLG,
+  THR_COMP_ZEROBA,
+#endif  // CONFIG_EXT_COMP_REFS
 #endif  // CONFIG_EXT_REFS
 
 #endif  // CONFIG_EXT_INTER
@@ -344,12 +441,11 @@ typedef struct RD_OPT {
   int thresh_mult[MAX_MODES];
   int thresh_mult_sub8x8[MAX_REFS];
 
-  int threshes[MAX_SEGMENTS][BLOCK_SIZES][MAX_MODES];
+  int threshes[MAX_SEGMENTS][BLOCK_SIZES_ALL][MAX_MODES];
 
   int64_t prediction_type_threshes[TOTAL_REFS_PER_FRAME][REFERENCE_MODES];
 
   int RDMULT;
-  int RDDIV;
 } RD_OPT;
 
 static INLINE void av1_init_rd_stats(RD_STATS *rd_stats) {
@@ -361,7 +457,9 @@ static INLINE void av1_init_rd_stats(RD_STATS *rd_stats) {
   rd_stats->rdcost = 0;
   rd_stats->sse = 0;
   rd_stats->skip = 1;
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+  rd_stats->zero_rate = 0;
+  rd_stats->ref_rdcost = INT64_MAX;
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
   rd_stats->dist_y = 0;
 #endif
 #if CONFIG_RD_DEBUG
@@ -388,7 +486,9 @@ static INLINE void av1_invalid_rd_stats(RD_STATS *rd_stats) {
   rd_stats->rdcost = INT64_MAX;
   rd_stats->sse = INT64_MAX;
   rd_stats->skip = 0;
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+  rd_stats->zero_rate = 0;
+  rd_stats->ref_rdcost = INT64_MAX;
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
   rd_stats->dist_y = INT64_MAX;
 #endif
 #if CONFIG_RD_DEBUG
@@ -415,7 +515,7 @@ static INLINE void av1_merge_rd_stats(RD_STATS *rd_stats_dst,
   rd_stats_dst->dist += rd_stats_src->dist;
   rd_stats_dst->sse += rd_stats_src->sse;
   rd_stats_dst->skip &= rd_stats_src->skip;
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
   rd_stats_dst->dist_y += rd_stats_src->dist_y;
 #endif
 #if CONFIG_RD_DEBUG
diff --git a/third_party/aom/av1/encoder/rdopt.c b/third_party/aom/av1/encoder/rdopt.c
index 2a537a06a..43b00b83b 100644
--- a/third_party/aom/av1/encoder/rdopt.c
+++ b/third_party/aom/av1/encoder/rdopt.c
@@ -63,7 +63,7 @@
 #endif  // CONFIG_PVQ
 #if CONFIG_PVQ || CONFIG_DAALA_DIST
 #include "av1/common/pvq.h"
-#endif  // CONFIG_PVQ || CONFIG_DAALA_DIST
+#endif  // CONFIG_PVQ || CONFIG_DIST_8X8
 #if CONFIG_DUAL_FILTER
 #define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS)
 #if USE_EXTRA_FILTER
@@ -113,8 +113,14 @@ static const int filter_sets[DUAL_FILTER_SET_SIZE][2] = {
 #endif  // CONFIG_EXT_REFS
 
 #if CONFIG_EXT_REFS
+#if CONFIG_EXT_COMP_REFS
+#define SECOND_REF_FRAME_MASK                                        \
+  ((1 << ALTREF_FRAME) | (1 << BWDREF_FRAME) | (1 << GOLDEN_FRAME) | \
+   (1 << LAST2_FRAME) | 0x01)  // NOLINT
+#else                          // !CONFIG_EXT_COMP_REFS
 #define SECOND_REF_FRAME_MASK ((1 << ALTREF_FRAME) | (1 << BWDREF_FRAME) | 0x01)
-#else
+#endif  // CONFIG_EXT_COMP_REFS
+#else   // !CONFIG_EXT_REFS
 #define SECOND_REF_FRAME_MASK ((1 << ALTREF_FRAME) | 0x01)
 #endif  // CONFIG_EXT_REFS
 
@@ -126,6 +132,11 @@ static const int filter_sets[DUAL_FILTER_SET_SIZE][2] = {
 #define FILTER_FAST_SEARCH 1
 #endif  // CONFIG_EXT_INTRA
 
+// Setting this to 1 will disable trellis optimization within the
+// transform search. Trellis optimization will still be applied
+// in the final encode.
+#define DISABLE_TRELLISQ_SEARCH 0
+
 const double ADST_FLIP_SVM[8] = { -6.6623, -2.8062, -3.2531, 3.1671,    // vert
                                   -7.7051, -3.2234, -3.6193, 3.4533 };  // horz
 
@@ -191,6 +202,56 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
 // TODO(zoeliu): May need to reconsider the order on the modes to check
 
 #if CONFIG_EXT_INTER
+
+#if CONFIG_COMPOUND_SINGLEREF
+  // Single ref comp mode
+  { SR_NEAREST_NEARMV, { LAST_FRAME, NONE_FRAME } },
+#if CONFIG_EXT_REFS
+  { SR_NEAREST_NEARMV, { LAST2_FRAME, NONE_FRAME } },
+  { SR_NEAREST_NEARMV, { LAST3_FRAME, NONE_FRAME } },
+  { SR_NEAREST_NEARMV, { BWDREF_FRAME, NONE_FRAME } },
+#endif  // CONFIG_EXT_REFS
+  { SR_NEAREST_NEARMV, { GOLDEN_FRAME, NONE_FRAME } },
+  { SR_NEAREST_NEARMV, { ALTREF_FRAME, NONE_FRAME } },
+
+  /*
+  { SR_NEAREST_NEWMV, { LAST_FRAME, NONE_FRAME } },
+#if CONFIG_EXT_REFS
+  { SR_NEAREST_NEWMV, { LAST2_FRAME, NONE_FRAME } },
+  { SR_NEAREST_NEWMV, { LAST3_FRAME, NONE_FRAME } },
+  { SR_NEAREST_NEWMV, { BWDREF_FRAME, NONE_FRAME } },
+#endif  // CONFIG_EXT_REFS
+  { SR_NEAREST_NEWMV, { GOLDEN_FRAME, NONE_FRAME } },
+  { SR_NEAREST_NEWMV, { ALTREF_FRAME, NONE_FRAME } },*/
+
+  { SR_NEAR_NEWMV, { LAST_FRAME, NONE_FRAME } },
+#if CONFIG_EXT_REFS
+  { SR_NEAR_NEWMV, { LAST2_FRAME, NONE_FRAME } },
+  { SR_NEAR_NEWMV, { LAST3_FRAME, NONE_FRAME } },
+  { SR_NEAR_NEWMV, { BWDREF_FRAME, NONE_FRAME } },
+#endif  // CONFIG_EXT_REFS
+  { SR_NEAR_NEWMV, { GOLDEN_FRAME, NONE_FRAME } },
+  { SR_NEAR_NEWMV, { ALTREF_FRAME, NONE_FRAME } },
+
+  { SR_ZERO_NEWMV, { LAST_FRAME, NONE_FRAME } },
+#if CONFIG_EXT_REFS
+  { SR_ZERO_NEWMV, { LAST2_FRAME, NONE_FRAME } },
+  { SR_ZERO_NEWMV, { LAST3_FRAME, NONE_FRAME } },
+  { SR_ZERO_NEWMV, { BWDREF_FRAME, NONE_FRAME } },
+#endif  // CONFIG_EXT_REFS
+  { SR_ZERO_NEWMV, { GOLDEN_FRAME, NONE_FRAME } },
+  { SR_ZERO_NEWMV, { ALTREF_FRAME, NONE_FRAME } },
+
+  { SR_NEW_NEWMV, { LAST_FRAME, NONE_FRAME } },
+#if CONFIG_EXT_REFS
+  { SR_NEW_NEWMV, { LAST2_FRAME, NONE_FRAME } },
+  { SR_NEW_NEWMV, { LAST3_FRAME, NONE_FRAME } },
+  { SR_NEW_NEWMV, { BWDREF_FRAME, NONE_FRAME } },
+#endif  // CONFIG_EXT_REFS
+  { SR_NEW_NEWMV, { GOLDEN_FRAME, NONE_FRAME } },
+  { SR_NEW_NEWMV, { ALTREF_FRAME, NONE_FRAME } },
+#endif  // CONFIG_COMPOUND_SINGLEREF
+
   { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
 #if CONFIG_EXT_REFS
   { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
@@ -202,6 +263,13 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEAREST_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
   { NEAREST_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
   { NEAREST_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+
+#if CONFIG_EXT_COMP_REFS
+  { NEAREST_NEARESTMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEAREST_NEARESTMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEAREST_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEAREST_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } },
+#endif  // CONFIG_EXT_COMP_REFS
 #endif  // CONFIG_EXT_REFS
 
 #else  // CONFIG_EXT_INTER
@@ -217,6 +285,13 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
   { NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
   { NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+
+#if CONFIG_EXT_COMP_REFS
+  { NEARESTMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEARESTMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } },
+#endif  // CONFIG_EXT_COMP_REFS
 #endif  // CONFIG_EXT_REFS
 #endif  // CONFIG_EXT_INTER
 
@@ -297,9 +372,43 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEAR_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
   { NEW_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
   { ZERO_ZEROMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+
+#if CONFIG_EXT_COMP_REFS
+  { NEAR_NEARMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEW_NEARESTMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEAREST_NEWMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEW_NEARMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEAR_NEWMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, LAST2_FRAME } },
+  { ZERO_ZEROMV, { LAST_FRAME, LAST2_FRAME } },
+
+  { NEAR_NEARMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEW_NEARESTMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEAREST_NEWMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEW_NEARMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEAR_NEWMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, LAST3_FRAME } },
+  { ZERO_ZEROMV, { LAST_FRAME, LAST3_FRAME } },
+
+  { NEAR_NEARMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEW_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEAREST_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEW_NEARMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEAR_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { ZERO_ZEROMV, { LAST_FRAME, GOLDEN_FRAME } },
+
+  { NEAR_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } },
+  { NEW_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
+  { NEW_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } },
+  { NEAR_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
+  { NEW_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
+  { ZERO_ZEROMV, { BWDREF_FRAME, ALTREF_FRAME } },
+#endif  // CONFIG_EXT_COMP_REFS
 #endif  // CONFIG_EXT_REFS
 
-#else  // CONFIG_EXT_INTER
+#else  // !CONFIG_EXT_INTER
 
   { NEARMV, { LAST_FRAME, ALTREF_FRAME } },
   { NEWMV, { LAST_FRAME, ALTREF_FRAME } },
@@ -321,6 +430,17 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
   { NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
   { NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+
+#if CONFIG_EXT_COMP_REFS
+  { NEARMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEWMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEARMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEWMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEARMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEARMV, { BWDREF_FRAME, ALTREF_FRAME } },
+  { NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
+#endif  // CONFIG_EXT_COMP_REFS
 #endif  // CONFIG_EXT_REFS
 
   { ZEROMV, { LAST_FRAME, ALTREF_FRAME } },
@@ -335,6 +455,13 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { ZEROMV, { LAST2_FRAME, BWDREF_FRAME } },
   { ZEROMV, { LAST3_FRAME, BWDREF_FRAME } },
   { ZEROMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+
+#if CONFIG_EXT_COMP_REFS
+  { ZEROMV, { LAST_FRAME, LAST2_FRAME } },
+  { ZEROMV, { LAST_FRAME, LAST3_FRAME } },
+  { ZEROMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { ZEROMV, { BWDREF_FRAME, ALTREF_FRAME } },
+#endif  // CONFIG_EXT_COMP_REFS
 #endif  // CONFIG_EXT_REFS
 
 #endif  // CONFIG_EXT_INTER
@@ -385,6 +512,35 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
 #endif  // CONFIG_EXT_INTER
 };
 
+static const PREDICTION_MODE intra_rd_search_mode_order[INTRA_MODES] = {
+  DC_PRED,       H_PRED,        V_PRED,
+#if CONFIG_ALT_INTRA
+  SMOOTH_PRED,
+#endif  // CONFIG_ALT_INTRA
+  TM_PRED,
+#if CONFIG_ALT_INTRA && CONFIG_SMOOTH_HV
+  SMOOTH_V_PRED, SMOOTH_H_PRED,
+#endif  // CONFIG_ALT_INTRA && CONFIG_SMOOTH_HV
+  D135_PRED,     D207_PRED,     D153_PRED, D63_PRED, D117_PRED, D45_PRED,
+};
+
+#if CONFIG_CFL
+static const UV_PREDICTION_MODE uv_rd_search_mode_order[UV_INTRA_MODES] = {
+  UV_DC_PRED,       UV_H_PRED,        UV_V_PRED,
+#if CONFIG_ALT_INTRA
+  UV_SMOOTH_PRED,
+#endif  // CONFIG_ALT_INTRA
+  UV_TM_PRED,
+#if CONFIG_ALT_INTRA && CONFIG_SMOOTH_HV
+  UV_SMOOTH_V_PRED, UV_SMOOTH_H_PRED,
+#endif  // CONFIG_ALT_INTRA && CONFIG_SMOOTH_HV
+  UV_D135_PRED,     UV_D207_PRED,     UV_D153_PRED,
+  UV_D63_PRED,      UV_D117_PRED,     UV_D45_PRED,
+};
+#else
+#define uv_rd_search_mode_order intra_rd_search_mode_order
+#endif  // CONFIG_CFL
+
 #if CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA || CONFIG_PALETTE
 static INLINE int write_uniform_cost(int n, int v) {
   const int l = get_unsigned_bits(n);
@@ -404,7 +560,7 @@ static INLINE int write_uniform_cost(int n, int v) {
 #define FAST_EXT_TX_EDST_MARGIN 0.3
 
 #if CONFIG_DAALA_DIST
-static int od_compute_var_4x4(od_coeff *x, int stride) {
+static int od_compute_var_4x4(uint16_t *x, int stride) {
   int sum;
   int s2;
   int i;
@@ -420,7 +576,7 @@ static int od_compute_var_4x4(od_coeff *x, int stride) {
       s2 += t * t;
     }
   }
-  // TODO(yushin) : Check wheter any changes are required for high bit depth.
+
   return (s2 - (sum * sum >> 4)) >> 4;
 }
 
@@ -431,8 +587,8 @@ static int od_compute_var_4x4(od_coeff *x, int stride) {
 #define OD_DIST_LP_MID (5)
 #define OD_DIST_LP_NORM (OD_DIST_LP_MID + 2)
 
-static double od_compute_dist_8x8(int qm, int use_activity_masking, od_coeff *x,
-                                  od_coeff *y, od_coeff *e_lp, int stride) {
+static double od_compute_dist_8x8(int use_activity_masking, uint16_t *x,
+                                  uint16_t *y, od_coeff *e_lp, int stride) {
   double sum;
   int min_var;
   double mean_var;
@@ -444,8 +600,7 @@ static double od_compute_dist_8x8(int qm, int use_activity_masking, od_coeff *x,
   double vardist;
 
   vardist = 0;
-  OD_ASSERT(qm != OD_FLAT_QM);
-  (void)qm;
+
 #if 1
   min_var = INT_MAX;
   mean_var = 0;
@@ -490,22 +645,61 @@ static double od_compute_dist_8x8(int qm, int use_activity_masking, od_coeff *x,
 }
 
 // Note : Inputs x and y are in a pixel domain
-static double od_compute_dist(int qm, int activity_masking, od_coeff *x,
-                              od_coeff *y, int bsize_w, int bsize_h,
-                              int qindex) {
+static double od_compute_dist_common(int activity_masking, uint16_t *x,
+                                     uint16_t *y, int bsize_w, int bsize_h,
+                                     int qindex, od_coeff *tmp,
+                                     od_coeff *e_lp) {
+  int i, j;
+  double sum = 0;
+  const int mid = OD_DIST_LP_MID;
+
+  for (j = 0; j < bsize_w; j++) {
+    e_lp[j] = mid * tmp[j] + 2 * tmp[bsize_w + j];
+    e_lp[(bsize_h - 1) * bsize_w + j] = mid * tmp[(bsize_h - 1) * bsize_w + j] +
+                                        2 * tmp[(bsize_h - 2) * bsize_w + j];
+  }
+  for (i = 1; i < bsize_h - 1; i++) {
+    for (j = 0; j < bsize_w; j++) {
+      e_lp[i * bsize_w + j] = mid * tmp[i * bsize_w + j] +
+                              tmp[(i - 1) * bsize_w + j] +
+                              tmp[(i + 1) * bsize_w + j];
+    }
+  }
+  for (i = 0; i < bsize_h; i += 8) {
+    for (j = 0; j < bsize_w; j += 8) {
+      sum += od_compute_dist_8x8(activity_masking, &x[i * bsize_w + j],
+                                 &y[i * bsize_w + j], &e_lp[i * bsize_w + j],
+                                 bsize_w);
+    }
+  }
+  /* Scale according to linear regression against SSE, for 8x8 blocks. */
+  if (activity_masking) {
+    sum *= 2.2 + (1.7 - 2.2) * (qindex - 99) / (210 - 99) +
+           (qindex < 99 ? 2.5 * (qindex - 99) / 99 * (qindex - 99) / 99 : 0);
+  } else {
+    sum *= qindex >= 128
+               ? 1.4 + (0.9 - 1.4) * (qindex - 128) / (209 - 128)
+               : qindex <= 43 ? 1.5 + (2.0 - 1.5) * (qindex - 43) / (16 - 43)
+                              : 1.5 + (1.4 - 1.5) * (qindex - 43) / (128 - 43);
+  }
+
+  return sum;
+}
+
+static double od_compute_dist(uint16_t *x, uint16_t *y, int bsize_w,
+                              int bsize_h, int qindex) {
   int i;
   double sum;
   sum = 0;
 
   assert(bsize_w >= 8 && bsize_h >= 8);
 
-  if (qm == OD_FLAT_QM) {
-    for (i = 0; i < bsize_w * bsize_h; i++) {
-      double tmp;
-      tmp = x[i] - y[i];
-      sum += tmp * tmp;
-    }
-  } else {
+#if CONFIG_PVQ
+  int activity_masking = 1;
+#else
+  int activity_masking = 0;
+#endif
+  {
     int j;
     DECLARE_ALIGNED(16, od_coeff, e[MAX_TX_SQUARE]);
     DECLARE_ALIGNED(16, od_coeff, tmp[MAX_TX_SQUARE]);
@@ -525,63 +719,242 @@ static double od_compute_dist(int qm, int activity_masking, od_coeff *x,
                                e[i * bsize_w + j - 1] + e[i * bsize_w + j + 1];
       }
     }
-    for (j = 0; j < bsize_w; j++) {
-      e_lp[j] = mid * tmp[j] + 2 * tmp[bsize_w + j];
-      e_lp[(bsize_h - 1) * bsize_w + j] =
-          mid * tmp[(bsize_h - 1) * bsize_w + j] +
-          2 * tmp[(bsize_h - 2) * bsize_w + j];
-    }
-    for (i = 1; i < bsize_h - 1; i++) {
+    sum = od_compute_dist_common(activity_masking, x, y, bsize_w, bsize_h,
+                                 qindex, tmp, e_lp);
+  }
+  return sum;
+}
+
+static double od_compute_dist_diff(uint16_t *x, int16_t *e, int bsize_w,
+                                   int bsize_h, int qindex) {
+  int i;
+  double sum;
+  sum = 0;
+
+  assert(bsize_w >= 8 && bsize_h >= 8);
+
+#if CONFIG_PVQ
+  int activity_masking = 1;
+#else
+  int activity_masking = 0;
+#endif
+  {
+    int j;
+    DECLARE_ALIGNED(16, uint16_t, y[MAX_TX_SQUARE]);
+    DECLARE_ALIGNED(16, od_coeff, tmp[MAX_TX_SQUARE]);
+    DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_TX_SQUARE]);
+    int mid = OD_DIST_LP_MID;
+    for (i = 0; i < bsize_h; i++) {
       for (j = 0; j < bsize_w; j++) {
-        e_lp[i * bsize_w + j] = mid * tmp[i * bsize_w + j] +
-                                tmp[(i - 1) * bsize_w + j] +
-                                tmp[(i + 1) * bsize_w + j];
+        y[i * bsize_w + j] = x[i * bsize_w + j] - e[i * bsize_w + j];
       }
     }
-    for (i = 0; i < bsize_h; i += 8) {
-      for (j = 0; j < bsize_w; j += 8) {
-        sum += od_compute_dist_8x8(qm, activity_masking, &x[i * bsize_w + j],
-                                   &y[i * bsize_w + j], &e_lp[i * bsize_w + j],
-                                   bsize_w);
+    for (i = 0; i < bsize_h; i++) {
+      tmp[i * bsize_w] = mid * e[i * bsize_w] + 2 * e[i * bsize_w + 1];
+      tmp[i * bsize_w + bsize_w - 1] =
+          mid * e[i * bsize_w + bsize_w - 1] + 2 * e[i * bsize_w + bsize_w - 2];
+      for (j = 1; j < bsize_w - 1; j++) {
+        tmp[i * bsize_w + j] = mid * e[i * bsize_w + j] +
+                               e[i * bsize_w + j - 1] + e[i * bsize_w + j + 1];
       }
     }
-    /* Scale according to linear regression against SSE, for 8x8 blocks. */
-    if (activity_masking) {
-      sum *= 2.2 + (1.7 - 2.2) * (qindex - 99) / (210 - 99) +
-             (qindex < 99 ? 2.5 * (qindex - 99) / 99 * (qindex - 99) / 99 : 0);
-    } else {
-      sum *= qindex >= 128
-                 ? 1.4 + (0.9 - 1.4) * (qindex - 128) / (209 - 128)
-                 : qindex <= 43
-                       ? 1.5 + (2.0 - 1.5) * (qindex - 43) / (16 - 43)
-                       : 1.5 + (1.4 - 1.5) * (qindex - 43) / (128 - 43);
-    }
+    sum = od_compute_dist_common(activity_masking, x, y, bsize_w, bsize_h,
+                                 qindex, tmp, e_lp);
   }
   return sum;
 }
+#endif  // CONFIG_DAALA_DIST
 
-int64_t av1_daala_dist(const uint8_t *src, int src_stride, const uint8_t *dst,
-                       int dst_stride, int bsw, int bsh, int qm,
-                       int use_activity_masking, int qindex) {
+#if CONFIG_DIST_8X8
+#define NEW_FUTURE_DIST 0
+int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCKD *xd,
+                     const uint8_t *src, int src_stride, const uint8_t *dst,
+                     int dst_stride, const BLOCK_SIZE tx_bsize, int bsw,
+                     int bsh, int visible_w, int visible_h, int qindex) {
+  int64_t d = 0;
+
+#if CONFIG_DAALA_DIST || NEW_FUTURE_DIST
   int i, j;
-  int64_t d;
-  DECLARE_ALIGNED(16, od_coeff, orig[MAX_TX_SQUARE]);
-  DECLARE_ALIGNED(16, od_coeff, rec[MAX_TX_SQUARE]);
 
-  assert(qm == OD_HVS_QM);
+  DECLARE_ALIGNED(16, uint16_t, orig[MAX_TX_SQUARE]);
+  DECLARE_ALIGNED(16, uint16_t, rec[MAX_TX_SQUARE]);
+  (void)cpi;
+  (void)tx_bsize;
+#endif  // CONFIG_DAALA_DIST || NEW_FUTURE_DIST
+
+#if !CONFIG_HIGHBITDEPTH
+  (void)xd;
+#endif
+
+#if !CONFIG_DAALA_DIST
+  (void)qindex;
+#endif
+
+#if !CONFIG_DAALA_DIST || !NEW_FUTURE_DIST
+  (void)xd;
+  (void)bsw, (void)bsh;
+  (void)visible_w, (void)visible_h;
+#endif
+
+#if CONFIG_DAALA_DIST || NEW_FUTURE_DIST
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    for (j = 0; j < bsh; j++)
+      for (i = 0; i < bsw; i++)
+        orig[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
+
+    if ((bsw == visible_w) && (bsh == visible_h)) {
+      for (j = 0; j < bsh; j++)
+        for (i = 0; i < bsw; i++)
+          rec[j * bsw + i] = CONVERT_TO_SHORTPTR(dst)[j * dst_stride + i];
+    } else {
+      for (j = 0; j < visible_h; j++)
+        for (i = 0; i < visible_w; i++)
+          rec[j * bsw + i] = CONVERT_TO_SHORTPTR(dst)[j * dst_stride + i];
 
-  for (j = 0; j < bsh; j++)
-    for (i = 0; i < bsw; i++) orig[j * bsw + i] = src[j * src_stride + i];
+      if (visible_w < bsw) {
+        for (j = 0; j < bsh; j++)
+          for (i = visible_w; i < bsw; i++)
+            rec[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
+      }
 
-  for (j = 0; j < bsh; j++)
-    for (i = 0; i < bsw; i++) rec[j * bsw + i] = dst[j * dst_stride + i];
+      if (visible_h < bsh) {
+        for (j = visible_h; j < bsh; j++)
+          for (i = 0; i < bsw; i++)
+            rec[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
+      }
+    }
+  } else {
+#endif
+    for (j = 0; j < bsh; j++)
+      for (i = 0; i < bsw; i++) orig[j * bsw + i] = src[j * src_stride + i];
+
+    if ((bsw == visible_w) && (bsh == visible_h)) {
+      for (j = 0; j < bsh; j++)
+        for (i = 0; i < bsw; i++) rec[j * bsw + i] = dst[j * dst_stride + i];
+    } else {
+      for (j = 0; j < visible_h; j++)
+        for (i = 0; i < visible_w; i++)
+          rec[j * bsw + i] = dst[j * dst_stride + i];
+
+      if (visible_w < bsw) {
+        for (j = 0; j < bsh; j++)
+          for (i = visible_w; i < bsw; i++)
+            rec[j * bsw + i] = src[j * src_stride + i];
+      }
+
+      if (visible_h < bsh) {
+        for (j = visible_h; j < bsh; j++)
+          for (i = 0; i < bsw; i++) rec[j * bsw + i] = src[j * src_stride + i];
+      }
+    }
+#if CONFIG_HIGHBITDEPTH
+  }
+#endif  // CONFIG_HIGHBITDEPTH
+#endif  // CONFIG_DAALA_DIST || NEW_FUTURE_DIST
+
+#if CONFIG_DAALA_DIST
+  d = (int64_t)od_compute_dist(orig, rec, bsw, bsh, qindex);
+#elif NEW_FUTURE_DIST
+  // Call new 8x8-wise distortion function here, for example
+  for (i = 0; i < bsh; i += 8) {
+    for (j = 0; j < bsw; j += 8) {
+      d +=
+          av1_compute_dist_8x8(&orig[i * bsw + j], &rec[i * bsw + j], bsw, bsh);
+    }
+  }
+#else
+  // Otherwise, MSE by default
+  unsigned sse;
+  // TODO(Any): Use even faster function which does not calculate variance
+  cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
+  d = sse;
+#endif  // CONFIG_DAALA_DIST
 
-  d = (int64_t)od_compute_dist(qm, use_activity_masking, orig, rec, bsw, bsh,
-                               qindex);
   return d;
 }
+
+static int64_t av1_dist_8x8_diff(const MACROBLOCKD *xd, const uint8_t *src,
+                                 int src_stride, const int16_t *diff,
+                                 int diff_stride, int bsw, int bsh,
+                                 int visible_w, int visible_h, int qindex) {
+  int64_t d = 0;
+
+#if CONFIG_DAALA_DIST || NEW_FUTURE_DIST
+  int i, j;
+
+  DECLARE_ALIGNED(16, uint16_t, orig[MAX_TX_SQUARE]);
+  DECLARE_ALIGNED(16, int16_t, diff16[MAX_TX_SQUARE]);
+#endif  // CONFIG_DAALA_DIST || NEW_FUTURE_DIST
+
+#if !CONFIG_HIGHBITDEPTH
+  (void)xd;
+#endif
+
+#if !CONFIG_DAALA_DIST
+  (void)qindex;
+#endif
+
+#if !CONFIG_DAALA_DIST || !NEW_FUTURE_DIST
+  (void)xd;
+  (void)src, (void)src_stride;
+  (void)bsw, (void)bsh;
+  (void)visible_w, (void)visible_h;
+#endif
+
+#if CONFIG_DAALA_DIST || NEW_FUTURE_DIST
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    for (j = 0; j < bsh; j++)
+      for (i = 0; i < bsw; i++)
+        orig[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
+  } else {
+#endif
+    for (j = 0; j < bsh; j++)
+      for (i = 0; i < bsw; i++) orig[j * bsw + i] = src[j * src_stride + i];
+#if CONFIG_HIGHBITDEPTH
+  }
+#endif  // CONFIG_HIGHBITDEPTH
+
+  if ((bsw == visible_w) && (bsh == visible_h)) {
+    for (j = 0; j < bsh; j++)
+      for (i = 0; i < bsw; i++) diff16[j * bsw + i] = diff[j * diff_stride + i];
+  } else {
+    for (j = 0; j < visible_h; j++)
+      for (i = 0; i < visible_w; i++)
+        diff16[j * bsw + i] = diff[j * diff_stride + i];
+
+    if (visible_w < bsw) {
+      for (j = 0; j < bsh; j++)
+        for (i = visible_w; i < bsw; i++) diff16[j * bsw + i] = 0;
+    }
+
+    if (visible_h < bsh) {
+      for (j = visible_h; j < bsh; j++)
+        for (i = 0; i < bsw; i++) diff16[j * bsw + i] = 0;
+    }
+  }
+#endif  // CONFIG_DAALA_DIST || NEW_FUTURE_DIST
+
+#if CONFIG_DAALA_DIST
+  d = (int64_t)od_compute_dist_diff(orig, diff16, bsw, bsh, qindex);
+#elif NEW_FUTURE_DIST
+  // Call new 8x8-wise distortion function (with diff inpu) here, for example
+  for (i = 0; i < bsh; i += 8) {
+    for (j = 0; j < bsw; j += 8) {
+      d += av1_compute_dist_8x8_diff(&orig[i * bsw + j], &diff16[i * bsw + j],
+                                     bsw, bsh);
+    }
+  }
+#else
+  // Otherwise, MSE by default
+  d = aom_sum_squares_2d_i16(diff, diff_stride, bsw, bsh);
 #endif  // CONFIG_DAALA_DIST
 
+  return d;
+}
+#endif  // CONFIG_DIST_8X8
+
 static void get_energy_distribution_fine(const AV1_COMP *cpi, BLOCK_SIZE bsize,
                                          const uint8_t *src, int src_stride,
                                          const uint8_t *dst, int dst_stride,
@@ -892,11 +1265,11 @@ static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
   for (plane = plane_from; plane <= plane_to; ++plane) {
     struct macroblock_plane *const p = &x->plane[plane];
     struct macroblockd_plane *const pd = &xd->plane[plane];
-#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
+#if CONFIG_CHROMA_SUB8X8
     const BLOCK_SIZE bs = AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
 #else
     const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
-#endif  // CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
+#endif  // CONFIG_CHROMA_SUB8X8
 
     unsigned int sse;
     int rate;
@@ -1068,7 +1441,7 @@ static int cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
 
 #if !CONFIG_VAR_TX && !CONFIG_SUPERTX
   // Check for consistency of tx_size with mode info
-  assert(tx_size == get_tx_size(plane, xd));
+  assert(tx_size == av1_get_tx_size(plane, xd));
 #endif  // !CONFIG_VAR_TX && !CONFIG_SUPERTX
   (void)cm;
 
@@ -1144,10 +1517,12 @@ static int cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
 #endif  // !CONFIG_LV_MAP
 
 int av1_cost_coeffs(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
-                    int block, TX_SIZE tx_size, const SCAN_ORDER *scan_order,
-                    const ENTROPY_CONTEXT *a, const ENTROPY_CONTEXT *l,
-                    int use_fast_coef_costing) {
+                    int blk_row, int blk_col, int block, TX_SIZE tx_size,
+                    const SCAN_ORDER *scan_order, const ENTROPY_CONTEXT *a,
+                    const ENTROPY_CONTEXT *l, int use_fast_coef_costing) {
 #if !CONFIG_LV_MAP
+  (void)blk_row;
+  (void)blk_col;
   const AV1_COMMON *const cm = &cpi->common;
   return cost_coeffs(cm, x, plane, block, tx_size, scan_order, a, l,
                      use_fast_coef_costing);
@@ -1158,13 +1533,11 @@ int av1_cost_coeffs(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
   const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   const struct macroblockd_plane *pd = &xd->plane[plane];
   const BLOCK_SIZE bsize = mbmi->sb_type;
-#if CONFIG_CB4X4
-#if CONFIG_CHROMA_2X2
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-#else
+#if CONFIG_CHROMA_SUB8X8
   const BLOCK_SIZE plane_bsize =
       AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
-#endif  // CONFIG_CHROMA_2X2
+#elif CONFIG_CB4X4
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
 #else   // CONFIG_CB4X4
   const BLOCK_SIZE plane_bsize =
       get_plane_block_size(AOMMAX(BLOCK_8X8, bsize), pd);
@@ -1172,7 +1545,8 @@ int av1_cost_coeffs(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
 
   TXB_CTX txb_ctx;
   get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
-  return av1_cost_coeffs_txb(cpi, x, plane, block, &txb_ctx);
+  return av1_cost_coeffs_txb(cpi, x, plane, blk_row, blk_col, block, tx_size,
+                             &txb_ctx);
 #endif  // !CONFIG_LV_MAP
 }
 #endif  // !CONFIG_PVQ || CONFIG_VAR_TX
@@ -1182,9 +1556,9 @@ static void get_txb_dimensions(const MACROBLOCKD *xd, int plane,
                                BLOCK_SIZE plane_bsize, int blk_row, int blk_col,
                                BLOCK_SIZE tx_bsize, int *width, int *height,
                                int *visible_width, int *visible_height) {
-#if !(CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT)
+#if !(CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX))
   assert(tx_bsize <= plane_bsize);
-#endif  // !(CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT)
+#endif
   int txb_height = block_size_high[tx_bsize];
   int txb_width = block_size_wide[tx_bsize];
   const int block_height = block_size_high[plane_bsize];
@@ -1208,19 +1582,31 @@ static void get_txb_dimensions(const MACROBLOCKD *xd, int plane,
       clamp(block_rows - (blk_row << tx_unit_size), 0, txb_height);
 }
 
-// Compute the pixel domain sum square error on all visible 4x4s in the
+// Compute the pixel domain distortion from src and dst on all visible 4x4s in
+// the
 // transform block.
-static unsigned pixel_sse(const AV1_COMP *const cpi, const MACROBLOCKD *xd,
-                          int plane, const uint8_t *src, const int src_stride,
-                          const uint8_t *dst, const int dst_stride, int blk_row,
-                          int blk_col, const BLOCK_SIZE plane_bsize,
-                          const BLOCK_SIZE tx_bsize) {
+static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x,
+                           int plane, const uint8_t *src, const int src_stride,
+                           const uint8_t *dst, const int dst_stride,
+                           int blk_row, int blk_col,
+                           const BLOCK_SIZE plane_bsize,
+                           const BLOCK_SIZE tx_bsize) {
   int txb_rows, txb_cols, visible_rows, visible_cols;
+  const MACROBLOCKD *xd = &x->e_mbd;
+
   get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize,
                      &txb_cols, &txb_rows, &visible_cols, &visible_rows);
   assert(visible_rows > 0);
   assert(visible_cols > 0);
-#if CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
+
+#if CONFIG_DIST_8X8
+  if (plane == 0 && txb_cols >= 8 && txb_rows >= 8)
+    return av1_dist_8x8(cpi, xd, src, src_stride, dst, dst_stride, tx_bsize,
+                        txb_cols, txb_rows, visible_cols, visible_rows,
+                        x->qindex);
+#endif  // CONFIG_DIST_8X8
+
+#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
   if ((txb_rows == visible_rows && txb_cols == visible_cols) &&
       tx_bsize < BLOCK_SIZES) {
 #else
@@ -1242,36 +1628,86 @@ static unsigned pixel_sse(const AV1_COMP *const cpi, const MACROBLOCKD *xd,
   return sse;
 }
 
-// Compute the squares sum squares on all visible 4x4s in the transform block.
-static int64_t sum_squares_visible(const MACROBLOCKD *xd, int plane,
-                                   const int16_t *diff, const int diff_stride,
-                                   int blk_row, int blk_col,
-                                   const BLOCK_SIZE plane_bsize,
-                                   const BLOCK_SIZE tx_bsize) {
+// Compute the pixel domain distortion from diff on all visible 4x4s in the
+// transform block.
+static int64_t pixel_diff_dist(const MACROBLOCK *x, int plane,
+                               const int16_t *diff, const int diff_stride,
+                               int blk_row, int blk_col,
+                               const BLOCK_SIZE plane_bsize,
+                               const BLOCK_SIZE tx_bsize) {
   int visible_rows, visible_cols;
+  const MACROBLOCKD *xd = &x->e_mbd;
+#if CONFIG_DIST_8X8
+  int txb_height = block_size_high[tx_bsize];
+  int txb_width = block_size_wide[tx_bsize];
+  const int src_stride = x->plane[plane].src.stride;
+  const int src_idx = (blk_row * src_stride + blk_col) << tx_size_wide_log2[0];
+  const uint8_t *src = &x->plane[plane].src.buf[src_idx];
+#endif
+
   get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL,
                      NULL, &visible_cols, &visible_rows);
-  return aom_sum_squares_2d_i16(diff, diff_stride, visible_cols, visible_rows);
+
+#if CONFIG_DIST_8X8
+  if (plane == 0 && txb_width >= 8 && txb_height >= 8)
+    return av1_dist_8x8_diff(xd, src, src_stride, diff, diff_stride, txb_width,
+                             txb_height, visible_cols, visible_rows, x->qindex);
+  else
+#endif
+    return aom_sum_squares_2d_i16(diff, diff_stride, visible_cols,
+                                  visible_rows);
+}
+
+#if CONFIG_PALETTE || CONFIG_INTRABC
+int av1_count_colors(const uint8_t *src, int stride, int rows, int cols) {
+  int val_count[256];
+  memset(val_count, 0, sizeof(val_count));
+  for (int r = 0; r < rows; ++r) {
+    for (int c = 0; c < cols; ++c) {
+      ++val_count[src[r * stride + c]];
+    }
+  }
+  int n = 0;
+  for (int i = 0; i < 256; ++i) {
+    if (val_count[i]) ++n;
+  }
+  return n;
 }
 
+#if CONFIG_HIGHBITDEPTH
+int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols,
+                            int bit_depth) {
+  assert(bit_depth <= 12);
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  int val_count[1 << 12];
+  memset(val_count, 0, (1 << 12) * sizeof(val_count[0]));
+  for (int r = 0; r < rows; ++r) {
+    for (int c = 0; c < cols; ++c) {
+      ++val_count[src[r * stride + c]];
+    }
+  }
+  int n = 0;
+  for (int i = 0; i < (1 << bit_depth); ++i) {
+    if (val_count[i]) ++n;
+  }
+  return n;
+}
+#endif  // CONFIG_HIGHBITDEPTH
+#endif  // CONFIG_PALETTE || CONFIG_INTRABC
+
 void av1_dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
                     BLOCK_SIZE plane_bsize, int block, int blk_row, int blk_col,
                     TX_SIZE tx_size, int64_t *out_dist, int64_t *out_sse,
                     OUTPUT_STATUS output_status) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
-#if CONFIG_DAALA_DIST
-  int qm = OD_HVS_QM;
-  int use_activity_masking = 0;
-#if CONFIG_PVQ
-  use_activity_masking = x->daala_enc.use_activity_masking;
-#endif  // CONFIG_PVQ
+#if CONFIG_DIST_8X8
   struct macroblockd_plane *const pd = &xd->plane[plane];
-#else   // CONFIG_DAALA_DIST
+#else   // CONFIG_DIST_8X8
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-#endif  // CONFIG_DAALA_DIST
+#endif  // CONFIG_DIST_8X8
 
-  if (cpi->sf.use_transform_domain_distortion && !CONFIG_DAALA_DIST) {
+  if (cpi->sf.use_transform_domain_distortion && !CONFIG_DIST_8X8) {
     // Transform domain distortion computation is more efficient as it does
     // not involve an inverse transform, but it is less accurate.
     const int buffer_length = tx_size_2d[tx_size];
@@ -1292,19 +1728,21 @@ void av1_dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
                                    &this_sse) >>
                 shift;
 #endif  // CONFIG_HIGHBITDEPTH
-#elif CONFIG_HIGHBITDEPTH
-    const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;
-    *out_dist =
-        av1_highbd_block_error(coeff, dqcoeff, buffer_length, &this_sse, bd) >>
-        shift;
-#else
-    *out_dist =
-        av1_block_error(coeff, dqcoeff, buffer_length, &this_sse) >> shift;
+#else   // !CONFIG_PVQ
+#if CONFIG_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+      *out_dist = av1_highbd_block_error(coeff, dqcoeff, buffer_length,
+                                         &this_sse, xd->bd) >>
+                  shift;
+    else
+#endif
+      *out_dist =
+          av1_block_error(coeff, dqcoeff, buffer_length, &this_sse) >> shift;
 #endif  // CONFIG_PVQ
     *out_sse = this_sse >> shift;
   } else {
     const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
-#if !CONFIG_PVQ || CONFIG_DAALA_DIST
+#if !CONFIG_PVQ || CONFIG_DIST_8X8
     const int bsw = block_size_wide[tx_bsize];
     const int bsh = block_size_high[tx_bsize];
 #endif
@@ -1323,34 +1761,13 @@ void av1_dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
     assert(cpi != NULL);
     assert(tx_size_wide_log2[0] == tx_size_high_log2[0]);
 
-#if CONFIG_DAALA_DIST
-    if (plane == 0 && bsw >= 8 && bsh >= 8) {
-      if (output_status == OUTPUT_HAS_DECODED_PIXELS) {
-        const int pred_stride = block_size_wide[plane_bsize];
-        const int pred_idx = (blk_row * pred_stride + blk_col)
-                             << tx_size_wide_log2[0];
-        const int16_t *pred = &pd->pred[pred_idx];
-        int i, j;
-        DECLARE_ALIGNED(16, uint8_t, pred8[MAX_TX_SQUARE]);
-
-        for (j = 0; j < bsh; j++)
-          for (i = 0; i < bsw; i++)
-            pred8[j * bsw + i] = pred[j * pred_stride + i];
-        *out_sse = av1_daala_dist(src, src_stride, pred8, bsw, bsw, bsh, qm,
-                                  use_activity_masking, x->qindex);
-      } else {
-        *out_sse = av1_daala_dist(src, src_stride, dst, dst_stride, bsw, bsh,
-                                  qm, use_activity_masking, x->qindex);
-      }
-    } else
-#endif  // CONFIG_DAALA_DIST
     {
       const int diff_stride = block_size_wide[plane_bsize];
       const int diff_idx = (blk_row * diff_stride + blk_col)
                            << tx_size_wide_log2[0];
       const int16_t *diff = &p->src_diff[diff_idx];
-      *out_sse = sum_squares_visible(xd, plane, diff, diff_stride, blk_row,
-                                     blk_col, plane_bsize, tx_bsize);
+      *out_sse = pixel_diff_dist(x, plane, diff, diff_stride, blk_row, blk_col,
+                                 plane_bsize, tx_bsize);
 #if CONFIG_HIGHBITDEPTH
       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
         *out_sse = ROUND_POWER_OF_TWO(*out_sse, (xd->bd - 8) * 2);
@@ -1360,15 +1777,8 @@ void av1_dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
 
     if (eob) {
       if (output_status == OUTPUT_HAS_DECODED_PIXELS) {
-#if CONFIG_DAALA_DIST
-        if (plane == 0 && bsw >= 8 && bsh >= 8)
-          *out_dist = av1_daala_dist(src, src_stride, dst, dst_stride, bsw, bsh,
-                                     qm, use_activity_masking, x->qindex);
-        else
-#endif  // CONFIG_DAALA_DIST
-          *out_dist =
-              pixel_sse(cpi, xd, plane, src, src_stride, dst, dst_stride,
-                        blk_row, blk_col, plane_bsize, tx_bsize);
+        *out_dist = pixel_dist(cpi, x, plane, src, src_stride, dst, dst_stride,
+                               blk_row, blk_col, plane_bsize, tx_bsize);
       } else {
 #if CONFIG_HIGHBITDEPTH
         uint8_t *recon;
@@ -1399,37 +1809,44 @@ void av1_dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
 #endif  // !CONFIG_PVQ
 
         const PLANE_TYPE plane_type = get_plane_type(plane);
-        TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
-
-        av1_inverse_transform_block(xd, dqcoeff, tx_type, tx_size, recon,
-                                    MAX_TX_SIZE, eob);
-
-#if CONFIG_DAALA_DIST
-        if (plane == 0 && bsw >= 8 && bsh >= 8) {
-          *out_dist = av1_daala_dist(src, src_stride, recon, MAX_TX_SIZE, bsw,
-                                     bsh, qm, use_activity_masking, x->qindex);
-        } else {
-          if (plane == 0) {
-            // Save decoded pixels for inter block in pd->pred to avoid
-            // block_8x8_rd_txfm_daala_dist() need to produce them
-            // by calling av1_inverse_transform_block() again.
-            const int pred_stride = block_size_wide[plane_bsize];
-            const int pred_idx = (blk_row * pred_stride + blk_col)
-                                 << tx_size_wide_log2[0];
-            int16_t *pred = &pd->pred[pred_idx];
-            int i, j;
+        TX_TYPE tx_type =
+            av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
+        av1_inverse_transform_block(xd, dqcoeff,
+#if CONFIG_LGT
+                                    xd->mi[0]->mbmi.mode,
+#endif
+                                    tx_type, tx_size, recon, MAX_TX_SIZE, eob);
+
+#if CONFIG_DIST_8X8
+        if (plane == 0 && (bsw < 8 || bsh < 8)) {
+          // Save decoded pixels for inter block in pd->pred to avoid
+          // block_8x8_rd_txfm_daala_dist() need to produce them
+          // by calling av1_inverse_transform_block() again.
+          const int pred_stride = block_size_wide[plane_bsize];
+          const int pred_idx = (blk_row * pred_stride + blk_col)
+                               << tx_size_wide_log2[0];
+          int16_t *pred = &pd->pred[pred_idx];
+          int i, j;
 
+#if CONFIG_HIGHBITDEPTH
+          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+            for (j = 0; j < bsh; j++)
+              for (i = 0; i < bsw; i++)
+                pred[j * pred_stride + i] =
+                    CONVERT_TO_SHORTPTR(recon)[j * MAX_TX_SIZE + i];
+          } else {
+#endif
             for (j = 0; j < bsh; j++)
               for (i = 0; i < bsw; i++)
                 pred[j * pred_stride + i] = recon[j * MAX_TX_SIZE + i];
+#if CONFIG_HIGHBITDEPTH
           }
-#endif  // CONFIG_DAALA_DIST
-          *out_dist =
-              pixel_sse(cpi, xd, plane, src, src_stride, recon, MAX_TX_SIZE,
-                        blk_row, blk_col, plane_bsize, tx_bsize);
-#if CONFIG_DAALA_DIST
+#endif  // CONFIG_HIGHBITDEPTH
         }
-#endif  // CONFIG_DAALA_DIST
+#endif  // CONFIG_DIST_8X8
+        *out_dist =
+            pixel_dist(cpi, x, plane, src, src_stride, recon, MAX_TX_SIZE,
+                       blk_row, blk_col, plane_bsize, tx_bsize);
       }
       *out_dist *= 16;
     } else {
@@ -1453,33 +1870,25 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
   int64_t rd1, rd2, rd;
   RD_STATS this_rd_stats;
 
-  assert(tx_size == get_tx_size(plane, xd));
+#if !CONFIG_SUPERTX && !CONFIG_VAR_TX
+  assert(tx_size == av1_get_tx_size(plane, xd));
+#endif  // !CONFIG_SUPERTX
 
   av1_init_rd_stats(&this_rd_stats);
 
   if (args->exit_early) return;
 
   if (!is_inter_block(mbmi)) {
-#if CONFIG_CFL
-
-#if CONFIG_EC_ADAPT
-    FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
-#else
-    FRAME_CONTEXT *const ec_ctx = cm->fc;
-#endif  // CONFIG_EC_ADAPT
-
-    av1_predict_intra_block_encoder_facade(x, ec_ctx, plane, block, blk_col,
-                                           blk_row, tx_size, plane_bsize);
-#else
     av1_predict_intra_block_facade(xd, plane, block, blk_col, blk_row, tx_size);
-#endif
 #if CONFIG_DPCM_INTRA
     const int block_raster_idx =
         av1_block_index_to_raster_order(tx_size, block);
-    const PREDICTION_MODE mode =
-        (plane == 0) ? get_y_mode(xd->mi[0], block_raster_idx) : mbmi->uv_mode;
-    TX_TYPE tx_type = get_tx_type((plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV,
-                                  xd, block, tx_size);
+    const PREDICTION_MODE mode = (plane == AOM_PLANE_Y)
+                                     ? get_y_mode(xd->mi[0], block_raster_idx)
+                                     : get_uv_mode(mbmi->uv_mode);
+    TX_TYPE tx_type =
+        av1_get_tx_type((plane == AOM_PLANE_Y) ? PLANE_TYPE_Y : PLANE_TYPE_UV,
+                        xd, blk_row, blk_col, block, tx_size);
     if (av1_use_dpcm_intra(plane, mode, tx_type, mbmi)) {
       int8_t skip;
       av1_encode_block_intra_dpcm(cm, x, mode, plane, block, blk_row, blk_col,
@@ -1496,9 +1905,36 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
 #if !CONFIG_TXK_SEL
   // full forward transform and quantization
   const int coeff_ctx = combine_entropy_contexts(*a, *l);
+#if DISABLE_TRELLISQ_SEARCH
+  av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                  coeff_ctx, AV1_XFORM_QUANT_B);
+#else
   av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
                   coeff_ctx, AV1_XFORM_QUANT_FP);
-  av1_optimize_b(cm, x, plane, block, plane_bsize, tx_size, a, l);
+
+  const int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
+  tran_low_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);
+  const int buffer_length = tx_size_2d[tx_size];
+  int64_t tmp_dist;
+  int64_t tmp;
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    tmp_dist =
+        av1_highbd_block_error(coeff, dqcoeff, buffer_length, &tmp, xd->bd) >>
+        shift;
+  else
+#endif
+    tmp_dist = av1_block_error(coeff, dqcoeff, buffer_length, &tmp) >> shift;
+
+  if (RDCOST(x->rdmult, 0, tmp_dist) + args->this_rd < args->best_rd) {
+    av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size,
+                   a, l);
+  } else {
+    args->exit_early = 1;
+    return;
+  }
+#endif  // DISABLE_TRELLISQ_SEARCH
 
   if (!is_inter_block(mbmi)) {
     struct macroblock_plane *const p = &x->plane[plane];
@@ -1518,25 +1954,27 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
     const int dst_stride = pd->dst.stride;
     uint8_t *dst =
         &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
-    cfl_store(xd->cfl, dst, dst_stride, blk_row, blk_col, tx_size);
+    // TODO (ltrudeau) Store sub-8x8 inter blocks when bottom right block is
+    // intra predicted.
+    cfl_store(xd->cfl, dst, dst_stride, blk_row, blk_col, tx_size, plane_bsize);
   }
 #endif
 #if CONFIG_DPCM_INTRA
 CALCULATE_RD : {}
 #endif  // CONFIG_DPCM_INTRA
-  rd = RDCOST(x->rdmult, x->rddiv, 0, this_rd_stats.dist);
+  rd = RDCOST(x->rdmult, 0, this_rd_stats.dist);
   if (args->this_rd + rd > args->best_rd) {
     args->exit_early = 1;
     return;
   }
 #if !CONFIG_PVQ
   const PLANE_TYPE plane_type = get_plane_type(plane);
-  const TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
-  const SCAN_ORDER *scan_order =
-      get_scan(cm, tx_size, tx_type, is_inter_block(mbmi));
+  const TX_TYPE tx_type =
+      av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
+  const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, mbmi);
   this_rd_stats.rate =
-      av1_cost_coeffs(cpi, x, plane, block, tx_size, scan_order, a, l,
-                      args->use_fast_coef_costing);
+      av1_cost_coeffs(cpi, x, plane, blk_row, blk_col, block, tx_size,
+                      scan_order, a, l, args->use_fast_coef_costing);
 #else   // !CONFIG_PVQ
   this_rd_stats.rate = x->rate;
 #endif  // !CONFIG_PVQ
@@ -1554,22 +1992,12 @@ CALCULATE_RD : {}
   av1_set_txb_context(x, plane, block, tx_size, a, l);
 #endif  // !CONFIG_PVQ
 
-  rd1 = RDCOST(x->rdmult, x->rddiv, this_rd_stats.rate, this_rd_stats.dist);
-  rd2 = RDCOST(x->rdmult, x->rddiv, 0, this_rd_stats.sse);
+  rd1 = RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
+  rd2 = RDCOST(x->rdmult, 0, this_rd_stats.sse);
 
   // TODO(jingning): temporarily enabled only for luma component
   rd = AOMMIN(rd1, rd2);
 
-#if CONFIG_DAALA_DIST
-  if (plane == 0 && plane_bsize >= BLOCK_8X8 &&
-      (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4)) {
-    this_rd_stats.dist = 0;
-    this_rd_stats.sse = 0;
-    rd = 0;
-    x->rate_4x4[block] = this_rd_stats.rate;
-  }
-#endif  // CONFIG_DAALA_DIST
-
 #if !CONFIG_PVQ
   this_rd_stats.skip &= !x->plane[plane].eobs[block];
 #else
@@ -1579,113 +2007,93 @@ CALCULATE_RD : {}
 
   args->this_rd += rd;
 
-  if (args->this_rd > args->best_rd) {
-    args->exit_early = 1;
-    return;
+#if CONFIG_DIST_8X8
+  if (!(plane == 0 && plane_bsize >= BLOCK_8X8 &&
+        (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4))) {
+#endif
+    if (args->this_rd > args->best_rd) {
+      args->exit_early = 1;
+      return;
+    }
+#if CONFIG_DIST_8X8
   }
+#endif
 }
 
-#if CONFIG_DAALA_DIST
-static void block_8x8_rd_txfm_daala_dist(int plane, int block, int blk_row,
-                                         int blk_col, BLOCK_SIZE plane_bsize,
-                                         TX_SIZE tx_size, void *arg) {
-  struct rdcost_block_args *args = arg;
-  MACROBLOCK *const x = args->x;
+#if CONFIG_DIST_8X8
+static void dist_8x8_sub8x8_txfm_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                    BLOCK_SIZE bsize,
+                                    struct rdcost_block_args *args) {
   MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblockd_plane *const pd = &xd->plane[0];
+  const struct macroblock_plane *const p = &x->plane[0];
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  int64_t rd, rd1, rd2;
-  RD_STATS this_rd_stats;
-  int qm = OD_HVS_QM;
-  int use_activity_masking = 0;
-
-  (void)tx_size;
-
-  assert(plane == 0);
-  assert(plane_bsize >= BLOCK_8X8);
-#if CONFIG_PVQ
-  use_activity_masking = x->daala_enc.use_activity_masking;
-#endif  // CONFIG_PVQ
-  av1_init_rd_stats(&this_rd_stats);
+  const int src_stride = p->src.stride;
+  const int dst_stride = pd->dst.stride;
+  const uint8_t *src = &p->src.buf[0];
+  const uint8_t *dst = &pd->dst.buf[0];
+  const int16_t *pred = &pd->pred[0];
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
 
-  if (args->exit_early) return;
+  int i, j;
+  int64_t rd, rd1, rd2;
+  unsigned int tmp1, tmp2;
+  int qindex = x->qindex;
 
-  {
-    const struct macroblock_plane *const p = &x->plane[plane];
-    struct macroblockd_plane *const pd = &xd->plane[plane];
+  assert((bw & 0x07) == 0);
+  assert((bh & 0x07) == 0);
 
-    const int src_stride = p->src.stride;
-    const int dst_stride = pd->dst.stride;
-    const int diff_stride = block_size_wide[plane_bsize];
+#if CONFIG_HIGHBITDEPTH
+  uint8_t *pred8;
+  DECLARE_ALIGNED(16, uint16_t, pred16[MAX_TX_SQUARE]);
 
-    const uint8_t *src =
-        &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
-    const uint8_t *dst =
-        &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    pred8 = CONVERT_TO_BYTEPTR(pred16);
+  else
+    pred8 = (uint8_t *)pred16;
+#else
+  DECLARE_ALIGNED(16, uint8_t, pred8[MAX_TX_SQUARE]);
+#endif  // CONFIG_HIGHBITDEPTH
 
-    unsigned int tmp1, tmp2;
-    int qindex = x->qindex;
-    const int pred_stride = block_size_wide[plane_bsize];
-    const int pred_idx = (blk_row * pred_stride + blk_col)
-                         << tx_size_wide_log2[0];
-    int16_t *pred = &pd->pred[pred_idx];
-    int i, j;
-    const int tx_blk_size = 8;
-
-    DECLARE_ALIGNED(16, uint8_t, pred8[8 * 8]);
-
-    for (j = 0; j < tx_blk_size; j++)
-      for (i = 0; i < tx_blk_size; i++)
-        pred8[j * tx_blk_size + i] = pred[j * diff_stride + i];
-
-    tmp1 = av1_daala_dist(src, src_stride, pred8, tx_blk_size, 8, 8, qm,
-                          use_activity_masking, qindex);
-    tmp2 = av1_daala_dist(src, src_stride, dst, dst_stride, 8, 8, qm,
-                          use_activity_masking, qindex);
-
-    if (!is_inter_block(mbmi)) {
-      this_rd_stats.sse = (int64_t)tmp1 * 16;
-      this_rd_stats.dist = (int64_t)tmp2 * 16;
-    } else {
-      // For inter mode, the decoded pixels are provided in pd->pred,
-      // while the predicted pixels are in dst.
-      this_rd_stats.sse = (int64_t)tmp2 * 16;
-      this_rd_stats.dist = (int64_t)tmp1 * 16;
-    }
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    for (j = 0; j < bh; j++)
+      for (i = 0; i < bw; i++)
+        CONVERT_TO_SHORTPTR(pred8)[j * bw + i] = pred[j * bw + i];
+  } else {
+#endif
+    for (j = 0; j < bh; j++)
+      for (i = 0; i < bw; i++) pred8[j * bw + i] = pred[j * bw + i];
+#if CONFIG_HIGHBITDEPTH
   }
+#endif  // CONFIG_HIGHBITDEPTH
 
-  rd = RDCOST(x->rdmult, x->rddiv, 0, this_rd_stats.dist);
-  if (args->this_rd + rd > args->best_rd) {
-    args->exit_early = 1;
-    return;
+  tmp1 = av1_dist_8x8(cpi, xd, src, src_stride, pred8, bw, bsize, bw, bh, bw,
+                      bh, qindex);
+  tmp2 = av1_dist_8x8(cpi, xd, src, src_stride, dst, dst_stride, bsize, bw, bh,
+                      bw, bh, qindex);
+
+  if (!is_inter_block(mbmi)) {
+    args->rd_stats.sse = (int64_t)tmp1 * 16;
+    args->rd_stats.dist = (int64_t)tmp2 * 16;
+  } else {
+    // For inter mode, the decoded pixels are provided in pd->pred,
+    // while the predicted pixels are in dst.
+    args->rd_stats.sse = (int64_t)tmp2 * 16;
+    args->rd_stats.dist = (int64_t)tmp1 * 16;
   }
 
-  {
-    const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
-    const uint8_t txw_unit = tx_size_wide_unit[tx_size];
-    const uint8_t txh_unit = tx_size_high_unit[tx_size];
-    const int step = txw_unit * txh_unit;
-    int offset_h = tx_size_high_unit[TX_4X4];
-    // The rate of the current 8x8 block is the sum of four 4x4 blocks in it.
-    this_rd_stats.rate =
-        x->rate_4x4[block - max_blocks_wide * offset_h - step] +
-        x->rate_4x4[block - max_blocks_wide * offset_h] +
-        x->rate_4x4[block - step] + x->rate_4x4[block];
-  }
-  rd1 = RDCOST(x->rdmult, x->rddiv, this_rd_stats.rate, this_rd_stats.dist);
-  rd2 = RDCOST(x->rdmult, x->rddiv, 0, this_rd_stats.sse);
+  rd1 = RDCOST(x->rdmult, args->rd_stats.rate, args->rd_stats.dist);
+  rd2 = RDCOST(x->rdmult, 0, args->rd_stats.sse);
   rd = AOMMIN(rd1, rd2);
 
-  args->rd_stats.dist += this_rd_stats.dist;
-  args->rd_stats.sse += this_rd_stats.sse;
-
-  args->this_rd += rd;
+  args->rd_stats.rdcost = rd;
+  args->this_rd = rd;
 
-  if (args->this_rd > args->best_rd) {
-    args->exit_early = 1;
-    return;
-  }
+  if (args->this_rd > args->best_rd) args->exit_early = 1;
 }
-#endif  // CONFIG_DAALA_DIST
+#endif  // CONFIG_DIST_8X8
 
 static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
                              RD_STATS *rd_stats, int64_t ref_best_rd, int plane,
@@ -1705,15 +2113,13 @@ static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
 
   av1_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
 
-#if CONFIG_DAALA_DIST
-  if (plane == 0 && bsize >= BLOCK_8X8 &&
+  av1_foreach_transformed_block_in_plane(xd, bsize, plane, block_rd_txfm,
+                                         &args);
+#if CONFIG_DIST_8X8
+  if (!args.exit_early && plane == 0 && bsize >= BLOCK_8X8 &&
       (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4))
-    av1_foreach_8x8_transformed_block_in_yplane(
-        xd, bsize, block_rd_txfm, block_8x8_rd_txfm_daala_dist, &args);
-  else
-#endif  // CONFIG_DAALA_DIST
-    av1_foreach_transformed_block_in_plane(xd, bsize, plane, block_rd_txfm,
-                                           &args);
+    dist_8x8_sub8x8_txfm_rd(cpi, x, bsize, &args);
+#endif
 
   if (args.exit_early) {
     av1_invalid_rd_stats(rd_stats);
@@ -1768,8 +2174,14 @@ static int tx_size_cost(const AV1_COMP *const cpi, const MACROBLOCK *const x,
   const MACROBLOCKD *const xd = &x->e_mbd;
   const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
 
-  const int tx_select =
-      cm->tx_mode == TX_MODE_SELECT && mbmi->sb_type >= BLOCK_8X8;
+  const int tx_select = cm->tx_mode == TX_MODE_SELECT &&
+#if CONFIG_EXT_PARTITION_TYPES
+                        // Currently these block shapes can only use 4x4
+                        // transforms
+                        mbmi->sb_type != BLOCK_4X16 &&
+                        mbmi->sb_type != BLOCK_16X4 &&
+#endif
+                        mbmi->sb_type >= BLOCK_8X8;
 
   if (tx_select) {
     const int is_inter = is_inter_block(mbmi);
@@ -1779,11 +2191,11 @@ static int tx_size_cost(const AV1_COMP *const cpi, const MACROBLOCK *const x,
     const int depth = tx_size_to_depth(coded_tx_size);
     const int tx_size_ctx = get_tx_size_context(xd);
     int r_tx_size = cpi->tx_size_cost[tx_size_cat][tx_size_ctx][depth];
-#if CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
+#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
     if (is_quarter_tx_allowed(xd, mbmi, is_inter) && tx_size != coded_tx_size)
       r_tx_size += av1_cost_bit(cm->fc->quarter_tx_size_prob,
                                 tx_size == quarter_txsize_lookup[bsize]);
-#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
+#endif
     return r_tx_size;
   } else {
     return 0;
@@ -1796,6 +2208,10 @@ int av1_tx_type_cost(const AV1_COMP *cpi, const MACROBLOCKD *xd,
                      TX_TYPE tx_type) {
   if (plane > 0) return 0;
 
+#if CONFIG_VAR_TX
+  tx_size = get_min_tx_size(tx_size);
+#endif
+
   const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   const int is_inter = is_inter_block(mbmi);
 #if CONFIG_EXT_TX
@@ -1844,6 +2260,9 @@ static int64_t txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
 
   const int r_tx_size = tx_size_cost(cpi, x, bs, tx_size);
 
+#if CONFIG_PVQ
+  assert(tx_size >= TX_4X4);
+#endif  // CONFIG_PVQ
   assert(skip_prob > 0);
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
   assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed_bsize(bs)));
@@ -1864,21 +2283,20 @@ static int64_t txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
 
   if (rd_stats->skip) {
     if (is_inter) {
-      rd = RDCOST(x->rdmult, x->rddiv, s1, rd_stats->sse);
+      rd = RDCOST(x->rdmult, s1, rd_stats->sse);
     } else {
-      rd = RDCOST(x->rdmult, x->rddiv, s1 + r_tx_size * tx_select,
-                  rd_stats->sse);
+      rd = RDCOST(x->rdmult, s1 + r_tx_size * tx_select, rd_stats->sse);
     }
   } else {
-    rd = RDCOST(x->rdmult, x->rddiv,
-                rd_stats->rate + s0 + r_tx_size * tx_select, rd_stats->dist);
+    rd = RDCOST(x->rdmult, rd_stats->rate + s0 + r_tx_size * tx_select,
+                rd_stats->dist);
   }
 
   if (tx_select) rd_stats->rate += r_tx_size;
 
   if (is_inter && !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
       !(rd_stats->skip))
-    rd = AOMMIN(rd, RDCOST(x->rdmult, x->rddiv, s1, rd_stats->sse));
+    rd = AOMMIN(rd, RDCOST(x->rdmult, s1, rd_stats->sse));
 
   return rd;
 }
@@ -1895,6 +2313,11 @@ static int skip_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs,
     // transforms should be considered for pruning
     prune = prune_tx_types(cpi, bs, x, xd, -1);
 
+#if CONFIG_MRC_TX
+  // MRC_DCT only implemented for TX_32X32 so only include this tx in
+  // the search for TX_32X32
+  if (tx_type == MRC_DCT && tx_size != TX_32X32) return 1;
+#endif  // CONFIG_MRC_TX
   if (mbmi->ref_mv_idx > 0 && tx_type != DCT_DCT) return 1;
   if (FIXED_TX_TYPE && tx_type != get_default_tx_type(0, xd, 0, tx_size))
     return 1;
@@ -1929,7 +2352,8 @@ static int skip_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs,
   return 0;
 }
 
-#if CONFIG_EXT_INTER && (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT)
+#if CONFIG_EXT_INTER && \
+    (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT || CONFIG_INTERINTRA)
 static int64_t estimate_yrd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bs,
                                    MACROBLOCK *x, int *r, int64_t *d, int *s,
                                    int64_t *sse, int64_t ref_best_rd) {
@@ -2020,14 +2444,13 @@ static void choose_largest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
       av1_tx_type_cost(cpi, xd, bs, plane, mbmi->tx_size, tx_type);
 
       if (this_rd_stats.skip)
-        this_rd = RDCOST(x->rdmult, x->rddiv, s1, this_rd_stats.sse);
+        this_rd = RDCOST(x->rdmult, s1, this_rd_stats.sse);
       else
-        this_rd = RDCOST(x->rdmult, x->rddiv, this_rd_stats.rate + s0,
-                         this_rd_stats.dist);
+        this_rd =
+            RDCOST(x->rdmult, this_rd_stats.rate + s0, this_rd_stats.dist);
       if (is_inter_block(mbmi) && !xd->lossless[mbmi->segment_id] &&
           !this_rd_stats.skip)
-        this_rd =
-            AOMMIN(this_rd, RDCOST(x->rdmult, x->rddiv, s1, this_rd_stats.sse));
+        this_rd = AOMMIN(this_rd, RDCOST(x->rdmult, s1, this_rd_stats.sse));
 
       if (this_rd < best_rd) {
         best_rd = this_rd;
@@ -2068,13 +2491,12 @@ static void choose_largest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
           continue;
       }
       if (this_rd_stats.skip)
-        this_rd = RDCOST(x->rdmult, x->rddiv, s1, this_rd_stats.sse);
+        this_rd = RDCOST(x->rdmult, s1, this_rd_stats.sse);
       else
-        this_rd = RDCOST(x->rdmult, x->rddiv, this_rd_stats.rate + s0,
-                         this_rd_stats.dist);
-      if (is_inter && !xd->lossless[mbmi->segment_id] && !this_rd_stats.skip)
         this_rd =
-            AOMMIN(this_rd, RDCOST(x->rdmult, x->rddiv, s1, this_rd_stats.sse));
+            RDCOST(x->rdmult, this_rd_stats.rate + s0, this_rd_stats.dist);
+      if (is_inter && !xd->lossless[mbmi->segment_id] && !this_rd_stats.skip)
+        this_rd = AOMMIN(this_rd, RDCOST(x->rdmult, s1, this_rd_stats.sse));
 
       if (this_rd < best_rd) {
         best_rd = this_rd;
@@ -2129,7 +2551,6 @@ static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
   TX_TYPE best_tx_type = DCT_DCT;
 #if CONFIG_TXK_SEL
   TX_TYPE best_txk_type[MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
-  const int num_blk = bsize_to_num_blk(bs);
 #endif  // CONFIG_TXK_SEL
   const int tx_select = cm->tx_mode == TX_MODE_SELECT;
   const int is_inter = is_inter_block(mbmi);
@@ -2171,8 +2592,7 @@ static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
                       rect_tx_size);
         if (rd < best_rd) {
 #if CONFIG_TXK_SEL
-          memcpy(best_txk_type, mbmi->txk_type,
-                 sizeof(best_txk_type[0]) * num_blk);
+          memcpy(best_txk_type, mbmi->txk_type, sizeof(best_txk_type[0]) * 256);
 #endif
           best_tx_type = tx_type;
           best_tx_size = rect_tx_size;
@@ -2278,8 +2698,7 @@ static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
       last_rd = rd;
       if (rd < best_rd) {
 #if CONFIG_TXK_SEL
-        memcpy(best_txk_type, mbmi->txk_type,
-               sizeof(best_txk_type[0]) * num_blk);
+        memcpy(best_txk_type, mbmi->txk_type, sizeof(best_txk_type[0]) * 256);
 #endif
         best_tx_type = tx_type;
         best_tx_size = n;
@@ -2295,7 +2714,7 @@ static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
   mbmi->tx_size = best_tx_size;
   mbmi->tx_type = best_tx_type;
 #if CONFIG_TXK_SEL
-  memcpy(mbmi->txk_type, best_txk_type, sizeof(best_txk_type[0]) * num_blk);
+  memcpy(mbmi->txk_type, best_txk_type, sizeof(best_txk_type[0]) * 256);
 #endif
 
 #if CONFIG_VAR_TX
@@ -2366,21 +2785,7 @@ static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x,
   int block = 0;
   for (row = 0; row < max_blocks_high; row += stepr) {
     for (col = 0; col < max_blocks_wide; col += stepc) {
-#if CONFIG_CFL
-      const struct macroblockd_plane *const pd = &xd->plane[0];
-      const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-
-#if CONFIG_EC_ADAPT
-      FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
-#else
-      FRAME_CONTEXT *const ec_ctx = cpi->common.fc;
-#endif  // CONFIG_EC_ADAPT
-
-      av1_predict_intra_block_encoder_facade(x, ec_ctx, 0, block, col, row,
-                                             tx_size, plane_bsize);
-#else
       av1_predict_intra_block_facade(xd, 0, block, col, row, tx_size);
-#endif
       block += step;
     }
   }
@@ -2388,7 +2793,8 @@ static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x,
   model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &this_rd_stats.rate,
                   &this_rd_stats.dist, &this_rd_stats.skip, &temp_sse);
 #if CONFIG_EXT_INTRA
-  if (av1_is_directional_mode(mbmi->mode, bsize)) {
+  if (av1_is_directional_mode(mbmi->mode, bsize) &&
+      av1_use_angle_delta(bsize)) {
     mode_cost += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
                                     MAX_ANGLE_DELTA + mbmi->angle_delta[0]);
   }
@@ -2405,8 +2811,8 @@ static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x,
     }
   }
 #endif  // CONFIG_FILTER_INTRA
-  this_rd = RDCOST(x->rdmult, x->rddiv, this_rd_stats.rate + mode_cost,
-                   this_rd_stats.dist);
+  this_rd =
+      RDCOST(x->rdmult, this_rd_stats.rate + mode_cost, this_rd_stats.dist);
   return this_rd;
 }
 
@@ -2620,7 +3026,7 @@ static int rd_pick_palette_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
       super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
       if (tokenonly_rd_stats.rate == INT_MAX) continue;
       this_rate = tokenonly_rd_stats.rate + palette_mode_cost;
-      this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist);
+      this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
       if (!xd->lossless[mbmi->segment_id] && mbmi->sb_type >= BLOCK_8X8) {
         tokenonly_rd_stats.rate -= tx_size_cost(cpi, x, bsize, mbmi->tx_size);
       }
@@ -2773,15 +3179,17 @@ static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
                                     src_stride, dst, dst_stride, xd->bd);
 #endif
           if (is_lossless) {
-            TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block, tx_size);
-            const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, 0);
+            TX_TYPE tx_type =
+                av1_get_tx_type(PLANE_TYPE_Y, xd, 0, 0, block, tx_size);
+            const SCAN_ORDER *scan_order =
+                get_scan(cm, tx_size, tx_type, &xd->mi[0]->mbmi);
             const int coeff_ctx =
                 combine_entropy_contexts(tempa[idx], templ[idy]);
 #if !CONFIG_PVQ
             av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8,
                             tx_size, coeff_ctx, AV1_XFORM_QUANT_FP);
-            ratey += av1_cost_coeffs(cpi, x, 0, block, tx_size, scan_order,
-                                     tempa + idx, templ + idy,
+            ratey += av1_cost_coeffs(cpi, x, 0, 0, 0, block, tx_size,
+                                     scan_order, tempa + idx, templ + idy,
                                      cpi->sf.use_fast_coef_costing);
             skip = (p->eobs[block] == 0);
             can_skip &= skip;
@@ -2806,28 +3214,38 @@ static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
             templ[idy] = !skip;
             can_skip &= skip;
 #endif
-            if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+            if (RDCOST(x->rdmult, ratey, distortion) >= best_rd)
               goto next_highbd;
 #if CONFIG_PVQ
             if (!skip)
 #endif
               av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block),
+#if CONFIG_LGT
+                                          mode,
+#endif
                                           DCT_DCT, tx_size, dst, dst_stride,
                                           p->eobs[block]);
           } else {
             int64_t dist;
             unsigned int tmp;
-            TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block, tx_size);
-            const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, 0);
+            TX_TYPE tx_type =
+                av1_get_tx_type(PLANE_TYPE_Y, xd, 0, 0, block, tx_size);
+            const SCAN_ORDER *scan_order =
+                get_scan(cm, tx_size, tx_type, &xd->mi[0]->mbmi);
             const int coeff_ctx =
                 combine_entropy_contexts(tempa[idx], templ[idy]);
 #if !CONFIG_PVQ
+#if DISABLE_TRELLISQ_SEARCH
+            av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8,
+                            tx_size, coeff_ctx, AV1_XFORM_QUANT_B);
+#else
             av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8,
                             tx_size, coeff_ctx, AV1_XFORM_QUANT_FP);
-            av1_optimize_b(cm, x, 0, block, BLOCK_8X8, tx_size, tempa + idx,
-                           templ + idy);
-            ratey += av1_cost_coeffs(cpi, x, 0, block, tx_size, scan_order,
-                                     tempa + idx, templ + idy,
+            av1_optimize_b(cm, x, 0, 0, 0, block, BLOCK_8X8, tx_size,
+                           tempa + idx, templ + idy);
+#endif  // DISABLE_TRELLISQ_SEARCH
+            ratey += av1_cost_coeffs(cpi, x, 0, 0, 0, block, tx_size,
+                                     scan_order, tempa + idx, templ + idy,
                                      cpi->sf.use_fast_coef_costing);
             skip = (p->eobs[block] == 0);
             can_skip &= skip;
@@ -2855,19 +3273,22 @@ static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
             if (!skip)
 #endif
               av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block),
+#if CONFIG_LGT
+                                          mode,
+#endif
                                           tx_type, tx_size, dst, dst_stride,
                                           p->eobs[block]);
             cpi->fn_ptr[sub_bsize].vf(src, src_stride, dst, dst_stride, &tmp);
             dist = (int64_t)tmp << 4;
             distortion += dist;
-            if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+            if (RDCOST(x->rdmult, ratey, distortion) >= best_rd)
               goto next_highbd;
           }
         }
       }
 
       rate += ratey;
-      this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+      this_rd = RDCOST(x->rdmult, rate, distortion);
 
       if (this_rd < best_rd) {
         *bestrate = rate;
@@ -2966,14 +3387,24 @@ static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
         aom_subtract_block(tx_height, tx_width, src_diff, 8, src, src_stride,
                            dst, dst_stride);
 #endif  // !CONFIG_PVQ
-
-        TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block, tx_size);
-        const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, 0);
+        TX_TYPE tx_type =
+            av1_get_tx_type(PLANE_TYPE_Y, xd, 0, 0, block, tx_size);
+        const SCAN_ORDER *scan_order =
+            get_scan(cm, tx_size, tx_type, &xd->mi[0]->mbmi);
         const int coeff_ctx = combine_entropy_contexts(tempa[idx], templ[idy]);
 #if CONFIG_CB4X4
         block = 4 * block;
 #endif  // CONFIG_CB4X4
-#if !CONFIG_PVQ
+#if !CONFIG_PVQ
+#if DISABLE_TRELLISQ_SEARCH
+        av1_xform_quant(cm, x, 0, block,
+#if CONFIG_CB4X4
+                        2 * (row + idy), 2 * (col + idx),
+#else
+                        row + idy, col + idx,
+#endif  // CONFIG_CB4X4
+                        BLOCK_8X8, tx_size, coeff_ctx, AV1_XFORM_QUANT_B);
+#else
         const AV1_XFORM_QUANT xform_quant =
             is_lossless ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP;
         av1_xform_quant(cm, x, 0, block,
@@ -2984,12 +3415,12 @@ static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
 #endif  // CONFIG_CB4X4
                         BLOCK_8X8, tx_size, coeff_ctx, xform_quant);
 
-        av1_optimize_b(cm, x, 0, block, BLOCK_8X8, tx_size, tempa + idx,
+        av1_optimize_b(cm, x, 0, 0, 0, block, BLOCK_8X8, tx_size, tempa + idx,
                        templ + idy);
-
-        ratey +=
-            av1_cost_coeffs(cpi, x, 0, block, tx_size, scan_order, tempa + idx,
-                            templ + idy, cpi->sf.use_fast_coef_costing);
+#endif  // DISABLE_TRELLISQ_SEARCH
+        ratey += av1_cost_coeffs(cpi, x, 0, 0, 0, block, tx_size, scan_order,
+                                 tempa + idx, templ + idy,
+                                 cpi->sf.use_fast_coef_costing);
         skip = (p->eobs[block] == 0);
         can_skip &= skip;
         tempa[idx] = !skip;
@@ -3028,6 +3459,9 @@ static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
           if (!skip)
 #endif  // CONFIG_PVQ
             av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block),
+#if CONFIG_LGT
+                                        mode,
+#endif
                                         tx_type, tx_size, dst, dst_stride,
                                         p->eobs[block]);
           unsigned int tmp;
@@ -3036,14 +3470,16 @@ static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
           distortion += dist;
         }
 
-        if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
-          goto next;
+        if (RDCOST(x->rdmult, ratey, distortion) >= best_rd) goto next;
 
         if (is_lossless) {  // Calculate inverse txfm *after* RD cost.
 #if CONFIG_PVQ
           if (!skip)
 #endif  // CONFIG_PVQ
             av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block),
+#if CONFIG_LGT
+                                        mode,
+#endif
                                         DCT_DCT, tx_size, dst, dst_stride,
                                         p->eobs[block]);
         }
@@ -3051,7 +3487,7 @@ static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
     }
 
     rate += ratey;
-    this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+    this_rd = RDCOST(x->rdmult, rate, distortion);
 
     if (this_rd < best_rd) {
       *bestrate = rate;
@@ -3153,9 +3589,9 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(const AV1_COMP *const cpi,
           cpi, mb, idy, idx, &best_mode, bmode_costs,
           xd->plane[0].above_context + idx, xd->plane[0].left_context + idy, &r,
           &ry, &d, bsize, tx_size, y_skip, best_rd - total_rd);
-#if !CONFIG_DAALA_DIST
+#if !CONFIG_DIST_8X8
       if (this_rd >= best_rd - total_rd) return INT64_MAX;
-#endif  // !CONFIG_DAALA_DIST
+#endif  // !CONFIG_DIST_8X8
       total_rd += this_rd;
       cost += r;
       total_distortion += d;
@@ -3172,7 +3608,7 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(const AV1_COMP *const cpi,
   }
   mbmi->mode = mic->bmi[3].as_mode;
 
-#if CONFIG_DAALA_DIST
+#if CONFIG_DIST_8X8
   {
     const struct macroblock_plane *p = &mb->plane[0];
     const struct macroblockd_plane *pd = &xd->plane[0];
@@ -3180,18 +3616,16 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(const AV1_COMP *const cpi,
     const int dst_stride = pd->dst.stride;
     uint8_t *src = p->src.buf;
     uint8_t *dst = pd->dst.buf;
-    int use_activity_masking = 0;
-    int qm = OD_HVS_QM;
 
 #if CONFIG_PVQ
     use_activity_masking = mb->daala_enc.use_activity_masking;
 #endif  // CONFIG_PVQ
     // Daala-defined distortion computed for the block of 8x8 pixels
-    total_distortion = av1_daala_dist(src, src_stride, dst, dst_stride, 8, 8,
-                                      qm, use_activity_masking, mb->qindex)
+    total_distortion = av1_dist_8x8(cpi, xd, src, src_stride, dst, dst_stride,
+                                    BLOCK_8X8, 8, 8, 8, 8, mb->qindex)
                        << 4;
   }
-#endif  // CONFIG_DAALA_DIST
+#endif  // CONFIG_DIST_8X8
   // Add in the cost of the transform type
   if (!is_lossless) {
     int rate_tx_type = 0;
@@ -3218,7 +3652,7 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(const AV1_COMP *const cpi,
   *rate_y = tot_rate_y;
   *distortion = total_distortion;
 
-  return RDCOST(mb->rdmult, mb->rddiv, cost, total_distortion);
+  return RDCOST(mb->rdmult, cost, total_distortion);
 }
 
 #if CONFIG_FILTER_INTRA
@@ -3261,7 +3695,7 @@ static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
     this_rate = tokenonly_rd_stats.rate +
                 av1_cost_bit(cpi->common.fc->filter_intra_probs[0], 1) +
                 write_uniform_cost(FILTER_INTRA_MODES, mode) + mode_cost;
-    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist);
+    this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
 
     if (this_rd < *best_rd) {
       *best_rd = this_rd;
@@ -3321,7 +3755,7 @@ static int64_t calc_rd_given_intra_angle(
   this_rate = tokenonly_rd_stats.rate + mode_cost +
               write_uniform_cost(2 * max_angle_delta + 1,
                                  mbmi->angle_delta[0] + max_angle_delta);
-  this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist);
+  this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
 
   if (this_rd < *best_rd) {
     *best_rd = this_rd;
@@ -3496,8 +3930,8 @@ static void angle_estimation(const uint8_t *src, int src_stride, int rows,
                              uint8_t *directional_mode_skip_mask) {
   memset(directional_mode_skip_mask, 0,
          INTRA_MODES * sizeof(*directional_mode_skip_mask));
-  // Sub-8x8 blocks do not use extra directions.
-  if (bsize < BLOCK_8X8) return;
+  // Check if angle_delta is used
+  if (!av1_use_angle_delta(bsize)) return;
   uint64_t hist[DIRECTIONAL_MODES];
   memset(hist, 0, DIRECTIONAL_MODES * sizeof(hist[0]));
   src += src_stride;
@@ -3551,8 +3985,8 @@ static void highbd_angle_estimation(const uint8_t *src8, int src_stride,
                                     uint8_t *directional_mode_skip_mask) {
   memset(directional_mode_skip_mask, 0,
          INTRA_MODES * sizeof(*directional_mode_skip_mask));
-  // Sub-8x8 blocks do not use extra directions.
-  if (bsize < BLOCK_8X8) return;
+  // Check if angle_delta is used
+  if (!av1_use_angle_delta(bsize)) return;
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   uint64_t hist[DIRECTIONAL_MODES];
   memset(hist, 0, DIRECTIONAL_MODES * sizeof(hist[0]));
@@ -3608,7 +4042,6 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
                                       int *rate, int *rate_tokenonly,
                                       int64_t *distortion, int *skippable,
                                       BLOCK_SIZE bsize, int64_t best_rd) {
-  uint8_t mode_idx;
   MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *const mic = xd->mi[0];
   MB_MODE_INFO *const mbmi = &mic->mbmi;
@@ -3683,7 +4116,7 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     x->use_default_intra_tx_type = 0;
 
   /* Y Search for intra prediction mode */
-  for (mode_idx = DC_PRED; mode_idx <= FINAL_MODE_SEARCH; ++mode_idx) {
+  for (int mode_idx = DC_PRED; mode_idx <= FINAL_MODE_SEARCH; ++mode_idx) {
     RD_STATS this_rd_stats;
     int this_rate, this_rate_tokenonly, s;
     int64_t this_distortion, this_rd, this_model_rd;
@@ -3692,7 +4125,8 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
       mbmi->mode = best_mbmi.mode;
       x->use_default_intra_tx_type = 0;
     } else {
-      mbmi->mode = mode_idx;
+      assert(mode_idx < INTRA_MODES);
+      mbmi->mode = intra_rd_search_mode_order[mode_idx];
     }
 #if CONFIG_PVQ
     od_encode_rollback(&x->daala_enc, &pre_buf);
@@ -3708,7 +4142,7 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
 #if CONFIG_EXT_INTRA
     is_directional_mode = av1_is_directional_mode(mbmi->mode, bsize);
     if (is_directional_mode && directional_mode_skip_mask[mbmi->mode]) continue;
-    if (is_directional_mode) {
+    if (is_directional_mode && av1_use_angle_delta(bsize)) {
       this_rd_stats.rate = INT_MAX;
       rd_pick_intra_angle_sby(cpi, x, &this_rate, &this_rd_stats, bsize,
                               bmode_costs[mbmi->mode], best_rd, &best_model_rd);
@@ -3754,11 +4188,13 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
         this_rate +=
             cpi->intra_filter_cost[intra_filter_ctx][mbmi->intra_filter];
 #endif  // CONFIG_INTRA_INTERP
-      this_rate += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
-                                      MAX_ANGLE_DELTA + mbmi->angle_delta[0]);
+      if (av1_use_angle_delta(bsize)) {
+        this_rate += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
+                                        MAX_ANGLE_DELTA + mbmi->angle_delta[0]);
+      }
     }
 #endif  // CONFIG_EXT_INTRA
-    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+    this_rd = RDCOST(x->rdmult, this_rate, this_distortion);
 #if CONFIG_FILTER_INTRA
     if (best_rd == INT64_MAX || this_rd - best_rd < (best_rd >> 4)) {
       filter_intra_mode_skip_mask ^= (1 << mbmi->mode);
@@ -3785,16 +4221,6 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   od_encode_rollback(&x->daala_enc, &post_buf);
 #endif  // CONFIG_PVQ
 
-#if CONFIG_CFL
-  // Perform one extra txfm_rd_in_plane() call, this time with the best value so
-  // we can store reconstructed luma values
-  RD_STATS this_rd_stats;
-  x->cfl_store_y = 1;
-  txfm_rd_in_plane(x, cpi, &this_rd_stats, INT64_MAX, 0, bsize,
-                   mic->mbmi.tx_size, cpi->sf.use_fast_coef_costing);
-  x->cfl_store_y = 0;
-#endif
-
 #if CONFIG_PALETTE
   if (try_palette) {
     rd_pick_palette_intra_sby(cpi, x, bsize, palette_y_mode_ctx,
@@ -3826,7 +4252,7 @@ static int super_block_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x,
                             int64_t ref_best_rd) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const TX_SIZE uv_tx_size = get_uv_tx_size(mbmi, &xd->plane[1]);
+  const TX_SIZE uv_tx_size = av1_get_uv_tx_size(mbmi, &xd->plane[1]);
   int plane;
   int is_cost_valid = 1;
   av1_init_rd_stats(rd_stats);
@@ -3857,9 +4283,8 @@ static int super_block_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x,
         break;
       }
       av1_merge_rd_stats(rd_stats, &pn_rd_stats);
-      if (RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist) >
-              ref_best_rd &&
-          RDCOST(x->rdmult, x->rddiv, 0, rd_stats->sse) > ref_best_rd) {
+      if (RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) > ref_best_rd &&
+          RDCOST(x->rdmult, 0, rd_stats->sse) > ref_best_rd) {
         is_cost_valid = 0;
         break;
       }
@@ -3875,13 +4300,6 @@ static int super_block_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x,
 }
 
 #if CONFIG_VAR_TX
-// FIXME crop these calls
-static uint64_t sum_squares_2d(const int16_t *diff, int diff_stride,
-                               TX_SIZE tx_size) {
-  return aom_sum_squares_2d_i16(diff, diff_stride, tx_size_wide[tx_size],
-                                tx_size_high[tx_size]);
-}
-
 void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
                        int blk_row, int blk_col, int plane, int block,
                        int plane_bsize, const ENTROPY_CONTEXT *a,
@@ -3890,18 +4308,23 @@ void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
   MACROBLOCKD *xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
+
+#if CONFIG_TXK_SEL
+  av1_search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize,
+                      tx_size, a, l, 0, rd_stats);
+  return;
+#endif
+
   int64_t tmp;
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   PLANE_TYPE plane_type = get_plane_type(plane);
-  TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+  TX_TYPE tx_type =
+      av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
   const SCAN_ORDER *const scan_order =
-      get_scan(cm, tx_size, tx_type, is_inter_block(&xd->mi[0]->mbmi));
+      get_scan(cm, tx_size, tx_type, &xd->mi[0]->mbmi);
   BLOCK_SIZE txm_bsize = txsize_to_bsize[tx_size];
   int bh = block_size_high[txm_bsize];
   int bw = block_size_wide[txm_bsize];
-  int txb_h = tx_size_high_unit[tx_size];
-  int txb_w = tx_size_wide_unit[tx_size];
-
   int src_stride = p->src.stride;
   uint8_t *src =
       &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
@@ -3914,30 +4337,15 @@ void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
 #else
   DECLARE_ALIGNED(16, uint8_t, rec_buffer[MAX_TX_SQUARE]);
 #endif  // CONFIG_HIGHBITDEPTH
-  int max_blocks_high = block_size_high[plane_bsize];
-  int max_blocks_wide = block_size_wide[plane_bsize];
-  const int diff_stride = max_blocks_wide;
+  const int diff_stride = block_size_wide[plane_bsize];
   const int16_t *diff =
       &p->src_diff[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
   int txb_coeff_cost;
 
   assert(tx_size < TX_SIZES_ALL);
 
-  if (xd->mb_to_bottom_edge < 0)
-    max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y);
-  if (xd->mb_to_right_edge < 0)
-    max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x);
-
-  max_blocks_high >>= tx_size_wide_log2[0];
-  max_blocks_wide >>= tx_size_wide_log2[0];
-
   int coeff_ctx = get_entropy_context(tx_size, a, l);
 
-  av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                  coeff_ctx, AV1_XFORM_QUANT_FP);
-
-  av1_optimize_b(cm, x, plane, block, plane_bsize, tx_size, a, l);
-
 // TODO(any): Use av1_dist_block to compute distortion
 #if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -3954,21 +4362,35 @@ void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
                     0, bw, bh);
 #endif  // CONFIG_HIGHBITDEPTH
 
-  if (blk_row + txb_h > max_blocks_high || blk_col + txb_w > max_blocks_wide) {
-    int idx, idy;
-    int blocks_height = AOMMIN(txb_h, max_blocks_high - blk_row);
-    int blocks_width = AOMMIN(txb_w, max_blocks_wide - blk_col);
-    tmp = 0;
-    for (idy = 0; idy < blocks_height; ++idy) {
-      for (idx = 0; idx < blocks_width; ++idx) {
-        const int16_t *d =
-            diff + ((idy * diff_stride + idx) << tx_size_wide_log2[0]);
-        tmp += sum_squares_2d(d, diff_stride, 0);
-      }
-    }
-  } else {
-    tmp = sum_squares_2d(diff, diff_stride, tx_size);
+#if DISABLE_TRELLISQ_SEARCH
+  av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                  coeff_ctx, AV1_XFORM_QUANT_B);
+
+#else
+  av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                  coeff_ctx, AV1_XFORM_QUANT_FP);
+
+  const int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  const int buffer_length = tx_size_2d[tx_size];
+  int64_t tmp_dist;
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    tmp_dist =
+        av1_highbd_block_error(coeff, dqcoeff, buffer_length, &tmp, xd->bd) >>
+        shift;
+  else
+#endif
+    tmp_dist = av1_block_error(coeff, dqcoeff, buffer_length, &tmp) >> shift;
+
+  if (RDCOST(x->rdmult, 0, tmp_dist) < rd_stats->ref_rdcost) {
+    av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size,
+                   a, l);
   }
+#endif  // DISABLE_TRELLISQ_SEARCH
+
+  tmp = pixel_diff_dist(x, plane, diff, diff_stride, blk_row, blk_col,
+                        plane_bsize, txm_bsize);
 
 #if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
@@ -3977,36 +4399,48 @@ void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
   rd_stats->sse += tmp * 16;
   const int eob = p->eobs[block];
 
+#if CONFIG_LGT
+  PREDICTION_MODE mode = get_prediction_mode(xd->mi[0], plane, tx_size, block);
+  av1_inverse_transform_block(xd, dqcoeff, mode, tx_type, tx_size, rec_buffer,
+                              MAX_TX_SIZE, eob);
+#else
   av1_inverse_transform_block(xd, dqcoeff, tx_type, tx_size, rec_buffer,
                               MAX_TX_SIZE, eob);
+#endif
   if (eob > 0) {
-    if (txb_w + blk_col > max_blocks_wide ||
-        txb_h + blk_row > max_blocks_high) {
-      int idx, idy;
-      unsigned int this_dist;
-      int blocks_height = AOMMIN(txb_h, max_blocks_high - blk_row);
-      int blocks_width = AOMMIN(txb_w, max_blocks_wide - blk_col);
-      tmp = 0;
-      for (idy = 0; idy < blocks_height; ++idy) {
-        for (idx = 0; idx < blocks_width; ++idx) {
-          uint8_t *const s =
-              src + ((idy * src_stride + idx) << tx_size_wide_log2[0]);
-          uint8_t *const r =
-              rec_buffer + ((idy * MAX_TX_SIZE + idx) << tx_size_wide_log2[0]);
-          cpi->fn_ptr[0].vf(s, src_stride, r, MAX_TX_SIZE, &this_dist);
-          tmp += this_dist;
-        }
+#if CONFIG_DIST_8X8
+    if (plane == 0 && (bw < 8 && bh < 8)) {
+      // Save sub8x8 luma decoded pixels
+      // since 8x8 luma decoded pixels are not available for daala-dist
+      // after recursive split of BLOCK_8x8 is done.
+      const int pred_stride = block_size_wide[plane_bsize];
+      const int pred_idx = (blk_row * pred_stride + blk_col)
+                           << tx_size_wide_log2[0];
+      int16_t *decoded = &pd->pred[pred_idx];
+      int i, j;
+
+#if CONFIG_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        for (j = 0; j < bh; j++)
+          for (i = 0; i < bw; i++)
+            decoded[j * pred_stride + i] =
+                CONVERT_TO_SHORTPTR(rec_buffer)[j * MAX_TX_SIZE + i];
+      } else {
+#endif
+        for (j = 0; j < bh; j++)
+          for (i = 0; i < bw; i++)
+            decoded[j * pred_stride + i] = rec_buffer[j * MAX_TX_SIZE + i];
+#if CONFIG_HIGHBITDEPTH
       }
-    } else {
-      uint32_t this_dist;
-      cpi->fn_ptr[txm_bsize].vf(src, src_stride, rec_buffer, MAX_TX_SIZE,
-                                &this_dist);
-      tmp = this_dist;
+#endif  // CONFIG_HIGHBITDEPTH
     }
+#endif  // CONFIG_DIST_8X8
+    tmp = pixel_dist(cpi, x, plane, src, src_stride, rec_buffer, MAX_TX_SIZE,
+                     blk_row, blk_col, plane_bsize, txm_bsize);
   }
   rd_stats->dist += tmp * 16;
-  txb_coeff_cost =
-      av1_cost_coeffs(cpi, x, plane, block, tx_size, scan_order, a, l, 0);
+  txb_coeff_cost = av1_cost_coeffs(cpi, x, plane, blk_row, blk_col, block,
+                                   tx_size, scan_order, a, l, 0);
   rd_stats->rate += txb_coeff_cost;
   rd_stats->skip &= (eob == 0);
 
@@ -4038,14 +4472,35 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
   int64_t this_rd = INT64_MAX;
   ENTROPY_CONTEXT *pta = ta + blk_col;
   ENTROPY_CONTEXT *ptl = tl + blk_row;
-  int coeff_ctx, i;
+  int i;
   int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row,
                                    mbmi->sb_type, tx_size);
   int64_t sum_rd = INT64_MAX;
   int tmp_eob = 0;
   int zero_blk_rate;
   RD_STATS sum_rd_stats;
-  const int tx_size_ctx = txsize_sqr_map[tx_size];
+#if CONFIG_TXK_SEL
+  TX_TYPE best_tx_type = TX_TYPES;
+  int txk_idx = (blk_row << 4) + blk_col;
+#endif
+#if CONFIG_RECT_TX_EXT
+  TX_SIZE quarter_txsize = quarter_txsize_lookup[mbmi->sb_type];
+  int check_qttx = is_quarter_tx_allowed(xd, mbmi, is_inter_block(mbmi)) &&
+                   tx_size == max_txsize_rect_lookup[mbmi->sb_type] &&
+                   quarter_txsize != tx_size;
+  int is_qttx_picked = 0;
+  int eobs_qttx[2] = { 0, 0 };
+  int skip_qttx[2] = { 0, 0 };
+  int block_offset_qttx = check_qttx
+                              ? tx_size_wide_unit[quarter_txsize] *
+                                    tx_size_high_unit[quarter_txsize]
+                              : 0;
+  int blk_row_offset, blk_col_offset;
+  int is_wide_qttx =
+      tx_size_wide_unit[quarter_txsize] > tx_size_high_unit[quarter_txsize];
+  blk_row_offset = is_wide_qttx ? tx_size_high_unit[quarter_txsize] : 0;
+  blk_col_offset = is_wide_qttx ? 0 : tx_size_wide_unit[quarter_txsize];
+#endif
 
   av1_init_rd_stats(&sum_rd_stats);
 
@@ -4056,15 +4511,25 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
     return;
   }
 
-  coeff_ctx = get_entropy_context(tx_size, pta, ptl);
-
   av1_init_rd_stats(rd_stats);
 
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
+#if CONFIG_LV_MAP
+  TX_SIZE txs_ctx = get_txsize_context(tx_size);
+  TXB_CTX txb_ctx;
+  get_txb_ctx(plane_bsize, tx_size, plane, pta, ptl, &txb_ctx);
+  zero_blk_rate =
+      av1_cost_bit(xd->fc->txb_skip[txs_ctx][txb_ctx.txb_skip_ctx], 1);
+#else
+  int tx_size_ctx = txsize_sqr_map[tx_size];
+  int coeff_ctx = get_entropy_context(tx_size, pta, ptl);
   zero_blk_rate = x->token_costs[tx_size_ctx][pd->plane_type][1][0][0]
                                 [coeff_ctx][EOB_TOKEN];
+#endif
 
+  rd_stats->ref_rdcost = ref_best_rd;
+  rd_stats->zero_rate = zero_blk_rate;
   if (cpi->common.tx_mode == TX_MODE_SELECT || tx_size == TX_4X4) {
     inter_tx_size[0][0] = tx_size;
 
@@ -4081,8 +4546,8 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
       }
     }
 
-    if ((RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist) >=
-             RDCOST(x->rdmult, x->rddiv, zero_blk_rate, rd_stats->sse) ||
+    if ((RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
+             RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) ||
          rd_stats->skip == 1) &&
         !xd->lossless[mbmi->segment_id]) {
 #if CONFIG_RD_DEBUG
@@ -4094,6 +4559,9 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
       rd_stats->skip = 1;
       x->blk_skip[plane][blk_row * bw + blk_col] = 1;
       p->eobs[block] = 0;
+#if CONFIG_TXK_SEL
+      mbmi->txk_type[txk_idx] = DCT_DCT;
+#endif
     } else {
       x->blk_skip[plane][blk_row * bw + blk_col] = 0;
       rd_stats->skip = 0;
@@ -4102,23 +4570,143 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
     if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
       rd_stats->rate +=
           av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 0);
-    this_rd = RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist);
+#if CONFIG_RECT_TX_EXT
+    if (check_qttx) {
+      assert(blk_row == 0 && blk_col == 0);
+      rd_stats->rate += av1_cost_bit(cpi->common.fc->quarter_tx_size_prob, 0);
+    }
+#endif
+    this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+#if CONFIG_LV_MAP
+    tmp_eob = p->txb_entropy_ctx[block];
+#else
     tmp_eob = p->eobs[block];
+#endif
+
+#if CONFIG_TXK_SEL
+    best_tx_type = mbmi->txk_type[txk_idx];
+#endif
+
+#if CONFIG_RECT_TX_EXT
+    if (check_qttx) {
+      assert(blk_row == 0 && blk_col == 0 && block == 0 && plane == 0);
+
+      RD_STATS rd_stats_tmp, rd_stats_qttx;
+      int64_t rd_qttx;
+
+      av1_init_rd_stats(&rd_stats_qttx);
+      av1_init_rd_stats(&rd_stats_tmp);
+
+      av1_tx_block_rd_b(cpi, x, quarter_txsize, 0, 0, plane, 0, plane_bsize,
+                        pta, ptl, &rd_stats_qttx);
+
+      tx_size_ctx = txsize_sqr_map[quarter_txsize];
+      coeff_ctx = get_entropy_context(quarter_txsize, pta, ptl);
+      zero_blk_rate = x->token_costs[tx_size_ctx][pd->plane_type][1][0][0]
+                                    [coeff_ctx][EOB_TOKEN];
+      if ((RDCOST(x->rdmult, rd_stats_qttx.rate, rd_stats_qttx.dist) >=
+               RDCOST(x->rdmult, zero_blk_rate, rd_stats_qttx.sse) ||
+           rd_stats_qttx.skip == 1) &&
+          !xd->lossless[mbmi->segment_id]) {
+#if CONFIG_RD_DEBUG
+        av1_update_txb_coeff_cost(&rd_stats_qttx, plane, quarter_txsize, 0, 0,
+                                  zero_blk_rate - rd_stats_qttx.rate);
+#endif  // CONFIG_RD_DEBUG
+        rd_stats_qttx.rate = zero_blk_rate;
+        rd_stats_qttx.dist = rd_stats_qttx.sse;
+        rd_stats_qttx.skip = 1;
+        x->blk_skip[plane][blk_row * bw + blk_col] = 1;
+        skip_qttx[0] = 1;
+        p->eobs[block] = 0;
+      } else {
+        x->blk_skip[plane][blk_row * bw + blk_col] = 0;
+        skip_qttx[0] = 0;
+        rd_stats->skip = 0;
+      }
+
+      // Second tx block
+      av1_tx_block_rd_b(cpi, x, quarter_txsize, blk_row_offset, blk_col_offset,
+                        plane, block_offset_qttx, plane_bsize, pta, ptl,
+                        &rd_stats_tmp);
+
+      av1_set_txb_context(x, plane, 0, quarter_txsize, pta, ptl);
+      coeff_ctx = get_entropy_context(quarter_txsize, pta + blk_col_offset,
+                                      ptl + blk_row_offset);
+      zero_blk_rate = x->token_costs[tx_size_ctx][pd->plane_type][1][0][0]
+                                    [coeff_ctx][EOB_TOKEN];
+      if ((RDCOST(x->rdmult, rd_stats_tmp.rate, rd_stats_tmp.dist) >=
+               RDCOST(x->rdmult, zero_blk_rate, rd_stats_tmp.sse) ||
+           rd_stats_tmp.skip == 1) &&
+          !xd->lossless[mbmi->segment_id]) {
+#if CONFIG_RD_DEBUG
+        av1_update_txb_coeff_cost(&rd_stats_tmp, plane, quarter_txsize, 0, 0,
+                                  zero_blk_rate - rd_stats_tmp.rate);
+#endif  // CONFIG_RD_DEBUG
+        rd_stats_tmp.rate = zero_blk_rate;
+        rd_stats_tmp.dist = rd_stats_tmp.sse;
+        rd_stats_tmp.skip = 1;
+        x->blk_skip[plane][blk_row_offset * bw + blk_col_offset] = 1;
+        skip_qttx[1] = 1;
+        p->eobs[block_offset_qttx] = 0;
+      } else {
+        x->blk_skip[plane][blk_row_offset * bw + blk_col_offset] = 0;
+        skip_qttx[1] = 0;
+        rd_stats_tmp.skip = 0;
+      }
+
+      av1_merge_rd_stats(&rd_stats_qttx, &rd_stats_tmp);
+
+      if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH) {
+        rd_stats_qttx.rate +=
+            av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 0);
+      }
+      rd_stats_qttx.rate +=
+          av1_cost_bit(cpi->common.fc->quarter_tx_size_prob, 1);
+      rd_qttx = RDCOST(x->rdmult, rd_stats_qttx.rate, rd_stats_qttx.dist);
+#if CONFIG_LV_MAP
+      eobs_qttx[0] = p->txb_entropy_ctx[0];
+      eobs_qttx[1] = p->txb_entropy_ctx[block_offset_qttx];
+#else
+      eobs_qttx[0] = p->eobs[0];
+      eobs_qttx[1] = p->eobs[block_offset_qttx];
+#endif
+      if (rd_qttx < this_rd) {
+        is_qttx_picked = 1;
+        this_rd = rd_qttx;
+        rd_stats->rate = rd_stats_qttx.rate;
+        rd_stats->dist = rd_stats_qttx.dist;
+        rd_stats->sse = rd_stats_qttx.sse;
+        rd_stats->skip = rd_stats_qttx.skip;
+        rd_stats->rdcost = rd_stats_qttx.rdcost;
+      }
+      av1_get_entropy_contexts(plane_bsize, 0, pd, ta, tl);
+    }
+#endif
   }
 
+#if CONFIG_MRC_TX
+  // If the tx type we are trying is MRC_DCT, we cannot partition the transform
+  // into anything smaller than TX_32X32
+  if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH && mbmi->tx_type != MRC_DCT) {
+#else
   if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH) {
+#endif
     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
     const int bsl = tx_size_wide_unit[sub_txs];
     int sub_step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
     RD_STATS this_rd_stats;
     int this_cost_valid = 1;
     int64_t tmp_rd = 0;
-
+#if CONFIG_DIST_8X8
+    int sub8x8_eob[4];
+#endif
     sum_rd_stats.rate =
         av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 1);
 
     assert(tx_size < TX_SIZES_ALL);
 
+    ref_best_rd = AOMMIN(this_rd, ref_best_rd);
+
     for (i = 0; i < 4 && this_cost_valid; ++i) {
       int offsetr = blk_row + (i >> 1) * bsl;
       int offsetc = blk_col + (i & 0x01) * bsl;
@@ -4129,30 +4717,170 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
                       depth + 1, plane_bsize, ta, tl, tx_above, tx_left,
                       &this_rd_stats, ref_best_rd - tmp_rd, &this_cost_valid,
                       rd_stats_stack);
-
+#if CONFIG_DIST_8X8
+      if (plane == 0 && tx_size == TX_8X8) {
+        sub8x8_eob[i] = p->eobs[block];
+      }
+#endif  // CONFIG_DIST_8X8
       av1_merge_rd_stats(&sum_rd_stats, &this_rd_stats);
 
-      tmp_rd =
-          RDCOST(x->rdmult, x->rddiv, sum_rd_stats.rate, sum_rd_stats.dist);
+      tmp_rd = RDCOST(x->rdmult, sum_rd_stats.rate, sum_rd_stats.dist);
+#if !CONFIG_DIST_8X8
       if (this_rd < tmp_rd) break;
+#endif
       block += sub_step;
     }
+#if CONFIG_DIST_8X8
+    if (this_cost_valid && plane == 0 && tx_size == TX_8X8) {
+      const int src_stride = p->src.stride;
+      const int dst_stride = pd->dst.stride;
+
+      const uint8_t *src =
+          &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
+      const uint8_t *dst =
+          &pd->dst
+               .buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+
+      int64_t dist_8x8;
+      int qindex = x->qindex;
+      const int pred_stride = block_size_wide[plane_bsize];
+      const int pred_idx = (blk_row * pred_stride + blk_col)
+                           << tx_size_wide_log2[0];
+      int16_t *pred = &pd->pred[pred_idx];
+      int j;
+      int row, col;
+
+#if CONFIG_HIGHBITDEPTH
+      uint8_t *pred8;
+      DECLARE_ALIGNED(16, uint16_t, pred8_16[8 * 8]);
+#else
+      DECLARE_ALIGNED(16, uint8_t, pred8[8 * 8]);
+#endif  // CONFIG_HIGHBITDEPTH
+
+      dist_8x8 = av1_dist_8x8(cpi, xd, src, src_stride, dst, dst_stride,
+                              BLOCK_8X8, 8, 8, 8, 8, qindex) *
+                 16;
+      sum_rd_stats.sse = dist_8x8;
+
+#if CONFIG_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+        pred8 = CONVERT_TO_BYTEPTR(pred8_16);
+      else
+        pred8 = (uint8_t *)pred8_16;
+#endif
+
+#if CONFIG_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        for (row = 0; row < 2; ++row) {
+          for (col = 0; col < 2; ++col) {
+            int idx = row * 2 + col;
+            int eob = sub8x8_eob[idx];
+
+            if (eob > 0) {
+              for (j = 0; j < 4; j++)
+                for (i = 0; i < 4; i++)
+                  CONVERT_TO_SHORTPTR(pred8)
+                  [(row * 4 + j) * 8 + 4 * col + i] =
+                      pred[(row * 4 + j) * pred_stride + 4 * col + i];
+            } else {
+              for (j = 0; j < 4; j++)
+                for (i = 0; i < 4; i++)
+                  CONVERT_TO_SHORTPTR(pred8)
+                  [(row * 4 + j) * 8 + 4 * col + i] = CONVERT_TO_SHORTPTR(
+                      dst)[(row * 4 + j) * dst_stride + 4 * col + i];
+            }
+          }
+        }
+      } else {
+#endif
+        for (row = 0; row < 2; ++row) {
+          for (col = 0; col < 2; ++col) {
+            int idx = row * 2 + col;
+            int eob = sub8x8_eob[idx];
+
+            if (eob > 0) {
+              for (j = 0; j < 4; j++)
+                for (i = 0; i < 4; i++)
+                  pred8[(row * 4 + j) * 8 + 4 * col + i] =
+                      pred[(row * 4 + j) * pred_stride + 4 * col + i];
+            } else {
+              for (j = 0; j < 4; j++)
+                for (i = 0; i < 4; i++)
+                  pred8[(row * 4 + j) * 8 + 4 * col + i] =
+                      dst[(row * 4 + j) * dst_stride + 4 * col + i];
+            }
+          }
+        }
+#if CONFIG_HIGHBITDEPTH
+      }
+#endif  // CONFIG_HIGHBITDEPTH
+      dist_8x8 = av1_dist_8x8(cpi, xd, src, src_stride, pred8, 8, BLOCK_8X8, 8,
+                              8, 8, 8, qindex) *
+                 16;
+      sum_rd_stats.dist = dist_8x8;
+      tmp_rd = RDCOST(x->rdmult, sum_rd_stats.rate, sum_rd_stats.dist);
+    }
+#endif  // CONFIG_DIST_8X8
     if (this_cost_valid) sum_rd = tmp_rd;
   }
 
   if (this_rd < sum_rd) {
     int idx, idy;
-    for (i = 0; i < tx_size_wide_unit[tx_size]; ++i) pta[i] = !(tmp_eob == 0);
-    for (i = 0; i < tx_size_high_unit[tx_size]; ++i) ptl[i] = !(tmp_eob == 0);
+#if CONFIG_RECT_TX_EXT
+    TX_SIZE tx_size_selected = is_qttx_picked ? quarter_txsize : tx_size;
+#else
+    TX_SIZE tx_size_selected = tx_size;
+#endif
+
+#if CONFIG_RECT_TX_EXT
+    if (is_qttx_picked) {
+      assert(blk_row == 0 && blk_col == 0 && plane == 0);
+#if CONFIG_LV_MAP
+      p->txb_entropy_ctx[0] = eobs_qttx[0];
+      p->txb_entropy_ctx[block_offset_qttx] = eobs_qttx[1];
+#else
+      p->eobs[0] = eobs_qttx[0];
+      p->eobs[block_offset_qttx] = eobs_qttx[1];
+#endif
+    } else {
+#endif
+#if CONFIG_LV_MAP
+      p->txb_entropy_ctx[block] = tmp_eob;
+#else
+    p->eobs[block] = tmp_eob;
+#endif
+#if CONFIG_RECT_TX_EXT
+    }
+#endif
+
+    av1_set_txb_context(x, plane, block, tx_size_selected, pta, ptl);
+#if CONFIG_RECT_TX_EXT
+    if (is_qttx_picked)
+      av1_set_txb_context(x, plane, block_offset_qttx, tx_size_selected,
+                          pta + blk_col_offset, ptl + blk_row_offset);
+#endif
+
     txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size,
                           tx_size);
-    inter_tx_size[0][0] = tx_size;
+    inter_tx_size[0][0] = tx_size_selected;
     for (idy = 0; idy < tx_size_high_unit[tx_size] / 2; ++idy)
       for (idx = 0; idx < tx_size_wide_unit[tx_size] / 2; ++idx)
-        inter_tx_size[idy][idx] = tx_size;
-    mbmi->tx_size = tx_size;
+        inter_tx_size[idy][idx] = tx_size_selected;
+    mbmi->tx_size = tx_size_selected;
+#if CONFIG_TXK_SEL
+    mbmi->txk_type[txk_idx] = best_tx_type;
+#endif
     if (this_rd == INT64_MAX) *is_cost_valid = 0;
-    x->blk_skip[plane][blk_row * bw + blk_col] = rd_stats->skip;
+#if CONFIG_RECT_TX_EXT
+    if (is_qttx_picked) {
+      x->blk_skip[plane][0] = skip_qttx[0];
+      x->blk_skip[plane][blk_row_offset * bw + blk_col_offset] = skip_qttx[1];
+    } else {
+#endif
+      x->blk_skip[plane][blk_row * bw + blk_col] = rd_stats->skip;
+#if CONFIG_RECT_TX_EXT
+    }
+#endif
   } else {
     *rd_stats = sum_rd_stats;
     if (sum_rd == INT64_MAX) *is_cost_valid = 0;
@@ -4201,17 +4929,16 @@ static void inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
                         tx_above, tx_left, &pn_rd_stats, ref_best_rd - this_rd,
                         &is_cost_valid, rd_stats_stack);
         av1_merge_rd_stats(rd_stats, &pn_rd_stats);
-        this_rd += AOMMIN(
-            RDCOST(x->rdmult, x->rddiv, pn_rd_stats.rate, pn_rd_stats.dist),
-            RDCOST(x->rdmult, x->rddiv, 0, pn_rd_stats.sse));
+        this_rd += AOMMIN(RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist),
+                          RDCOST(x->rdmult, 0, pn_rd_stats.sse));
         block += step;
         ++block32;
       }
     }
   }
 
-  this_rd = AOMMIN(RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist),
-                   RDCOST(x->rdmult, x->rddiv, 0, rd_stats->sse));
+  this_rd = AOMMIN(RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist),
+                   RDCOST(x->rdmult, 0, rd_stats->sse));
   if (this_rd > ref_best_rd) is_cost_valid = 0;
 
   if (!is_cost_valid) {
@@ -4247,6 +4974,7 @@ static int64_t select_tx_size_fix_type(const AV1_COMP *cpi, MACROBLOCK *x,
       mbmi->min_tx_size = AOMMIN(
           mbmi->min_tx_size, get_min_tx_size(mbmi->inter_tx_size[row][col]));
 
+#if !CONFIG_TXK_SEL
 #if CONFIG_EXT_TX
   if (get_ext_tx_types(mbmi->min_tx_size, bsize, is_inter,
                        cm->reduced_tx_set_used) > 1 &&
@@ -4266,20 +4994,21 @@ static int64_t select_tx_size_fix_type(const AV1_COMP *cpi, MACROBLOCK *x,
                                     [mbmi->tx_type];
     }
   }
-#else   // CONFIG_EXT_TX
+#else
   if (mbmi->min_tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id])
     rd_stats->rate +=
         cpi->inter_tx_type_costs[mbmi->min_tx_size][mbmi->tx_type];
 #endif  // CONFIG_EXT_TX
+#endif  // CONFIG_TXK_SEL
 
   if (rd_stats->skip)
-    rd = RDCOST(x->rdmult, x->rddiv, s1, rd_stats->sse);
+    rd = RDCOST(x->rdmult, s1, rd_stats->sse);
   else
-    rd = RDCOST(x->rdmult, x->rddiv, rd_stats->rate + s0, rd_stats->dist);
+    rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist);
 
   if (is_inter && !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
       !(rd_stats->skip))
-    rd = AOMMIN(rd, RDCOST(x->rdmult, x->rddiv, s1, rd_stats->sse));
+    rd = AOMMIN(rd, RDCOST(x->rdmult, s1, rd_stats->sse));
 
   return rd;
 }
@@ -4299,6 +5028,12 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
   TX_SIZE best_tx = max_txsize_lookup[bsize];
   TX_SIZE best_min_tx_size = TX_SIZES_ALL;
   uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE * 8];
+  TX_TYPE txk_start = DCT_DCT;
+#if CONFIG_TXK_SEL
+  TX_TYPE txk_end = DCT_DCT + 1;
+#else
+  TX_TYPE txk_end = TX_TYPES;
+#endif
   const int n4 = bsize_to_num_blk(bsize);
   int idx, idy;
   int prune = 0;
@@ -4326,9 +5061,14 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
   for (idx = 0; idx < count32; ++idx)
     av1_invalid_rd_stats(&rd_stats_stack[idx]);
 
-  for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
+  for (tx_type = txk_start; tx_type < txk_end; ++tx_type) {
     RD_STATS this_rd_stats;
     av1_init_rd_stats(&this_rd_stats);
+#if CONFIG_MRC_TX
+    // MRC_DCT only implemented for TX_32X32 so only include this tx in
+    // the search for TX_32X32
+    if (tx_type == MRC_DCT && max_tx_size != TX_32X32) continue;
+#endif  // CONFIG_MRC_TX
 #if CONFIG_EXT_TX
     if (is_inter) {
       if (!ext_tx_used_inter[ext_tx_set][tx_type]) continue;
@@ -4384,7 +5124,6 @@ static void tx_block_rd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
                         ENTROPY_CONTEXT *left_ctx, RD_STATS *rd_stats) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
   const int tx_row = blk_row >> (1 - pd->subsampling_y);
@@ -4402,16 +5141,11 @@ static void tx_block_rd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
             : mbmi->inter_tx_size[tx_row][tx_col];
 
   if (tx_size == plane_tx_size) {
-    int i;
     ENTROPY_CONTEXT *ta = above_ctx + blk_col;
     ENTROPY_CONTEXT *tl = left_ctx + blk_row;
     av1_tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
                       plane_bsize, ta, tl, rd_stats);
-
-    for (i = 0; i < tx_size_wide_unit[tx_size]; ++i)
-      ta[i] = !(p->eobs[block] == 0);
-    for (i = 0; i < tx_size_high_unit[tx_size]; ++i)
-      tl[i] = !(p->eobs[block] == 0);
+    av1_set_txb_context(x, plane, block, tx_size, ta, tl);
   } else {
     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
     const int bsl = tx_size_wide_unit[sub_txs];
@@ -4498,9 +5232,8 @@ static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x,
 
     av1_merge_rd_stats(rd_stats, &pn_rd_stats);
 
-    this_rd =
-        AOMMIN(RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist),
-               RDCOST(x->rdmult, x->rddiv, 0, rd_stats->sse));
+    this_rd = AOMMIN(RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist),
+                     RDCOST(x->rdmult, 0, rd_stats->sse));
 
     if (this_rd > ref_best_rd) {
       is_cost_valid = 0;
@@ -4543,7 +5276,7 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
                            &plane_block_height, &rows, &cols);
   if (rows * cols > PALETTE_MAX_BLOCK_SIZE) return;
 
-  mbmi->uv_mode = DC_PRED;
+  mbmi->uv_mode = UV_DC_PRED;
 #if CONFIG_FILTER_INTRA
   mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
 #endif  // CONFIG_FILTER_INTRA
@@ -4689,7 +5422,7 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
         }
       }
 
-      this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist);
+      this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
       if (this_rd < *best_rd) {
         *best_rd = this_rd;
         *best_mbmi = *mbmi;
@@ -4727,7 +5460,7 @@ static int rd_pick_filter_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
 
   av1_zero(filter_intra_mode_info);
   mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 1;
-  mbmi->uv_mode = DC_PRED;
+  mbmi->uv_mode = UV_DC_PRED;
 #if CONFIG_PALETTE
   mbmi->palette_mode_info.palette_size[1] = 0;
 #endif  // CONFIG_PALETTE
@@ -4741,7 +5474,7 @@ static int rd_pick_filter_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
                 av1_cost_bit(cpi->common.fc->filter_intra_probs[1], 1) +
                 cpi->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode] +
                 write_uniform_cost(FILTER_INTRA_MODES, mode);
-    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist);
+    this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
     if (this_rd < *best_rd) {
       *best_rd = this_rd;
       *rate = this_rate;
@@ -4754,7 +5487,7 @@ static int rd_pick_filter_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
   }
 
   if (filter_intra_selected_flag) {
-    mbmi->uv_mode = DC_PRED;
+    mbmi->uv_mode = UV_DC_PRED;
     mbmi->filter_intra_mode_info.use_filter_intra_mode[1] =
         filter_intra_mode_info.use_filter_intra_mode[1];
     mbmi->filter_intra_mode_info.filter_intra_mode[1] =
@@ -4782,7 +5515,7 @@ static int64_t pick_intra_angle_routine_sbuv(
   if (!super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd_in))
     return INT64_MAX;
   this_rate = tokenonly_rd_stats.rate + rate_overhead;
-  this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist);
+  this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
   if (this_rd < *best_rd) {
     *best_rd = this_rd;
     *best_angle_delta = mbmi->angle_delta[1];
@@ -4852,8 +5585,172 @@ static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
 }
 #endif  // CONFIG_EXT_INTRA
 
+#if CONFIG_CFL
+static int64_t cfl_alpha_dist(const uint8_t *y_pix, int y_stride,
+                              const int y_averages_q3[MAX_NUM_TXB],
+                              const uint8_t *src, int src_stride, int width,
+                              int height, TX_SIZE tx_size, int dc_pred,
+                              int alpha_q3, int64_t *dist_neg_out) {
+  int64_t dist = 0;
+  int diff;
+
+  if (alpha_q3 == 0) {
+    for (int j = 0; j < height; j++) {
+      for (int i = 0; i < width; i++) {
+        diff = src[i] - dc_pred;
+        dist += diff * diff;
+      }
+      src += src_stride;
+    }
+
+    if (dist_neg_out) *dist_neg_out = dist;
+
+    return dist;
+  }
+
+  int64_t dist_neg = 0;
+  const int tx_height = tx_size_high[tx_size];
+  const int tx_width = tx_size_wide[tx_size];
+  const int y_block_row_off = y_stride * tx_height;
+  const int src_block_row_off = src_stride * tx_height;
+  const uint8_t *t_y_pix;
+  const uint8_t *t_src;
+  int a = 0;
+  for (int b_j = 0; b_j < height; b_j += tx_height) {
+    const int h = b_j + tx_height;
+    for (int b_i = 0; b_i < width; b_i += tx_width) {
+      const int w = b_i + tx_width;
+      const int tx_avg_q3 = y_averages_q3[a++];
+      t_y_pix = y_pix;
+      t_src = src;
+      for (int t_j = b_j; t_j < h; t_j++) {
+        for (int t_i = b_i; t_i < w; t_i++) {
+          const int uv = t_src[t_i];
+
+          const int scaled_luma =
+              get_scaled_luma_q0(alpha_q3, t_y_pix[t_i], tx_avg_q3);
+
+          // TODO(ltrudeau) add support for HBD.
+          diff = uv - clamp(scaled_luma + dc_pred, 0, 255);
+          dist += diff * diff;
+
+          // TODO(ltrudeau) add support for HBD.
+          diff = uv - clamp(-scaled_luma + dc_pred, 0, 255);
+          dist_neg += diff * diff;
+        }
+        t_y_pix += y_stride;
+        t_src += src_stride;
+      }
+    }
+    y_pix += y_block_row_off;
+    src += src_block_row_off;
+  }
+
+  if (dist_neg_out) *dist_neg_out = dist_neg;
+
+  return dist;
+}
+
+static inline void cfl_update_costs(CFL_CTX *cfl, FRAME_CONTEXT *ec_ctx) {
+  assert(ec_ctx->cfl_alpha_cdf[CFL_ALPHABET_SIZE - 1] ==
+         AOM_ICDF(CDF_PROB_TOP));
+
+  aom_cdf_prob prev_cdf = 0;
+
+  for (int c = 0; c < CFL_ALPHABET_SIZE; c++) {
+    const int sign_bit_cost = (cfl_alpha_codes[c][CFL_PRED_U] != 0) +
+                              (cfl_alpha_codes[c][CFL_PRED_V] != 0);
+
+    aom_cdf_prob prob = AOM_ICDF(ec_ctx->cfl_alpha_cdf[c]) - prev_cdf;
+    prev_cdf = AOM_ICDF(ec_ctx->cfl_alpha_cdf[c]);
+
+    cfl->costs[c] = av1_cost_symbol(prob) + av1_cost_literal(sign_bit_cost);
+  }
+}
+
+static int cfl_rd_pick_alpha(MACROBLOCK *const x, TX_SIZE tx_size) {
+  const struct macroblock_plane *const p_u = &x->plane[AOM_PLANE_U];
+  const struct macroblock_plane *const p_v = &x->plane[AOM_PLANE_V];
+  const uint8_t *const src_u = p_u->src.buf;
+  const uint8_t *const src_v = p_v->src.buf;
+  const int src_stride_u = p_u->src.stride;
+  const int src_stride_v = p_v->src.stride;
+
+  MACROBLOCKD *const xd = &x->e_mbd;
+  FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+
+  CFL_CTX *const cfl = xd->cfl;
+  cfl_compute_parameters(xd, tx_size);
+  const int width = cfl->uv_width;
+  const int height = cfl->uv_height;
+  const int dc_pred_u = cfl->dc_pred[CFL_PRED_U];
+  const int dc_pred_v = cfl->dc_pred[CFL_PRED_V];
+  const int *y_averages_q3 = cfl->y_averages_q3;
+  const uint8_t *y_pix = cfl->y_down_pix;
+
+  CFL_SIGN_TYPE *signs = mbmi->cfl_alpha_signs;
+
+  cfl_update_costs(cfl, ec_ctx);
+
+  int64_t sse[CFL_PRED_PLANES][CFL_MAGS_SIZE];
+  sse[CFL_PRED_U][0] =
+      cfl_alpha_dist(y_pix, MAX_SB_SIZE, y_averages_q3, src_u, src_stride_u,
+                     width, height, tx_size, dc_pred_u, 0, NULL);
+  sse[CFL_PRED_V][0] =
+      cfl_alpha_dist(y_pix, MAX_SB_SIZE, y_averages_q3, src_v, src_stride_v,
+                     width, height, tx_size, dc_pred_v, 0, NULL);
+
+  for (int m = 1; m < CFL_MAGS_SIZE; m += 2) {
+    assert(cfl_alpha_mags_q3[m + 1] == -cfl_alpha_mags_q3[m]);
+    sse[CFL_PRED_U][m] = cfl_alpha_dist(
+        y_pix, MAX_SB_SIZE, y_averages_q3, src_u, src_stride_u, width, height,
+        tx_size, dc_pred_u, cfl_alpha_mags_q3[m], &sse[CFL_PRED_U][m + 1]);
+    sse[CFL_PRED_V][m] = cfl_alpha_dist(
+        y_pix, MAX_SB_SIZE, y_averages_q3, src_v, src_stride_v, width, height,
+        tx_size, dc_pred_v, cfl_alpha_mags_q3[m], &sse[CFL_PRED_V][m + 1]);
+  }
+
+  int64_t dist;
+  int64_t cost;
+  int64_t best_cost;
+
+  // Compute least squares parameter of the entire block
+  // IMPORTANT: We assume that the first code is 0,0
+  int ind = 0;
+  signs[CFL_PRED_U] = CFL_SIGN_POS;
+  signs[CFL_PRED_V] = CFL_SIGN_POS;
+
+  dist = sse[CFL_PRED_U][0] + sse[CFL_PRED_V][0];
+  dist *= 16;
+  best_cost = RDCOST(x->rdmult, cfl->costs[0], dist);
+
+  for (int c = 1; c < CFL_ALPHABET_SIZE; c++) {
+    const int idx_u = cfl_alpha_codes[c][CFL_PRED_U];
+    const int idx_v = cfl_alpha_codes[c][CFL_PRED_V];
+    for (CFL_SIGN_TYPE sign_u = idx_u == 0; sign_u < CFL_SIGNS; sign_u++) {
+      for (CFL_SIGN_TYPE sign_v = idx_v == 0; sign_v < CFL_SIGNS; sign_v++) {
+        dist = sse[CFL_PRED_U][idx_u + (sign_u == CFL_SIGN_NEG)] +
+               sse[CFL_PRED_V][idx_v + (sign_v == CFL_SIGN_NEG)];
+        dist *= 16;
+        cost = RDCOST(x->rdmult, cfl->costs[c], dist);
+        if (cost < best_cost) {
+          best_cost = cost;
+          ind = c;
+          signs[CFL_PRED_U] = sign_u;
+          signs[CFL_PRED_V] = sign_v;
+        }
+      }
+    }
+  }
+
+  mbmi->cfl_alpha_idx = ind;
+  return cfl->costs[ind];
+}
+#endif  // CONFIG_CFL
+
 static void init_sbuv_mode(MB_MODE_INFO *const mbmi) {
-  mbmi->uv_mode = DC_PRED;
+  mbmi->uv_mode = UV_DC_PRED;
 #if CONFIG_PALETTE
   mbmi->palette_mode_info.palette_size[1] = 0;
 #endif  // CONFIG_PALETTE
@@ -4870,20 +5767,19 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   assert(!is_inter_block(mbmi));
   MB_MODE_INFO best_mbmi = *mbmi;
-  PREDICTION_MODE mode;
   int64_t best_rd = INT64_MAX, this_rd;
-  int this_rate;
-  RD_STATS tokenonly_rd_stats;
 #if CONFIG_PVQ
   od_rollback_buffer buf;
   od_encode_checkpoint(&x->daala_enc, &buf);
 #endif  // CONFIG_PVQ
 #if CONFIG_PALETTE
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-  uint8_t *best_palette_color_map = NULL;
 #endif  // CONFIG_PALETTE
 
-  for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
+  for (int mode_idx = 0; mode_idx < UV_INTRA_MODES; ++mode_idx) {
+    int this_rate;
+    RD_STATS tokenonly_rd_stats;
+    UV_PREDICTION_MODE mode = uv_rd_search_mode_order[mode_idx];
 #if CONFIG_EXT_INTRA
     const int is_directional_mode =
         av1_is_directional_mode(mode, mbmi->sb_type);
@@ -4893,9 +5789,16 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
       continue;
 
     mbmi->uv_mode = mode;
+#if CONFIG_CFL
+    int cfl_alpha_rate = 0;
+    if (mode == UV_DC_PRED) {
+      const TX_SIZE uv_tx_size = av1_get_uv_tx_size(mbmi, &xd->plane[1]);
+      cfl_alpha_rate = cfl_rd_pick_alpha(x, uv_tx_size);
+    }
+#endif
 #if CONFIG_EXT_INTRA
     mbmi->angle_delta[1] = 0;
-    if (is_directional_mode) {
+    if (is_directional_mode && av1_use_angle_delta(mbmi->sb_type)) {
       const int rate_overhead = cpi->intra_uv_mode_cost[mbmi->mode][mode] +
                                 write_uniform_cost(2 * MAX_ANGLE_DELTA + 1, 0);
       if (!rd_pick_intra_angle_sbuv(cpi, x, bsize, rate_overhead, best_rd,
@@ -4915,8 +5818,13 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     this_rate =
         tokenonly_rd_stats.rate + cpi->intra_uv_mode_cost[mbmi->mode][mode];
 
+#if CONFIG_CFL
+    if (mode == UV_DC_PRED) {
+      this_rate += cfl_alpha_rate;
+    }
+#endif
 #if CONFIG_EXT_INTRA
-    if (is_directional_mode) {
+    if (is_directional_mode && av1_use_angle_delta(mbmi->sb_type)) {
       this_rate += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
                                       MAX_ANGLE_DELTA + mbmi->angle_delta[1]);
     }
@@ -4927,7 +5835,7 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
 #endif  // CONFIG_FILTER_INTRA
 #if CONFIG_PALETTE
     if (cpi->common.allow_screen_content_tools && mbmi->sb_type >= BLOCK_8X8 &&
-        mode == DC_PRED)
+        mode == UV_DC_PRED)
       this_rate += av1_cost_bit(
           av1_default_palette_uv_mode_prob[pmi->palette_size[0] > 0], 0);
 #endif  // CONFIG_PALETTE
@@ -4935,7 +5843,7 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
 #if CONFIG_PVQ
     od_encode_rollback(&x->daala_enc, &buf);
 #endif  // CONFIG_PVQ
-    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist);
+    this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
 
     if (this_rd < best_rd) {
       best_mbmi = *mbmi;
@@ -4949,9 +5857,9 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
 
 #if CONFIG_PALETTE
   if (cpi->common.allow_screen_content_tools && mbmi->sb_type >= BLOCK_8X8) {
-    best_palette_color_map = x->palette_buffer->best_palette_color_map;
+    uint8_t *best_palette_color_map = x->palette_buffer->best_palette_color_map;
     rd_pick_palette_intra_sbuv(cpi, x,
-                               cpi->intra_uv_mode_cost[mbmi->mode][DC_PRED],
+                               cpi->intra_uv_mode_cost[mbmi->mode][UV_DC_PRED],
                                best_palette_color_map, &best_mbmi, &best_rd,
                                rate, rate_tokenonly, distortion, skippable);
   }
@@ -4975,7 +5883,7 @@ static void choose_intra_uv_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
                                  PICK_MODE_CONTEXT *ctx, BLOCK_SIZE bsize,
                                  TX_SIZE max_tx_size, int *rate_uv,
                                  int *rate_uv_tokenonly, int64_t *dist_uv,
-                                 int *skip_uv, PREDICTION_MODE *mode_uv) {
+                                 int *skip_uv, UV_PREDICTION_MODE *mode_uv) {
   // Use an estimated rd for uv_intra based on DC_PRED if the
   // appropriate speed flag is set.
   (void)ctx;
@@ -4990,7 +5898,7 @@ static void choose_intra_uv_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
     *rate_uv_tokenonly = 0;
     *dist_uv = 0;
     *skip_uv = 1;
-    *mode_uv = DC_PRED;
+    *mode_uv = UV_DC_PRED;
     return;
   }
   BLOCK_SIZE bs = scale_chroma_bsize(bsize, x->e_mbd.plane[1].subsampling_x,
@@ -5011,6 +5919,12 @@ static int cost_mv_ref(const AV1_COMP *const cpi, PREDICTION_MODE mode,
   if (is_inter_compound_mode(mode)) {
     return cpi
         ->inter_compound_mode_cost[mode_context][INTER_COMPOUND_OFFSET(mode)];
+#if CONFIG_COMPOUND_SINGLEREF
+  } else if (is_inter_singleref_comp_mode(mode)) {
+    return cpi
+        ->inter_singleref_comp_mode_cost[mode_context]
+                                        [INTER_SINGLEREF_COMP_OFFSET(mode)];
+#endif  // CONFIG_COMPOUND_SINGLEREF
   }
 #endif
 
@@ -5096,8 +6010,13 @@ typedef struct {
   int segment_yrate;
   PREDICTION_MODE modes[4];
 #if CONFIG_EXT_INTER
+#if CONFIG_COMPOUND_SINGLEREF
+  SEG_RDSTAT rdstat[4][INTER_MODES + INTER_SINGLEREF_COMP_MODES +
+                       INTER_COMPOUND_MODES];
+#else   // !CONFIG_COMPOUND_SINGLEREF
   SEG_RDSTAT rdstat[4][INTER_MODES + INTER_COMPOUND_MODES];
-#else
+#endif  // CONFIG_COMPOUND_SINGLEREF
+#else   // !CONFIG_EXT_INTER
   SEG_RDSTAT rdstat[4][INTER_MODES];
 #endif  // CONFIG_EXT_INTER
   int mvthresh;
@@ -5120,27 +6039,28 @@ static int check_best_zero_mv(
     int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME], int this_mode,
     const MV_REFERENCE_FRAME ref_frames[2], const BLOCK_SIZE bsize, int block,
     int mi_row, int mi_col) {
-  int_mv zeromv[2];
+  int_mv zeromv[2] = { {.as_int = 0 } };
+#if CONFIG_GLOBAL_MOTION
   int comp_pred_mode = ref_frames[1] > INTRA_FRAME;
-  int cur_frm;
+#endif
   (void)mi_row;
   (void)mi_col;
-  for (cur_frm = 0; cur_frm < 1 + comp_pred_mode; cur_frm++) {
 #if CONFIG_GLOBAL_MOTION
-    if (this_mode == ZEROMV
+  if (this_mode == ZEROMV
 #if CONFIG_EXT_INTER
-        || this_mode == ZERO_ZEROMV
+      || this_mode == ZERO_ZEROMV
 #endif  // CONFIG_EXT_INTER
-        )
+      ) {
+    for (int cur_frm = 0; cur_frm < 1 + comp_pred_mode; cur_frm++) {
       zeromv[cur_frm].as_int =
           gm_get_motion_vector(&cpi->common.global_motion[ref_frames[cur_frm]],
                                cpi->common.allow_high_precision_mv, bsize,
                                mi_col, mi_row, block)
               .as_int;
-    else
-#endif  // CONFIG_GLOBAL_MOTION
-      zeromv[cur_frm].as_int = 0;
+    }
   }
+#endif  // CONFIG_GLOBAL_MOTION
+
 #if !CONFIG_EXT_INTER
   assert(ref_frames[1] != INTRA_FRAME);  // Just sanity check
 #endif                                   // !CONFIG_EXT_INTER
@@ -5201,8 +6121,11 @@ static int check_best_zero_mv(
 }
 
 static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
-                                BLOCK_SIZE bsize, int_mv *frame_mv, int mi_row,
-                                int mi_col,
+                                BLOCK_SIZE bsize, int_mv *frame_mv,
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+                                int_mv *frame_comp_mv,
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+                                int mi_row, int mi_col,
 #if CONFIG_EXT_INTER
                                 int_mv *ref_mv_sub8x8[2], const uint8_t *mask,
                                 int mask_stride,
@@ -5213,35 +6136,47 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
   const int ph = block_size_high[bsize];
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  // This function should only ever be called for compound modes
+// This function should only ever be called for compound modes
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  if (!has_second_ref(mbmi)) {
+    assert(is_inter_singleref_comp_mode(mbmi->mode));
+    assert(frame_comp_mv);
+  }
+  assert(has_second_ref(mbmi) || is_inter_singleref_comp_mode(mbmi->mode));
+  const int refs[2] = { mbmi->ref_frame[0], has_second_ref(mbmi)
+                                                ? mbmi->ref_frame[1]
+                                                : mbmi->ref_frame[0] };
+#else
   assert(has_second_ref(mbmi));
   const int refs[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] };
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
   int_mv ref_mv[2];
   int ite, ref;
-#if CONFIG_DUAL_FILTER
-  InterpFilter interp_filter[4] = {
-    mbmi->interp_filter[0], mbmi->interp_filter[1], mbmi->interp_filter[2],
-    mbmi->interp_filter[3],
-  };
-#else
-  const InterpFilter interp_filter = mbmi->interp_filter;
-#endif  // CONFIG_DUAL_FILTER
   struct scale_factors sf;
-  struct macroblockd_plane *const pd = &xd->plane[0];
 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
   // ic and ir are the 4x4 coordiantes of the sub8x8 at index "block"
   const int ic = block & 1;
   const int ir = (block - ic) >> 1;
+  struct macroblockd_plane *const pd = &xd->plane[0];
   const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x) + 4 * ic;
   const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y) + 4 * ir;
 #if CONFIG_GLOBAL_MOTION
   int is_global[2];
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+#else
   for (ref = 0; ref < 2; ++ref) {
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
     WarpedMotionParams *const wm =
         &xd->global_motion[xd->mi[0]->mbmi.ref_frame[ref]];
     is_global[ref] = is_global_mv_block(xd->mi[0], block, wm->wmtype);
   }
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  if (!has_second_ref(mbmi)) is_global[1] = is_global[0];
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
 #endif  // CONFIG_GLOBAL_MOTION
+#else   // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+  (void)block;
 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
 
   // Do joint motion search in compound mode to get more accurate mv.
@@ -5264,7 +6199,11 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
   (void)ref_mv_sub8x8;
 #endif  // CONFIG_EXT_INTER && CONFIG_CB4X4
 
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+#else
   for (ref = 0; ref < 2; ++ref) {
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
 #if CONFIG_EXT_INTER && !CONFIG_CB4X4
     if (bsize < BLOCK_8X8 && ref_mv_sub8x8 != NULL)
       ref_mv[ref].as_int = ref_mv_sub8x8[ref]->as_int;
@@ -5284,6 +6223,24 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
     }
   }
 
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  if (!has_second_ref(mbmi)) {
+    assert(is_inter_singleref_comp_mode(mbmi->mode));
+    // NOTE: For single ref comp mode, set up the 2nd set of ref_mv/pre_planes
+    //       all from the 1st reference frame, i.e. refs[0].
+    ref_mv[1] = x->mbmi_ext->ref_mvs[refs[0]][0];
+    if (scaled_ref_frame[0]) {
+      int i;
+      // Swap out the reference frame for a version that's been scaled to
+      // match the resolution of the current frame, allowing the existing
+      // motion search code to be used without additional modifications.
+      for (i = 0; i < MAX_MB_PLANE; i++)
+        backup_yv12[1][i] = xd->plane[i].pre[1];
+      av1_setup_pre_planes(xd, 1, scaled_ref_frame[0], mi_row, mi_col, NULL);
+    }
+  }
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+
 // Since we have scaled the reference frames to match the size of the current
 // frame we must use a unit scaling factor during mode selection.
 #if CONFIG_HIGHBITDEPTH
@@ -5294,9 +6251,16 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
                                     cm->height);
 #endif  // CONFIG_HIGHBITDEPTH
 
-  // Allow joint search multiple times iteratively for each reference frame
-  // and break out of the search loop if it couldn't find a better mv.
+// Allow joint search multiple times iteratively for each reference frame
+// and break out of the search loop if it couldn't find a better mv.
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  const int num_ites =
+      (has_second_ref(mbmi) || mbmi->mode == SR_NEW_NEWMV) ? 4 : 1;
+  const int start_ite = has_second_ref(mbmi) ? 0 : 1;
+  for (ite = start_ite; ite < (start_ite + num_ites); ite++) {
+#else
   for (ite = 0; ite < 4; ite++) {
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
     struct buf_2d ref_yv12[2];
     int bestsme = INT_MAX;
     int sadpb = x->sadperbit16;
@@ -5308,7 +6272,7 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
                        // odd iterations search in the second. The predictor
                        // found for the 'other' reference frame is factored in.
     const int plane = 0;
-    ConvolveParams conv_params = get_conv_params(0, plane);
+    ConvolveParams conv_params = get_conv_params(!id, 0, plane);
 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
     WarpTypesAllowed warp_types;
 #if CONFIG_GLOBAL_MOTION
@@ -5323,21 +6287,24 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
     ref_yv12[0] = xd->plane[plane].pre[0];
     ref_yv12[1] = xd->plane[plane].pre[1];
 
-#if CONFIG_DUAL_FILTER
-    // reload the filter types
-    interp_filter[0] =
-        (id == 0) ? mbmi->interp_filter[2] : mbmi->interp_filter[0];
-    interp_filter[1] =
-        (id == 0) ? mbmi->interp_filter[3] : mbmi->interp_filter[1];
-#endif  // CONFIG_DUAL_FILTER
-
 // Get the prediction block from the 'other' reference frame.
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+    MV *const the_other_mv = (has_second_ref(mbmi) || id)
+                                 ? &frame_mv[refs[!id]].as_mv
+                                 : &frame_comp_mv[refs[0]].as_mv;
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+
 #if CONFIG_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
       av1_highbd_build_inter_predictor(
           ref_yv12[!id].buf, ref_yv12[!id].stride, second_pred, pw,
-          &frame_mv[refs[!id]].as_mv, &sf, pw, ph, 0, interp_filter,
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+          the_other_mv,
+#else   // !(CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF)
+          &frame_mv[refs[!id]].as_mv,
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+          &sf, pw, ph, 0, mbmi->interp_filter,
 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
           &warp_types, p_col, p_row,
 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
@@ -5347,7 +6314,12 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
 #endif  // CONFIG_HIGHBITDEPTH
       av1_build_inter_predictor(
           ref_yv12[!id].buf, ref_yv12[!id].stride, second_pred, pw,
-          &frame_mv[refs[!id]].as_mv, &sf, pw, ph, &conv_params, interp_filter,
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+          the_other_mv,
+#else   // !(CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF)
+        &frame_mv[refs[!id]].as_mv,
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+          &sf, pw, ph, &conv_params, mbmi->interp_filter,
 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
           &warp_types, p_col, p_row, plane, !id,
 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
@@ -5360,13 +6332,24 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
     if (id) xd->plane[plane].pre[0] = ref_yv12[id];
     av1_set_mv_search_range(&x->mv_limits, &ref_mv[id].as_mv);
 
-    // Use the mv result from the single mode as mv predictor.
-    *best_mv = frame_mv[refs[id]].as_mv;
+// Use the mv result from the single mode as mv predictor.
+// Use the mv result from the single mode as mv predictor.
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+    if (!has_second_ref(mbmi) && id)
+      *best_mv = frame_comp_mv[refs[0]].as_mv;
+    else
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+      *best_mv = frame_mv[refs[id]].as_mv;
 
     best_mv->col >>= 3;
     best_mv->row >>= 3;
 
-    av1_set_mvcost(x, refs[id], id, mbmi->ref_mv_idx);
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+    if (!has_second_ref(mbmi))
+      av1_set_mvcost(x, refs[0], 0, mbmi->ref_mv_idx);
+    else
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+      av1_set_mvcost(x, refs[id], id, mbmi->ref_mv_idx);
 
     // Small-range full-pixel motion search.
     bestsme =
@@ -5392,60 +6375,33 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
     if (bestsme < INT_MAX) {
       int dis; /* TODO: use dis in distortion calculation later. */
       unsigned int sse;
-      if (cpi->sf.use_upsampled_references) {
-        // Use up-sampled reference frames.
-        struct buf_2d backup_pred = pd->pre[0];
-        const YV12_BUFFER_CONFIG *upsampled_ref =
-            get_upsampled_ref(cpi, refs[id]);
-
-        // Set pred for Y plane
-        setup_pred_plane(&pd->pre[0], bsize, upsampled_ref->y_buffer,
-                         upsampled_ref->y_crop_width,
-                         upsampled_ref->y_crop_height, upsampled_ref->y_stride,
-                         (mi_row << 3), (mi_col << 3), NULL, pd->subsampling_x,
-                         pd->subsampling_y);
-
-// If bsize < BLOCK_8X8, adjust pred pointer for this block
-#if !CONFIG_CB4X4
-        if (bsize < BLOCK_8X8)
-          pd->pre[0].buf =
-              &pd->pre[0].buf[(av1_raster_block_offset(BLOCK_8X8, block,
-                                                       pd->pre[0].stride))
-                              << 3];
-#endif  // !CONFIG_CB4X4
-
-        bestsme = cpi->find_fractional_mv_step(
-            x, &ref_mv[id].as_mv, cpi->common.allow_high_precision_mv,
-            x->errorperbit, &cpi->fn_ptr[bsize], 0,
-            cpi->sf.mv.subpel_iters_per_step, NULL, x->nmvjointcost, x->mvcost,
-            &dis, &sse, second_pred,
-#if CONFIG_EXT_INTER
-            mask, mask_stride, id,
-#endif
-            pw, ph, 1);
-
-        // Restore the reference frames.
-        pd->pre[0] = backup_pred;
-      } else {
-        (void)block;
-        bestsme = cpi->find_fractional_mv_step(
-            x, &ref_mv[id].as_mv, cpi->common.allow_high_precision_mv,
-            x->errorperbit, &cpi->fn_ptr[bsize], 0,
-            cpi->sf.mv.subpel_iters_per_step, NULL, x->nmvjointcost, x->mvcost,
-            &dis, &sse, second_pred,
+      bestsme = cpi->find_fractional_mv_step(
+          x, &ref_mv[id].as_mv, cpi->common.allow_high_precision_mv,
+          x->errorperbit, &cpi->fn_ptr[bsize], 0,
+          cpi->sf.mv.subpel_iters_per_step, NULL, x->nmvjointcost, x->mvcost,
+          &dis, &sse, second_pred,
 #if CONFIG_EXT_INTER
-            mask, mask_stride, id,
+          mask, mask_stride, id,
 #endif
-            pw, ph, 0);
-      }
+          pw, ph, cpi->sf.use_upsampled_references);
     }
 
     // Restore the pointer to the first (possibly scaled) prediction buffer.
     if (id) xd->plane[plane].pre[0] = ref_yv12[0];
 
     if (bestsme < last_besterr[id]) {
-      frame_mv[refs[id]].as_mv = *best_mv;
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+      // NOTE: For single ref comp mode, frame_mv stores the first mv and
+      //       frame_comp_mv stores the second mv.
+      if (!has_second_ref(mbmi) && id)
+        frame_comp_mv[refs[0]].as_mv = *best_mv;
+      else
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+        frame_mv[refs[id]].as_mv = *best_mv;
       last_besterr[id] = bestsme;
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+      if (!has_second_ref(mbmi)) last_besterr[!id] = last_besterr[id];
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
     } else {
       break;
     }
@@ -5453,40 +6409,92 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
 
   *rate_mv = 0;
 
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+#else
   for (ref = 0; ref < 2; ++ref) {
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
     if (scaled_ref_frame[ref]) {
       // Restore the prediction frame pointers to their unscaled versions.
       int i;
       for (i = 0; i < MAX_MB_PLANE; i++)
         xd->plane[i].pre[ref] = backup_yv12[ref][i];
     }
-    av1_set_mvcost(x, refs[ref], ref, mbmi->ref_mv_idx);
+
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+    if (!has_second_ref(mbmi))
+      av1_set_mvcost(x, refs[0], 0, mbmi->ref_mv_idx);
+    else
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+      av1_set_mvcost(x, refs[ref], ref, mbmi->ref_mv_idx);
+
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+    if (!has_second_ref(mbmi)) {
+      // NOTE: For single ref comp mode, i.e. !has_second_ref(mbmi) is true, the
+      //       first mv is stored in frame_mv[] and the second mv is stored in
+      //       frame_comp_mv[].
+      if (compound_ref0_mode(mbmi->mode) == NEWMV)  // SR_NEW_NEWMV
+        *rate_mv += av1_mv_bit_cost(&frame_mv[refs[0]].as_mv,
+                                    &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
+                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+      assert(compound_ref1_mode(mbmi->mode) == NEWMV);
+      *rate_mv += av1_mv_bit_cost(&frame_comp_mv[refs[0]].as_mv,
+                                  &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
+                                  x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+    } else {
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
 #if CONFIG_EXT_INTER && !CONFIG_CB4X4
-    if (bsize >= BLOCK_8X8)
+      if (bsize >= BLOCK_8X8)
 #endif  // CONFIG_EXT_INTER && !CONFIG_CB4X4
-      *rate_mv += av1_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
-                                  &x->mbmi_ext->ref_mvs[refs[ref]][0].as_mv,
-                                  x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+        *rate_mv += av1_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
+                                    &x->mbmi_ext->ref_mvs[refs[ref]][0].as_mv,
+                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
 #if CONFIG_EXT_INTER && !CONFIG_CB4X4
-    else
-      *rate_mv += av1_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
-                                  &ref_mv_sub8x8[ref]->as_mv, x->nmvjointcost,
-                                  x->mvcost, MV_COST_WEIGHT);
+      else
+        *rate_mv += av1_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
+                                    &ref_mv_sub8x8[ref]->as_mv, x->nmvjointcost,
+                                    x->mvcost, MV_COST_WEIGHT);
 #endif  // CONFIG_EXT_INTER && !CONFIG_CB4X4
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+    }
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  }
+
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  if (!has_second_ref(mbmi)) {
+    if (scaled_ref_frame[0]) {
+      // Restore the prediction frame pointers to their unscaled versions.
+      int i;
+      for (i = 0; i < MAX_MB_PLANE; i++)
+        xd->plane[i].pre[1] = backup_yv12[1][i];
+    }
   }
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
 }
 
-static void estimate_ref_frame_costs(const AV1_COMMON *cm,
-                                     const MACROBLOCKD *xd, int segment_id,
-                                     unsigned int *ref_costs_single,
-                                     unsigned int *ref_costs_comp,
-                                     aom_prob *comp_mode_p) {
+static void estimate_ref_frame_costs(
+    const AV1_COMMON *cm, const MACROBLOCKD *xd, int segment_id,
+    unsigned int *ref_costs_single,
+#if CONFIG_EXT_COMP_REFS
+    unsigned int (*ref_costs_comp)[TOTAL_REFS_PER_FRAME],
+#else
+    unsigned int *ref_costs_comp,
+#endif  // CONFIG_EXT_COMP_REFS
+    aom_prob *comp_mode_p) {
   int seg_ref_active =
       segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
   if (seg_ref_active) {
     memset(ref_costs_single, 0,
            TOTAL_REFS_PER_FRAME * sizeof(*ref_costs_single));
+#if CONFIG_EXT_COMP_REFS
+    int ref_frame;
+    for (ref_frame = 0; ref_frame < TOTAL_REFS_PER_FRAME; ++ref_frame)
+      memset(ref_costs_comp[ref_frame], 0,
+             TOTAL_REFS_PER_FRAME * sizeof((*ref_costs_comp)[0]));
+#else
     memset(ref_costs_comp, 0, TOTAL_REFS_PER_FRAME * sizeof(*ref_costs_comp));
+#endif  // CONFIG_EXT_COMP_REFS
+
     *comp_mode_p = 128;
   } else {
     aom_prob intra_inter_p = av1_get_intra_inter_prob(cm, xd);
@@ -5541,7 +6549,7 @@ static void estimate_ref_frame_costs(const AV1_COMMON *cm,
 
       ref_costs_single[LAST3_FRAME] += av1_cost_bit(ref_single_p5, 0);
       ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p5, 1);
-#else
+#else   // !CONFIG_EXT_REFS
       ref_costs_single[LAST_FRAME] += av1_cost_bit(ref_single_p1, 0);
       ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p1, 1);
       ref_costs_single[ALTREF_FRAME] += av1_cost_bit(ref_single_p1, 1);
@@ -5570,6 +6578,63 @@ static void estimate_ref_frame_costs(const AV1_COMMON *cm,
 
       unsigned int base_cost = av1_cost_bit(intra_inter_p, 1);
 
+#if CONFIG_EXT_COMP_REFS
+      aom_prob comp_ref_type_p = av1_get_comp_reference_type_prob(cm, xd);
+      unsigned int ref_bicomp_costs[TOTAL_REFS_PER_FRAME] = { 0 };
+
+      ref_bicomp_costs[LAST_FRAME] = ref_bicomp_costs[LAST2_FRAME] =
+          ref_bicomp_costs[LAST3_FRAME] = ref_bicomp_costs[GOLDEN_FRAME] =
+#if USE_UNI_COMP_REFS
+              base_cost + av1_cost_bit(comp_ref_type_p, 1);
+#else
+              base_cost;
+#endif  // USE_UNI_COMP_REFS
+      ref_bicomp_costs[BWDREF_FRAME] = ref_bicomp_costs[ALTREF_FRAME] = 0;
+
+      ref_bicomp_costs[LAST_FRAME] += av1_cost_bit(ref_comp_p, 0);
+      ref_bicomp_costs[LAST2_FRAME] += av1_cost_bit(ref_comp_p, 0);
+      ref_bicomp_costs[LAST3_FRAME] += av1_cost_bit(ref_comp_p, 1);
+      ref_bicomp_costs[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p, 1);
+
+      ref_bicomp_costs[LAST_FRAME] += av1_cost_bit(ref_comp_p1, 1);
+      ref_bicomp_costs[LAST2_FRAME] += av1_cost_bit(ref_comp_p1, 0);
+
+      ref_bicomp_costs[LAST3_FRAME] += av1_cost_bit(ref_comp_p2, 0);
+      ref_bicomp_costs[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p2, 1);
+
+      ref_bicomp_costs[BWDREF_FRAME] += av1_cost_bit(bwdref_comp_p, 0);
+      ref_bicomp_costs[ALTREF_FRAME] += av1_cost_bit(bwdref_comp_p, 1);
+
+      int ref0;
+      for (ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) {
+        ref_costs_comp[ref0][BWDREF_FRAME] =
+            ref_bicomp_costs[ref0] + ref_bicomp_costs[BWDREF_FRAME];
+        ref_costs_comp[ref0][ALTREF_FRAME] =
+            ref_bicomp_costs[ref0] + ref_bicomp_costs[ALTREF_FRAME];
+      }
+
+      aom_prob uni_comp_ref_p = av1_get_pred_prob_uni_comp_ref_p(cm, xd);
+      aom_prob uni_comp_ref_p1 = av1_get_pred_prob_uni_comp_ref_p1(cm, xd);
+      aom_prob uni_comp_ref_p2 = av1_get_pred_prob_uni_comp_ref_p2(cm, xd);
+
+      ref_costs_comp[LAST_FRAME][LAST2_FRAME] =
+          base_cost + av1_cost_bit(comp_ref_type_p, 0) +
+          av1_cost_bit(uni_comp_ref_p, 0) + av1_cost_bit(uni_comp_ref_p1, 0);
+      ref_costs_comp[LAST_FRAME][LAST3_FRAME] =
+          base_cost + av1_cost_bit(comp_ref_type_p, 0) +
+          av1_cost_bit(uni_comp_ref_p, 0) + av1_cost_bit(uni_comp_ref_p1, 1) +
+          av1_cost_bit(uni_comp_ref_p2, 0);
+      ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] =
+          base_cost + av1_cost_bit(comp_ref_type_p, 0) +
+          av1_cost_bit(uni_comp_ref_p, 0) + av1_cost_bit(uni_comp_ref_p1, 1) +
+          av1_cost_bit(uni_comp_ref_p2, 1);
+
+      ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] =
+          base_cost + av1_cost_bit(comp_ref_type_p, 0) +
+          av1_cost_bit(uni_comp_ref_p, 1);
+
+#else  // !CONFIG_EXT_COMP_REFS
+
       ref_costs_comp[LAST_FRAME] =
 #if CONFIG_EXT_REFS
           ref_costs_comp[LAST2_FRAME] = ref_costs_comp[LAST3_FRAME] =
@@ -5596,11 +6661,23 @@ static void estimate_ref_frame_costs(const AV1_COMMON *cm,
       //               more bit.
       ref_costs_comp[BWDREF_FRAME] += av1_cost_bit(bwdref_comp_p, 0);
       ref_costs_comp[ALTREF_FRAME] += av1_cost_bit(bwdref_comp_p, 1);
-#else
+#else   // !CONFIG_EXT_REFS
       ref_costs_comp[LAST_FRAME] += av1_cost_bit(ref_comp_p, 0);
       ref_costs_comp[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p, 1);
 #endif  // CONFIG_EXT_REFS
+#endif  // CONFIG_EXT_COMP_REFS
     } else {
+#if CONFIG_EXT_COMP_REFS
+      int ref0;
+      for (ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) {
+        ref_costs_comp[ref0][BWDREF_FRAME] = 512;
+        ref_costs_comp[ref0][ALTREF_FRAME] = 512;
+      }
+      ref_costs_comp[LAST_FRAME][LAST2_FRAME] = 512;
+      ref_costs_comp[LAST_FRAME][LAST3_FRAME] = 512;
+      ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] = 512;
+      ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] = 512;
+#else  // !CONFIG_EXT_COMP_REFS
       ref_costs_comp[LAST_FRAME] = 512;
 #if CONFIG_EXT_REFS
       ref_costs_comp[LAST2_FRAME] = 512;
@@ -5609,6 +6686,7 @@ static void estimate_ref_frame_costs(const AV1_COMMON *cm,
       ref_costs_comp[ALTREF_FRAME] = 512;
 #endif  // CONFIG_EXT_REFS
       ref_costs_comp[GOLDEN_FRAME] = 512;
+#endif  // CONFIG_EXT_COMP_REFS
     }
   }
 }
@@ -5693,8 +6771,13 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
   int sadpb = x->sadperbit16;
   MV mvp_full;
 #if CONFIG_EXT_INTER
+#if CONFIG_COMPOUND_SINGLEREF
+  int ref =
+      has_second_ref(mbmi) ? mbmi->ref_frame[ref_idx] : mbmi->ref_frame[0];
+#else   // !CONFIG_COMPOUND_SINGLEREF
   int ref = mbmi->ref_frame[ref_idx];
-#else
+#endif  // CONFIG_COMPOUND_SINGLEREF
+#else   // !CONFIG_EXT_INTER
   int ref = mbmi->ref_frame[0];
   int ref_idx = 0;
 #endif  // CONFIG_EXT_INTER
@@ -5802,7 +6885,7 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
           MAX_MVSEARCH_STEPS - 1 - step_param, 1, &cpi->fn_ptr[bsize], &ref_mv,
           &(x->best_mv.as_mv), 0);
       break;
-    default: assert("Invalid motion mode!\n");
+    default: assert(0 && "Invalid motion mode!\n");
   }
 #endif  // CONFIG_MOTION_VAR
 
@@ -5820,17 +6903,6 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
                                  x->second_best_mv.as_int != x->best_mv.as_int;
           const int pw = block_size_wide[bsize];
           const int ph = block_size_high[bsize];
-          // Use up-sampled reference frames.
-          struct macroblockd_plane *const pd = &xd->plane[0];
-          struct buf_2d backup_pred = pd->pre[ref_idx];
-          const YV12_BUFFER_CONFIG *upsampled_ref = get_upsampled_ref(cpi, ref);
-
-          // Set pred for Y plane
-          setup_pred_plane(
-              &pd->pre[ref_idx], bsize, upsampled_ref->y_buffer,
-              upsampled_ref->y_crop_width, upsampled_ref->y_crop_height,
-              upsampled_ref->y_stride, (mi_row << 3), (mi_col << 3), NULL,
-              pd->subsampling_x, pd->subsampling_y);
 
           best_mv_var = cpi->find_fractional_mv_step(
               x, &ref_mv, cm->allow_high_precision_mv, x->errorperbit,
@@ -5873,9 +6945,6 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
               x->best_mv.as_mv = best_mv;
             }
           }
-
-          // Restore the reference frames.
-          pd->pre[ref_idx] = backup_pred;
         } else {
           cpi->find_fractional_mv_step(
               x, &ref_mv, cm->allow_high_precision_mv, x->errorperbit,
@@ -5891,13 +6960,12 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
         break;
       case OBMC_CAUSAL:
         av1_find_best_obmc_sub_pixel_tree_up(
-            cpi, x, mi_row, mi_col, &x->best_mv.as_mv, &ref_mv,
-            cm->allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize],
-            cpi->sf.mv.subpel_force_stop, cpi->sf.mv.subpel_iters_per_step,
-            x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], 0,
-            cpi->sf.use_upsampled_references);
+            x, &x->best_mv.as_mv, &ref_mv, cm->allow_high_precision_mv,
+            x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
+            cpi->sf.mv.subpel_iters_per_step, x->nmvjointcost, x->mvcost, &dis,
+            &x->pred_sse[ref], 0, cpi->sf.use_upsampled_references);
         break;
-      default: assert("Invalid motion mode!\n");
+      default: assert(0 && "Invalid motion mode!\n");
     }
 #endif  // CONFIG_MOTION_VAR
   }
@@ -5936,15 +7004,12 @@ static void build_second_inter_pred(const AV1_COMP *cpi, MACROBLOCK *x,
   const int ph = block_size_high[bsize];
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+#if CONFIG_COMPOUND_SINGLEREF
+  const int other_ref =
+      has_second_ref(mbmi) ? mbmi->ref_frame[!ref_idx] : mbmi->ref_frame[0];
+#else  // !CONFIG_COMPOUND_SINGLEREF
   const int other_ref = mbmi->ref_frame[!ref_idx];
-#if CONFIG_DUAL_FILTER
-  InterpFilter interp_filter[2] = {
-    (ref_idx == 0) ? mbmi->interp_filter[2] : mbmi->interp_filter[0],
-    (ref_idx == 0) ? mbmi->interp_filter[3] : mbmi->interp_filter[1]
-  };
-#else
-  const InterpFilter interp_filter = mbmi->interp_filter;
-#endif  // CONFIG_DUAL_FILTER
+#endif  // CONFIG_COMPOUND_SINGLEREF
   struct scale_factors sf;
 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
   struct macroblockd_plane *const pd = &xd->plane[0];
@@ -5961,8 +7026,12 @@ static void build_second_inter_pred(const AV1_COMP *cpi, MACROBLOCK *x,
   (void)block;
 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
 
-  // This function should only ever be called for compound modes
+// This function should only ever be called for compound modes
+#if CONFIG_COMPOUND_SINGLEREF
+  assert(has_second_ref(mbmi) || is_inter_singleref_comp_mode(mbmi->mode));
+#else   // !CONFIG_COMPOUND_SINGLEREF
   assert(has_second_ref(mbmi));
+#endif  // CONFIG_COMPOUND_SINGLEREF
 
   struct buf_2d backup_yv12[MAX_MB_PLANE];
   const YV12_BUFFER_CONFIG *const scaled_ref_frame =
@@ -5991,7 +7060,7 @@ static void build_second_inter_pred(const AV1_COMP *cpi, MACROBLOCK *x,
   struct buf_2d ref_yv12;
 
   const int plane = 0;
-  ConvolveParams conv_params = get_conv_params(0, plane);
+  ConvolveParams conv_params = get_conv_params(!ref_idx, 0, plane);
 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
   WarpTypesAllowed warp_types;
 #if CONFIG_GLOBAL_MOTION
@@ -6010,7 +7079,7 @@ static void build_second_inter_pred(const AV1_COMP *cpi, MACROBLOCK *x,
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     av1_highbd_build_inter_predictor(
         ref_yv12.buf, ref_yv12.stride, second_pred, pw, other_mv, &sf, pw, ph,
-        0, interp_filter,
+        0, mbmi->interp_filter,
 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
         &warp_types, p_col, p_row,
 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
@@ -6019,7 +7088,7 @@ static void build_second_inter_pred(const AV1_COMP *cpi, MACROBLOCK *x,
 #endif  // CONFIG_HIGHBITDEPTH
     av1_build_inter_predictor(
         ref_yv12.buf, ref_yv12.stride, second_pred, pw, other_mv, &sf, pw, ph,
-        &conv_params, interp_filter,
+        &conv_params, mbmi->interp_filter,
 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
         &warp_types, p_col, p_row, plane, !ref_idx,
 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
@@ -6038,15 +7107,22 @@ static void build_second_inter_pred(const AV1_COMP *cpi, MACROBLOCK *x,
 
 // Search for the best mv for one component of a compound,
 // given that the other component is fixed.
-static void compound_single_motion_search(
-    const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, MV *this_mv,
-    int mi_row, int mi_col, const uint8_t *second_pred, const uint8_t *mask,
-    int mask_stride, int *rate_mv, const int block, int ref_idx) {
+static void compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
+                                          BLOCK_SIZE bsize, MV *this_mv,
+                                          int mi_row, int mi_col,
+                                          const uint8_t *second_pred,
+                                          const uint8_t *mask, int mask_stride,
+                                          int *rate_mv, int ref_idx) {
   const int pw = block_size_wide[bsize];
   const int ph = block_size_high[bsize];
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+#if CONFIG_COMPOUND_SINGLEREF
+  const int ref =
+      has_second_ref(mbmi) ? mbmi->ref_frame[ref_idx] : mbmi->ref_frame[0];
+#else
   const int ref = mbmi->ref_frame[ref_idx];
+#endif  // CONFIG_COMPOUND_SINGLEREF
   int_mv ref_mv = x->mbmi_ext->ref_mvs[ref][0];
   struct macroblockd_plane *const pd = &xd->plane[0];
 
@@ -6054,9 +7130,16 @@ static void compound_single_motion_search(
   const YV12_BUFFER_CONFIG *const scaled_ref_frame =
       av1_get_scaled_ref_frame(cpi, ref);
 
-  // Check that this is either an interinter or an interintra block
+// Check that this is either an interinter or an interintra block
+#if CONFIG_COMPOUND_SINGLEREF
+  assert(has_second_ref(mbmi) ||
+         // or a single ref comp pred mode
+         is_inter_singleref_comp_mode(mbmi->mode) ||
+         (ref_idx == 0 && mbmi->ref_frame[1] == INTRA_FRAME));
+#else
   assert(has_second_ref(mbmi) ||
          (ref_idx == 0 && mbmi->ref_frame[1] == INTRA_FRAME));
+#endif  // CONFIG_COMPOUND_SINGLEREF
 
   if (scaled_ref_frame) {
     int i;
@@ -6091,7 +7174,12 @@ static void compound_single_motion_search(
   best_mv->col >>= 3;
   best_mv->row >>= 3;
 
-  av1_set_mvcost(x, ref, ref_idx, mbmi->ref_mv_idx);
+#if CONFIG_COMPOUND_SINGLEREF
+  if (!has_second_ref(mbmi))
+    av1_set_mvcost(x, ref, 0, mbmi->ref_mv_idx);
+  else
+#endif  // CONFIG_COMPOUND_SINGLEREF
+    av1_set_mvcost(x, ref, ref_idx, mbmi->ref_mv_idx);
 
   // Small-range full-pixel motion search.
   bestsme = av1_refining_search_8p_c(x, sadpb, search_range,
@@ -6112,43 +7200,11 @@ static void compound_single_motion_search(
   if (bestsme < INT_MAX) {
     int dis; /* TODO: use dis in distortion calculation later. */
     unsigned int sse;
-    if (cpi->sf.use_upsampled_references) {
-      // Use up-sampled reference frames.
-      struct buf_2d backup_pred = pd->pre[0];
-      const YV12_BUFFER_CONFIG *upsampled_ref = get_upsampled_ref(cpi, ref);
-
-      // Set pred for Y plane
-      setup_pred_plane(&pd->pre[0], bsize, upsampled_ref->y_buffer,
-                       upsampled_ref->y_crop_width,
-                       upsampled_ref->y_crop_height, upsampled_ref->y_stride,
-                       (mi_row << 3), (mi_col << 3), NULL, pd->subsampling_x,
-                       pd->subsampling_y);
-
-// If bsize < BLOCK_8X8, adjust pred pointer for this block
-#if !CONFIG_CB4X4
-      if (bsize < BLOCK_8X8)
-        pd->pre[0].buf =
-            &pd->pre[0].buf[(av1_raster_block_offset(BLOCK_8X8, block,
-                                                     pd->pre[0].stride))
-                            << 3];
-#endif  // !CONFIG_CB4X4
-
-      bestsme = cpi->find_fractional_mv_step(
-          x, &ref_mv.as_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
-          &cpi->fn_ptr[bsize], 0, cpi->sf.mv.subpel_iters_per_step, NULL,
-          x->nmvjointcost, x->mvcost, &dis, &sse, second_pred, mask,
-          mask_stride, ref_idx, pw, ph, 1);
-
-      // Restore the reference frames.
-      pd->pre[0] = backup_pred;
-    } else {
-      (void)block;
-      bestsme = cpi->find_fractional_mv_step(
-          x, &ref_mv.as_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
-          &cpi->fn_ptr[bsize], 0, cpi->sf.mv.subpel_iters_per_step, NULL,
-          x->nmvjointcost, x->mvcost, &dis, &sse, second_pred, mask,
-          mask_stride, ref_idx, pw, ph, 0);
-    }
+    bestsme = cpi->find_fractional_mv_step(
+        x, &ref_mv.as_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
+        &cpi->fn_ptr[bsize], 0, cpi->sf.mv.subpel_iters_per_step, NULL,
+        x->nmvjointcost, x->mvcost, &dis, &sse, second_pred, mask, mask_stride,
+        ref_idx, pw, ph, cpi->sf.use_upsampled_references);
   }
 
   // Restore the pointer to the first (possibly scaled) prediction buffer.
@@ -6165,7 +7221,12 @@ static void compound_single_motion_search(
       xd->plane[i].pre[ref_idx] = backup_yv12[i];
   }
 
-  av1_set_mvcost(x, ref, ref_idx, mbmi->ref_mv_idx);
+#if CONFIG_COMPOUND_SINGLEREF
+  if (!has_second_ref(mbmi))
+    av1_set_mvcost(x, ref, 0, mbmi->ref_mv_idx);
+  else
+#endif  // CONFIG_COMPOUND_SINGLEREF
+    av1_set_mvcost(x, ref, ref_idx, mbmi->ref_mv_idx);
   *rate_mv += av1_mv_bit_cost(this_mv, &ref_mv.as_mv, x->nmvjointcost,
                               x->mvcost, MV_COST_WEIGHT);
 }
@@ -6174,13 +7235,23 @@ static void compound_single_motion_search(
 // where the second prediction is also an inter mode.
 static void compound_single_motion_search_interinter(
     const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *frame_mv,
+#if CONFIG_COMPOUND_SINGLEREF
+    int_mv *frame_comp_mv,
+#endif  // CONFIG_COMPOUND_SINGLEREF
     int mi_row, int mi_col, const uint8_t *mask, int mask_stride, int *rate_mv,
     const int block, int ref_idx) {
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
 
-  // This function should only ever be called for compound modes
+// This function should only ever be called for compound modes
+#if CONFIG_COMPOUND_SINGLEREF
+  int is_singleref_comp_mode =
+      !has_second_ref(mbmi) && is_inter_singleref_comp_mode(mbmi->mode);
+  assert(has_second_ref(mbmi) || is_singleref_comp_mode);
+  if (is_singleref_comp_mode && ref_idx) assert(frame_comp_mv);
+#else   // !CONFIG_COMPOUND_SINGLEREF
   assert(has_second_ref(mbmi));
+#endif  // CONFIG_COMPOUND_SINGLEREF
 
 // Prediction buffer from second frame.
 #if CONFIG_HIGHBITDEPTH
@@ -6194,14 +7265,26 @@ static void compound_single_motion_search_interinter(
   DECLARE_ALIGNED(16, uint8_t, second_pred[MAX_SB_SQUARE]);
 #endif  // CONFIG_HIGHBITDEPTH
 
+#if CONFIG_COMPOUND_SINGLEREF
+  MV *this_mv = has_second_ref(mbmi)
+                    ? &frame_mv[mbmi->ref_frame[ref_idx]].as_mv
+                    : (ref_idx ? &frame_comp_mv[mbmi->ref_frame[0]].as_mv
+                               : &frame_mv[mbmi->ref_frame[0]].as_mv);
+  const MV *other_mv =
+      has_second_ref(mbmi)
+          ? &frame_mv[mbmi->ref_frame[!ref_idx]].as_mv
+          : (ref_idx ? &frame_mv[mbmi->ref_frame[0]].as_mv
+                     : &frame_comp_mv[mbmi->ref_frame[0]].as_mv);
+#else   // !CONFIG_COMPOUND_SINGLEREF
   MV *this_mv = &frame_mv[mbmi->ref_frame[ref_idx]].as_mv;
   const MV *other_mv = &frame_mv[mbmi->ref_frame[!ref_idx]].as_mv;
+#endif  // CONFIG_COMPOUND_SINGLEREF
 
   build_second_inter_pred(cpi, x, bsize, other_mv, mi_row, mi_col, block,
                           ref_idx, second_pred);
 
   compound_single_motion_search(cpi, x, bsize, this_mv, mi_row, mi_col,
-                                second_pred, mask, mask_stride, rate_mv, block,
+                                second_pred, mask, mask_stride, rate_mv,
                                 ref_idx);
 }
 
@@ -6220,21 +7303,40 @@ static void do_masked_motion_search_indexed(
   mask = av1_get_compound_type_mask(comp_data, sb_type);
 
   int_mv frame_mv[TOTAL_REFS_PER_FRAME];
+#if CONFIG_COMPOUND_SINGLEREF
+  int_mv frame_comp_mv[TOTAL_REFS_PER_FRAME];
+#endif  // CONFIG_COMPOUND_SINGLEREF
   MV_REFERENCE_FRAME rf[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] };
   assert(bsize >= BLOCK_8X8 || CONFIG_CB4X4);
 
   frame_mv[rf[0]].as_int = cur_mv[0].as_int;
-  frame_mv[rf[1]].as_int = cur_mv[1].as_int;
+#if CONFIG_COMPOUND_SINGLEREF
+  if (!has_second_ref(mbmi))
+    frame_comp_mv[rf[0]].as_int = cur_mv[1].as_int;
+  else
+#endif  // CONFIG_COMPOUND_SINGLEREF
+    frame_mv[rf[1]].as_int = cur_mv[1].as_int;
   if (which == 0 || which == 1) {
-    compound_single_motion_search_interinter(cpi, x, bsize, frame_mv, mi_row,
-                                             mi_col, mask, mask_stride, rate_mv,
-                                             0, which);
+    compound_single_motion_search_interinter(
+        cpi, x, bsize, frame_mv,
+#if CONFIG_COMPOUND_SINGLEREF
+        has_second_ref(mbmi) ? NULL : frame_comp_mv,
+#endif  // CONFIG_COMPOUND_SINGLEREF
+        mi_row, mi_col, mask, mask_stride, rate_mv, 0, which);
   } else if (which == 2) {
-    joint_motion_search(cpi, x, bsize, frame_mv, mi_row, mi_col, NULL, mask,
-                        mask_stride, rate_mv, 0);
+    joint_motion_search(cpi, x, bsize, frame_mv,
+#if CONFIG_COMPOUND_SINGLEREF
+                        has_second_ref(mbmi) ? NULL : frame_comp_mv,
+#endif  // CONFIG_COMPOUND_SINGLEREF
+                        mi_row, mi_col, NULL, mask, mask_stride, rate_mv, 0);
   }
   tmp_mv[0].as_int = frame_mv[rf[0]].as_int;
-  tmp_mv[1].as_int = frame_mv[rf[1]].as_int;
+#if CONFIG_COMPOUND_SINGLEREF
+  if (!has_second_ref(mbmi))
+    tmp_mv[1].as_int = frame_comp_mv[rf[0]].as_int;
+  else  // comp ref
+#endif  // CONFIG_COMPOUND_SINGLEREF
+    tmp_mv[1].as_int = frame_mv[rf[1]].as_int;
 }
 #endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
 #endif  // CONFIG_EXT_INTER
@@ -6483,7 +7585,7 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
     sse = ROUND_POWER_OF_TWO(sse, bd_round);
 
     model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
-    rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
+    rd = RDCOST(x->rdmult, rate, dist);
 
     if (rd < best_rd) {
       *best_wedge_index = wedge_index;
@@ -6544,7 +7646,7 @@ static int64_t pick_wedge_fixed_sign(
     sse = ROUND_POWER_OF_TWO(sse, bd_round);
 
     model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
-    rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
+    rd = RDCOST(x->rdmult, rate, dist);
 
     if (rd < best_rd) {
       *best_wedge_index = wedge_index;
@@ -6646,7 +7748,7 @@ static int64_t pick_interinter_seg(const AV1_COMP *const cpi,
     sse = ROUND_POWER_OF_TWO(sse, bd_round);
 
     model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
-    rd0 = RDCOST(x->rdmult, x->rddiv, rate, dist);
+    rd0 = RDCOST(x->rdmult, rate, dist);
 
     if (rd0 < best_rd) {
       best_mask_type = cur_mask_type;
@@ -6729,7 +7831,17 @@ static int interinter_compound_motion_search(
 #endif  // CONFIG_COMPOUND_SEGMENT
     mbmi->interinter_compound_type
   };
-  if (this_mode == NEW_NEWMV) {
+#if CONFIG_COMPOUND_SINGLEREF
+  // NOTE: Mode is needed to identify the compound mode prediction, regardless
+  //       of comp refs or single ref.
+  mbmi->mode = this_mode;
+#endif  // CONFIG_COMPOUND_SINGLEREF
+
+  if (this_mode == NEW_NEWMV
+#if CONFIG_COMPOUND_SINGLEREF
+      || this_mode == SR_NEW_NEWMV
+#endif  // CONFIG_COMPOUND_SINGLEREF
+      ) {
     do_masked_motion_search_indexed(cpi, x, cur_mv, &compound_data, bsize,
                                     mi_row, mi_col, tmp_mv, &tmp_rate_mv, 2);
     mbmi->mv[0].as_int = tmp_mv[0].as_int;
@@ -6738,7 +7850,12 @@ static int interinter_compound_motion_search(
     do_masked_motion_search_indexed(cpi, x, cur_mv, &compound_data, bsize,
                                     mi_row, mi_col, tmp_mv, &tmp_rate_mv, 0);
     mbmi->mv[0].as_int = tmp_mv[0].as_int;
-  } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
+  } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV
+#if CONFIG_COMPOUND_SINGLEREF
+             // || this_mode == SR_NEAREST_NEWMV
+             || this_mode == SR_NEAR_NEWMV || this_mode == SR_ZERO_NEWMV
+#endif  // CONFIG_COMPOUND_SINGLEREF
+             ) {
     do_masked_motion_search_indexed(cpi, x, cur_mv, &compound_data, bsize,
                                     mi_row, mi_col, tmp_mv, &tmp_rate_mv, 1);
     mbmi->mv[1].as_int = tmp_mv[1].as_int;
@@ -6763,7 +7880,7 @@ static int64_t build_and_cost_compound_type(
   const COMPOUND_TYPE compound_type = mbmi->interinter_compound_type;
 
   best_rd_cur = pick_interinter_mask(cpi, x, bsize, *preds0, *preds1);
-  best_rd_cur += RDCOST(x->rdmult, x->rddiv, rs2 + rate_mv, 0);
+  best_rd_cur += RDCOST(x->rdmult, rs2 + rate_mv, 0);
 
   if (have_newmv_in_inter_mode(this_mode) &&
       use_masked_motion_search(compound_type)) {
@@ -6772,7 +7889,7 @@ static int64_t build_and_cost_compound_type(
     av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize);
     model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
                     &tmp_skip_txfm_sb, &tmp_skip_sse_sb);
-    rd = RDCOST(x->rdmult, x->rddiv, rs2 + *out_rate_mv + rate_sum, dist_sum);
+    rd = RDCOST(x->rdmult, rs2 + *out_rate_mv + rate_sum, dist_sum);
     if (rd >= best_rd_cur) {
       mbmi->mv[0].as_int = cur_mv[0].as_int;
       mbmi->mv[1].as_int = cur_mv[1].as_int;
@@ -6788,7 +7905,7 @@ static int64_t build_and_cost_compound_type(
     rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
                              &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
     if (rd != INT64_MAX)
-      rd = RDCOST(x->rdmult, x->rddiv, rs2 + *out_rate_mv + rate_sum, dist_sum);
+      rd = RDCOST(x->rdmult, rs2 + *out_rate_mv + rate_sum, dist_sum);
     best_rd_cur = rd;
 
   } else {
@@ -6801,7 +7918,7 @@ static int64_t build_and_cost_compound_type(
     rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
                              &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
     if (rd != INT64_MAX)
-      rd = RDCOST(x->rdmult, x->rddiv, rs2 + rate_mv + rate_sum, dist_sum);
+      rd = RDCOST(x->rdmult, rs2 + rate_mv + rate_sum, dist_sum);
     best_rd_cur = rd;
   }
   return best_rd_cur;
@@ -6832,6 +7949,9 @@ typedef struct {
 static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
                             const BLOCK_SIZE bsize,
                             int_mv (*const mode_mv)[TOTAL_REFS_PER_FRAME],
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+                            int_mv (*const mode_comp_mv)[TOTAL_REFS_PER_FRAME],
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
                             const int mi_row, const int mi_col,
                             int *const rate_mv, int_mv *const single_newmv,
                             HandleInterModeArgs *const args) {
@@ -6844,6 +7964,9 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
   const int is_comp_interintra_pred = (mbmi->ref_frame[1] == INTRA_FRAME);
 #endif  // CONFIG_EXT_INTER
   int_mv *const frame_mv = mode_mv[this_mode];
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  int_mv *const frame_comp_mv = mode_comp_mv[this_mode];
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
   const int refs[2] = { mbmi->ref_frame[0],
                         mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] };
   int i;
@@ -6861,8 +7984,11 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
       frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
 
       if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
-        joint_motion_search(cpi, x, bsize, frame_mv, mi_row, mi_col, NULL, NULL,
-                            0, rate_mv, 0);
+        joint_motion_search(cpi, x, bsize, frame_mv,
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+                            NULL,  // int_mv *frame_comp_mv
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+                            mi_row, mi_col, NULL, NULL, 0, rate_mv, 0);
       } else {
         *rate_mv = 0;
         for (i = 0; i < 2; ++i) {
@@ -6877,8 +8003,12 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
       if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
         frame_mv[refs[0]].as_int =
             mode_mv[compound_ref0_mode(this_mode)][refs[0]].as_int;
-        compound_single_motion_search_interinter(
-            cpi, x, bsize, frame_mv, mi_row, mi_col, NULL, 0, rate_mv, 0, 1);
+        compound_single_motion_search_interinter(cpi, x, bsize, frame_mv,
+#if CONFIG_COMPOUND_SINGLEREF
+                                                 NULL,
+#endif  // CONFIG_COMPOUND_SINGLEREF
+                                                 mi_row, mi_col, NULL, 0,
+                                                 rate_mv, 0, 1);
       } else {
         av1_set_mvcost(x, refs[1], 1, mbmi->ref_mv_idx);
         *rate_mv = av1_mv_bit_cost(&frame_mv[refs[1]].as_mv,
@@ -6891,8 +8021,12 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
       if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
         frame_mv[refs[1]].as_int =
             mode_mv[compound_ref1_mode(this_mode)][refs[1]].as_int;
-        compound_single_motion_search_interinter(
-            cpi, x, bsize, frame_mv, mi_row, mi_col, NULL, 0, rate_mv, 0, 0);
+        compound_single_motion_search_interinter(cpi, x, bsize, frame_mv,
+#if CONFIG_COMPOUND_SINGLEREF
+                                                 NULL,
+#endif  // CONFIG_COMPOUND_SINGLEREF
+                                                 mi_row, mi_col, NULL, 0,
+                                                 rate_mv, 0, 0);
       } else {
         av1_set_mvcost(x, refs[0], 0, mbmi->ref_mv_idx);
         *rate_mv = av1_mv_bit_cost(&frame_mv[refs[0]].as_mv,
@@ -6900,7 +8034,7 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
       }
     }
-#else
+#else   // !CONFIG_EXT_INTER
     // Initialize mv using single prediction mode result.
     frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
     frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
@@ -6917,6 +8051,41 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
       }
     }
 #endif  // CONFIG_EXT_INTER
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  } else if (is_inter_singleref_comp_mode(this_mode)) {
+    // Single ref comp mode
+    const int mode0 = compound_ref0_mode(this_mode);
+
+    single_newmv[refs[0]].as_int = args->single_newmv[refs[0]].as_int;
+    frame_mv[refs[0]].as_int = (mode0 == NEWMV)
+                                   ? single_newmv[refs[0]].as_int
+                                   : mode_mv[mode0][refs[0]].as_int;
+    assert(compound_ref1_mode(this_mode) == NEWMV);
+    frame_comp_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
+
+    if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
+      if (this_mode == SR_NEW_NEWMV) {
+        joint_motion_search(cpi, x, bsize, frame_mv, frame_comp_mv, mi_row,
+                            mi_col, NULL, NULL, 0, rate_mv, 0);
+      } else {
+        assert(  // this_mode == SR_NEAREST_NEWMV ||
+            this_mode == SR_NEAR_NEWMV || this_mode == SR_ZERO_NEWMV);
+        compound_single_motion_search_interinter(cpi, x, bsize, frame_mv,
+                                                 frame_comp_mv, mi_row, mi_col,
+                                                 NULL, 0, rate_mv, 0, 1);
+      }
+    } else {
+      *rate_mv = 0;
+      av1_set_mvcost(x, refs[0], 0, mbmi->ref_mv_idx);
+      if (mode0 == NEWMV)
+        *rate_mv += av1_mv_bit_cost(&frame_mv[refs[0]].as_mv,
+                                    &mbmi_ext->ref_mvs[refs[0]][0].as_mv,
+                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+      *rate_mv += av1_mv_bit_cost(&frame_comp_mv[refs[0]].as_mv,
+                                  &mbmi_ext->ref_mvs[refs[0]][0].as_mv,
+                                  x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+    }
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
   } else {
 #if CONFIG_EXT_INTER
     if (is_comp_interintra_pred) {
@@ -6984,7 +8153,7 @@ int64_t interpolation_filter_search(
   av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
   model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate, &tmp_dist,
                   skip_txfm_sb, skip_sse_sb);
-  *rd = RDCOST(x->rdmult, x->rddiv, *switchable_rate + tmp_rate, tmp_dist);
+  *rd = RDCOST(x->rdmult, *switchable_rate + tmp_rate, tmp_dist);
 
   if (assign_filter == SWITCHABLE) {
     // do interp_filter search
@@ -7020,7 +8189,7 @@ int64_t interpolation_filter_search(
         av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
         model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
                         &tmp_dist, &tmp_skip_sb, &tmp_skip_sse);
-        tmp_rd = RDCOST(x->rdmult, x->rddiv, tmp_rs + tmp_rate, tmp_dist);
+        tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate, tmp_dist);
 
         if (tmp_rd < *rd) {
           *rd = tmp_rd;
@@ -7072,12 +8241,10 @@ static int64_t motion_mode_rd(
     int mi_col, HandleInterModeArgs *const args, const int64_t ref_best_rd,
     const int *refs, int rate_mv,
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+    // only used when WARPED_MOTION is on?
     int_mv *const single_newmv,
 #if CONFIG_EXT_INTER
-    int rate2_bmc_nocoeff, MB_MODE_INFO *best_bmc_mbmi,
-#if CONFIG_MOTION_VAR
-    int rate_mv_bmc,
-#endif  // CONFIG_MOTION_VAR
+    int rate2_bmc_nocoeff, MB_MODE_INFO *best_bmc_mbmi, int rate_mv_bmc,
 #endif  // CONFIG_EXT_INTER
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
     int rs, int *skip_txfm_sb, int64_t *skip_sse_sb, BUFFER_SET *orig_dst) {
@@ -7108,7 +8275,13 @@ static int64_t motion_mode_rd(
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 
 #if CONFIG_WARPED_MOTION
+#if WARPED_MOTION_SORT_SAMPLES
+  int pts0[SAMPLES_ARRAY_SIZE], pts_inref0[SAMPLES_ARRAY_SIZE];
+  int pts_mv0[SAMPLES_ARRAY_SIZE];
+  int total_samples;
+#else
   int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+#endif  // WARPED_MOTION_SORT_SAMPLES
 #endif  // CONFIG_WARPED_MOTION
 
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
@@ -7118,18 +8291,39 @@ static int64_t motion_mode_rd(
   if (cm->interp_filter == SWITCHABLE) rd_stats->rate += rs;
 #if CONFIG_WARPED_MOTION
   aom_clear_system_state();
+#if WARPED_MOTION_SORT_SAMPLES
+  mbmi->num_proj_ref[0] =
+      findSamples(cm, xd, mi_row, mi_col, pts0, pts_inref0, pts_mv0);
+  total_samples = mbmi->num_proj_ref[0];
+#else
   mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref);
+#endif  // WARPED_MOTION_SORT_SAMPLES
 #if CONFIG_EXT_INTER
   best_bmc_mbmi->num_proj_ref[0] = mbmi->num_proj_ref[0];
 #endif  // CONFIG_EXT_INTER
 #endif  // CONFIG_WARPED_MOTION
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
   rate2_nocoeff = rd_stats->rate;
+#if CONFIG_NCOBMC_ADAPT_WEIGHT
+  // We cannot estimate the rd cost for the motion mode NCOBMC_ADAPT_WEIGHT
+  // right now since it requires mvs from all neighboring blocks. We will
+  // check if this mode is beneficial after all the mv's in the current
+  // superblock are selected.
+  last_motion_mode_allowed = motion_mode_allowed_wrapper(1,
+#if CONFIG_GLOBAL_MOTION
+                                                         0, xd->global_motion,
+#endif  // CONFIG_GLOBAL_MOTION
+                                                         mi);
+#else
   last_motion_mode_allowed = motion_mode_allowed(
-#if CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+#if CONFIG_GLOBAL_MOTION
       0, xd->global_motion,
-#endif  // CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+#endif  // CONFIG_GLOBAL_MOTION
+#if CONFIG_WARPED_MOTION
+      xd,
+#endif
       mi);
+#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
   base_mbmi = *mbmi;
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 
@@ -7155,7 +8349,11 @@ static int64_t motion_mode_rd(
       *mbmi = *best_bmc_mbmi;
       mbmi->motion_mode = OBMC_CAUSAL;
 #endif  // CONFIG_EXT_INTER
-      if (!is_comp_pred && have_newmv_in_inter_mode(this_mode)) {
+      if (!is_comp_pred &&
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+          !is_inter_singleref_comp_mode(this_mode) &&
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+          have_newmv_in_inter_mode(this_mode)) {
         int tmp_rate_mv = 0;
 
         single_motion_search(cpi, x, bsize, mi_row, mi_col,
@@ -7195,6 +8393,9 @@ static int64_t motion_mode_rd(
 
 #if CONFIG_WARPED_MOTION
     if (mbmi->motion_mode == WARPED_CAUSAL) {
+#if WARPED_MOTION_SORT_SAMPLES
+      int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+#endif  // WARPED_MOTION_SORT_SAMPLES
 #if CONFIG_EXT_INTER
       *mbmi = *best_bmc_mbmi;
       mbmi->motion_mode = WARPED_CAUSAL;
@@ -7210,6 +8411,19 @@ static int64_t motion_mode_rd(
                                                             : cm->interp_filter;
 #endif  // CONFIG_DUAL_FILTER
 
+#if WARPED_MOTION_SORT_SAMPLES
+      memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
+      memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0));
+      // Rank the samples by motion vector difference
+      if (mbmi->num_proj_ref[0] > 1) {
+        mbmi->num_proj_ref[0] = sortSamples(pts_mv0, &mbmi->mv[0].as_mv, pts,
+                                            pts_inref, mbmi->num_proj_ref[0]);
+#if CONFIG_EXT_INTER
+        best_bmc_mbmi->num_proj_ref[0] = mbmi->num_proj_ref[0];
+#endif  // CONFIG_EXT_INTER
+      }
+#endif  // WARPED_MOTION_SORT_SAMPLES
+
       if (!find_projection(mbmi->num_proj_ref[0], pts, pts_inref, bsize,
                            mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
                            &mbmi->wm_params[0], mi_row, mi_col)) {
@@ -7218,9 +8432,16 @@ static int64_t motion_mode_rd(
           int tmp_rate_mv = 0;
           const int_mv mv0 = mbmi->mv[0];
           WarpedMotionParams wm_params0 = mbmi->wm_params[0];
+#if WARPED_MOTION_SORT_SAMPLES
+          int num_proj_ref0 = mbmi->num_proj_ref[0];
 
+          // Refine MV in a small range.
+          av1_refine_warped_mv(cpi, x, bsize, mi_row, mi_col, pts0, pts_inref0,
+                               pts_mv0, total_samples);
+#else
           // Refine MV in a small range.
           av1_refine_warped_mv(cpi, x, bsize, mi_row, mi_col, pts, pts_inref);
+#endif  // WARPED_MOTION_SORT_SAMPLES
 
           // Keep the refined MV and WM parameters.
           if (mv0.as_int != mbmi->mv[0].as_int) {
@@ -7241,6 +8462,9 @@ static int64_t motion_mode_rd(
               tmp_rate_mv = AOMMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
             }
 #if CONFIG_EXT_INTER
+#if WARPED_MOTION_SORT_SAMPLES
+            best_bmc_mbmi->num_proj_ref[0] = mbmi->num_proj_ref[0];
+#endif  // WARPED_MOTION_SORT_SAMPLES
             tmp_rate2 = rate2_bmc_nocoeff - rate_mv_bmc + tmp_rate_mv;
 #else
             tmp_rate2 = rate2_nocoeff - rate_mv + tmp_rate_mv;
@@ -7255,6 +8479,9 @@ static int64_t motion_mode_rd(
             // Restore the old MV and WM parameters.
             mbmi->mv[0] = mv0;
             mbmi->wm_params[0] = wm_params0;
+#if WARPED_MOTION_SORT_SAMPLES
+            mbmi->num_proj_ref[0] = num_proj_ref0;
+#endif  // WARPED_MOTION_SORT_SAMPLES
           }
         }
 
@@ -7328,8 +8555,8 @@ static int64_t motion_mode_rd(
 
       av1_merge_rd_stats(rd_stats, rd_stats_y);
 
-      rdcosty = RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist);
-      rdcosty = AOMMIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, rd_stats->sse));
+      rdcosty = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+      rdcosty = AOMMIN(rdcosty, RDCOST(x->rdmult, 0, rd_stats->sse));
 /* clang-format off */
 #if CONFIG_VAR_TX
       is_cost_valid_uv =
@@ -7365,12 +8592,11 @@ static int64_t motion_mode_rd(
         mbmi->skip = 0;
         // here mbmi->skip temporarily plays a role as what this_skip2 does
       } else if (!xd->lossless[mbmi->segment_id] &&
-                 (RDCOST(x->rdmult, x->rddiv,
+                 (RDCOST(x->rdmult,
                          rd_stats_y->rate + rd_stats_uv->rate +
                              av1_cost_bit(av1_get_skip_prob(cm, xd), 0),
                          rd_stats->dist) >=
-                  RDCOST(x->rdmult, x->rddiv,
-                         av1_cost_bit(av1_get_skip_prob(cm, xd), 1),
+                  RDCOST(x->rdmult, av1_cost_bit(av1_get_skip_prob(cm, xd), 1),
                          rd_stats->sse))) {
         rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate;
         rd_stats->rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
@@ -7427,7 +8653,7 @@ static int64_t motion_mode_rd(
 #endif  // CONFIG_GLOBAL_MOTION
 
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-    tmp_rd = RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist);
+    tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
     if (mbmi->motion_mode == SIMPLE_TRANSLATION || (tmp_rd < best_rd)) {
       best_mbmi = *mbmi;
       best_rd = tmp_rd;
@@ -7466,11 +8692,17 @@ static int64_t motion_mode_rd(
   return 0;
 }
 
-static int64_t handle_inter_mode(
-    const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
-    RD_STATS *rd_stats, RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
-    int *disable_skip, int_mv (*mode_mv)[TOTAL_REFS_PER_FRAME], int mi_row,
-    int mi_col, HandleInterModeArgs *args, const int64_t ref_best_rd) {
+static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                 BLOCK_SIZE bsize, RD_STATS *rd_stats,
+                                 RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
+                                 int *disable_skip,
+                                 int_mv (*mode_mv)[TOTAL_REFS_PER_FRAME],
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+                                 int_mv (*mode_comp_mv)[TOTAL_REFS_PER_FRAME],
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+                                 int mi_row, int mi_col,
+                                 HandleInterModeArgs *args,
+                                 const int64_t ref_best_rd) {
   const AV1_COMMON *cm = &cpi->common;
   (void)cm;
   MACROBLOCKD *xd = &x->e_mbd;
@@ -7479,7 +8711,14 @@ static int64_t handle_inter_mode(
   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   const int is_comp_pred = has_second_ref(mbmi);
   const int this_mode = mbmi->mode;
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  const int is_singleref_comp_mode = is_inter_singleref_comp_mode(this_mode);
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
   int_mv *frame_mv = mode_mv[this_mode];
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  // The comp mv for the compound mode in single ref
+  int_mv *frame_comp_mv = mode_comp_mv[this_mode];
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
   int i;
   int refs[2] = { mbmi->ref_frame[0],
                   (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
@@ -7487,7 +8726,7 @@ static int64_t handle_inter_mode(
   int rate_mv = 0;
 #if CONFIG_EXT_INTER
   int pred_exists = 1;
-#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
+#if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT || CONFIG_INTERINTRA
   const int bw = block_size_wide[bsize];
 #endif  // ONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
   int_mv single_newmv[TOTAL_REFS_PER_FRAME];
@@ -7511,9 +8750,7 @@ static int64_t handle_inter_mode(
 #if CONFIG_EXT_INTER
   int rate2_bmc_nocoeff;
   MB_MODE_INFO best_bmc_mbmi;
-#if CONFIG_MOTION_VAR
   int rate_mv_bmc;
-#endif  // CONFIG_MOTION_VAR
 #endif  // CONFIG_EXT_INTER
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
   int64_t rd = INT64_MAX;
@@ -7523,6 +8760,11 @@ static int64_t handle_inter_mode(
   int skip_txfm_sb = 0;
   int64_t skip_sse_sb = INT64_MAX;
   int16_t mode_ctx;
+#if CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_MOTION_VAR
+  // dummy fillers
+  mbmi->ncobmc_mode[0] = NO_OVERLAP;
+  mbmi->ncobmc_mode[1] = NO_OVERLAP;
+#endif
 
 #if CONFIG_EXT_INTER
 #if CONFIG_INTERINTRA
@@ -7546,7 +8788,11 @@ static int64_t handle_inter_mode(
 #endif  // CONFIG_EXT_INTER
 
 #if CONFIG_EXT_INTER
+#if CONFIG_COMPOUND_SINGLEREF
+  if (is_comp_pred || is_singleref_comp_mode)
+#else   // !CONFIG_COMPOUND_SINGLEREF
   if (is_comp_pred)
+#endif  // CONFIG_COMPOUND_SINGLEREF
     mode_ctx = mbmi_ext->compound_mode_context[refs[0]];
   else
 #endif  // CONFIG_EXT_INTER
@@ -7572,12 +8818,22 @@ static int64_t handle_inter_mode(
     if (frame_mv[refs[0]].as_int == INVALID_MV ||
         frame_mv[refs[1]].as_int == INVALID_MV)
       return INT64_MAX;
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  } else if (is_singleref_comp_mode) {
+    if (frame_mv[refs[0]].as_int == INVALID_MV ||
+        frame_comp_mv[refs[0]].as_int == INVALID_MV)
+      return INT64_MAX;
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
   }
 
   mbmi->motion_mode = SIMPLE_TRANSLATION;
   if (have_newmv_in_inter_mode(this_mode)) {
-    const int64_t ret_val = handle_newmv(cpi, x, bsize, mode_mv, mi_row, mi_col,
-                                         &rate_mv, single_newmv, args);
+    const int64_t ret_val =
+        handle_newmv(cpi, x, bsize, mode_mv,
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+                     mode_comp_mv,
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+                     mi_row, mi_col, &rate_mv, single_newmv, args);
     if (ret_val != 0)
       return ret_val;
     else
@@ -7591,6 +8847,16 @@ static int64_t handle_inter_mode(
     mbmi->mv[i].as_int = cur_mv[i].as_int;
   }
 
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  if (!is_comp_pred && is_singleref_comp_mode) {
+    cur_mv[1] = frame_comp_mv[refs[0]];
+    // Clip "next_nearest" so that it does not extend to far out of image
+    if (this_mode != NEWMV) clamp_mv2(&cur_mv[1].as_mv, xd);
+    if (mv_check_bounds(&x->mv_limits, &cur_mv[1].as_mv)) return INT64_MAX;
+    mbmi->mv[1].as_int = cur_mv[1].as_int;
+  }
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+
 #if CONFIG_EXT_INTER
   if (this_mode == NEAREST_NEARESTMV)
 #else
@@ -7614,7 +8880,13 @@ static int64_t handle_inter_mode(
 
 #if CONFIG_EXT_INTER
   if (mbmi_ext->ref_mv_count[ref_frame_type] > 0) {
-    if (this_mode == NEAREST_NEWMV) {
+#if CONFIG_COMPOUND_SINGLEREF
+    if (this_mode == NEAREST_NEWMV ||  // this_mode == SR_NEAREST_NEWMV ||
+        this_mode == SR_NEAREST_NEARMV)
+#else   // !CONFIG_COMPOUND_SINGLEREF
+    if (this_mode == NEAREST_NEWMV)
+#endif  // CONFIG_COMPOUND_SINGLEREF
+    {
       cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv;
 
       lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv);
@@ -7635,7 +8907,11 @@ static int64_t handle_inter_mode(
 
   if (mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
     int ref_mv_idx = mbmi->ref_mv_idx + 1;
-    if (this_mode == NEAR_NEWMV || this_mode == NEAR_NEARMV) {
+    if (this_mode == NEAR_NEWMV ||
+#if CONFIG_COMPOUND_SINGLEREF
+        this_mode == SR_NEAR_NEWMV ||
+#endif  // CONFIG_COMPOUND_SINGLEREF
+        this_mode == NEAR_NEARMV) {
       cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
 
       lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv);
@@ -7644,8 +8920,17 @@ static int64_t handle_inter_mode(
       mbmi->mv[0].as_int = cur_mv[0].as_int;
     }
 
-    if (this_mode == NEW_NEARMV || this_mode == NEAR_NEARMV) {
-      cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
+    if (this_mode == NEW_NEARMV ||
+#if CONFIG_COMPOUND_SINGLEREF
+        this_mode == SR_NEAREST_NEARMV ||
+#endif  // CONFIG_COMPOUND_SINGLEREF
+        this_mode == NEAR_NEARMV) {
+#if CONFIG_COMPOUND_SINGLEREF
+      if (this_mode == SR_NEAREST_NEARMV)
+        cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
+      else
+#endif  // CONFIG_COMPOUND_SINGLEREF
+        cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
 
       lower_mv_precision(&cur_mv[1].as_mv, cm->allow_high_precision_mv);
       clamp_mv2(&cur_mv[1].as_mv, xd);
@@ -7653,7 +8938,7 @@ static int64_t handle_inter_mode(
       mbmi->mv[1].as_int = cur_mv[1].as_int;
     }
   }
-#else
+#else   // !CONFIG_EXT_INTER
   if (this_mode == NEARMV && is_comp_pred) {
     uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
     if (mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
@@ -7706,7 +8991,7 @@ static int64_t handle_inter_mode(
     rd_stats->rate += cost_mv_ref(cpi, this_mode, mode_ctx);
   }
 
-  if (RDCOST(x->rdmult, x->rddiv, rd_stats->rate, 0) > ref_best_rd &&
+  if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd &&
 #if CONFIG_EXT_INTER
       mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV
 #else
@@ -7725,13 +9010,16 @@ static int64_t handle_inter_mode(
   best_bmc_mbmi = *mbmi;
   rate2_bmc_nocoeff = rd_stats->rate;
   if (cm->interp_filter == SWITCHABLE) rate2_bmc_nocoeff += rs;
-#if CONFIG_MOTION_VAR
   rate_mv_bmc = rate_mv;
-#endif  // CONFIG_MOTION_VAR
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 
 #if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
-  if (is_comp_pred) {
+#if CONFIG_COMPOUND_SINGLEREF
+  if (is_comp_pred || is_singleref_comp_mode)
+#else
+  if (is_comp_pred)
+#endif  // CONFIG_COMPOUND_SINGLEREF
+  {
     int rate_sum, rs2;
     int64_t dist_sum;
     int64_t best_rd_compound = INT64_MAX, best_rd_cur = INT64_MAX;
@@ -7741,8 +9029,8 @@ static int64_t handle_inter_mode(
     int tmp_skip_txfm_sb;
     int64_t tmp_skip_sse_sb;
     int compound_type_cost[COMPOUND_TYPES];
-    uint8_t pred0[2 * MAX_SB_SQUARE];
-    uint8_t pred1[2 * MAX_SB_SQUARE];
+    DECLARE_ALIGNED(16, uint8_t, pred0[2 * MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(16, uint8_t, pred1[2 * MAX_SB_SQUARE]);
     uint8_t *preds0[1] = { pred0 };
     uint8_t *preds1[1] = { pred1 };
     int strides[1] = { bw };
@@ -7761,6 +9049,17 @@ static int64_t handle_inter_mode(
     best_compound_data.seg_mask = tmp_mask_buf;
 #endif  // CONFIG_COMPOUND_SEGMENT
 
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+    // TODO(zoeliu): To further check whether the following setups are needed.
+    // Single ref compound mode: Prepare the 2nd ref frame predictor the same as
+    // the 1st one.
+    if (!is_comp_pred && is_singleref_comp_mode) {
+      xd->block_refs[1] = xd->block_refs[0];
+      for (i = 0; i < MAX_MB_PLANE; i++)
+        xd->plane[i].pre[1] = xd->plane[i].pre[0];
+    }
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+
     if (masked_compound_used) {
       av1_cost_tokens(compound_type_cost, cm->fc->compound_type_prob[bsize],
                       av1_compound_type_tree);
@@ -7773,7 +9072,7 @@ static int64_t handle_inter_mode(
 
     for (cur_type = COMPOUND_AVERAGE; cur_type < COMPOUND_TYPES; cur_type++) {
       if (cur_type != COMPOUND_AVERAGE && !masked_compound_used) break;
-      if (!is_interinter_compound_used(cur_type, bsize)) break;
+      if (!is_interinter_compound_used(cur_type, bsize)) continue;
       tmp_rate_mv = rate_mv;
       best_rd_cur = INT64_MAX;
       mbmi->interinter_compound_type = cur_type;
@@ -7792,8 +9091,7 @@ static int64_t handle_inter_mode(
                                    &tmp_skip_txfm_sb, &tmp_skip_sse_sb,
                                    INT64_MAX);
           if (rd != INT64_MAX)
-            best_rd_cur =
-                RDCOST(x->rdmult, x->rddiv, rs2 + rate_mv + rate_sum, dist_sum);
+            best_rd_cur = RDCOST(x->rdmult, rs2 + rate_mv + rate_sum, dist_sum);
           best_rd_compound = best_rd_cur;
           break;
 #if CONFIG_WEDGE
@@ -7923,8 +9221,7 @@ static int64_t handle_inter_mode(
       av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
       model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
                       &tmp_skip_txfm_sb, &tmp_skip_sse_sb);
-      rd =
-          RDCOST(x->rdmult, x->rddiv, tmp_rate_mv + rate_sum + rmode, dist_sum);
+      rd = RDCOST(x->rdmult, tmp_rate_mv + rate_sum + rmode, dist_sum);
       if (rd < best_interintra_rd) {
         best_interintra_rd = rd;
         best_interintra_mode = mbmi->interintra_mode;
@@ -7939,7 +9236,7 @@ static int64_t handle_inter_mode(
     rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
                              &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
     if (rd != INT64_MAX)
-      rd = RDCOST(x->rdmult, x->rddiv, rate_mv + rmode + rate_sum, dist_sum);
+      rd = RDCOST(x->rdmult, rate_mv + rmode + rate_sum, dist_sum);
     best_interintra_rd = rd;
 
     if (ref_best_rd < INT64_MAX && best_interintra_rd > 2 * ref_best_rd) {
@@ -7953,8 +9250,7 @@ static int64_t handle_inter_mode(
       int_mv tmp_mv;
       int rwedge = av1_cost_bit(cm->fc->wedge_interintra_prob[bsize], 0);
       if (rd != INT64_MAX)
-        rd = RDCOST(x->rdmult, x->rddiv, rmode + rate_mv + rwedge + rate_sum,
-                    dist_sum);
+        rd = RDCOST(x->rdmult, rmode + rate_mv + rwedge + rate_sum, dist_sum);
       best_interintra_rd_nowedge = best_interintra_rd;
 
       // Disable wedge search if source variance is small
@@ -7968,7 +9264,7 @@ static int64_t handle_inter_mode(
             pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
 
         best_interintra_rd_wedge +=
-            RDCOST(x->rdmult, x->rddiv, rmode + rate_mv + rwedge, 0);
+            RDCOST(x->rdmult, rmode + rate_mv + rwedge, 0);
         // Refine motion vector.
         if (have_newmv_in_inter_mode(this_mode)) {
           // get negative of mask
@@ -7977,14 +9273,14 @@ static int64_t handle_inter_mode(
           tmp_mv.as_int = x->mbmi_ext->ref_mvs[refs[0]][0].as_int;
           compound_single_motion_search(cpi, x, bsize, &tmp_mv.as_mv, mi_row,
                                         mi_col, intrapred, mask, bw,
-                                        &tmp_rate_mv, 0, 0);
+                                        &tmp_rate_mv, 0);
           mbmi->mv[0].as_int = tmp_mv.as_int;
           av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, &orig_dst,
                                          bsize);
           model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
                           &tmp_skip_txfm_sb, &tmp_skip_sse_sb);
-          rd = RDCOST(x->rdmult, x->rddiv,
-                      rmode + tmp_rate_mv + rwedge + rate_sum, dist_sum);
+          rd = RDCOST(x->rdmult, rmode + tmp_rate_mv + rwedge + rate_sum,
+                      dist_sum);
           if (rd >= best_interintra_rd_wedge) {
             tmp_mv.as_int = cur_mv[0].as_int;
             tmp_rate_mv = rate_mv;
@@ -8000,8 +9296,8 @@ static int64_t handle_inter_mode(
             estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
                                 &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
         if (rd != INT64_MAX)
-          rd = RDCOST(x->rdmult, x->rddiv,
-                      rmode + tmp_rate_mv + rwedge + rate_sum, dist_sum);
+          rd = RDCOST(x->rdmult, rmode + tmp_rate_mv + rwedge + rate_sum,
+                      dist_sum);
         best_interintra_rd_wedge = rd;
         if (best_interintra_rd_wedge < best_interintra_rd_nowedge) {
           mbmi->use_wedge_interintra = 1;
@@ -8042,7 +9338,7 @@ static int64_t handle_inter_mode(
     av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, &orig_dst, bsize);
     model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
                     &tmp_dist, &skip_txfm_sb, &skip_sse_sb);
-    rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
+    rd = RDCOST(x->rdmult, rs + tmp_rate, tmp_dist);
   }
 #endif  // CONFIG_EXT_INTER
 
@@ -8097,10 +9393,7 @@ static int64_t handle_inter_mode(
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
                            single_newmv,
 #if CONFIG_EXT_INTER
-                           rate2_bmc_nocoeff, &best_bmc_mbmi,
-#if CONFIG_MOTION_VAR
-                           rate_mv_bmc,
-#endif  // CONFIG_MOTION_VAR
+                           rate2_bmc_nocoeff, &best_bmc_mbmi, rate_mv_bmc,
 #endif  // CONFIG_EXT_INTER
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
                            rs, &skip_txfm_sb, &skip_sse_sb, &orig_dst);
@@ -8118,11 +9411,7 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
 
   MACROBLOCKD *const xd = &x->e_mbd;
   const TileInfo *tile = &xd->tile;
-#if CONFIG_EC_ADAPT
   FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
-#else
-  FRAME_CONTEXT *const ec_ctx = cm->fc;
-#endif  // CONFIG_EC_ADAPT
   MODE_INFO *const mi = xd->mi[0];
   const int mi_row = -xd->mb_to_top_edge / (8 * MI_SIZE);
   const int mi_col = -xd->mb_to_left_edge / (8 * MI_SIZE);
@@ -8222,7 +9511,7 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
 #endif
     mbmi->use_intrabc = 1;
     mbmi->mode = DC_PRED;
-    mbmi->uv_mode = DC_PRED;
+    mbmi->uv_mode = UV_DC_PRED;
     mbmi->mv[0].as_mv = dv;
 #if CONFIG_DUAL_FILTER
     for (int idx = 0; idx < 4; ++idx) mbmi->interp_filter[idx] = BILINEAR;
@@ -8233,12 +9522,12 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
     x->skip = 0;
     av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
 
+    assert(x->mvcost == x->mv_cost_stack[0]);
+    // TODO(aconverse@google.com): The full motion field defining discount
+    // in MV_COST_WEIGHT is too large. Explore other values.
     int rate_mv = av1_mv_bit_cost(&dv, &dv_ref.as_mv, x->nmvjointcost,
-                                  x->mvcost, MV_COST_WEIGHT);
-    const PREDICTION_MODE A = av1_above_block_mode(mi, xd->above_mi, 0);
-    const PREDICTION_MODE L = av1_left_block_mode(mi, xd->left_mi, 0);
-    const int rate_mode = cpi->y_mode_costs[A][L][DC_PRED] +
-                          av1_cost_bit(ec_ctx->intrabc_prob, 1);
+                                  x->mvcost, MV_COST_WEIGHT_SUB);
+    const int rate_mode = av1_cost_bit(ec_ctx->intrabc_prob, 1);
 
     RD_STATS rd_stats, rd_stats_uv;
     av1_subtract_plane(x, bsize, 0);
@@ -8267,8 +9556,7 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
     rdc_noskip.rate =
         rate_mode + rate_mv + rd_stats.rate + av1_cost_bit(skip_prob, 0);
     rdc_noskip.dist = rd_stats.dist;
-    rdc_noskip.rdcost =
-        RDCOST(x->rdmult, x->rddiv, rdc_noskip.rate, rdc_noskip.dist);
+    rdc_noskip.rdcost = RDCOST(x->rdmult, rdc_noskip.rate, rdc_noskip.dist);
     if (rdc_noskip.rdcost < best_rd) {
       best_rd = rdc_noskip.rdcost;
       best_mbmi = *mbmi;
@@ -8282,7 +9570,7 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
     av1_init_rd_stats(&rdc_skip);
     rdc_skip.rate = rate_mode + rate_mv + av1_cost_bit(skip_prob, 1);
     rdc_skip.dist = rd_stats.sse;
-    rdc_skip.rdcost = RDCOST(x->rdmult, x->rddiv, rdc_skip.rate, rdc_skip.dist);
+    rdc_skip.rdcost = RDCOST(x->rdmult, rdc_skip.rate, rdc_skip.dist);
     if (rdc_skip.rdcost < best_rd) {
       best_rd = rdc_skip.rdcost;
       best_mbmi = *mbmi;
@@ -8302,6 +9590,7 @@ void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
                                PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   struct macroblockd_plane *const pd = xd->plane;
   int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
   int y_skip = 0, uv_skip = 0;
@@ -8310,11 +9599,11 @@ void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
   const int unify_bsize = CONFIG_CB4X4;
 
   ctx->skip = 0;
-  xd->mi[0]->mbmi.ref_frame[0] = INTRA_FRAME;
-  xd->mi[0]->mbmi.ref_frame[1] = NONE_FRAME;
+  mbmi->ref_frame[0] = INTRA_FRAME;
+  mbmi->ref_frame[1] = NONE_FRAME;
 #if CONFIG_INTRABC
-  xd->mi[0]->mbmi.use_intrabc = 0;
-  xd->mi[0]->mbmi.mv[0].as_int = 0;
+  mbmi->use_intrabc = 0;
+  mbmi->mv[0].as_int = 0;
 #endif  // CONFIG_INTRABC
 
   const int64_t intra_yrd =
@@ -8325,9 +9614,29 @@ void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
                                          &dist_y, &y_skip, best_rd);
 
   if (intra_yrd < best_rd) {
-    max_uv_tx_size = uv_txsize_lookup[bsize][xd->mi[0]->mbmi.tx_size]
-                                     [pd[1].subsampling_x][pd[1].subsampling_y];
-    init_sbuv_mode(&xd->mi[0]->mbmi);
+#if CONFIG_CFL
+    // Perform one extra txfm_rd_in_plane() call, this time with the best value
+    // so we can store reconstructed luma values
+    RD_STATS this_rd_stats;
+
+#if CONFIG_CB4X4
+    // Don't store the luma value if no chroma is associated.
+    // Don't worry, we will store this reconstructed luma in the following
+    // encode dry-run the chroma plane will never know.
+    x->cfl_store_y = !x->skip_chroma_rd;
+#else
+    x->cfl_store_y = 1;
+#endif
+
+    txfm_rd_in_plane(x, cpi, &this_rd_stats, INT64_MAX, AOM_PLANE_Y,
+                     mbmi->sb_type, mbmi->tx_size,
+                     cpi->sf.use_fast_coef_costing);
+
+    x->cfl_store_y = 0;
+#endif
+    max_uv_tx_size = uv_txsize_lookup[bsize][mbmi->tx_size][pd[1].subsampling_x]
+                                     [pd[1].subsampling_y];
+    init_sbuv_mode(mbmi);
 #if CONFIG_CB4X4
     if (!x->skip_chroma_rd)
       rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, &dist_uv,
@@ -8346,8 +9655,8 @@ void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
           rate_y + rate_uv + av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
       rd_cost->dist = dist_y + dist_uv;
     }
-    rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+    rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
     rd_cost->dist_y = dist_y;
 #endif
   } else {
@@ -8360,7 +9669,7 @@ void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
   if (rd_pick_intrabc_mode_sb(cpi, x, rd_cost, bsize, best_rd) < best_rd) {
     ctx->skip = x->skip;  // FIXME where is the proper place to set this?!
     assert(rd_cost->rate != INT_MAX);
-    rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
+    rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
   }
 #endif
   if (rd_cost->rate == INT_MAX) return;
@@ -8494,7 +9803,8 @@ static void pick_filter_intra_interframe(
     const AV1_COMP *cpi, MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
     BLOCK_SIZE bsize, int mi_row, int mi_col, int *rate_uv_intra,
     int *rate_uv_tokenonly, int64_t *dist_uv, int *skip_uv,
-    PREDICTION_MODE *mode_uv, FILTER_INTRA_MODE_INFO *filter_intra_mode_info_uv,
+    UV_PREDICTION_MODE *mode_uv,
+    FILTER_INTRA_MODE_INFO *filter_intra_mode_info_uv,
 #if CONFIG_EXT_INTRA
     int8_t *uv_angle_delta,
 #endif  // CONFIG_EXT_INTRA
@@ -8531,7 +9841,7 @@ static void pick_filter_intra_interframe(
   // TODO(huisu): use skip_mask for further speedup.
   (void)skip_mask;
   mbmi->mode = DC_PRED;
-  mbmi->uv_mode = DC_PRED;
+  mbmi->uv_mode = UV_DC_PRED;
   mbmi->ref_frame[0] = INTRA_FRAME;
   mbmi->ref_frame[1] = NONE_FRAME;
   if (!rd_pick_filter_intra_sby(cpi, x, &rate_dummy, &rate_y, &distortion_y,
@@ -8600,7 +9910,8 @@ static void pick_filter_intra_interframe(
   rate2 += write_uniform_cost(
       FILTER_INTRA_MODES, mbmi->filter_intra_mode_info.filter_intra_mode[0]);
 #if CONFIG_EXT_INTRA
-  if (av1_is_directional_mode(mbmi->uv_mode, bsize)) {
+  if (av1_is_directional_mode(mbmi->uv_mode, bsize) &&
+      av1_use_angle_delta(bsize)) {
     rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
                                 MAX_ANGLE_DELTA + mbmi->angle_delta[1]);
   }
@@ -8628,7 +9939,7 @@ static void pick_filter_intra_interframe(
   } else {
     rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
   }
-  this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+  this_rd = RDCOST(x->rdmult, rate2, distortion2);
 
   if (this_rd < *best_intra_rd) {
     *best_intra_rd = this_rd;
@@ -8693,6 +10004,9 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
   unsigned char segment_id = mbmi->segment_id;
   int comp_pred, i, k;
   int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  int_mv frame_comp_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
   struct buf_2d yv12_mb[TOTAL_REFS_PER_FRAME][MAX_MB_PLANE];
   int_mv single_newmv[TOTAL_REFS_PER_FRAME] = { { 0 } };
 #if CONFIG_EXT_INTER
@@ -8722,7 +10036,11 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
   int best_mode_skippable = 0;
   int midx, best_mode_index = -1;
   unsigned int ref_costs_single[TOTAL_REFS_PER_FRAME];
+#if CONFIG_EXT_COMP_REFS
+  unsigned int ref_costs_comp[TOTAL_REFS_PER_FRAME][TOTAL_REFS_PER_FRAME];
+#else
   unsigned int ref_costs_comp[TOTAL_REFS_PER_FRAME];
+#endif  // CONFIG_EXT_COMP_REFS
   aom_prob comp_mode_p;
   int64_t best_intra_rd = INT64_MAX;
   unsigned int best_pred_sse = UINT_MAX;
@@ -8730,7 +10048,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
   int rate_uv_intra[TX_SIZES_ALL], rate_uv_tokenonly[TX_SIZES_ALL];
   int64_t dist_uvs[TX_SIZES_ALL];
   int skip_uvs[TX_SIZES_ALL];
-  PREDICTION_MODE mode_uv[TX_SIZES_ALL];
+  UV_PREDICTION_MODE mode_uv[TX_SIZES_ALL];
 #if CONFIG_PALETTE
   PALETTE_MODE_INFO pmi_uv[TX_SIZES_ALL];
 #endif  // CONFIG_PALETTE
@@ -8747,7 +10065,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
   const int *const intra_mode_cost = cpi->mbmode_cost[size_group_lookup[bsize]];
   int best_skip2 = 0;
-  uint8_t ref_frame_skip_mask[2] = { 0 };
+  uint16_t ref_frame_skip_mask[2] = { 0 };
   uint32_t mode_skip_mask[TOTAL_REFS_PER_FRAME] = { 0 };
 #if CONFIG_EXT_INTER && CONFIG_INTERINTRA
   MV_REFERENCE_FRAME best_single_inter_ref = LAST_FRAME;
@@ -8850,6 +10168,10 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
   *returnrate_nocoef = INT_MAX;
 #endif  // CONFIG_SUPERTX
 
+#if CONFIG_SPEED_REFS
+  memset(x->mbmi_ext->ref_mvs, 0, sizeof(x->mbmi_ext->ref_mvs));
+#endif  // CONFIG_SPEED_REFS
+
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     x->pred_mv_sad[ref_frame] = INT_MAX;
     x->mbmi_ext->mode_context[ref_frame] = 0;
@@ -8873,6 +10195,10 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 #endif  // CONFIG_GLOBAL_MOTION
 #if CONFIG_EXT_INTER
     frame_mv[NEW_NEWMV][ref_frame].as_int = INVALID_MV;
+#if CONFIG_COMPOUND_SINGLEREF
+    frame_mv[SR_NEW_NEWMV][ref_frame].as_int = INVALID_MV;
+    frame_comp_mv[SR_NEW_NEWMV][ref_frame].as_int = INVALID_MV;
+#endif  // CONFIG_COMPOUND_SINGLEREF
 #if CONFIG_GLOBAL_MOTION
     frame_mv[ZERO_ZEROMV][ref_frame].as_int =
         gm_get_motion_vector(&cm->global_motion[ref_frame],
@@ -8934,6 +10260,10 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 // Skip checking missing references in both single and compound reference
 // modes. Note that a mode will be skipped iff both reference frames
 // are masked out.
+#if CONFIG_EXT_COMP_REFS
+      ref_frame_skip_mask[0] |= (1 << ref_frame);
+      ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+#else  // !CONFIG_EXT_COMP_REFS
 #if CONFIG_EXT_REFS
       if (ref_frame == BWDREF_FRAME || ref_frame == ALTREF_FRAME) {
         ref_frame_skip_mask[0] |= (1 << ref_frame);
@@ -8945,6 +10275,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 #if CONFIG_EXT_REFS
       }
 #endif  // CONFIG_EXT_REFS
+#endif  // CONFIG_EXT_COMP_REFS
     } else {
       for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
         // Skip fixed mv modes for poor references
@@ -9000,6 +10331,12 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         mode_skip_mask[ALTREF_FRAME] |= (1 << NEAREST_NEARESTMV);
       if (frame_mv[NEAR_NEARMV][ALTREF_FRAME].as_int != zeromv.as_int)
         mode_skip_mask[ALTREF_FRAME] |= (1 << NEAR_NEARMV);
+#if CONFIG_COMPOUND_SINGLEREF
+      if (frame_mv[SR_NEAREST_NEARMV][ALTREF_FRAME].as_int != zeromv.as_int ||
+          frame_comp_mv[SR_NEAREST_NEARMV][ALTREF_FRAME].as_int !=
+              zeromv.as_int)
+        mode_skip_mask[ALTREF_FRAME] |= (1 << SR_NEAREST_NEARMV);
+#endif  // CONFIG_COMPOUND_SINGLEREF
 #endif  // CONFIG_EXT_INTER
     }
   }
@@ -9077,7 +10414,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     int compmode_cost = 0;
     int rate2 = 0, rate_y = 0, rate_uv = 0;
     int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
     int64_t distortion2_y = 0;
     int64_t total_sse_y = INT64_MAX;
 #endif
@@ -9106,6 +10443,13 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
           frame_mv[compound_ref0_mode(this_mode)][ref_frame].as_int;
       frame_mv[this_mode][second_ref_frame].as_int =
           frame_mv[compound_ref1_mode(this_mode)][second_ref_frame].as_int;
+#if CONFIG_COMPOUND_SINGLEREF
+    } else if (is_inter_singleref_comp_mode(this_mode)) {
+      frame_mv[this_mode][ref_frame].as_int =
+          frame_mv[compound_ref0_mode(this_mode)][ref_frame].as_int;
+      frame_comp_mv[this_mode][ref_frame].as_int =
+          frame_mv[compound_ref1_mode(this_mode)][ref_frame].as_int;
+#endif  // CONFIG_COMPOUND_SINGLEREF
     }
 #endif  // CONFIG_EXT_INTER
 
@@ -9154,6 +10498,34 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         (ref_frame_skip_mask[1] & (1 << AOMMAX(0, second_ref_frame))))
       continue;
 
+#if CONFIG_EXT_COMP_REFS
+// TODO(zoeliu): Following toggle between #if 0/1 and the bug will manifest
+// itself.
+#if 0
+    if (!(cpi->ref_frame_flags & flag_list[ref_frame]) ||
+        (second_ref_frame > INTRA_FRAME &&
+         (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))))
+      printf("Frame=%d, bsize=%d, (mi_row,mi_col)=(%d,%d), ref_frame=%d, "
+             "second_ref_frame=%d\n", cm->current_video_frame, bsize, mi_row,
+             mi_col, ref_frame, second_ref_frame);
+
+    if (!(cpi->ref_frame_flags & flag_list[ref_frame])) continue;
+    if (second_ref_frame > INTRA_FRAME &&
+        (!(cpi->ref_frame_flags & flag_list[second_ref_frame])))
+      continue;
+#endif  // 0
+
+#if !USE_UNI_COMP_REFS
+    // NOTE(zoeliu): Temporarily disable uni-directional comp refs
+    if (second_ref_frame > INTRA_FRAME) {
+      if (!((ref_frame < BWDREF_FRAME) ^ (second_ref_frame < BWDREF_FRAME)))
+        continue;
+    }
+    assert(second_ref_frame <= INTRA_FRAME ||
+           ((ref_frame < BWDREF_FRAME) ^ (second_ref_frame < BWDREF_FRAME)));
+#endif  // !USE_UNI_COMP_REFS
+#endif  // CONFIG_EXT_COMP_REFS
+
     if (mode_skip_mask[ref_frame] & (1 << this_mode)) continue;
 
     // Test best rd so far against threshold for trying this mode.
@@ -9239,7 +10611,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     }
 
     mbmi->mode = this_mode;
-    mbmi->uv_mode = DC_PRED;
+    mbmi->uv_mode = UV_DC_PRED;
     mbmi->ref_frame[0] = ref_frame;
     mbmi->ref_frame[1] = second_ref_frame;
 #if CONFIG_PALETTE
@@ -9267,6 +10639,15 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
     }
 
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+    // Single ref compound mode
+    if (!comp_pred && is_inter_singleref_comp_mode(mbmi->mode)) {
+      xd->block_refs[1] = xd->block_refs[0];
+      for (i = 0; i < MAX_MB_PLANE; i++)
+        xd->plane[i].pre[1] = xd->plane[i].pre[0];
+    }
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+
 #if CONFIG_EXT_INTER && CONFIG_INTERINTRA
     mbmi->interintra_mode = (INTERINTRA_MODE)(II_DC_PRED - 1);
 #endif  // CONFIG_EXT_INTER && CONFIG_INTERINTRA
@@ -9277,7 +10658,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       struct macroblockd_plane *const pd = &xd->plane[1];
 #if CONFIG_EXT_INTRA
       is_directional_mode = av1_is_directional_mode(mbmi->mode, bsize);
-      if (is_directional_mode) {
+      if (is_directional_mode && av1_use_angle_delta(bsize)) {
         int rate_dummy;
         int64_t model_rd = INT64_MAX;
         if (!angle_stats_ready) {
@@ -9390,10 +10771,13 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         if (av1_is_intra_filter_switchable(p_angle))
           rate2 += cpi->intra_filter_cost[intra_filter_ctx][mbmi->intra_filter];
 #endif  // CONFIG_INTRA_INTERP
-        rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
-                                    MAX_ANGLE_DELTA + mbmi->angle_delta[0]);
+        if (av1_use_angle_delta(bsize)) {
+          rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
+                                      MAX_ANGLE_DELTA + mbmi->angle_delta[0]);
+        }
       }
-      if (mbmi->uv_mode != DC_PRED && mbmi->uv_mode != TM_PRED) {
+      if (av1_is_directional_mode(mbmi->uv_mode, bsize) &&
+          av1_use_angle_delta(bsize)) {
         rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
                                     MAX_ANGLE_DELTA + mbmi->angle_delta[1]);
       }
@@ -9409,7 +10793,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
               mbmi->filter_intra_mode_info.filter_intra_mode[0]);
         }
       }
-      if (mbmi->uv_mode == DC_PRED) {
+      if (mbmi->uv_mode == UV_DC_PRED) {
         rate2 +=
             av1_cost_bit(cpi->common.fc->filter_intra_probs[1],
                          mbmi->filter_intra_mode_info.use_filter_intra_mode[1]);
@@ -9422,7 +10806,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED)
         rate2 += intra_cost_penalty;
       distortion2 = distortion_y + distortion_uv;
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
       if (bsize < BLOCK_8X8) distortion2_y = distortion_y;
 #endif
     } else {
@@ -9481,6 +10865,27 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
             mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0] = this_mv;
           }
         }
+#if CONFIG_COMPOUND_SINGLEREF
+      } else if (is_inter_singleref_comp_mode(mbmi->mode)) {
+        if (mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
+          // TODO(zoeliu): To further investigate which ref_mv_idx should be
+          //               chosen for the mode of SR_NEAR_NEWMV.
+          int ref_mv_idx = 0;
+          // Special case: SR_NEAR_NEWMV mode use
+          // 1 + mbmi->ref_mv_idx (like NEARMV) instead of
+          // mbmi->ref_mv_idx (like NEWMV)
+          if (mbmi->mode == SR_NEAR_NEWMV) ref_mv_idx = 1;
+
+          if (compound_ref0_mode(mbmi->mode) == NEWMV ||
+              compound_ref1_mode(mbmi->mode) == NEWMV) {
+            int_mv this_mv =
+                mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
+            clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
+                         xd->n8_h << MI_SIZE_LOG2, xd);
+            mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv;
+          }
+        }
+#endif  // CONFIG_COMPOUND_SINGLEREF
       } else {
 #endif  // CONFIG_EXT_INTER
         if (mbmi->mode == NEWMV && mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
@@ -9500,6 +10905,19 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       {
         RD_STATS rd_stats, rd_stats_y, rd_stats_uv;
         av1_init_rd_stats(&rd_stats);
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
+        // While av1 master uses rd_stats_y.rate through out the codebase,
+        // which is set when handle_inter_moden is called, the daala-dist code
+        // in rd_pick_partition() for cb4x4 and sub8x8 blocks need to know
+        // .dist_y which comes from rd_stats_y.dist and rd_stats_y.sse.
+        // The problem is rd_stats_y.dist and rd_stats_y.sse are sometimes not
+        // initialized when rd_stats.skip = 1,
+        // then instead rd_stats.dist and rd_stats.sse have the
+        // combined luma and chroma dist and sse.
+        // This can be seen inside motion_mode_rd(), which is called by
+        // handle_inter_mode().
+        if (bsize < BLOCK_8X8) av1_init_rd_stats(&rd_stats_y);
+#endif
         rd_stats.rate = rate2;
 
         // Point to variables that are maintained between loop iterations
@@ -9510,6 +10928,9 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 #endif  // CONFIG_EXT_INTER
         this_rd = handle_inter_mode(cpi, x, bsize, &rd_stats, &rd_stats_y,
                                     &rd_stats_uv, &disable_skip, frame_mv,
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+                                    frame_comp_mv,
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
                                     mi_row, mi_col, &args, best_rd);
 
         rate2 = rd_stats.rate;
@@ -9518,23 +10939,39 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         total_sse = rd_stats.sse;
         rate_y = rd_stats_y.rate;
         rate_uv = rd_stats_uv.rate;
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
-        if (bsize < BLOCK_8X8) distortion2_y = rd_stats_y.dist;
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
+        if (bsize < BLOCK_8X8) {
+          if (rd_stats_y.rate != INT_MAX) {
+            assert(rd_stats_y.sse < INT64_MAX);
+            assert(rd_stats_y.dist < INT64_MAX);
+          }
+          total_sse_y = rd_stats_y.sse;
+          distortion2_y = rd_stats_y.dist;
+        }
 #endif
       }
 
 // TODO(jingning): This needs some refactoring to improve code quality
 // and reduce redundant steps.
 #if CONFIG_EXT_INTER
+#if CONFIG_COMPOUND_SINGLEREF
+      if ((have_nearmv_in_inter_mode(mbmi->mode) &&
+           mbmi_ext->ref_mv_count[ref_frame_type] > 2) ||
+          ((mbmi->mode == NEWMV || mbmi->mode == SR_NEW_NEWMV ||
+            mbmi->mode == NEW_NEWMV) &&
+           mbmi_ext->ref_mv_count[ref_frame_type] > 1))
+#else   // !CONFIG_COMPOUND_SINGLEREF
       if ((have_nearmv_in_inter_mode(mbmi->mode) &&
            mbmi_ext->ref_mv_count[ref_frame_type] > 2) ||
           ((mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) &&
-           mbmi_ext->ref_mv_count[ref_frame_type] > 1)) {
-#else
+           mbmi_ext->ref_mv_count[ref_frame_type] > 1))
+#endif  // CONFIG_COMPOUND_SINGLEREF
+#else   // !CONFIG_EXT_INTER
       if ((mbmi->mode == NEARMV &&
            mbmi_ext->ref_mv_count[ref_frame_type] > 2) ||
-          (mbmi->mode == NEWMV && mbmi_ext->ref_mv_count[ref_frame_type] > 1)) {
-#endif
+          (mbmi->mode == NEWMV && mbmi_ext->ref_mv_count[ref_frame_type] > 1))
+#endif  // CONFIG_EXT_INTER
+      {
         int_mv backup_mv = frame_mv[NEARMV][ref_frame];
         MB_MODE_INFO backup_mbmi = *mbmi;
         int backup_skip = x->skip;
@@ -9560,18 +10997,16 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         rate2 += (rate2 < INT_MAX ? cpi->drl_mode_cost0[drl_ctx][0] : 0);
 
         if (this_rd < INT64_MAX) {
-          if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
-              RDCOST(x->rdmult, x->rddiv, 0, total_sse))
-            tmp_ref_rd =
-                RDCOST(x->rdmult, x->rddiv,
-                       rate2 + av1_cost_bit(av1_get_skip_prob(cm, xd), 0),
-                       distortion2);
+          if (RDCOST(x->rdmult, rate_y + rate_uv, distortion2) <
+              RDCOST(x->rdmult, 0, total_sse))
+            tmp_ref_rd = RDCOST(
+                x->rdmult, rate2 + av1_cost_bit(av1_get_skip_prob(cm, xd), 0),
+                distortion2);
           else
-            tmp_ref_rd =
-                RDCOST(x->rdmult, x->rddiv,
-                       rate2 + av1_cost_bit(av1_get_skip_prob(cm, xd), 1) -
-                           rate_y - rate_uv,
-                       total_sse);
+            tmp_ref_rd = RDCOST(
+                x->rdmult, rate2 + av1_cost_bit(av1_get_skip_prob(cm, xd), 1) -
+                               rate_y - rate_uv,
+                total_sse);
         }
 #if CONFIG_VAR_TX
         for (i = 0; i < MAX_MB_PLANE; ++i)
@@ -9587,6 +11022,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
           RD_STATS tmp_rd_stats, tmp_rd_stats_y, tmp_rd_stats_uv;
 
           av1_invalid_rd_stats(&tmp_rd_stats);
+
           x->skip = 0;
 
           mbmi->ref_mv_idx = 1 + ref_idx;
@@ -9627,6 +11063,34 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
                            xd->n8_h << MI_SIZE_LOG2, xd);
               mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0] = this_mv;
             }
+#if CONFIG_COMPOUND_SINGLEREF
+          } else if (is_inter_singleref_comp_mode(mbmi->mode)) {
+            int ref_mv_idx = mbmi->ref_mv_idx;
+            // Special case: SR_NEAR_NEWMV mode use
+            // 1 + mbmi->ref_mv_idx (like NEARMV) instead of
+            // mbmi->ref_mv_idx (like NEWMV)
+            if (mbmi->mode == SR_NEAR_NEWMV) ref_mv_idx = 1 + mbmi->ref_mv_idx;
+
+            // TODO(zoeliu): For the mode of SR_NEAREST_NEWMV, as it only runs
+            //               the "if", not the "else if",
+            //               mbmi_ext->ref_mvs[mbmi->ref_frame[0]] takes the
+            //               value for "NEWMV", instead of "NEARESTMV".
+            if (compound_ref0_mode(mbmi->mode) == NEWMV ||
+                compound_ref1_mode(mbmi->mode) == NEWMV) {
+              int_mv this_mv =
+                  mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
+              clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
+                           xd->n8_h << MI_SIZE_LOG2, xd);
+              mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv;
+            } else if (compound_ref0_mode(mbmi->mode) == NEARESTMV ||
+                       compound_ref1_mode(mbmi->mode) == NEARESTMV) {
+              int_mv this_mv =
+                  mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv;
+              clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
+                           xd->n8_h << MI_SIZE_LOG2, xd);
+              mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv;
+            }
+#endif  // CONFIG_COMPOUND_SINGLEREF
           } else {
 #endif  // CONFIG_EXT_INTER
             for (ref = 0; ref < 1 + comp_pred; ++ref) {
@@ -9657,16 +11121,28 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 
             frame_mv[NEARMV][ref_frame] = cur_mv;
             av1_init_rd_stats(&tmp_rd_stats);
-
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
+            // With the same reason as 'rd_stats_y' passed to above
+            // handle_inter_mode(), tmp_rd_stats_y.dist and
+            // tmp_rd_stats_y.sse are sometimes not initialized, esp. when
+            // tmp_rd_stats.skip = 1 and tmp_rd_stats.dist and .sse
+            // represent combined luma and chroma .dist and .sse,
+            // we should initialized tmp_rd_stats_y.
+            if (bsize < BLOCK_8X8) av1_init_rd_stats(&tmp_rd_stats_y);
+#endif
             // Point to variables that are not maintained between iterations
             args.single_newmv = dummy_single_newmv;
 #if CONFIG_EXT_INTER
             args.single_newmv_rate = dummy_single_newmv_rate;
             args.modelled_rd = NULL;
 #endif  // CONFIG_EXT_INTER
-            tmp_alt_rd = handle_inter_mode(
-                cpi, x, bsize, &tmp_rd_stats, &tmp_rd_stats_y, &tmp_rd_stats_uv,
-                &dummy_disable_skip, frame_mv, mi_row, mi_col, &args, best_rd);
+            tmp_alt_rd = handle_inter_mode(cpi, x, bsize, &tmp_rd_stats,
+                                           &tmp_rd_stats_y, &tmp_rd_stats_uv,
+                                           &dummy_disable_skip, frame_mv,
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+                                           frame_comp_mv,
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+                                           mi_row, mi_col, &args, best_rd);
             // Prevent pointers from escaping local scope
             args.single_newmv = NULL;
 #if CONFIG_EXT_INTER
@@ -9696,25 +11172,22 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 
           if (tmp_alt_rd < INT64_MAX) {
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-            tmp_alt_rd = RDCOST(x->rdmult, x->rddiv, tmp_rd_stats.rate,
-                                tmp_rd_stats.dist);
+            tmp_alt_rd =
+                RDCOST(x->rdmult, tmp_rd_stats.rate, tmp_rd_stats.dist);
 #else
-            if (RDCOST(x->rdmult, x->rddiv,
-                       tmp_rd_stats_y.rate + tmp_rd_stats_uv.rate,
+            if (RDCOST(x->rdmult, tmp_rd_stats_y.rate + tmp_rd_stats_uv.rate,
                        tmp_rd_stats.dist) <
-                RDCOST(x->rdmult, x->rddiv, 0, tmp_rd_stats.sse))
-              tmp_alt_rd =
-                  RDCOST(x->rdmult, x->rddiv,
-                         tmp_rd_stats.rate +
-                             av1_cost_bit(av1_get_skip_prob(cm, xd), 0),
-                         tmp_rd_stats.dist);
+                RDCOST(x->rdmult, 0, tmp_rd_stats.sse))
+              tmp_alt_rd = RDCOST(
+                  x->rdmult, tmp_rd_stats.rate +
+                                 av1_cost_bit(av1_get_skip_prob(cm, xd), 0),
+                  tmp_rd_stats.dist);
             else
-              tmp_alt_rd =
-                  RDCOST(x->rdmult, x->rddiv,
-                         tmp_rd_stats.rate +
-                             av1_cost_bit(av1_get_skip_prob(cm, xd), 1) -
-                             tmp_rd_stats_y.rate - tmp_rd_stats_uv.rate,
-                         tmp_rd_stats.sse);
+              tmp_alt_rd = RDCOST(
+                  x->rdmult, tmp_rd_stats.rate +
+                                 av1_cost_bit(av1_get_skip_prob(cm, xd), 1) -
+                                 tmp_rd_stats_y.rate - tmp_rd_stats_uv.rate,
+                  tmp_rd_stats.sse);
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
           }
 
@@ -9730,8 +11203,12 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
             tmp_ref_rd = tmp_alt_rd;
             backup_mbmi = *mbmi;
             backup_skip = x->skip;
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
             if (bsize < BLOCK_8X8) {
+              if (tmp_rd_stats_y.rate != INT_MAX) {
+                assert(tmp_rd_stats_y.sse < INT64_MAX);
+                assert(tmp_rd_stats_y.dist < INT64_MAX);
+              }
               total_sse_y = tmp_rd_stats_y.sse;
               distortion2_y = tmp_rd_stats_y.dist;
             }
@@ -9774,19 +11251,33 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     // Estimate the reference frame signaling cost and add it
     // to the rolling cost variable.
     if (comp_pred) {
+#if CONFIG_EXT_COMP_REFS
+      rate2 += ref_costs_comp[ref_frame][second_ref_frame];
+#else  // !CONFIG_EXT_COMP_REFS
       rate2 += ref_costs_comp[ref_frame];
 #if CONFIG_EXT_REFS
       rate2 += ref_costs_comp[second_ref_frame];
 #endif  // CONFIG_EXT_REFS
+#endif  // CONFIG_EXT_COMP_REFS
     } else {
       rate2 += ref_costs_single[ref_frame];
     }
 
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+    // Add the cost to signal single/comp mode in single ref.
+    if (!comp_pred && cm->reference_mode != COMPOUND_REFERENCE) {
+      aom_prob singleref_comp_mode_p = av1_get_inter_mode_prob(cm, xd);
+      rate2 += av1_cost_bit(singleref_comp_mode_p,
+                            is_inter_singleref_comp_mode(mbmi->mode));
+    }
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-    if (ref_frame == INTRA_FRAME) {
+    if (ref_frame == INTRA_FRAME)
 #else
-    if (!disable_skip) {
+    if (!disable_skip)
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+    {
       if (skippable) {
         // Back out the coefficient coding costs
         rate2 -= (rate_y + rate_uv);
@@ -9795,9 +11286,8 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         // Cost the skip mb case
         rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
       } else if (ref_frame != INTRA_FRAME && !xd->lossless[mbmi->segment_id]) {
-        if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv + rate_skip0,
-                   distortion2) <
-            RDCOST(x->rdmult, x->rddiv, rate_skip1, total_sse)) {
+        if (RDCOST(x->rdmult, rate_y + rate_uv + rate_skip0, distortion2) <
+            RDCOST(x->rdmult, rate_skip1, total_sse)) {
           // Add in the cost of the no skip flag.
           rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
         } else {
@@ -9809,8 +11299,11 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
           this_skip2 = 1;
           rate_y = 0;
           rate_uv = 0;
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
-          if (bsize < BLOCK_8X8) distortion2_y = total_sse_y;
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
+          if (bsize < BLOCK_8X8) {
+            assert(total_sse_y < INT64_MAX);
+            distortion2_y = total_sse_y;
+          }
 #endif
         }
       } else {
@@ -9819,11 +11312,11 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       }
 
       // Calculate the final RD estimate for this mode.
-      this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+      this_rd = RDCOST(x->rdmult, rate2, distortion2);
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
     } else {
       this_skip2 = mbmi->skip;
-      this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+      this_rd = RDCOST(x->rdmult, rate2, distortion2);
       if (this_skip2) {
         rate_y = 0;
         rate_uv = 0;
@@ -9831,6 +11324,12 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
     }
 
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
+    if ((bsize < BLOCK_8X8) && (rate2 != INT_MAX)) {
+      assert(distortion2_y < INT64_MAX);
+    }
+#endif
+
     if (ref_frame == INTRA_FRAME) {
       // Keep record of best intra rd
       if (this_rd < best_intra_rd) {
@@ -9875,12 +11374,18 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         *returnrate_nocoef -= av1_cost_bit(av1_get_intra_inter_prob(cm, xd),
                                            mbmi->ref_frame[0] != INTRA_FRAME);
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+#if CONFIG_WARPED_MOTION
+        set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+#endif
 #if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
         MODE_INFO *const mi = xd->mi[0];
         const MOTION_MODE motion_allowed = motion_mode_allowed(
-#if CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+#if CONFIG_GLOBAL_MOTION
             0, xd->global_motion,
-#endif  // CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
+#endif  // CONFIG_GLOBAL_MOTION
+#if CONFIG_WARPED_MOTION
+            xd,
+#endif
             mi);
         if (motion_allowed == WARPED_CAUSAL)
           *returnrate_nocoef -= cpi->motion_mode_cost[bsize][mbmi->motion_mode];
@@ -9901,8 +11406,11 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         best_rate_y = rate_y + av1_cost_bit(av1_get_skip_prob(cm, xd),
                                             this_skip2 || skippable);
         best_rate_uv = rate_uv;
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
-        if (bsize < BLOCK_8X8) rd_cost->dist_y = distortion2_y;
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
+        if (bsize < BLOCK_8X8) {
+          assert(distortion2_y < INT64_MAX);
+          rd_cost->dist_y = distortion2_y;
+        }
 #endif
 #if CONFIG_VAR_TX
         for (i = 0; i < MAX_MB_PLANE; ++i)
@@ -9911,7 +11419,11 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 #endif  // CONFIG_VAR_TX
       }
     }
-
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
+    if ((bsize < BLOCK_8X8) && (rd_cost->rate != INT_MAX)) {
+      assert(rd_cost->dist_y < INT64_MAX);
+    }
+#endif
     /* keep record of best compound/single-only prediction */
     if (!disable_skip && ref_frame != INTRA_FRAME) {
       int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
@@ -9924,8 +11436,8 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         hybrid_rate = rate2 + compmode_cost;
       }
 
-      single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
-      hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
+      single_rd = RDCOST(x->rdmult, single_rate, distortion2);
+      hybrid_rd = RDCOST(x->rdmult, hybrid_rate, distortion2);
 
       if (!comp_pred) {
         if (single_rd < best_pred_rd[SINGLE_REFERENCE])
@@ -9963,6 +11475,15 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
     }
 
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+    // Single ref compound mode
+    if (!has_second_ref(mbmi) && is_inter_singleref_comp_mode(mbmi->mode)) {
+      xd->block_refs[1] = xd->block_refs[0];
+      for (i = 0; i < MAX_MB_PLANE; i++)
+        xd->plane[i].pre[1] = xd->plane[i].pre[0];
+    }
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+
     if (is_inter_mode(mbmi->mode)) {
       av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
 #if CONFIG_MOTION_VAR
@@ -9996,9 +11517,9 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
     }
 
-    if (RDCOST(x->rdmult, x->rddiv, rd_stats_y.rate + rd_stats_uv.rate,
+    if (RDCOST(x->rdmult, rd_stats_y.rate + rd_stats_uv.rate,
                (rd_stats_y.dist + rd_stats_uv.dist)) >
-        RDCOST(x->rdmult, x->rddiv, 0, (rd_stats_y.sse + rd_stats_uv.sse))) {
+        RDCOST(x->rdmult, 0, (rd_stats_y.sse + rd_stats_uv.sse))) {
       skip_blk = 1;
       rd_stats_y.rate = av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
       rd_stats_uv.rate = 0;
@@ -10009,8 +11530,8 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       rd_stats_y.rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
     }
 
-    if (RDCOST(x->rdmult, x->rddiv, best_rate_y + best_rate_uv, rd_cost->dist) >
-        RDCOST(x->rdmult, x->rddiv, rd_stats_y.rate + rd_stats_uv.rate,
+    if (RDCOST(x->rdmult, best_rate_y + best_rate_uv, rd_cost->dist) >
+        RDCOST(x->rdmult, rd_stats_y.rate + rd_stats_uv.rate,
                (rd_stats_y.dist + rd_stats_uv.dist))) {
 #if CONFIG_VAR_TX
       int idx, idy;
@@ -10031,15 +11552,24 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       rd_cost->rate +=
           (rd_stats_y.rate + rd_stats_uv.rate - best_rate_y - best_rate_uv);
       rd_cost->dist = rd_stats_y.dist + rd_stats_uv.dist;
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
-      if (bsize < BLOCK_8X8) rd_cost->dist_y = rd_stats_y.dist;
-#endif
-      rd_cost->rdcost =
-          RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
+      rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
       best_skip2 = skip_blk;
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
+      if (bsize < BLOCK_8X8) {
+        assert(rd_cost->rate != INT_MAX);
+        assert(rd_cost->dist_y < INT64_MAX);
+        rd_cost->dist_y = rd_stats_y.dist;
+      }
+#endif
     }
   }
 
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
+  if ((bsize < BLOCK_8X8) && (rd_cost->rate != INT_MAX)) {
+    assert(rd_cost->dist_y < INT64_MAX);
+  }
+#endif
+
 #if CONFIG_PALETTE
   // Only try palette mode when the best mode so far is an intra mode.
   if (try_palette && !is_inter_mode(best_mbmode.mode)) {
@@ -10058,7 +11588,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     MB_MODE_INFO best_mbmi_palette = best_mbmode;
 
     mbmi->mode = DC_PRED;
-    mbmi->uv_mode = DC_PRED;
+    mbmi->uv_mode = UV_DC_PRED;
     mbmi->ref_frame[0] = INTRA_FRAME;
     mbmi->ref_frame[1] = NONE_FRAME;
     rate_overhead_palette = rd_pick_palette_intra_sby(
@@ -10119,7 +11649,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 #endif  // CONFIG_SUPERTX
       rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
     }
-    this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+    this_rd = RDCOST(x->rdmult, rate2, distortion2);
     if (this_rd < best_rd) {
       best_mode_index = 3;
       mbmi->mv[0].as_int = 0;
@@ -10165,10 +11695,14 @@ PALETTE_EXIT:
   }
 #endif  // CONFIG_FILTER_INTRA
 
-  // The inter modes' rate costs are not calculated precisely in some cases.
-  // Therefore, sometimes, NEWMV is chosen instead of NEARESTMV, NEARMV, and
-  // ZEROMV. Here, checks are added for those cases, and the mode decisions
-  // are corrected.
+// The inter modes' rate costs are not calculated precisely in some cases.
+// Therefore, sometimes, NEWMV is chosen instead of NEARESTMV, NEARMV, and
+// ZEROMV. Here, checks are added for those cases, and the mode decisions
+// are corrected.
+#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+// NOTE: For SR_NEW_NEWMV, no need to check as the two mvs from the same ref
+//       are surely different from each other.
+#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
   if (best_mbmode.mode == NEWMV
 #if CONFIG_EXT_INTER
       || best_mbmode.mode == NEW_NEWMV
@@ -10248,8 +11782,9 @@ PALETTE_EXIT:
       }
 
       if (nearestmv[0].as_int == best_mbmode.mv[0].as_int &&
-          nearestmv[1].as_int == best_mbmode.mv[1].as_int) {
+          nearestmv[1].as_int == best_mbmode.mv[1].as_int)
 #if CONFIG_EXT_INTER
+      {
         best_mbmode.mode = NEAREST_NEARESTMV;
       } else {
         int ref_set = (mbmi_ext->ref_mv_count[rf_type] >= 2)
@@ -10274,6 +11809,7 @@ PALETTE_EXIT:
           best_mbmode.mode = ZERO_ZEROMV;
       }
 #else
+      {
         best_mbmode.mode = NEARESTMV;
       } else if (best_mbmode.mv[0].as_int == zeromv[0].as_int &&
                  best_mbmode.mv[1].as_int == zeromv[1].as_int) {
@@ -10287,11 +11823,18 @@ PALETTE_EXIT:
   // using a mode which can support ref_mv_idx
   if (best_mbmode.ref_mv_idx != 0 &&
 #if CONFIG_EXT_INTER
+#if CONFIG_COMPOUND_SINGLEREF
+      !(best_mbmode.mode == NEWMV || best_mbmode.mode == SR_NEW_NEWMV ||
+        best_mbmode.mode == NEW_NEWMV ||
+        have_nearmv_in_inter_mode(best_mbmode.mode)))
+#else   // !CONFIG_COMPOUND_SINGLEREF
       !(best_mbmode.mode == NEWMV || best_mbmode.mode == NEW_NEWMV ||
-        have_nearmv_in_inter_mode(best_mbmode.mode))) {
-#else
-      !(best_mbmode.mode == NEARMV || best_mbmode.mode == NEWMV)) {
-#endif
+        have_nearmv_in_inter_mode(best_mbmode.mode)))
+#endif  // CONFIG_COMPOUND_SINGLEREF
+#else   // !CONFIG_EXT_INTER
+      !(best_mbmode.mode == NEARMV || best_mbmode.mode == NEWMV))
+#endif  // CONFIG_EXT_INTER
+  {
     best_mbmode.ref_mv_idx = 0;
   }
 
@@ -10377,11 +11920,12 @@ PALETTE_EXIT:
       ) {
 #if CONFIG_WARPED_MOTION || CONFIG_MOTION_VAR
     // Correct the motion mode for ZEROMV
-    const MOTION_MODE last_motion_mode_allowed = motion_mode_allowed(
-#if SEPARATE_GLOBAL_MOTION
-        0, xd->global_motion,
-#endif  // SEPARATE_GLOBAL_MOTION
-        xd->mi[0]);
+    const MOTION_MODE last_motion_mode_allowed =
+        motion_mode_allowed(0, xd->global_motion,
+#if CONFIG_WARPED_MOTION
+                            xd,
+#endif
+                            xd->mi[0]);
     if (mbmi->motion_mode > last_motion_mode_allowed)
       mbmi->motion_mode = last_motion_mode_allowed;
 #endif  // CONFIG_WARPED_MOTION || CONFIG_MOTION_VAR
@@ -10445,7 +11989,11 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
   int i;
   int64_t best_pred_diff[REFERENCE_MODES];
   unsigned int ref_costs_single[TOTAL_REFS_PER_FRAME];
+#if CONFIG_EXT_COMP_REFS
+  unsigned int ref_costs_comp[TOTAL_REFS_PER_FRAME][TOTAL_REFS_PER_FRAME];
+#else
   unsigned int ref_costs_comp[TOTAL_REFS_PER_FRAME];
+#endif  // CONFIG_EXT_COMP_REFS
   aom_prob comp_mode_p;
   InterpFilter best_filter = SWITCHABLE;
   int64_t this_rd = INT64_MAX;
@@ -10476,7 +12024,7 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
 #endif  // CONFIG_FILTER_INTRA
   mbmi->mode = ZEROMV;
   mbmi->motion_mode = SIMPLE_TRANSLATION;
-  mbmi->uv_mode = DC_PRED;
+  mbmi->uv_mode = UV_DC_PRED;
   mbmi->ref_frame[0] = LAST_FRAME;
   mbmi->ref_frame[1] = NONE_FRAME;
 #if CONFIG_GLOBAL_MOTION
@@ -10501,7 +12049,17 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
 #if CONFIG_WARPED_MOTION
   if (is_motion_variation_allowed_bsize(bsize) && !has_second_ref(mbmi)) {
     int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+#if WARPED_MOTION_SORT_SAMPLES
+    int pts_mv[SAMPLES_ARRAY_SIZE];
+    mbmi->num_proj_ref[0] =
+        findSamples(cm, xd, mi_row, mi_col, pts, pts_inref, pts_mv);
+    // Rank the samples by motion vector difference
+    if (mbmi->num_proj_ref[0] > 1)
+      mbmi->num_proj_ref[0] = sortSamples(pts_mv, &mbmi->mv[0].as_mv, pts,
+                                          pts_inref, mbmi->num_proj_ref[0]);
+#else
     mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref);
+#endif  // WARPED_MOTION_SORT_SAMPLES
   }
 #endif
 
@@ -10548,12 +12106,12 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
   // Estimate the reference frame signaling cost and add it
   // to the rolling cost variable.
   rate2 += ref_costs_single[LAST_FRAME];
-  this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+  this_rd = RDCOST(x->rdmult, rate2, distortion2);
 
   rd_cost->rate = rate2;
   rd_cost->dist = distortion2;
   rd_cost->rdcost = this_rd;
-#if CONFIG_DAALA_DIST && CONFIG_CB4X4
+#if CONFIG_DIST_8X8 && CONFIG_CB4X4
   if (bsize < BLOCK_8X8) rd_cost->dist_y = distortion2;
 #endif
   if (this_rd >= best_rd_so_far) {
@@ -10646,7 +12204,8 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
 
   // handle above row
   if (xd->up_available) {
-    const int overlap = num_4x4_blocks_high_lookup[bsize] * 2;
+    const int overlap =
+        AOMMIN(block_size_high[bsize] >> 1, block_size_high[BLOCK_64X64] >> 1);
     const int miw = AOMMIN(xd->n8_w, cm->mi_cols - mi_col);
     const int mi_row_offset = -1;
     const uint8_t *const mask1d = av1_get_obmc_mask(overlap);
@@ -10666,7 +12225,9 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
             &xd->mi[mi_col_offset + 1 + mi_row_offset * xd->mi_stride]->mbmi;
 #endif
       const BLOCK_SIZE a_bsize = AOMMAX(above_mbmi->sb_type, BLOCK_8X8);
-      const int mi_step = AOMMIN(xd->n8_w, mi_size_wide[a_bsize]);
+      const int above_step =
+          AOMMIN(mi_size_wide[a_bsize], mi_size_wide[BLOCK_64X64]);
+      const int mi_step = AOMMIN(xd->n8_w, above_step);
       const int neighbor_bw = mi_step * MI_SIZE;
 
       if (is_neighbor_overlappable(above_mbmi)) {
@@ -10725,7 +12286,8 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
 
   // handle left column
   if (xd->left_available) {
-    const int overlap = num_4x4_blocks_wide_lookup[bsize] * 2;
+    const int overlap =
+        AOMMIN(block_size_wide[bsize] >> 1, block_size_wide[BLOCK_64X64] >> 1);
     const int mih = AOMMIN(xd->n8_h, cm->mi_rows - mi_row);
     const int mi_col_offset = -1;
     const uint8_t *const mask1d = av1_get_obmc_mask(overlap);
@@ -10746,7 +12308,9 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
             &xd->mi[mi_col_offset + (mi_row_offset + 1) * xd->mi_stride]->mbmi;
 #endif
       const BLOCK_SIZE l_bsize = AOMMAX(left_mbmi->sb_type, BLOCK_8X8);
-      const int mi_step = AOMMIN(xd->n8_h, mi_size_high[l_bsize]);
+      const int left_step =
+          AOMMIN(mi_size_high[l_bsize], mi_size_high[BLOCK_64X64]);
+      const int mi_step = AOMMIN(xd->n8_h, left_step);
       const int neighbor_bh = mi_step * MI_SIZE;
 
       if (is_neighbor_overlappable(left_mbmi)) {
@@ -10854,8 +12418,23 @@ void av1_check_ncobmc_rd(const struct AV1_COMP *cpi, struct macroblock *x,
   av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
 
   av1_subtract_plane(x, bsize, 0);
+#if CONFIG_VAR_TX
+  if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
+    select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+  } else {
+    int idx, idy;
+    super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+    for (idy = 0; idy < xd->n8_h; ++idy)
+      for (idx = 0; idx < xd->n8_w; ++idx)
+        mbmi->inter_tx_size[idy][idx] = mbmi->tx_size;
+    memset(x->blk_skip[0], rd_stats_y.skip,
+           sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
+  }
+  inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+#else
   super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
   super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+#endif
   assert(rd_stats_y.rate != INT_MAX && rd_stats_uv.rate != INT_MAX);
   if (rd_stats_y.skip && rd_stats_uv.skip) {
     rd_stats_y.rate = rate_skip1;
@@ -10863,10 +12442,10 @@ void av1_check_ncobmc_rd(const struct AV1_COMP *cpi, struct macroblock *x,
     rd_stats_y.dist = rd_stats_y.sse;
     rd_stats_uv.dist = rd_stats_uv.sse;
     skip_blk = 0;
-  } else if (RDCOST(x->rdmult, x->rddiv,
+  } else if (RDCOST(x->rdmult,
                     (rd_stats_y.rate + rd_stats_uv.rate + rate_skip0),
                     (rd_stats_y.dist + rd_stats_uv.dist)) >
-             RDCOST(x->rdmult, x->rddiv, rate_skip1,
+             RDCOST(x->rdmult, rate_skip1,
                     (rd_stats_y.sse + rd_stats_uv.sse))) {
     rd_stats_y.rate = rate_skip1;
     rd_stats_uv.rate = 0;
@@ -10879,18 +12458,33 @@ void av1_check_ncobmc_rd(const struct AV1_COMP *cpi, struct macroblock *x,
   }
   backup_skip = skip_blk;
   backup_mbmi = *mbmi;
-  rd_causal = RDCOST(x->rdmult, x->rddiv, (rd_stats_y.rate + rd_stats_uv.rate),
+  rd_causal = RDCOST(x->rdmult, (rd_stats_y.rate + rd_stats_uv.rate),
                      (rd_stats_y.dist + rd_stats_uv.dist));
-  rd_causal += RDCOST(x->rdmult, x->rddiv,
-                      av1_cost_bit(cm->fc->motion_mode_prob[bsize][0], 0), 0);
+  rd_causal +=
+      RDCOST(x->rdmult, av1_cost_bit(cm->fc->motion_mode_prob[bsize][0], 0), 0);
 
   // Check non-causal mode
   mbmi->motion_mode = OBMC_CAUSAL;
   av1_build_ncobmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
 
   av1_subtract_plane(x, bsize, 0);
+#if CONFIG_VAR_TX
+  if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
+    select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+  } else {
+    int idx, idy;
+    super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+    for (idy = 0; idy < xd->n8_h; ++idy)
+      for (idx = 0; idx < xd->n8_w; ++idx)
+        mbmi->inter_tx_size[idy][idx] = mbmi->tx_size;
+    memset(x->blk_skip[0], rd_stats_y.skip,
+           sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
+  }
+  inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+#else
   super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
   super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+#endif
   assert(rd_stats_y.rate != INT_MAX && rd_stats_uv.rate != INT_MAX);
   if (rd_stats_y.skip && rd_stats_uv.skip) {
     rd_stats_y.rate = rate_skip1;
@@ -10898,10 +12492,10 @@ void av1_check_ncobmc_rd(const struct AV1_COMP *cpi, struct macroblock *x,
     rd_stats_y.dist = rd_stats_y.sse;
     rd_stats_uv.dist = rd_stats_uv.sse;
     skip_blk = 0;
-  } else if (RDCOST(x->rdmult, x->rddiv,
+  } else if (RDCOST(x->rdmult,
                     (rd_stats_y.rate + rd_stats_uv.rate + rate_skip0),
                     (rd_stats_y.dist + rd_stats_uv.dist)) >
-             RDCOST(x->rdmult, x->rddiv, rate_skip1,
+             RDCOST(x->rdmult, rate_skip1,
                     (rd_stats_y.sse + rd_stats_uv.sse))) {
     rd_stats_y.rate = rate_skip1;
     rd_stats_uv.rate = 0;
@@ -10914,9 +12508,8 @@ void av1_check_ncobmc_rd(const struct AV1_COMP *cpi, struct macroblock *x,
   }
 
   if (rd_causal >
-      RDCOST(x->rdmult, x->rddiv,
-             rd_stats_y.rate + rd_stats_uv.rate +
-                 av1_cost_bit(cm->fc->motion_mode_prob[bsize][0], 1),
+      RDCOST(x->rdmult, rd_stats_y.rate + rd_stats_uv.rate +
+                            av1_cost_bit(cm->fc->motion_mode_prob[bsize][0], 1),
              (rd_stats_y.dist + rd_stats_uv.dist))) {
     x->skip = skip_blk;
   } else {
diff --git a/third_party/aom/av1/encoder/rdopt.h b/third_party/aom/av1/encoder/rdopt.h
index e5d778fe5..43a6a3794 100644
--- a/third_party/aom/av1/encoder/rdopt.h
+++ b/third_party/aom/av1/encoder/rdopt.h
@@ -57,22 +57,33 @@ typedef enum OUTPUT_STATUS {
   OUTPUT_HAS_DECODED_PIXELS
 } OUTPUT_STATUS;
 
+#if CONFIG_PALETTE || CONFIG_INTRABC
+// Returns the number of colors in 'src'.
+int av1_count_colors(const uint8_t *src, int stride, int rows, int cols);
+#if CONFIG_HIGHBITDEPTH
+// Same as av1_count_colors(), but for high-bitdepth mode.
+int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols,
+                            int bit_depth);
+#endif  // CONFIG_HIGHBITDEPTH
+#endif  // CONFIG_PALETTE || CONFIG_INTRABC
+
 void av1_dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
                     BLOCK_SIZE plane_bsize, int block, int blk_row, int blk_col,
                     TX_SIZE tx_size, int64_t *out_dist, int64_t *out_sse,
                     OUTPUT_STATUS output_status);
 
-#if CONFIG_DAALA_DIST
-int64_t av1_daala_dist(const uint8_t *src, int src_stride, const uint8_t *dst,
-                       int dst_stride, int bsw, int bsh, int qm,
-                       int use_activity_masking, int qindex);
+#if CONFIG_DIST_8X8
+int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCKD *xd,
+                     const uint8_t *src, int src_stride, const uint8_t *dst,
+                     int dst_stride, const BLOCK_SIZE tx_bsize, int bsw,
+                     int bsh, int visible_w, int visible_h, int qindex);
 #endif
 
 #if !CONFIG_PVQ || CONFIG_VAR_TX
 int av1_cost_coeffs(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
-                    int block, TX_SIZE tx_size, const SCAN_ORDER *scan_order,
-                    const ENTROPY_CONTEXT *a, const ENTROPY_CONTEXT *l,
-                    int use_fast_coef_costing);
+                    int blk_row, int blk_col, int block, TX_SIZE tx_size,
+                    const SCAN_ORDER *scan_order, const ENTROPY_CONTEXT *a,
+                    const ENTROPY_CONTEXT *l, int use_fast_coef_costing);
 #endif
 void av1_rd_pick_intra_mode_sb(const struct AV1_COMP *cpi, struct macroblock *x,
                                struct RD_STATS *rd_cost, BLOCK_SIZE bsize,
diff --git a/third_party/aom/av1/encoder/segmentation.c b/third_party/aom/av1/encoder/segmentation.c
index b581a61d0..b61df43fa 100644
--- a/third_party/aom/av1/encoder/segmentation.c
+++ b/third_party/aom/av1/encoder/segmentation.c
@@ -299,12 +299,8 @@ void av1_choose_segmap_coding_method(AV1_COMMON *cm, MACROBLOCKD *xd) {
   int no_pred_cost;
   int t_pred_cost = INT_MAX;
 
-  int i, tile_col, tile_row, mi_row, mi_col;
-#if CONFIG_TILE_GROUPS
+  int tile_col, tile_row, mi_row, mi_col;
   const int probwt = cm->num_tg;
-#else
-  const int probwt = 1;
-#endif
 
   unsigned(*temporal_predictor_count)[2] = cm->counts.seg.pred;
   unsigned *no_pred_segcounts = cm->counts.seg.tree_total;
@@ -312,7 +308,9 @@ void av1_choose_segmap_coding_method(AV1_COMMON *cm, MACROBLOCKD *xd) {
 
   aom_prob no_pred_tree[SEG_TREE_PROBS];
   aom_prob t_pred_tree[SEG_TREE_PROBS];
+#if !CONFIG_NEW_MULTISYMBOL
   aom_prob t_nopred_prob[PREDICTION_PROBS];
+#endif
 
   (void)xd;
 
@@ -327,7 +325,7 @@ void av1_choose_segmap_coding_method(AV1_COMMON *cm, MACROBLOCKD *xd) {
     for (tile_col = 0; tile_col < cm->tile_cols; tile_col++) {
       MODE_INFO **mi_ptr;
       av1_tile_set_col(&tile_info, cm, tile_col);
-#if CONFIG_TILE_GROUPS && CONFIG_DEPENDENT_HORZTILES
+#if CONFIG_DEPENDENT_HORZTILES
       av1_tile_set_tg_boundary(&tile_info, cm, tile_row, tile_col);
 #endif
       mi_ptr = cm->mi_grid_visible + tile_info.mi_row_start * cm->mi_stride +
@@ -357,8 +355,9 @@ void av1_choose_segmap_coding_method(AV1_COMMON *cm, MACROBLOCKD *xd) {
     calc_segtree_probs(t_unpred_seg_counts, t_pred_tree, segp->tree_probs,
                        probwt);
     t_pred_cost = cost_segmap(t_unpred_seg_counts, t_pred_tree);
-
+#if !CONFIG_NEW_MULTISYMBOL
     // Add in the cost of the signaling for each prediction context.
+    int i;
     for (i = 0; i < PREDICTION_PROBS; i++) {
       const int count0 = temporal_predictor_count[i][0];
       const int count1 = temporal_predictor_count[i][1];
@@ -372,6 +371,7 @@ void av1_choose_segmap_coding_method(AV1_COMMON *cm, MACROBLOCKD *xd) {
       t_pred_cost += count0 * av1_cost_zero(t_nopred_prob[i]) +
                      count1 * av1_cost_one(t_nopred_prob[i]);
     }
+#endif
   }
 
   // Now choose which coding method to use.
diff --git a/third_party/aom/av1/encoder/speed_features.c b/third_party/aom/av1/encoder/speed_features.c
index e2275a54f..eeab33a95 100644
--- a/third_party/aom/av1/encoder/speed_features.c
+++ b/third_party/aom/av1/encoder/speed_features.c
@@ -35,7 +35,7 @@ static unsigned char good_quality_max_mesh_pct[MAX_MESH_SPEED + 1] = {
 // TODO(aconverse@google.com): These settings are pretty relaxed, tune them for
 // each speed setting
 static MESH_PATTERN intrabc_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = {
-  { { 64, 1 }, { 64, 1 }, { 0, 0 }, { 0, 0 } },
+  { { 256, 1 }, { 256, 1 }, { 0, 0 }, { 0, 0 } },
   { { 64, 1 }, { 64, 1 }, { 0, 0 }, { 0, 0 } },
   { { 64, 1 }, { 64, 1 }, { 0, 0 }, { 0, 0 } },
   { { 64, 4 }, { 16, 1 }, { 0, 0 }, { 0, 0 } },
@@ -171,12 +171,24 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
     sf->recode_loop = ALLOW_RECODE_KFARFGF;
 #if CONFIG_TX64X64
     sf->intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V;
+#if CONFIG_CFL
+    sf->intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC_H_V;
+#else
     sf->intra_uv_mode_mask[TX_64X64] = INTRA_DC_H_V;
+#endif  // CONFIG_CFL
 #endif  // CONFIG_TX64X64
     sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+#if CONFIG_CFL
+    sf->intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_H_V;
+#else
     sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
+#endif
     sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
+#if CONFIG_CFL
+    sf->intra_uv_mode_mask[TX_16X16] = UV_INTRA_DC_H_V;
+#else
     sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
+#endif
 
     sf->tx_size_search_breakout = 1;
     sf->partition_search_breakout_rate_thr = 80;
@@ -199,7 +211,7 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
             : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER |
                   FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR;
     sf->disable_filter_search_var_thresh = 100;
-    sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
+    sf->comp_inter_joint_search_thresh = BLOCK_SIZES_ALL;
     sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
     sf->allow_partition_search_skip = 1;
     sf->use_upsampled_references = 0;
@@ -227,10 +239,18 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
     sf->mode_skip_start = 6;
 #if CONFIG_TX64X64
     sf->intra_y_mode_mask[TX_64X64] = INTRA_DC;
+#if CONFIG_CFL
+    sf->intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC;
+#else
     sf->intra_uv_mode_mask[TX_64X64] = INTRA_DC;
+#endif  // CONFIG_CFL
 #endif  // CONFIG_TX64X64
     sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
+#if CONFIG_CFL
+    sf->intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC;
+#else
     sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC;
+#endif  // CONFIG_CFL
     sf->adaptive_interp_filter_search = 1;
   }
 
@@ -255,7 +275,11 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
     sf->disable_filter_search_var_thresh = 500;
     for (i = 0; i < TX_SIZES; ++i) {
       sf->intra_y_mode_mask[i] = INTRA_DC;
+#if CONFIG_CFL
+      sf->intra_uv_mode_mask[i] = UV_INTRA_DC;
+#else
       sf->intra_uv_mode_mask[i] = INTRA_DC;
+#endif  // CONFIG_CFL
     }
     sf->partition_search_breakout_rate_thr = 500;
     sf->mv.reduce_first_step_size = 1;
@@ -405,7 +429,11 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
 
   for (i = 0; i < TX_SIZES; i++) {
     sf->intra_y_mode_mask[i] = INTRA_ALL;
+#if CONFIG_CFL
+    sf->intra_uv_mode_mask[i] = UV_INTRA_ALL;
+#else
     sf->intra_uv_mode_mask[i] = INTRA_ALL;
+#endif  // CONFIG_CFL
   }
   sf->use_rd_breakout = 0;
   sf->lpf_pick = LPF_PICK_FROM_FULL_IMAGE;
@@ -413,7 +441,7 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
   sf->use_fast_coef_costing = 0;
   sf->mode_skip_start = MAX_MODES;  // Mode index at which mode skip mask set
   sf->schedule_mode_search = 0;
-  for (i = 0; i < BLOCK_SIZES; ++i) sf->inter_mode_mask[i] = INTER_ALL;
+  for (i = 0; i < BLOCK_SIZES_ALL; ++i) sf->inter_mode_mask[i] = INTER_ALL;
   sf->max_intra_bsize = BLOCK_LARGEST;
   sf->reuse_inter_pred_sby = 0;
   // This setting only takes effect when partition_search_type is set
diff --git a/third_party/aom/av1/encoder/speed_features.h b/third_party/aom/av1/encoder/speed_features.h
index 5710d77c7..2c89f4e5c 100644
--- a/third_party/aom/av1/encoder/speed_features.h
+++ b/third_party/aom/av1/encoder/speed_features.h
@@ -29,6 +29,24 @@ enum {
 #endif  // CONFIG_SMOOTH_HV
 #endif  // CONFIG_ALT_INTRA
               (1 << TM_PRED),
+#if CONFIG_CFL
+  UV_INTRA_ALL = (1 << UV_DC_PRED) | (1 << UV_V_PRED) | (1 << UV_H_PRED) |
+                 (1 << UV_D45_PRED) | (1 << UV_D135_PRED) |
+                 (1 << UV_D117_PRED) | (1 << UV_D153_PRED) |
+                 (1 << UV_D207_PRED) | (1 << UV_D63_PRED) |
+#if CONFIG_ALT_INTRA
+                 (1 << UV_SMOOTH_PRED) |
+#if CONFIG_SMOOTH_HV
+                 (1 << UV_SMOOTH_V_PRED) | (1 << UV_SMOOTH_H_PRED) |
+#endif  // CONFIG_SMOOTH_HV
+#endif  // CONFIG_ALT_INTRA
+                 (1 << UV_TM_PRED),
+  UV_INTRA_DC = (1 << UV_DC_PRED),
+  UV_INTRA_DC_TM = (1 << UV_DC_PRED) | (1 << UV_TM_PRED),
+  UV_INTRA_DC_H_V = (1 << UV_DC_PRED) | (1 << UV_V_PRED) | (1 << UV_H_PRED),
+  UV_INTRA_DC_TM_H_V = (1 << UV_DC_PRED) | (1 << UV_TM_PRED) |
+                       (1 << UV_V_PRED) | (1 << UV_H_PRED),
+#endif  // CONFIG_CFL
   INTRA_DC = (1 << DC_PRED),
   INTRA_DC_TM = (1 << DC_PRED) | (1 << TM_PRED),
   INTRA_DC_H_V = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED),
@@ -38,6 +56,11 @@ enum {
 
 #if CONFIG_EXT_INTER
 enum {
+#if CONFIG_COMPOUND_SINGLEREF
+// TODO(zoeliu): To further consider following single ref comp modes:
+//               SR_NEAREST_NEARMV, SR_NEAREST_NEWMV, SR_NEAR_NEWMV,
+//               SR_ZERO_NEWMV, and SR_NEW_NEWMV.
+#endif  // CONFIG_COMPOUND_SINGLEREF
   INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) | (1 << NEWMV) |
               (1 << NEAREST_NEARESTMV) | (1 << NEAR_NEARMV) | (1 << NEW_NEWMV) |
               (1 << NEAREST_NEWMV) | (1 << NEAR_NEWMV) | (1 << NEW_NEARMV) |
@@ -67,7 +90,7 @@ enum {
                             (1 << NEW_NEARMV) | (1 << NEAR_NEWMV) |
                             (1 << NEAR_NEARMV),
 };
-#else
+#else   // !CONFIG_EXT_INTER
 enum {
   INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) | (1 << NEWMV),
   INTER_NEAREST = (1 << NEARESTMV),
@@ -399,10 +422,6 @@ typedef struct SPEED_FEATURES {
   int intra_y_mode_mask[TX_SIZES];
   int intra_uv_mode_mask[TX_SIZES];
 
-  // These bit masks allow you to enable or disable intra modes for each
-  // prediction block size separately.
-  int intra_y_mode_bsize_mask[BLOCK_SIZES];
-
   // This variable enables an early break out of mode testing if the model for
   // rd built from the prediction signal indicates a value that's much
   // higher than the best rd we've seen so far.
@@ -417,7 +436,7 @@ typedef struct SPEED_FEATURES {
 
   // A binary mask indicating if NEARESTMV, NEARMV, ZEROMV, NEWMV
   // modes are used in order from LSB to MSB for each BLOCK_SIZE.
-  int inter_mode_mask[BLOCK_SIZES];
+  int inter_mode_mask[BLOCK_SIZES_ALL];
 
   // This feature controls whether we do the expensive context update and
   // calculation in the rd coefficient costing loop.
diff --git a/third_party/aom/av1/encoder/temporal_filter.c b/third_party/aom/av1/encoder/temporal_filter.c
index 1ed1ebdb2..604647922 100644
--- a/third_party/aom/av1/encoder/temporal_filter.c
+++ b/third_party/aom/av1/encoder/temporal_filter.c
@@ -41,7 +41,7 @@ static void temporal_filter_predictors_mb_c(
   enum mv_precision mv_precision_uv;
   int uv_stride;
   // TODO(angiebird): change plane setting accordingly
-  ConvolveParams conv_params = get_conv_params(which_mv, 0);
+  ConvolveParams conv_params = get_conv_params(which_mv, which_mv, 0);
 
 #if USE_TEMPORALFILTER_12TAP
 #if CONFIG_DUAL_FILTER
@@ -413,10 +413,10 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi,
               mbd->mi[0]->bmi[0].as_mv[0].as_mv.col, predictor, scale,
               mb_col * 16, mb_row * 16);
 
+// Apply the filter (YUV)
 #if CONFIG_HIGHBITDEPTH
           if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
             int adj_strength = strength + 2 * (mbd->bd - 8);
-            // Apply the filter (YUV)
             av1_highbd_temporal_filter_apply(
                 f->y_buffer + mb_y_offset, f->y_stride, predictor, 16, 16,
                 adj_strength, filter_weight, accumulator, count);
@@ -429,7 +429,7 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi,
                 mb_uv_width, mb_uv_height, adj_strength, filter_weight,
                 accumulator + 512, count + 512);
           } else {
-            // Apply the filter (YUV)
+#endif  // CONFIG_HIGHBITDEPTH
             av1_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
                                         predictor, 16, 16, strength,
                                         filter_weight, accumulator, count);
@@ -441,29 +441,17 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi,
                 f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512,
                 mb_uv_width, mb_uv_height, strength, filter_weight,
                 accumulator + 512, count + 512);
+#if CONFIG_HIGHBITDEPTH
           }
-#else
-          // Apply the filter (YUV)
-          av1_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
-                                      predictor, 16, 16, strength,
-                                      filter_weight, accumulator, count);
-          av1_temporal_filter_apply_c(f->u_buffer + mb_uv_offset, f->uv_stride,
-                                      predictor + 256, mb_uv_width,
-                                      mb_uv_height, strength, filter_weight,
-                                      accumulator + 256, count + 256);
-          av1_temporal_filter_apply_c(f->v_buffer + mb_uv_offset, f->uv_stride,
-                                      predictor + 512, mb_uv_width,
-                                      mb_uv_height, strength, filter_weight,
-                                      accumulator + 512, count + 512);
 #endif  // CONFIG_HIGHBITDEPTH
         }
       }
 
+// Normalize filter output to produce AltRef frame
 #if CONFIG_HIGHBITDEPTH
       if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
         uint16_t *dst1_16;
         uint16_t *dst2_16;
-        // Normalize filter output to produce AltRef frame
         dst1 = cpi->alt_ref_buffer.y_buffer;
         dst1_16 = CONVERT_TO_SHORTPTR(dst1);
         stride = cpi->alt_ref_buffer.y_stride;
@@ -505,7 +493,7 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi,
           byte += stride - mb_uv_width;
         }
       } else {
-        // Normalize filter output to produce AltRef frame
+#endif  // CONFIG_HIGHBITDEPTH
         dst1 = cpi->alt_ref_buffer.y_buffer;
         stride = cpi->alt_ref_buffer.y_stride;
         byte = mb_y_offset;
@@ -541,43 +529,7 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi,
           }
           byte += stride - mb_uv_width;
         }
-      }
-#else
-      // Normalize filter output to produce AltRef frame
-      dst1 = cpi->alt_ref_buffer.y_buffer;
-      stride = cpi->alt_ref_buffer.y_stride;
-      byte = mb_y_offset;
-      for (i = 0, k = 0; i < 16; i++) {
-        for (j = 0; j < 16; j++, k++) {
-          dst1[byte] =
-              (uint8_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
-
-          // move to next pixel
-          byte++;
-        }
-        byte += stride - 16;
-      }
-
-      dst1 = cpi->alt_ref_buffer.u_buffer;
-      dst2 = cpi->alt_ref_buffer.v_buffer;
-      stride = cpi->alt_ref_buffer.uv_stride;
-      byte = mb_uv_offset;
-      for (i = 0, k = 256; i < mb_uv_height; i++) {
-        for (j = 0; j < mb_uv_width; j++, k++) {
-          int m = k + 256;
-
-          // U
-          dst1[byte] =
-              (uint8_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
-
-          // V
-          dst2[byte] =
-              (uint8_t)OD_DIVU(accumulator[m] + (count[m] >> 1), count[m]);
-
-          // move to next pixel
-          byte++;
-        }
-        byte += stride - mb_uv_width;
+#if CONFIG_HIGHBITDEPTH
       }
 #endif  // CONFIG_HIGHBITDEPTH
       mb_y_offset += 16;
@@ -650,7 +602,11 @@ static void adjust_arnr_filter(AV1_COMP *cpi, int distance, int group_boost,
   *arnr_strength = strength;
 }
 
-void av1_temporal_filter(AV1_COMP *cpi, int distance) {
+void av1_temporal_filter(AV1_COMP *cpi,
+#if CONFIG_BGSPRITE
+                         YV12_BUFFER_CONFIG *bg,
+#endif  // CONFIG_BGSPRITE
+                         int distance) {
   RATE_CONTROL *const rc = &cpi->rc;
   int frame;
   int frames_to_blur;
@@ -692,9 +648,18 @@ void av1_temporal_filter(AV1_COMP *cpi, int distance) {
   // Setup frame pointers, NULL indicates frame not included in filter.
   for (frame = 0; frame < frames_to_blur; ++frame) {
     const int which_buffer = start_frame - frame;
-    struct lookahead_entry *buf =
-        av1_lookahead_peek(cpi->lookahead, which_buffer);
-    frames[frames_to_blur - 1 - frame] = &buf->img;
+#if CONFIG_BGSPRITE
+    if (frame == frames_to_blur_backward && bg != NULL) {
+      // Insert bg into frames at ARF index.
+      frames[frames_to_blur - 1 - frame] = bg;
+    } else {
+#endif  // CONFIG_BGSPRITE
+      struct lookahead_entry *buf =
+          av1_lookahead_peek(cpi->lookahead, which_buffer);
+      frames[frames_to_blur - 1 - frame] = &buf->img;
+#if CONFIG_BGSPRITE
+    }
+#endif  // CONFIG_BGSPRITE
   }
 
   if (frames_to_blur > 0) {
diff --git a/third_party/aom/av1/encoder/temporal_filter.h b/third_party/aom/av1/encoder/temporal_filter.h
index bc0863a63..ebb24703f 100644
--- a/third_party/aom/av1/encoder/temporal_filter.h
+++ b/third_party/aom/av1/encoder/temporal_filter.h
@@ -16,7 +16,11 @@
 extern "C" {
 #endif
 
-void av1_temporal_filter(AV1_COMP *cpi, int distance);
+void av1_temporal_filter(AV1_COMP *cpi,
+#if CONFIG_BGSPRITE
+                         YV12_BUFFER_CONFIG *bg,
+#endif  // CONFIG_BGSPRITE
+                         int distance);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/av1/encoder/tokenize.c b/third_party/aom/av1/encoder/tokenize.c
index 18d2cd958..b9db891b3 100644
--- a/third_party/aom/av1/encoder/tokenize.c
+++ b/third_party/aom/av1/encoder/tokenize.c
@@ -277,12 +277,12 @@ static void cost_coeffs_b(int plane, int block, int blk_row, int blk_col,
   struct macroblock_plane *p = &x->plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
   const PLANE_TYPE type = pd->plane_type;
-  const int ref = is_inter_block(mbmi);
-  const TX_TYPE tx_type = get_tx_type(type, xd, block, tx_size);
-  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, ref);
-  const int rate = av1_cost_coeffs(cpi, x, plane, block, tx_size, scan_order,
-                                   pd->above_context + blk_col,
-                                   pd->left_context + blk_row, 0);
+  const TX_TYPE tx_type =
+      av1_get_tx_type(type, xd, blk_row, blk_col, block, tx_size);
+  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, mbmi);
+  const int rate = av1_cost_coeffs(
+      cpi, x, plane, blk_row, blk_col, block, tx_size, scan_order,
+      pd->above_context + blk_col, pd->left_context + blk_row, 0);
   args->this_rate += rate;
   (void)plane_bsize;
   av1_set_contexts(xd, pd, plane, tx_size, p->eobs[block] > 0, blk_col,
@@ -323,42 +323,48 @@ void av1_tokenize_palette_sb(const AV1_COMP *cpi,
                              const struct ThreadData *const td, int plane,
                              TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize,
                              int *rate) {
+  assert(plane == 0 || plane == 1);
   const MACROBLOCK *const x = &td->mb;
   const MACROBLOCKD *const xd = &x->e_mbd;
   const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   const uint8_t *const color_map = xd->plane[plane].color_index_map;
   const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-  const int n = pmi->palette_size[plane];
-  int i, j;
-  int this_rate = 0;
-  uint8_t color_order[PALETTE_MAX_SIZE];
-  const aom_prob(
-      *const probs)[PALETTE_COLOR_INDEX_CONTEXTS][PALETTE_COLORS - 1] =
-      plane == 0 ? av1_default_palette_y_color_index_prob
-                 : av1_default_palette_uv_color_index_prob;
+  aom_cdf_prob(
+      *palette_cdf)[PALETTE_COLOR_INDEX_CONTEXTS][CDF_SIZE(PALETTE_COLORS)] =
+      plane ? xd->tile_ctx->palette_uv_color_index_cdf
+            : xd->tile_ctx->palette_y_color_index_cdf;
   int plane_block_width, rows, cols;
   av1_get_block_dimensions(bsize, plane, xd, &plane_block_width, NULL, &rows,
                            &cols);
-  assert(plane == 0 || plane == 1);
 
+  // The first color index does not use context or entropy.
+  (*t)->token = color_map[0];
+  (*t)->palette_cdf = NULL;
+  (*t)->skip_eob_node = 0;
+  ++(*t);
+
+  const int n = pmi->palette_size[plane];
+  const int calc_rate = rate && dry_run == DRY_RUN_COSTCOEFFS;
+  int this_rate = 0;
+  uint8_t color_order[PALETTE_MAX_SIZE];
 #if CONFIG_PALETTE_THROUGHPUT
-  int k;
-  for (k = 1; k < rows + cols - 1; ++k) {
-    for (j = AOMMIN(k, cols - 1); j >= AOMMAX(0, k - rows + 1); --j) {
-      i = k - j;
+  for (int k = 1; k < rows + cols - 1; ++k) {
+    for (int j = AOMMIN(k, cols - 1); j >= AOMMAX(0, k - rows + 1); --j) {
+      int i = k - j;
 #else
-  for (i = 0; i < rows; ++i) {
-    for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
+  for (int i = 0; i < rows; ++i) {
+    for (int j = (i == 0 ? 1 : 0); j < cols; ++j) {
 #endif  // CONFIG_PALETTE_THROUGHPUT
       int color_new_idx;
       const int color_ctx = av1_get_palette_color_index_context(
           color_map, plane_block_width, i, j, n, color_order, &color_new_idx);
       assert(color_new_idx >= 0 && color_new_idx < n);
-      if (dry_run == DRY_RUN_COSTCOEFFS)
+      if (calc_rate) {
         this_rate += cpi->palette_y_color_cost[n - PALETTE_MIN_SIZE][color_ctx]
                                               [color_new_idx];
+      }
       (*t)->token = color_new_idx;
-      (*t)->context_tree = probs[n - PALETTE_MIN_SIZE][color_ctx];
+      (*t)->palette_cdf = palette_cdf[n - PALETTE_MIN_SIZE][color_ctx];
       (*t)->skip_eob_node = 0;
       ++(*t);
     }
@@ -434,17 +440,13 @@ static void tokenize_b(int plane, int block, int blk_row, int blk_col,
   const int segment_id = mbmi->segment_id;
 #endif  // CONFIG_SUEPRTX
   const int16_t *scan, *nb;
-  const TX_TYPE tx_type = get_tx_type(type, xd, block, tx_size);
-  const SCAN_ORDER *const scan_order =
-      get_scan(cm, tx_size, tx_type, is_inter_block(mbmi));
+  const TX_TYPE tx_type =
+      av1_get_tx_type(type, xd, blk_row, blk_col, block, tx_size);
+  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, mbmi);
   const int ref = is_inter_block(mbmi);
   unsigned int(*const counts)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
       td->rd_counts.coef_counts[txsize_sqr_map[tx_size]][type][ref];
-#if CONFIG_EC_ADAPT
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
-#else
-  FRAME_CONTEXT *ec_ctx = cpi->common.fc;
-#endif
   aom_cdf_prob(
       *const coef_head_cdfs)[COEFF_CONTEXTS][CDF_SIZE(ENTROPY_TOKENS)] =
       ec_ctx->coef_head_cdfs[txsize_sqr_map[tx_size]][type][ref];
@@ -595,16 +597,31 @@ void tokenize_vartx(ThreadData *td, TOKENEXTRA **t, RUN_TYPE dry_run,
       cost_coeffs_b(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg);
 #endif
   } else {
+#if CONFIG_RECT_TX_EXT
+    int is_qttx = plane_tx_size == quarter_txsize_lookup[plane_bsize];
+    const TX_SIZE sub_txs = is_qttx ? plane_tx_size : sub_tx_size_map[tx_size];
+#else
     // Half the block size in transform block unit.
     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+#endif
     const int bsl = tx_size_wide_unit[sub_txs];
     int i;
 
     assert(bsl > 0);
 
     for (i = 0; i < 4; ++i) {
+#if CONFIG_RECT_TX_EXT
+      int is_wide_tx = tx_size_wide_unit[sub_txs] > tx_size_high_unit[sub_txs];
+      const int offsetr =
+          is_qttx ? (is_wide_tx ? i * tx_size_high_unit[sub_txs] : 0)
+                  : blk_row + ((i >> 1) * bsl);
+      const int offsetc =
+          is_qttx ? (is_wide_tx ? 0 : i * tx_size_wide_unit[sub_txs])
+                  : blk_col + ((i & 0x01) * bsl);
+#else
       const int offsetr = blk_row + ((i >> 1) * bsl);
       const int offsetc = blk_col + ((i & 0x01) * bsl);
+#endif
 
       int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
 
@@ -666,7 +683,7 @@ void av1_tokenize_sb_vartx(const AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
     }
 #endif
     const struct macroblockd_plane *const pd = &xd->plane[plane];
-#if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
+#if CONFIG_CHROMA_SUB8X8
     const BLOCK_SIZE plane_bsize =
         AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
 #else
@@ -681,14 +698,30 @@ void av1_tokenize_sb_vartx(const AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
     int idx, idy;
     int block = 0;
     int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
-    for (idy = 0; idy < mi_height; idy += bh) {
-      for (idx = 0; idx < mi_width; idx += bw) {
-        tokenize_vartx(td, t, dry_run, max_tx_size, plane_bsize, idy, idx,
-                       block, plane, &arg);
-        block += step;
+
+    const BLOCK_SIZE max_unit_bsize = get_plane_block_size(BLOCK_64X64, pd);
+    int mu_blocks_wide =
+        block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
+    int mu_blocks_high =
+        block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
+
+    mu_blocks_wide = AOMMIN(mi_width, mu_blocks_wide);
+    mu_blocks_high = AOMMIN(mi_height, mu_blocks_high);
+
+    for (idy = 0; idy < mi_height; idy += mu_blocks_high) {
+      for (idx = 0; idx < mi_width; idx += mu_blocks_wide) {
+        int blk_row, blk_col;
+        const int unit_height = AOMMIN(mu_blocks_high + idy, mi_height);
+        const int unit_width = AOMMIN(mu_blocks_wide + idx, mi_width);
+        for (blk_row = idy; blk_row < unit_height; blk_row += bh) {
+          for (blk_col = idx; blk_col < unit_width; blk_col += bw) {
+            tokenize_vartx(td, t, dry_run, max_tx_size, plane_bsize, blk_row,
+                           blk_col, block, plane, &arg);
+            block += step;
+          }
+        }
       }
     }
-
 #if !CONFIG_LV_MAP
     if (!dry_run) {
       (*t)->token = EOSB_TOKEN;
diff --git a/third_party/aom/av1/encoder/tokenize.h b/third_party/aom/av1/encoder/tokenize.h
index cbfa3cd91..73f0305fa 100644
--- a/third_party/aom/av1/encoder/tokenize.h
+++ b/third_party/aom/av1/encoder/tokenize.h
@@ -37,6 +37,9 @@ typedef struct {
 typedef struct {
   aom_cdf_prob (*tail_cdf)[CDF_SIZE(ENTROPY_TOKENS)];
   aom_cdf_prob (*head_cdf)[CDF_SIZE(ENTROPY_TOKENS)];
+#if CONFIG_PALETTE
+  aom_cdf_prob *palette_cdf;
+#endif  // CONFIG_PALETTE
   int eob_val;
   int first_val;
   const aom_prob *context_tree;
diff --git a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c
new file mode 100644
index 000000000..c8d4ccb70
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "./av1_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+static INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i dc = _mm_unpacklo_epi16(*p, zero);
+  const __m128i ac = _mm_unpackhi_epi16(*p, zero);
+  *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1);
+}
+
+static INLINE void update_qp(__m256i *qp) {
+  qp[0] = _mm256_permute2x128_si256(qp[0], qp[0], 0x11);
+  qp[1] = _mm256_permute2x128_si256(qp[1], qp[1], 0x11);
+  qp[2] = _mm256_permute2x128_si256(qp[2], qp[2], 0x11);
+}
+
+static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr,
+                           const int16_t *dequant_ptr, int log_scale,
+                           __m256i *qp) {
+  __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
+  round = _mm_srai_epi16(round, log_scale);
+  const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
+  const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
+
+  init_one_qp(&round, &qp[0]);
+  init_one_qp(&quant, &qp[1]);
+  init_one_qp(&dequant, &qp[2]);
+}
+
+static INLINE void quantize(const __m256i *qp, __m256i *c,
+                            const int16_t *iscan_ptr, int log_scale,
+                            tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                            __m256i *eob) {
+  const __m256i abs = _mm256_abs_epi32(*c);
+  __m256i q = _mm256_add_epi32(abs, qp[0]);
+
+  __m256i q_lo = _mm256_mul_epi32(q, qp[1]);
+  __m256i q_hi = _mm256_srli_epi64(q, 32);
+  const __m256i qp_hi = _mm256_srli_epi64(qp[1], 32);
+  q_hi = _mm256_mul_epi32(q_hi, qp_hi);
+  q_lo = _mm256_srli_epi64(q_lo, 16 - log_scale);
+  q_hi = _mm256_srli_epi64(q_hi, 16 - log_scale);
+  q_hi = _mm256_slli_epi64(q_hi, 32);
+  q = _mm256_or_si256(q_lo, q_hi);
+
+  __m256i dq = _mm256_mullo_epi32(q, qp[2]);
+  dq = _mm256_srai_epi32(dq, log_scale);
+  q = _mm256_sign_epi32(q, *c);
+  dq = _mm256_sign_epi32(dq, *c);
+
+  _mm256_storeu_si256((__m256i *)qcoeff, q);
+  _mm256_storeu_si256((__m256i *)dqcoeff, dq);
+
+  const __m128i isc = _mm_loadu_si128((const __m128i *)iscan_ptr);
+  const __m128i zr = _mm_setzero_si128();
+  const __m128i lo = _mm_unpacklo_epi16(isc, zr);
+  const __m128i hi = _mm_unpackhi_epi16(isc, zr);
+  const __m256i iscan =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
+
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i zc = _mm256_cmpeq_epi32(dq, zero);
+  const __m256i nz = _mm256_cmpeq_epi32(zc, zero);
+  __m256i cur_eob = _mm256_sub_epi32(iscan, nz);
+  cur_eob = _mm256_and_si256(cur_eob, nz);
+  *eob = _mm256_max_epi32(cur_eob, *eob);
+}
+
+void av1_highbd_quantize_fp_avx2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, int log_scale) {
+  (void)scan;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  const unsigned int step = 8;
+
+  if (LIKELY(!skip_block)) {
+    __m256i qp[3], coeff;
+
+    init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, qp);
+    coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+
+    __m256i eob = _mm256_setzero_si256();
+    quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+    coeff_ptr += step;
+    qcoeff_ptr += step;
+    dqcoeff_ptr += step;
+    iscan += step;
+    n_coeffs -= step;
+
+    update_qp(qp);
+    while (n_coeffs > 0) {
+      coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+      quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+      coeff_ptr += step;
+      qcoeff_ptr += step;
+      dqcoeff_ptr += step;
+      iscan += step;
+      n_coeffs -= step;
+    }
+    {
+      __m256i eob_s;
+      eob_s = _mm256_shuffle_epi32(eob, 0xe);
+      eob = _mm256_max_epi16(eob, eob_s);
+      eob_s = _mm256_shufflelo_epi16(eob, 0xe);
+      eob = _mm256_max_epi16(eob, eob_s);
+      eob_s = _mm256_shufflelo_epi16(eob, 1);
+      eob = _mm256_max_epi16(eob, eob_s);
+      const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
+                                              _mm256_extractf128_si256(eob, 1));
+      *eob_ptr = _mm_extract_epi16(final_eob, 0);
+    }
+  } else {
+    do {
+      const __m256i zero = _mm256_setzero_si256();
+      _mm256_storeu_si256((__m256i *)qcoeff_ptr, zero);
+      _mm256_storeu_si256((__m256i *)dqcoeff_ptr, zero);
+      qcoeff_ptr += step;
+      dqcoeff_ptr += step;
+      n_coeffs -= step;
+    } while (n_coeffs > 0);
+    *eob_ptr = 0;
+  }
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c
index fa5626002..8d717a083 100644
--- a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c
+++ b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c
@@ -133,9 +133,10 @@ void av1_highbd_quantize_fp_sse4_1(
     coeff[0] = _mm_loadu_si128((__m128i const *)src);
 
     qparam[0] =
-        _mm_set_epi32(round_ptr[1], round_ptr[1], round_ptr[1], round_ptr[0]);
-    qparam[1] = _mm_set_epi64x(quant_ptr[1], quant_ptr[0]);
-    qparam[2] = _mm_set_epi64x(dequant_ptr[1], dequant_ptr[0]);
+        _mm_set_epi32(round_ptr[1] >> log_scale, round_ptr[1] >> log_scale,
+                      round_ptr[1] >> log_scale, round_ptr[0] >> log_scale);
+    qparam[1] = _mm_set_epi32(0, quant_ptr[1], 0, quant_ptr[0]);
+    qparam[2] = _mm_set_epi32(0, dequant_ptr[1], 0, dequant_ptr[0]);
 
     // DC and first 3 AC
     quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant,
@@ -143,8 +144,8 @@ void av1_highbd_quantize_fp_sse4_1(
 
     // update round/quan/dquan for AC
     qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]);
-    qparam[1] = _mm_set_epi64x(quant_ptr[1], quant_ptr[1]);
-    qparam[2] = _mm_set_epi64x(dequant_ptr[1], dequant_ptr[1]);
+    qparam[1] = _mm_set_epi32(0, quant_ptr[1], 0, quant_ptr[1]);
+    qparam[2] = _mm_set_epi32(0, dequant_ptr[1], 0, dequant_ptr[1]);
 
     quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
                           log_scale, quanAddr, dquanAddr);
diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c b/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c
new file mode 100644
index 000000000..1c0a120ca
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "./av1_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+static INLINE void read_coeff(const tran_low_t *coeff, __m256i *c) {
+#if CONFIG_HIGHBITDEPTH
+  const __m256i x0 = _mm256_loadu_si256((const __m256i *)coeff);
+  const __m256i x1 = _mm256_loadu_si256((const __m256i *)coeff + 1);
+  *c = _mm256_packs_epi32(x0, x1);
+  *c = _mm256_permute4x64_epi64(*c, 0xD8);
+#else
+  *c = _mm256_loadu_si256((const __m256i *)coeff);
+#endif
+}
+
+static INLINE void write_zero(tran_low_t *qcoeff) {
+  const __m256i zero = _mm256_setzero_si256();
+#if CONFIG_HIGHBITDEPTH
+  _mm256_storeu_si256((__m256i *)qcoeff, zero);
+  _mm256_storeu_si256((__m256i *)qcoeff + 1, zero);
+#else
+  _mm256_storeu_si256((__m256i *)qcoeff, zero);
+#endif
+}
+
+static INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
+  const __m128i ac = _mm_unpackhi_epi64(*p, *p);
+  *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(*p), ac, 1);
+}
+
+static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr,
+                           const int16_t *dequant_ptr, int log_scale,
+                           __m256i *thr, __m256i *qp) {
+  __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
+  const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
+  const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
+
+  if (log_scale > 0) {
+    const __m128i rnd = _mm_set1_epi16((int16_t)1 << (log_scale - 1));
+    round = _mm_add_epi16(round, rnd);
+    round = _mm_srai_epi16(round, log_scale);
+  }
+
+  init_one_qp(&round, &qp[0]);
+  init_one_qp(&quant, &qp[1]);
+
+  if (log_scale > 0) {
+    qp[1] = _mm256_slli_epi16(qp[1], log_scale);
+  }
+
+  init_one_qp(&dequant, &qp[2]);
+  *thr = _mm256_srai_epi16(qp[2], 1 + log_scale);
+}
+
+static INLINE void update_qp(int log_scale, __m256i *thr, __m256i *qp) {
+  qp[0] = _mm256_permute2x128_si256(qp[0], qp[0], 0x11);
+  qp[1] = _mm256_permute2x128_si256(qp[1], qp[1], 0x11);
+  qp[2] = _mm256_permute2x128_si256(qp[2], qp[2], 0x11);
+  *thr = _mm256_srai_epi16(qp[2], 1 + log_scale);
+}
+
+#define store_quan(q, addr)                               \
+  do {                                                    \
+    __m256i sign_bits = _mm256_srai_epi16(q, 15);         \
+    __m256i y0 = _mm256_unpacklo_epi16(q, sign_bits);     \
+    __m256i y1 = _mm256_unpackhi_epi16(q, sign_bits);     \
+    __m256i x0 = _mm256_permute2x128_si256(y0, y1, 0x20); \
+    __m256i x1 = _mm256_permute2x128_si256(y0, y1, 0x31); \
+    _mm256_storeu_si256((__m256i *)addr, x0);             \
+    _mm256_storeu_si256((__m256i *)addr + 1, x1);         \
+  } while (0)
+
+#if CONFIG_HIGHBITDEPTH
+#define store_two_quan(q, addr1, dq, addr2) \
+  do {                                      \
+    store_quan(q, addr1);                   \
+    store_quan(dq, addr2);                  \
+  } while (0)
+#else
+#define store_two_quan(q, addr1, dq, addr2)    \
+  do {                                         \
+    _mm256_storeu_si256((__m256i *)addr1, q);  \
+    _mm256_storeu_si256((__m256i *)addr2, dq); \
+  } while (0)
+#endif
+
+static INLINE void quantize(const __m256i *thr, const __m256i *qp, __m256i *c,
+                            const int16_t *iscan_ptr, tran_low_t *qcoeff,
+                            tran_low_t *dqcoeff, __m256i *eob) {
+  const __m256i abs = _mm256_abs_epi16(*c);
+  __m256i mask = _mm256_cmpgt_epi16(abs, *thr);
+  mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs, *thr));
+  const int nzflag = _mm256_movemask_epi8(mask);
+
+  if (nzflag) {
+    __m256i q = _mm256_adds_epi16(abs, qp[0]);
+    q = _mm256_mulhi_epi16(q, qp[1]);
+    q = _mm256_sign_epi16(q, *c);
+    const __m256i dq = _mm256_mullo_epi16(q, qp[2]);
+
+    store_two_quan(q, qcoeff, dq, dqcoeff);
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr);
+    const __m256i zero_coeff = _mm256_cmpeq_epi16(dq, zero);
+    const __m256i nzero_coeff = _mm256_cmpeq_epi16(zero_coeff, zero);
+    __m256i cur_eob = _mm256_sub_epi16(iscan, nzero_coeff);
+    cur_eob = _mm256_and_si256(cur_eob, nzero_coeff);
+    *eob = _mm256_max_epi16(*eob, cur_eob);
+  } else {
+    write_zero(qcoeff);
+    write_zero(dqcoeff);
+  }
+}
+
+void av1_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                          int skip_block, const int16_t *zbin_ptr,
+                          const int16_t *round_ptr, const int16_t *quant_ptr,
+                          const int16_t *quant_shift_ptr,
+                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                          const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+  (void)scan_ptr;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  const unsigned int step = 16;
+
+  if (LIKELY(!skip_block)) {
+    __m256i qp[3];
+    __m256i coeff, thr;
+    const int log_scale = 0;
+
+    init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
+    read_coeff(coeff_ptr, &coeff);
+
+    __m256i eob = _mm256_setzero_si256();
+    quantize(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+    coeff_ptr += step;
+    qcoeff_ptr += step;
+    dqcoeff_ptr += step;
+    iscan_ptr += step;
+    n_coeffs -= step;
+
+    update_qp(log_scale, &thr, qp);
+
+    while (n_coeffs > 0) {
+      read_coeff(coeff_ptr, &coeff);
+      quantize(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+      coeff_ptr += step;
+      qcoeff_ptr += step;
+      dqcoeff_ptr += step;
+      iscan_ptr += step;
+      n_coeffs -= step;
+    }
+    {
+      __m256i eob_s;
+      eob_s = _mm256_shuffle_epi32(eob, 0xe);
+      eob = _mm256_max_epi16(eob, eob_s);
+      eob_s = _mm256_shufflelo_epi16(eob, 0xe);
+      eob = _mm256_max_epi16(eob, eob_s);
+      eob_s = _mm256_shufflelo_epi16(eob, 1);
+      eob = _mm256_max_epi16(eob, eob_s);
+      const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
+                                              _mm256_extractf128_si256(eob, 1));
+      *eob_ptr = _mm_extract_epi16(final_eob, 0);
+    }
+  } else {
+    do {
+      write_zero(qcoeff_ptr);
+      write_zero(dqcoeff_ptr);
+      qcoeff_ptr += step;
+      dqcoeff_ptr += step;
+      n_coeffs -= step;
+    } while (n_coeffs > 0);
+    *eob_ptr = 0;
+  }
+}
+
+static INLINE void quantize_32x32(const __m256i *thr, const __m256i *qp,
+                                  __m256i *c, const int16_t *iscan_ptr,
+                                  tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                                  __m256i *eob) {
+  const __m256i abs = _mm256_abs_epi16(*c);
+  __m256i mask = _mm256_cmpgt_epi16(abs, *thr);
+  mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs, *thr));
+  const int nzflag = _mm256_movemask_epi8(mask);
+
+  if (nzflag) {
+    __m256i q = _mm256_adds_epi16(abs, qp[0]);
+    q = _mm256_mulhi_epu16(q, qp[1]);
+
+    __m256i dq = _mm256_mullo_epi16(q, qp[2]);
+    dq = _mm256_srli_epi16(dq, 1);
+
+    q = _mm256_sign_epi16(q, *c);
+    dq = _mm256_sign_epi16(dq, *c);
+
+    store_two_quan(q, qcoeff, dq, dqcoeff);
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr);
+    const __m256i zero_coeff = _mm256_cmpeq_epi16(dq, zero);
+    const __m256i nzero_coeff = _mm256_cmpeq_epi16(zero_coeff, zero);
+    __m256i cur_eob = _mm256_sub_epi16(iscan, nzero_coeff);
+    cur_eob = _mm256_and_si256(cur_eob, nzero_coeff);
+    *eob = _mm256_max_epi16(*eob, cur_eob);
+  } else {
+    write_zero(qcoeff);
+    write_zero(dqcoeff);
+  }
+}
+
+void av1_quantize_fp_32x32_avx2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+  (void)scan_ptr;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  const unsigned int step = 16;
+
+  if (LIKELY(!skip_block)) {
+    __m256i qp[3];
+    __m256i coeff, thr;
+    const int log_scale = 1;
+
+    init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
+    read_coeff(coeff_ptr, &coeff);
+
+    __m256i eob = _mm256_setzero_si256();
+    quantize_32x32(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+    coeff_ptr += step;
+    qcoeff_ptr += step;
+    dqcoeff_ptr += step;
+    iscan_ptr += step;
+    n_coeffs -= step;
+
+    update_qp(log_scale, &thr, qp);
+
+    while (n_coeffs > 0) {
+      read_coeff(coeff_ptr, &coeff);
+      quantize_32x32(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr,
+                     &eob);
+
+      coeff_ptr += step;
+      qcoeff_ptr += step;
+      dqcoeff_ptr += step;
+      iscan_ptr += step;
+      n_coeffs -= step;
+    }
+    {
+      __m256i eob_s;
+      eob_s = _mm256_shuffle_epi32(eob, 0xe);
+      eob = _mm256_max_epi16(eob, eob_s);
+      eob_s = _mm256_shufflelo_epi16(eob, 0xe);
+      eob = _mm256_max_epi16(eob, eob_s);
+      eob_s = _mm256_shufflelo_epi16(eob, 1);
+      eob = _mm256_max_epi16(eob, eob_s);
+      const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
+                                              _mm256_extractf128_si256(eob, 1));
+      *eob_ptr = _mm_extract_epi16(final_eob, 0);
+    }
+  } else {
+    do {
+      write_zero(qcoeff_ptr);
+      write_zero(dqcoeff_ptr);
+      qcoeff_ptr += step;
+      dqcoeff_ptr += step;
+      n_coeffs -= step;
+    } while (n_coeffs > 0);
+    *eob_ptr = 0;
+  }
+}
diff --git a/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c b/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c
index 37c4b0d88..496c33395 100644
--- a/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c
+++ b/third_party/aom/av1/encoder/x86/dct_intrin_sse2.c
@@ -203,8 +203,12 @@ static void fidtx4_sse2(__m128i *in) {
 #endif  // CONFIG_EXT_TX
 
 void av1_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride,
-                     int tx_type) {
+                     TxfmParam *txfm_param) {
   __m128i in[4];
+  int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif
 
   switch (tx_type) {
     case DCT_DCT: aom_fdct4x4_sse2(input, output, stride); break;
@@ -1301,8 +1305,12 @@ static void fidtx8_sse2(__m128i *in) {
 #endif  // CONFIG_EXT_TX
 
 void av1_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride,
-                     int tx_type) {
+                     TxfmParam *txfm_param) {
   __m128i in[8];
+  int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif
 
   switch (tx_type) {
     case DCT_DCT: aom_fdct8x8_sse2(input, output, stride); break;
@@ -2334,8 +2342,12 @@ static void fidtx16_sse2(__m128i *in0, __m128i *in1) {
 #endif  // CONFIG_EXT_TX
 
 void av1_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride,
-                       int tx_type) {
+                       TxfmParam *txfm_param) {
   __m128i in0[16], in1[16];
+  int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif
 
   switch (tx_type) {
     case DCT_DCT:
@@ -2550,8 +2562,12 @@ static INLINE void write_buffer_4x8(tran_low_t *output, __m128i *res) {
 }
 
 void av1_fht4x8_sse2(const int16_t *input, tran_low_t *output, int stride,
-                     int tx_type) {
+                     TxfmParam *txfm_param) {
   __m128i in[8];
+  int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif
 
   switch (tx_type) {
     case DCT_DCT:
@@ -2724,8 +2740,12 @@ static INLINE void write_buffer_8x4(tran_low_t *output, __m128i *res) {
 }
 
 void av1_fht8x4_sse2(const int16_t *input, tran_low_t *output, int stride,
-                     int tx_type) {
+                     TxfmParam *txfm_param) {
   __m128i in[8];
+  int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif
 
   switch (tx_type) {
     case DCT_DCT:
@@ -2864,8 +2884,12 @@ static void row_8x16_rounding(__m128i *in, int bits) {
 }
 
 void av1_fht8x16_sse2(const int16_t *input, tran_low_t *output, int stride,
-                      int tx_type) {
+                      TxfmParam *txfm_param) {
   __m128i in[16];
+  int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif
 
   __m128i *const t = in;      // Alias to top 8x8 sub block
   __m128i *const b = in + 8;  // Alias to bottom 8x8 sub block
@@ -3045,8 +3069,12 @@ static INLINE void load_buffer_16x8(const int16_t *input, __m128i *in,
 #define col_16x8_rounding row_8x16_rounding
 
 void av1_fht16x8_sse2(const int16_t *input, tran_low_t *output, int stride,
-                      int tx_type) {
+                      TxfmParam *txfm_param) {
   __m128i in[16];
+  int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif
 
   __m128i *const l = in;      // Alias to left 8x8 sub block
   __m128i *const r = in + 8;  // Alias to right 8x8 sub block, which we store
@@ -3355,8 +3383,12 @@ static INLINE void fhalfright32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
 // For 16x32, this means the input is a 2x2 grid of such blocks.
 // For 32x16, it means the input is a 4x1 grid.
 void av1_fht16x32_sse2(const int16_t *input, tran_low_t *output, int stride,
-                       int tx_type) {
+                       TxfmParam *txfm_param) {
   __m128i intl[16], intr[16], inbl[16], inbr[16];
+  int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif
 
   switch (tx_type) {
     case DCT_DCT:
@@ -3544,8 +3576,12 @@ static INLINE void write_buffer_32x16(tran_low_t *output, __m128i *res0,
 }
 
 void av1_fht32x16_sse2(const int16_t *input, tran_low_t *output, int stride,
-                       int tx_type) {
+                       TxfmParam *txfm_param) {
   __m128i in0[16], in1[16], in2[16], in3[16];
+  int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif
 
   load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
   switch (tx_type) {
@@ -3784,8 +3820,12 @@ static INLINE void write_buffer_32x32(__m128i *in0, __m128i *in1, __m128i *in2,
 }
 
 void av1_fht32x32_sse2(const int16_t *input, tran_low_t *output, int stride,
-                       int tx_type) {
+                       TxfmParam *txfm_param) {
   __m128i in0[32], in1[32], in2[32], in3[32];
+  int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+  assert(tx_type != MRC_DCT && "No 32x32 sse2 MRC_DCT implementation");
+#endif
 
   load_buffer_32x32(input, in0, in1, in2, in3, stride, 0, 0);
   switch (tx_type) {
diff --git a/third_party/aom/av1/encoder/x86/error_intrin_avx2.c b/third_party/aom/av1/encoder/x86/error_intrin_avx2.c
index ae733a1ce..20ba4149c 100644
--- a/third_party/aom/av1/encoder/x86/error_intrin_avx2.c
+++ b/third_party/aom/av1/encoder/x86/error_intrin_avx2.c
@@ -14,7 +14,20 @@
 #include "./av1_rtcd.h"
 #include "aom/aom_integer.h"
 
-int64_t av1_block_error_avx2(const int16_t *coeff, const int16_t *dqcoeff,
+static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset,
+                              __m256i *c) {
+  const tran_low_t *addr = coeff + offset;
+#if CONFIG_HIGHBITDEPTH
+  const __m256i x0 = _mm256_loadu_si256((const __m256i *)addr);
+  const __m256i x1 = _mm256_loadu_si256((const __m256i *)addr + 1);
+  const __m256i y = _mm256_packs_epi32(x0, x1);
+  *c = _mm256_permute4x64_epi64(y, 0xD8);
+#else
+  *c = _mm256_loadu_si256((const __m256i *)addr);
+#endif
+}
+
+int64_t av1_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff,
                              intptr_t block_size, int64_t *ssz) {
   __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg;
   __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi;
@@ -22,16 +35,16 @@ int64_t av1_block_error_avx2(const int16_t *coeff, const int16_t *dqcoeff,
   __m128i sse_reg128, ssz_reg128;
   int64_t sse;
   int i;
-  const __m256i zero_reg = _mm256_set1_epi16(0);
+  const __m256i zero_reg = _mm256_setzero_si256();
 
   // init sse and ssz registerd to zero
-  sse_reg = _mm256_set1_epi16(0);
-  ssz_reg = _mm256_set1_epi16(0);
+  sse_reg = _mm256_setzero_si256();
+  ssz_reg = _mm256_setzero_si256();
 
   for (i = 0; i < block_size; i += 16) {
     // load 32 bytes from coeff and dqcoeff
-    coeff_reg = _mm256_loadu_si256((const __m256i *)(coeff + i));
-    dqcoeff_reg = _mm256_loadu_si256((const __m256i *)(dqcoeff + i));
+    read_coeff(coeff, i, &coeff_reg);
+    read_coeff(dqcoeff, i, &dqcoeff_reg);
     // dqcoeff - coeff
     dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg);
     // madd (dqcoeff - coeff)
diff --git a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
index b56eed518..cab36f2bd 100644
--- a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
+++ b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
@@ -113,25 +113,13 @@ static void fdct4x4_sse4_1(__m128i *in, int bit) {
   in[3] = _mm_unpackhi_epi64(v1, v3);
 }
 
-static INLINE void write_buffer_4x4(__m128i *res, tran_low_t *output) {
+static INLINE void write_buffer_4x4(__m128i *res, int32_t *output) {
   _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
   _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
   _mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
   _mm_store_si128((__m128i *)(output + 3 * 4), res[3]);
 }
 
-// Note:
-//  We implement av1_fwd_txfm2d_4x4(). This function is kept here since
-//  av1_highbd_fht4x4_c() is not removed yet
-void av1_highbd_fht4x4_sse4_1(const int16_t *input, tran_low_t *output,
-                              int stride, int tx_type) {
-  (void)input;
-  (void)output;
-  (void)stride;
-  (void)tx_type;
-  assert(0);
-}
-
 static void fadst4x4_sse4_1(__m128i *in, int bit) {
   const int32_t *cospi = cospi_arr(bit);
   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
@@ -416,7 +404,7 @@ static INLINE void col_txfm_8x8_rounding(__m128i *in, int shift) {
   in[15] = _mm_srai_epi32(in[15], shift);
 }
 
-static INLINE void write_buffer_8x8(const __m128i *res, tran_low_t *output) {
+static INLINE void write_buffer_8x8(const __m128i *res, int32_t *output) {
   _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
   _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
   _mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
@@ -1800,7 +1788,7 @@ static void col_txfm_16x16_rounding(__m128i *in, int shift) {
   col_txfm_8x8_rounding(&in[48], shift);
 }
 
-static void write_buffer_16x16(const __m128i *in, tran_low_t *output) {
+static void write_buffer_16x16(const __m128i *in, int32_t *output) {
   const int size_8x8 = 16 * 4;
   write_buffer_8x8(&in[0], output);
   output += size_8x8;
diff --git a/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c b/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
index 8495ad1aa..af8e9a5f4 100644
--- a/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
+++ b/third_party/aom/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
@@ -18,51 +18,6 @@
 #include "aom_dsp/txfm_common.h"
 #include "aom_dsp/x86/txfm_common_avx2.h"
 
-static int32_t get_16x16_sum(const int16_t *input, int stride) {
-  __m256i r0, r1, r2, r3, u0, u1;
-  __m256i zero = _mm256_setzero_si256();
-  __m256i sum = _mm256_setzero_si256();
-  const int16_t *blockBound = input + (stride << 4);
-  __m128i v0, v1;
-
-  while (input < blockBound) {
-    r0 = _mm256_loadu_si256((__m256i const *)input);
-    r1 = _mm256_loadu_si256((__m256i const *)(input + stride));
-    r2 = _mm256_loadu_si256((__m256i const *)(input + 2 * stride));
-    r3 = _mm256_loadu_si256((__m256i const *)(input + 3 * stride));
-
-    u0 = _mm256_add_epi16(r0, r1);
-    u1 = _mm256_add_epi16(r2, r3);
-    sum = _mm256_add_epi16(sum, u0);
-    sum = _mm256_add_epi16(sum, u1);
-
-    input += stride << 2;
-  }
-
-  // unpack 16 int16_t into 2x8 int32_t
-  u0 = _mm256_unpacklo_epi16(zero, sum);
-  u1 = _mm256_unpackhi_epi16(zero, sum);
-  u0 = _mm256_srai_epi32(u0, 16);
-  u1 = _mm256_srai_epi32(u1, 16);
-  sum = _mm256_add_epi32(u0, u1);
-
-  u0 = _mm256_srli_si256(sum, 8);
-  u1 = _mm256_add_epi32(sum, u0);
-
-  v0 = _mm_add_epi32(_mm256_extracti128_si256(u1, 1),
-                     _mm256_castsi256_si128(u1));
-  v1 = _mm_srli_si128(v0, 4);
-  v0 = _mm_add_epi32(v0, v1);
-  return (int32_t)_mm_extract_epi32(v0, 0);
-}
-
-void aom_fdct16x16_1_avx2(const int16_t *input, tran_low_t *output,
-                          int stride) {
-  int32_t dc = get_16x16_sum(input, stride);
-  output[0] = (tran_low_t)(dc >> 1);
-  _mm256_zeroupper();
-}
-
 static INLINE void load_buffer_16x16(const int16_t *input, int stride,
                                      int flipud, int fliplr, __m256i *in) {
   if (!flipud) {
@@ -959,8 +914,12 @@ static void fidtx16_avx2(__m256i *in) {
 #endif
 
 void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride,
-                       int tx_type) {
+                       TxfmParam *txfm_param) {
   __m256i in[16];
+  int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
+#endif
 
   switch (tx_type) {
     case DCT_DCT:
@@ -1084,22 +1043,6 @@ void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride,
   _mm256_zeroupper();
 }
 
-void aom_fdct32x32_1_avx2(const int16_t *input, tran_low_t *output,
-                          int stride) {
-  // left and upper corner
-  int32_t sum = get_16x16_sum(input, stride);
-  // right and upper corner
-  sum += get_16x16_sum(input + 16, stride);
-  // left and lower corner
-  sum += get_16x16_sum(input + (stride << 4), stride);
-  // right and lower corner
-  sum += get_16x16_sum(input + (stride << 4) + 16, stride);
-
-  sum >>= 3;
-  output[0] = (tran_low_t)sum;
-  _mm256_zeroupper();
-}
-
 static void mm256_vectors_swap(__m256i *a0, __m256i *a1, const int size) {
   int i = 0;
   __m256i temp;
@@ -1570,9 +1513,13 @@ static void fidtx32_avx2(__m256i *in0, __m256i *in1) {
 #endif
 
 void av1_fht32x32_avx2(const int16_t *input, tran_low_t *output, int stride,
-                       int tx_type) {
+                       TxfmParam *txfm_param) {
   __m256i in0[32];  // left 32 columns
   __m256i in1[32];  // right 32 columns
+  int tx_type = txfm_param->tx_type;
+#if CONFIG_MRC_TX
+  assert(tx_type != MRC_DCT && "No avx2 32x32 implementation of MRC_DCT");
+#endif
 
   switch (tx_type) {
     case DCT_DCT:
-- 
cgit v1.2.3