Update aom to commit id e87fb2378f01103d5d6e477a4ef6892dc714e614

author: trav90 <travawine@palemoon.org> 2018-10-18 21:53:44 -0500
committer: trav90 <travawine@palemoon.org> 2018-10-18 21:53:44 -0500
commit: ec910d81405c736a4490383a250299a7837c2e64 (patch)
tree: 4f27cc226f93a863121aef6c56313e4153a69b3e /third_party/aom/av1/encoder/rdopt.c
parent: 01eb57073ba97b2d6cbf20f745dfcc508197adc3 (diff)
download: UXP-ec910d81405c736a4490383a250299a7837c2e64.tar
UXP-ec910d81405c736a4490383a250299a7837c2e64.tar.gz
UXP-ec910d81405c736a4490383a250299a7837c2e64.tar.lz
UXP-ec910d81405c736a4490383a250299a7837c2e64.tar.xz
UXP-ec910d81405c736a4490383a250299a7837c2e64.zip
1 files changed, 2345 insertions, 1876 deletions
diff --git a/third_party/aom/av1/encoder/rdopt.c b/third_party/aom/av1/encoder/rdopt.c
index 43b00b83b..607db9b86 100644
--- a/third_party/aom/av1/encoder/rdopt.c
+++ b/third_party/aom/av1/encoder/rdopt.c
@@ -21,12 +21,16 @@
 #include "aom_ports/mem.h"
 #include "aom_ports/system_state.h"
 
+#if CONFIG_CFL
+#include "av1/common/cfl.h"
+#endif
 #include "av1/common/common.h"
 #include "av1/common/common_data.h"
 #include "av1/common/entropy.h"
 #include "av1/common/entropymode.h"
 #include "av1/common/idct.h"
 #include "av1/common/mvref_common.h"
+#include "av1/common/obmc.h"
 #include "av1/common/pred_common.h"
 #include "av1/common/quant_common.h"
 #include "av1/common/reconinter.h"
@@ -51,19 +55,15 @@
 #endif
 #include "av1/encoder/hybrid_fwd_txfm.h"
 #include "av1/encoder/mcomp.h"
-#if CONFIG_PALETTE
 #include "av1/encoder/palette.h"
-#endif  // CONFIG_PALETTE
 #include "av1/encoder/ratectrl.h"
 #include "av1/encoder/rd.h"
 #include "av1/encoder/rdopt.h"
 #include "av1/encoder/tokenize.h"
 #if CONFIG_PVQ
 #include "av1/encoder/pvq_encoder.h"
-#endif  // CONFIG_PVQ
-#if CONFIG_PVQ || CONFIG_DAALA_DIST
 #include "av1/common/pvq.h"
-#endif  // CONFIG_PVQ || CONFIG_DIST_8X8
+#endif  // CONFIG_PVQ
 #if CONFIG_DUAL_FILTER
 #define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS)
 #if USE_EXTRA_FILTER
@@ -82,26 +82,36 @@ static const int filter_sets[DUAL_FILTER_SET_SIZE][2] = {
 
 #if CONFIG_EXT_REFS
 
-#define LAST_FRAME_MODE_MASK                                      \
-  ((1 << INTRA_FRAME) | (1 << LAST2_FRAME) | (1 << LAST3_FRAME) | \
-   (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME))
-#define LAST2_FRAME_MODE_MASK                                    \
-  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST3_FRAME) | \
-   (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME))
-#define LAST3_FRAME_MODE_MASK                                    \
-  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) | \
-   (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME))
-#define GOLDEN_FRAME_MODE_MASK                                   \
-  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) | \
-   (1 << LAST3_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME))
-#define BWDREF_FRAME_MODE_MASK                                   \
-  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) | \
-   (1 << LAST3_FRAME) | (1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME))
-#define ALTREF_FRAME_MODE_MASK                                   \
-  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) | \
-   (1 << LAST3_FRAME) | (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME))
-
-#else
+#define LAST_FRAME_MODE_MASK                                          \
+  ((1 << INTRA_FRAME) | (1 << LAST2_FRAME) | (1 << LAST3_FRAME) |     \
+   (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF2_FRAME) | \
+   (1 << ALTREF_FRAME))
+#define LAST2_FRAME_MODE_MASK                                         \
+  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST3_FRAME) |      \
+   (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF2_FRAME) | \
+   (1 << ALTREF_FRAME))
+#define LAST3_FRAME_MODE_MASK                                         \
+  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) |      \
+   (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF2_FRAME) | \
+   (1 << ALTREF_FRAME))
+#define GOLDEN_FRAME_MODE_MASK                                       \
+  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) |     \
+   (1 << LAST3_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF2_FRAME) | \
+   (1 << ALTREF_FRAME))
+#define BWDREF_FRAME_MODE_MASK                                       \
+  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) |     \
+   (1 << LAST3_FRAME) | (1 << GOLDEN_FRAME) | (1 << ALTREF2_FRAME) | \
+   (1 << ALTREF_FRAME))
+#define ALTREF2_FRAME_MODE_MASK                                     \
+  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) |    \
+   (1 << LAST3_FRAME) | (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | \
+   (1 << ALTREF_FRAME))
+#define ALTREF_FRAME_MODE_MASK                                      \
+  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) |    \
+   (1 << LAST3_FRAME) | (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | \
+   (1 << ALTREF2_FRAME))
+
+#else  // !CONFIG_EXT_REFS
 
 #define LAST_FRAME_MODE_MASK \
   ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | (1 << INTRA_FRAME))
@@ -114,11 +124,12 @@ static const int filter_sets[DUAL_FILTER_SET_SIZE][2] = {
 
 #if CONFIG_EXT_REFS
 #if CONFIG_EXT_COMP_REFS
-#define SECOND_REF_FRAME_MASK                                        \
-  ((1 << ALTREF_FRAME) | (1 << BWDREF_FRAME) | (1 << GOLDEN_FRAME) | \
-   (1 << LAST2_FRAME) | 0x01)  // NOLINT
-#else                          // !CONFIG_EXT_COMP_REFS
-#define SECOND_REF_FRAME_MASK ((1 << ALTREF_FRAME) | (1 << BWDREF_FRAME) | 0x01)
+#define SECOND_REF_FRAME_MASK                                         \
+  ((1 << ALTREF_FRAME) | (1 << ALTREF2_FRAME) | (1 << BWDREF_FRAME) | \
+   (1 << GOLDEN_FRAME) | (1 << LAST2_FRAME) | 0x01)
+#else  // !CONFIG_EXT_COMP_REFS
+#define SECOND_REF_FRAME_MASK \
+  ((1 << ALTREF_FRAME) | (1 << ALTREF2_FRAME) | (1 << BWDREF_FRAME) | 0x01)
 #endif  // CONFIG_EXT_COMP_REFS
 #else   // !CONFIG_EXT_REFS
 #define SECOND_REF_FRAME_MASK ((1 << ALTREF_FRAME) | 0x01)
@@ -135,10 +146,16 @@ static const int filter_sets[DUAL_FILTER_SET_SIZE][2] = {
 // Setting this to 1 will disable trellis optimization within the
 // transform search. Trellis optimization will still be applied
 // in the final encode.
+#ifndef DISABLE_TRELLISQ_SEARCH
 #define DISABLE_TRELLISQ_SEARCH 0
+#endif
 
-const double ADST_FLIP_SVM[8] = { -6.6623, -2.8062, -3.2531, 3.1671,    // vert
-                                  -7.7051, -3.2234, -3.6193, 3.4533 };  // horz
+static const double ADST_FLIP_SVM[8] = {
+  /* vertical */
+  -6.6623, -2.8062, -3.2531, 3.1671,
+  /* horizontal */
+  -7.7051, -3.2234, -3.6193, 3.4533
+};
 
 typedef struct {
   PREDICTION_MODE mode;
@@ -166,6 +183,7 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEARESTMV, { LAST2_FRAME, NONE_FRAME } },
   { NEARESTMV, { LAST3_FRAME, NONE_FRAME } },
   { NEARESTMV, { BWDREF_FRAME, NONE_FRAME } },
+  { NEARESTMV, { ALTREF2_FRAME, NONE_FRAME } },
 #endif  // CONFIG_EXT_REFS
   { NEARESTMV, { ALTREF_FRAME, NONE_FRAME } },
   { NEARESTMV, { GOLDEN_FRAME, NONE_FRAME } },
@@ -177,6 +195,7 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEWMV, { LAST2_FRAME, NONE_FRAME } },
   { NEWMV, { LAST3_FRAME, NONE_FRAME } },
   { NEWMV, { BWDREF_FRAME, NONE_FRAME } },
+  { NEWMV, { ALTREF2_FRAME, NONE_FRAME } },
 #endif  // CONFIG_EXT_REFS
   { NEWMV, { ALTREF_FRAME, NONE_FRAME } },
   { NEWMV, { GOLDEN_FRAME, NONE_FRAME } },
@@ -186,6 +205,7 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEARMV, { LAST2_FRAME, NONE_FRAME } },
   { NEARMV, { LAST3_FRAME, NONE_FRAME } },
   { NEARMV, { BWDREF_FRAME, NONE_FRAME } },
+  { NEARMV, { ALTREF2_FRAME, NONE_FRAME } },
 #endif  // CONFIG_EXT_REFS
   { NEARMV, { ALTREF_FRAME, NONE_FRAME } },
   { NEARMV, { GOLDEN_FRAME, NONE_FRAME } },
@@ -195,14 +215,13 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { ZEROMV, { LAST2_FRAME, NONE_FRAME } },
   { ZEROMV, { LAST3_FRAME, NONE_FRAME } },
   { ZEROMV, { BWDREF_FRAME, NONE_FRAME } },
+  { ZEROMV, { ALTREF2_FRAME, NONE_FRAME } },
 #endif  // CONFIG_EXT_REFS
   { ZEROMV, { GOLDEN_FRAME, NONE_FRAME } },
   { ZEROMV, { ALTREF_FRAME, NONE_FRAME } },
 
 // TODO(zoeliu): May need to reconsider the order on the modes to check
 
-#if CONFIG_EXT_INTER
-
 #if CONFIG_COMPOUND_SINGLEREF
   // Single ref comp mode
   { SR_NEAREST_NEARMV, { LAST_FRAME, NONE_FRAME } },
@@ -263,6 +282,10 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEAREST_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
   { NEAREST_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
   { NEAREST_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
 
 #if CONFIG_EXT_COMP_REFS
   { NEAREST_NEARESTMV, { LAST_FRAME, LAST2_FRAME } },
@@ -272,40 +295,14 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
 #endif  // CONFIG_EXT_COMP_REFS
 #endif  // CONFIG_EXT_REFS
 
-#else  // CONFIG_EXT_INTER
-
-  { NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
-#if CONFIG_EXT_REFS
-  { NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
-  { NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
-#endif  // CONFIG_EXT_REFS
-  { NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
-#if CONFIG_EXT_REFS
-  { NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
-  { NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
-  { NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
-  { NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
-
-#if CONFIG_EXT_COMP_REFS
-  { NEARESTMV, { LAST_FRAME, LAST2_FRAME } },
-  { NEARESTMV, { LAST_FRAME, LAST3_FRAME } },
-  { NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } },
-  { NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } },
-#endif  // CONFIG_EXT_COMP_REFS
-#endif  // CONFIG_EXT_REFS
-#endif  // CONFIG_EXT_INTER
-
   { TM_PRED, { INTRA_FRAME, NONE_FRAME } },
 
-#if CONFIG_ALT_INTRA
   { SMOOTH_PRED, { INTRA_FRAME, NONE_FRAME } },
 #if CONFIG_SMOOTH_HV
   { SMOOTH_V_PRED, { INTRA_FRAME, NONE_FRAME } },
   { SMOOTH_H_PRED, { INTRA_FRAME, NONE_FRAME } },
 #endif  // CONFIG_SMOOTH_HV
-#endif  // CONFIG_ALT_INTRA
 
-#if CONFIG_EXT_INTER
   { NEAR_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
   { NEW_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
   { NEAREST_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
@@ -373,6 +370,38 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEW_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
   { ZERO_ZEROMV, { GOLDEN_FRAME, BWDREF_FRAME } },
 
+  { NEAR_NEARMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARESTMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { NEAR_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { ZERO_ZEROMV, { LAST_FRAME, ALTREF2_FRAME } },
+
+  { NEAR_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARESTMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { NEAR_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { NEW_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { ZERO_ZEROMV, { LAST2_FRAME, ALTREF2_FRAME } },
+
+  { NEAR_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { NEAR_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { NEW_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { ZERO_ZEROMV, { LAST3_FRAME, ALTREF2_FRAME } },
+
+  { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+  { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+  { NEW_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+  { ZERO_ZEROMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+
 #if CONFIG_EXT_COMP_REFS
   { NEAR_NEARMV, { LAST_FRAME, LAST2_FRAME } },
   { NEW_NEARESTMV, { LAST_FRAME, LAST2_FRAME } },
@@ -408,64 +437,6 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
 #endif  // CONFIG_EXT_COMP_REFS
 #endif  // CONFIG_EXT_REFS
 
-#else  // !CONFIG_EXT_INTER
-
-  { NEARMV, { LAST_FRAME, ALTREF_FRAME } },
-  { NEWMV, { LAST_FRAME, ALTREF_FRAME } },
-#if CONFIG_EXT_REFS
-  { NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
-  { NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
-  { NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
-  { NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
-#endif  // CONFIG_EXT_REFS
-  { NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
-  { NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
-
-#if CONFIG_EXT_REFS
-  { NEARMV, { LAST_FRAME, BWDREF_FRAME } },
-  { NEWMV, { LAST_FRAME, BWDREF_FRAME } },
-  { NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
-  { NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
-  { NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
-  { NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
-  { NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
-  { NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
-
-#if CONFIG_EXT_COMP_REFS
-  { NEARMV, { LAST_FRAME, LAST2_FRAME } },
-  { NEWMV, { LAST_FRAME, LAST2_FRAME } },
-  { NEARMV, { LAST_FRAME, LAST3_FRAME } },
-  { NEWMV, { LAST_FRAME, LAST3_FRAME } },
-  { NEARMV, { LAST_FRAME, GOLDEN_FRAME } },
-  { NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
-  { NEARMV, { BWDREF_FRAME, ALTREF_FRAME } },
-  { NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
-#endif  // CONFIG_EXT_COMP_REFS
-#endif  // CONFIG_EXT_REFS
-
-  { ZEROMV, { LAST_FRAME, ALTREF_FRAME } },
-#if CONFIG_EXT_REFS
-  { ZEROMV, { LAST2_FRAME, ALTREF_FRAME } },
-  { ZEROMV, { LAST3_FRAME, ALTREF_FRAME } },
-#endif  // CONFIG_EXT_REFS
-  { ZEROMV, { GOLDEN_FRAME, ALTREF_FRAME } },
-
-#if CONFIG_EXT_REFS
-  { ZEROMV, { LAST_FRAME, BWDREF_FRAME } },
-  { ZEROMV, { LAST2_FRAME, BWDREF_FRAME } },
-  { ZEROMV, { LAST3_FRAME, BWDREF_FRAME } },
-  { ZEROMV, { GOLDEN_FRAME, BWDREF_FRAME } },
-
-#if CONFIG_EXT_COMP_REFS
-  { ZEROMV, { LAST_FRAME, LAST2_FRAME } },
-  { ZEROMV, { LAST_FRAME, LAST3_FRAME } },
-  { ZEROMV, { LAST_FRAME, GOLDEN_FRAME } },
-  { ZEROMV, { BWDREF_FRAME, ALTREF_FRAME } },
-#endif  // CONFIG_EXT_COMP_REFS
-#endif  // CONFIG_EXT_REFS
-
-#endif  // CONFIG_EXT_INTER
-
   { H_PRED, { INTRA_FRAME, NONE_FRAME } },
   { V_PRED, { INTRA_FRAME, NONE_FRAME } },
   { D135_PRED, { INTRA_FRAME, NONE_FRAME } },
@@ -475,7 +446,6 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { D117_PRED, { INTRA_FRAME, NONE_FRAME } },
   { D45_PRED, { INTRA_FRAME, NONE_FRAME } },
 
-#if CONFIG_EXT_INTER
   { ZEROMV, { LAST_FRAME, INTRA_FRAME } },
   { NEARESTMV, { LAST_FRAME, INTRA_FRAME } },
   { NEARMV, { LAST_FRAME, INTRA_FRAME } },
@@ -503,37 +473,34 @@ static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
   { NEARESTMV, { BWDREF_FRAME, INTRA_FRAME } },
   { NEARMV, { BWDREF_FRAME, INTRA_FRAME } },
   { NEWMV, { BWDREF_FRAME, INTRA_FRAME } },
+
+  { ZEROMV, { ALTREF2_FRAME, INTRA_FRAME } },
+  { NEARESTMV, { ALTREF2_FRAME, INTRA_FRAME } },
+  { NEARMV, { ALTREF2_FRAME, INTRA_FRAME } },
+  { NEWMV, { ALTREF2_FRAME, INTRA_FRAME } },
 #endif  // CONFIG_EXT_REFS
 
   { ZEROMV, { ALTREF_FRAME, INTRA_FRAME } },
   { NEARESTMV, { ALTREF_FRAME, INTRA_FRAME } },
   { NEARMV, { ALTREF_FRAME, INTRA_FRAME } },
   { NEWMV, { ALTREF_FRAME, INTRA_FRAME } },
-#endif  // CONFIG_EXT_INTER
 };
 
 static const PREDICTION_MODE intra_rd_search_mode_order[INTRA_MODES] = {
-  DC_PRED,       H_PRED,        V_PRED,
-#if CONFIG_ALT_INTRA
-  SMOOTH_PRED,
-#endif  // CONFIG_ALT_INTRA
-  TM_PRED,
-#if CONFIG_ALT_INTRA && CONFIG_SMOOTH_HV
+  DC_PRED,       H_PRED,        V_PRED,    SMOOTH_PRED, TM_PRED,
+#if CONFIG_SMOOTH_HV
   SMOOTH_V_PRED, SMOOTH_H_PRED,
-#endif  // CONFIG_ALT_INTRA && CONFIG_SMOOTH_HV
-  D135_PRED,     D207_PRED,     D153_PRED, D63_PRED, D117_PRED, D45_PRED,
+#endif  // CONFIG_SMOOTH_HV
+  D135_PRED,     D207_PRED,     D153_PRED, D63_PRED,    D117_PRED, D45_PRED,
 };
 
 #if CONFIG_CFL
 static const UV_PREDICTION_MODE uv_rd_search_mode_order[UV_INTRA_MODES] = {
-  UV_DC_PRED,       UV_H_PRED,        UV_V_PRED,
-#if CONFIG_ALT_INTRA
-  UV_SMOOTH_PRED,
-#endif  // CONFIG_ALT_INTRA
-  UV_TM_PRED,
-#if CONFIG_ALT_INTRA && CONFIG_SMOOTH_HV
+  UV_DC_PRED,       UV_CFL_PRED,      UV_H_PRED,
+  UV_V_PRED,        UV_SMOOTH_PRED,   UV_TM_PRED,
+#if CONFIG_SMOOTH_HV
   UV_SMOOTH_V_PRED, UV_SMOOTH_H_PRED,
-#endif  // CONFIG_ALT_INTRA && CONFIG_SMOOTH_HV
+#endif  // CONFIG_SMOOTH_HV
   UV_D135_PRED,     UV_D207_PRED,     UV_D153_PRED,
   UV_D63_PRED,      UV_D117_PRED,     UV_D45_PRED,
 };
@@ -541,7 +508,6 @@ static const UV_PREDICTION_MODE uv_rd_search_mode_order[UV_INTRA_MODES] = {
 #define uv_rd_search_mode_order intra_rd_search_mode_order
 #endif  // CONFIG_CFL
 
-#if CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA || CONFIG_PALETTE
 static INLINE int write_uniform_cost(int n, int v) {
   const int l = get_unsigned_bits(n);
   const int m = (1 << l) - n;
@@ -551,7 +517,6 @@ static INLINE int write_uniform_cost(int n, int v) {
   else
     return l * av1_cost_bit(128, 0);
 }
-#endif  // CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA || CONFIG_PALETTE
 
 // constants for prune 1 and prune 2 decision boundaries
 #define FAST_EXT_TX_CORR_MID 0.0
@@ -559,7 +524,82 @@ static INLINE int write_uniform_cost(int n, int v) {
 #define FAST_EXT_TX_CORR_MARGIN 0.5
 #define FAST_EXT_TX_EDST_MARGIN 0.3
 
-#if CONFIG_DAALA_DIST
+static unsigned pixel_dist_visible_only(
+    const AV1_COMP *const cpi, const MACROBLOCK *x, const uint8_t *src,
+    const int src_stride, const uint8_t *dst, const int dst_stride,
+    const BLOCK_SIZE tx_bsize, int txb_rows, int txb_cols, int visible_rows,
+    int visible_cols) {
+  unsigned sse;
+
+  if (txb_rows == visible_rows && txb_cols == visible_cols
+#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
+      && tx_bsize < BLOCK_SIZES
+#endif
+      ) {
+    cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
+    return sse;
+  }
+#if CONFIG_HIGHBITDEPTH
+  const MACROBLOCKD *xd = &x->e_mbd;
+
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    uint64_t sse64 = aom_highbd_sse_odd_size(src, src_stride, dst, dst_stride,
+                                             visible_cols, visible_rows);
+    return (unsigned int)ROUND_POWER_OF_TWO(sse64, (xd->bd - 8) * 2);
+  }
+#else
+  (void)x;
+#endif  // CONFIG_HIGHBITDEPTH
+  sse = aom_sse_odd_size(src, src_stride, dst, dst_stride, visible_cols,
+                         visible_rows);
+  return sse;
+}
+
+#if CONFIG_DIST_8X8
+static uint64_t cdef_dist_8x8_16bit(uint16_t *dst, int dstride, uint16_t *src,
+                                    int sstride, int coeff_shift) {
+  uint64_t svar = 0;
+  uint64_t dvar = 0;
+  uint64_t sum_s = 0;
+  uint64_t sum_d = 0;
+  uint64_t sum_s2 = 0;
+  uint64_t sum_d2 = 0;
+  uint64_t sum_sd = 0;
+  uint64_t dist = 0;
+
+  int i, j;
+  for (i = 0; i < 8; i++) {
+    for (j = 0; j < 8; j++) {
+      sum_s += src[i * sstride + j];
+      sum_d += dst[i * dstride + j];
+      sum_s2 += src[i * sstride + j] * src[i * sstride + j];
+      sum_d2 += dst[i * dstride + j] * dst[i * dstride + j];
+      sum_sd += src[i * sstride + j] * dst[i * dstride + j];
+    }
+  }
+  /* Compute the variance -- the calculation cannot go negative. */
+  svar = sum_s2 - ((sum_s * sum_s + 32) >> 6);
+  dvar = sum_d2 - ((sum_d * sum_d + 32) >> 6);
+
+  // Tuning of jm's original dering distortion metric used in CDEF tool,
+  // suggested by jm
+  const uint64_t a = 4;
+  const uint64_t b = 2;
+  const uint64_t c1 = (400 * a << 2 * coeff_shift);
+  const uint64_t c2 = (b * 20000 * a * a << 4 * coeff_shift);
+
+  dist =
+      (uint64_t)floor(.5 +
+                      (sum_d2 + sum_s2 - 2 * sum_sd) * .5 * (svar + dvar + c1) /
+                          (sqrt(svar * (double)dvar + c2)));
+
+  // Calibrate dist to have similar rate for the same QP with MSE only
+  // distortion (as in master branch)
+  dist = (uint64_t)((float)dist * 0.75);
+
+  return dist;
+}
+
 static int od_compute_var_4x4(uint16_t *x, int stride) {
   int sum;
   int s2;
@@ -617,7 +657,7 @@ static double od_compute_dist_8x8(int use_activity_masking, uint16_t *x,
     }
   }
   /* We use a different variance statistic depending on whether activity
-     masking is used, since the harmonic mean appeared slghtly worse with
+     masking is used, since the harmonic mean appeared slightly worse with
      masking off. The calibration constant just ensures that we preserve the
      rate compared to activity=1. */
   if (use_activity_masking) {
@@ -688,268 +728,241 @@ static double od_compute_dist_common(int activity_masking, uint16_t *x,
 
 static double od_compute_dist(uint16_t *x, uint16_t *y, int bsize_w,
                               int bsize_h, int qindex) {
-  int i;
-  double sum;
-  sum = 0;
-
   assert(bsize_w >= 8 && bsize_h >= 8);
-
 #if CONFIG_PVQ
   int activity_masking = 1;
 #else
   int activity_masking = 0;
 #endif
-  {
-    int j;
-    DECLARE_ALIGNED(16, od_coeff, e[MAX_TX_SQUARE]);
-    DECLARE_ALIGNED(16, od_coeff, tmp[MAX_TX_SQUARE]);
-    DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_TX_SQUARE]);
-    int mid = OD_DIST_LP_MID;
-    for (i = 0; i < bsize_h; i++) {
-      for (j = 0; j < bsize_w; j++) {
-        e[i * bsize_w + j] = x[i * bsize_w + j] - y[i * bsize_w + j];
-      }
+  int i, j;
+  DECLARE_ALIGNED(16, od_coeff, e[MAX_TX_SQUARE]);
+  DECLARE_ALIGNED(16, od_coeff, tmp[MAX_TX_SQUARE]);
+  DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_TX_SQUARE]);
+  for (i = 0; i < bsize_h; i++) {
+    for (j = 0; j < bsize_w; j++) {
+      e[i * bsize_w + j] = x[i * bsize_w + j] - y[i * bsize_w + j];
     }
-    for (i = 0; i < bsize_h; i++) {
-      tmp[i * bsize_w] = mid * e[i * bsize_w] + 2 * e[i * bsize_w + 1];
-      tmp[i * bsize_w + bsize_w - 1] =
-          mid * e[i * bsize_w + bsize_w - 1] + 2 * e[i * bsize_w + bsize_w - 2];
-      for (j = 1; j < bsize_w - 1; j++) {
-        tmp[i * bsize_w + j] = mid * e[i * bsize_w + j] +
-                               e[i * bsize_w + j - 1] + e[i * bsize_w + j + 1];
-      }
+  }
+  int mid = OD_DIST_LP_MID;
+  for (i = 0; i < bsize_h; i++) {
+    tmp[i * bsize_w] = mid * e[i * bsize_w] + 2 * e[i * bsize_w + 1];
+    tmp[i * bsize_w + bsize_w - 1] =
+        mid * e[i * bsize_w + bsize_w - 1] + 2 * e[i * bsize_w + bsize_w - 2];
+    for (j = 1; j < bsize_w - 1; j++) {
+      tmp[i * bsize_w + j] = mid * e[i * bsize_w + j] + e[i * bsize_w + j - 1] +
+                             e[i * bsize_w + j + 1];
     }
-    sum = od_compute_dist_common(activity_masking, x, y, bsize_w, bsize_h,
-                                 qindex, tmp, e_lp);
   }
-  return sum;
+  return od_compute_dist_common(activity_masking, x, y, bsize_w, bsize_h,
+                                qindex, tmp, e_lp);
 }
 
 static double od_compute_dist_diff(uint16_t *x, int16_t *e, int bsize_w,
                                    int bsize_h, int qindex) {
-  int i;
-  double sum;
-  sum = 0;
-
   assert(bsize_w >= 8 && bsize_h >= 8);
-
 #if CONFIG_PVQ
   int activity_masking = 1;
 #else
   int activity_masking = 0;
 #endif
-  {
-    int j;
-    DECLARE_ALIGNED(16, uint16_t, y[MAX_TX_SQUARE]);
-    DECLARE_ALIGNED(16, od_coeff, tmp[MAX_TX_SQUARE]);
-    DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_TX_SQUARE]);
-    int mid = OD_DIST_LP_MID;
-    for (i = 0; i < bsize_h; i++) {
-      for (j = 0; j < bsize_w; j++) {
-        y[i * bsize_w + j] = x[i * bsize_w + j] - e[i * bsize_w + j];
-      }
+  DECLARE_ALIGNED(16, uint16_t, y[MAX_TX_SQUARE]);
+  DECLARE_ALIGNED(16, od_coeff, tmp[MAX_TX_SQUARE]);
+  DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_TX_SQUARE]);
+  int i, j;
+  for (i = 0; i < bsize_h; i++) {
+    for (j = 0; j < bsize_w; j++) {
+      y[i * bsize_w + j] = x[i * bsize_w + j] - e[i * bsize_w + j];
     }
-    for (i = 0; i < bsize_h; i++) {
-      tmp[i * bsize_w] = mid * e[i * bsize_w] + 2 * e[i * bsize_w + 1];
-      tmp[i * bsize_w + bsize_w - 1] =
-          mid * e[i * bsize_w + bsize_w - 1] + 2 * e[i * bsize_w + bsize_w - 2];
-      for (j = 1; j < bsize_w - 1; j++) {
-        tmp[i * bsize_w + j] = mid * e[i * bsize_w + j] +
-                               e[i * bsize_w + j - 1] + e[i * bsize_w + j + 1];
-      }
+  }
+  int mid = OD_DIST_LP_MID;
+  for (i = 0; i < bsize_h; i++) {
+    tmp[i * bsize_w] = mid * e[i * bsize_w] + 2 * e[i * bsize_w + 1];
+    tmp[i * bsize_w + bsize_w - 1] =
+        mid * e[i * bsize_w + bsize_w - 1] + 2 * e[i * bsize_w + bsize_w - 2];
+    for (j = 1; j < bsize_w - 1; j++) {
+      tmp[i * bsize_w + j] = mid * e[i * bsize_w + j] + e[i * bsize_w + j - 1] +
+                             e[i * bsize_w + j + 1];
     }
-    sum = od_compute_dist_common(activity_masking, x, y, bsize_w, bsize_h,
-                                 qindex, tmp, e_lp);
   }
-  return sum;
+  return od_compute_dist_common(activity_masking, x, y, bsize_w, bsize_h,
+                                qindex, tmp, e_lp);
 }
-#endif  // CONFIG_DAALA_DIST
 
-#if CONFIG_DIST_8X8
-#define NEW_FUTURE_DIST 0
-int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCKD *xd,
+int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCK *x,
                      const uint8_t *src, int src_stride, const uint8_t *dst,
                      int dst_stride, const BLOCK_SIZE tx_bsize, int bsw,
                      int bsh, int visible_w, int visible_h, int qindex) {
   int64_t d = 0;
-
-#if CONFIG_DAALA_DIST || NEW_FUTURE_DIST
   int i, j;
+  const MACROBLOCKD *xd = &x->e_mbd;
 
   DECLARE_ALIGNED(16, uint16_t, orig[MAX_TX_SQUARE]);
   DECLARE_ALIGNED(16, uint16_t, rec[MAX_TX_SQUARE]);
-  (void)cpi;
-  (void)tx_bsize;
-#endif  // CONFIG_DAALA_DIST || NEW_FUTURE_DIST
 
-#if !CONFIG_HIGHBITDEPTH
-  (void)xd;
-#endif
+  assert(bsw >= 8);
+  assert(bsh >= 8);
+  assert((bsw & 0x07) == 0);
+  assert((bsh & 0x07) == 0);
 
-#if !CONFIG_DAALA_DIST
-  (void)qindex;
-#endif
-
-#if !CONFIG_DAALA_DIST || !NEW_FUTURE_DIST
-  (void)xd;
-  (void)bsw, (void)bsh;
-  (void)visible_w, (void)visible_h;
-#endif
-
-#if CONFIG_DAALA_DIST || NEW_FUTURE_DIST
+  if (x->tune_metric == AOM_TUNE_CDEF_DIST ||
+      x->tune_metric == AOM_TUNE_DAALA_DIST) {
 #if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    for (j = 0; j < bsh; j++)
-      for (i = 0; i < bsw; i++)
-        orig[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
-
-    if ((bsw == visible_w) && (bsh == visible_h)) {
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       for (j = 0; j < bsh; j++)
         for (i = 0; i < bsw; i++)
-          rec[j * bsw + i] = CONVERT_TO_SHORTPTR(dst)[j * dst_stride + i];
-    } else {
-      for (j = 0; j < visible_h; j++)
-        for (i = 0; i < visible_w; i++)
-          rec[j * bsw + i] = CONVERT_TO_SHORTPTR(dst)[j * dst_stride + i];
+          orig[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
 
-      if (visible_w < bsw) {
+      if ((bsw == visible_w) && (bsh == visible_h)) {
         for (j = 0; j < bsh; j++)
-          for (i = visible_w; i < bsw; i++)
-            rec[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
-      }
-
-      if (visible_h < bsh) {
-        for (j = visible_h; j < bsh; j++)
           for (i = 0; i < bsw; i++)
-            rec[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
+            rec[j * bsw + i] = CONVERT_TO_SHORTPTR(dst)[j * dst_stride + i];
+      } else {
+        for (j = 0; j < visible_h; j++)
+          for (i = 0; i < visible_w; i++)
+            rec[j * bsw + i] = CONVERT_TO_SHORTPTR(dst)[j * dst_stride + i];
+
+        if (visible_w < bsw) {
+          for (j = 0; j < bsh; j++)
+            for (i = visible_w; i < bsw; i++)
+              rec[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
+        }
+
+        if (visible_h < bsh) {
+          for (j = visible_h; j < bsh; j++)
+            for (i = 0; i < bsw; i++)
+              rec[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
+        }
       }
-    }
-  } else {
+    } else {
 #endif
-    for (j = 0; j < bsh; j++)
-      for (i = 0; i < bsw; i++) orig[j * bsw + i] = src[j * src_stride + i];
-
-    if ((bsw == visible_w) && (bsh == visible_h)) {
       for (j = 0; j < bsh; j++)
-        for (i = 0; i < bsw; i++) rec[j * bsw + i] = dst[j * dst_stride + i];
-    } else {
-      for (j = 0; j < visible_h; j++)
-        for (i = 0; i < visible_w; i++)
-          rec[j * bsw + i] = dst[j * dst_stride + i];
+        for (i = 0; i < bsw; i++) orig[j * bsw + i] = src[j * src_stride + i];
 
-      if (visible_w < bsw) {
+      if ((bsw == visible_w) && (bsh == visible_h)) {
         for (j = 0; j < bsh; j++)
-          for (i = visible_w; i < bsw; i++)
-            rec[j * bsw + i] = src[j * src_stride + i];
-      }
+          for (i = 0; i < bsw; i++) rec[j * bsw + i] = dst[j * dst_stride + i];
+      } else {
+        for (j = 0; j < visible_h; j++)
+          for (i = 0; i < visible_w; i++)
+            rec[j * bsw + i] = dst[j * dst_stride + i];
+
+        if (visible_w < bsw) {
+          for (j = 0; j < bsh; j++)
+            for (i = visible_w; i < bsw; i++)
+              rec[j * bsw + i] = src[j * src_stride + i];
+        }
 
-      if (visible_h < bsh) {
-        for (j = visible_h; j < bsh; j++)
-          for (i = 0; i < bsw; i++) rec[j * bsw + i] = src[j * src_stride + i];
+        if (visible_h < bsh) {
+          for (j = visible_h; j < bsh; j++)
+            for (i = 0; i < bsw; i++)
+              rec[j * bsw + i] = src[j * src_stride + i];
+        }
       }
-    }
 #if CONFIG_HIGHBITDEPTH
-  }
+    }
 #endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_DAALA_DIST || NEW_FUTURE_DIST
+  }
 
-#if CONFIG_DAALA_DIST
-  d = (int64_t)od_compute_dist(orig, rec, bsw, bsh, qindex);
-#elif NEW_FUTURE_DIST
-  // Call new 8x8-wise distortion function here, for example
-  for (i = 0; i < bsh; i += 8) {
-    for (j = 0; j < bsw; j += 8) {
-      d +=
-          av1_compute_dist_8x8(&orig[i * bsw + j], &rec[i * bsw + j], bsw, bsh);
+  if (x->tune_metric == AOM_TUNE_DAALA_DIST) {
+    d = (int64_t)od_compute_dist(orig, rec, bsw, bsh, qindex);
+  } else if (x->tune_metric == AOM_TUNE_CDEF_DIST) {
+    int coeff_shift = AOMMAX(xd->bd - 8, 0);
+
+    for (i = 0; i < bsh; i += 8) {
+      for (j = 0; j < bsw; j += 8) {
+        d += cdef_dist_8x8_16bit(&rec[i * bsw + j], bsw, &orig[i * bsw + j],
+                                 bsw, coeff_shift);
+      }
     }
+#if CONFIG_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+      d = ((uint64_t)d) >> 2 * coeff_shift;
+#endif
+  } else {
+    // Otherwise, MSE by default
+    d = pixel_dist_visible_only(cpi, x, src, src_stride, dst, dst_stride,
+                                tx_bsize, bsh, bsw, visible_h, visible_w);
   }
-#else
-  // Otherwise, MSE by default
-  unsigned sse;
-  // TODO(Any): Use even faster function which does not calculate variance
-  cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
-  d = sse;
-#endif  // CONFIG_DAALA_DIST
 
   return d;
 }
 
-static int64_t av1_dist_8x8_diff(const MACROBLOCKD *xd, const uint8_t *src,
+static int64_t av1_dist_8x8_diff(const MACROBLOCK *x, const uint8_t *src,
                                  int src_stride, const int16_t *diff,
                                  int diff_stride, int bsw, int bsh,
                                  int visible_w, int visible_h, int qindex) {
   int64_t d = 0;
-
-#if CONFIG_DAALA_DIST || NEW_FUTURE_DIST
   int i, j;
+  const MACROBLOCKD *xd = &x->e_mbd;
 
   DECLARE_ALIGNED(16, uint16_t, orig[MAX_TX_SQUARE]);
   DECLARE_ALIGNED(16, int16_t, diff16[MAX_TX_SQUARE]);
-#endif  // CONFIG_DAALA_DIST || NEW_FUTURE_DIST
 
-#if !CONFIG_HIGHBITDEPTH
-  (void)xd;
-#endif
+  assert(bsw >= 8);
+  assert(bsh >= 8);
+  assert((bsw & 0x07) == 0);
+  assert((bsh & 0x07) == 0);
 
-#if !CONFIG_DAALA_DIST
-  (void)qindex;
-#endif
-
-#if !CONFIG_DAALA_DIST || !NEW_FUTURE_DIST
-  (void)xd;
-  (void)src, (void)src_stride;
-  (void)bsw, (void)bsh;
-  (void)visible_w, (void)visible_h;
-#endif
-
-#if CONFIG_DAALA_DIST || NEW_FUTURE_DIST
+  if (x->tune_metric == AOM_TUNE_CDEF_DIST ||
+      x->tune_metric == AOM_TUNE_DAALA_DIST) {
 #if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    for (j = 0; j < bsh; j++)
-      for (i = 0; i < bsw; i++)
-        orig[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
-  } else {
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      for (j = 0; j < bsh; j++)
+        for (i = 0; i < bsw; i++)
+          orig[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
+    } else {
 #endif
-    for (j = 0; j < bsh; j++)
-      for (i = 0; i < bsw; i++) orig[j * bsw + i] = src[j * src_stride + i];
+      for (j = 0; j < bsh; j++)
+        for (i = 0; i < bsw; i++) orig[j * bsw + i] = src[j * src_stride + i];
 #if CONFIG_HIGHBITDEPTH
-  }
+    }
 #endif  // CONFIG_HIGHBITDEPTH
 
-  if ((bsw == visible_w) && (bsh == visible_h)) {
-    for (j = 0; j < bsh; j++)
-      for (i = 0; i < bsw; i++) diff16[j * bsw + i] = diff[j * diff_stride + i];
-  } else {
-    for (j = 0; j < visible_h; j++)
-      for (i = 0; i < visible_w; i++)
-        diff16[j * bsw + i] = diff[j * diff_stride + i];
-
-    if (visible_w < bsw) {
+    if ((bsw == visible_w) && (bsh == visible_h)) {
       for (j = 0; j < bsh; j++)
-        for (i = visible_w; i < bsw; i++) diff16[j * bsw + i] = 0;
-    }
+        for (i = 0; i < bsw; i++)
+          diff16[j * bsw + i] = diff[j * diff_stride + i];
+    } else {
+      for (j = 0; j < visible_h; j++)
+        for (i = 0; i < visible_w; i++)
+          diff16[j * bsw + i] = diff[j * diff_stride + i];
 
-    if (visible_h < bsh) {
-      for (j = visible_h; j < bsh; j++)
-        for (i = 0; i < bsw; i++) diff16[j * bsw + i] = 0;
+      if (visible_w < bsw) {
+        for (j = 0; j < bsh; j++)
+          for (i = visible_w; i < bsw; i++) diff16[j * bsw + i] = 0;
+      }
+
+      if (visible_h < bsh) {
+        for (j = visible_h; j < bsh; j++)
+          for (i = 0; i < bsw; i++) diff16[j * bsw + i] = 0;
+      }
     }
   }
-#endif  // CONFIG_DAALA_DIST || NEW_FUTURE_DIST
 
-#if CONFIG_DAALA_DIST
-  d = (int64_t)od_compute_dist_diff(orig, diff16, bsw, bsh, qindex);
-#elif NEW_FUTURE_DIST
-  // Call new 8x8-wise distortion function (with diff inpu) here, for example
-  for (i = 0; i < bsh; i += 8) {
-    for (j = 0; j < bsw; j += 8) {
-      d += av1_compute_dist_8x8_diff(&orig[i * bsw + j], &diff16[i * bsw + j],
-                                     bsw, bsh);
+  if (x->tune_metric == AOM_TUNE_DAALA_DIST) {
+    d = (int64_t)od_compute_dist_diff(orig, diff16, bsw, bsh, qindex);
+  } else if (x->tune_metric == AOM_TUNE_CDEF_DIST) {
+    int coeff_shift = AOMMAX(xd->bd - 8, 0);
+    DECLARE_ALIGNED(16, uint16_t, dst16[MAX_TX_SQUARE]);
+
+    for (i = 0; i < bsh; i++) {
+      for (j = 0; j < bsw; j++) {
+        dst16[i * bsw + j] = orig[i * bsw + j] - diff16[i * bsw + j];
+      }
     }
+
+    for (i = 0; i < bsh; i += 8) {
+      for (j = 0; j < bsw; j += 8) {
+        d += cdef_dist_8x8_16bit(&dst16[i * bsw + j], bsw, &orig[i * bsw + j],
+                                 bsw, coeff_shift);
+      }
+    }
+    // Don't scale 'd' for HBD since it will be done by caller side for diff
+    // input
+  } else {
+    // Otherwise, MSE by default
+    d = aom_sum_squares_2d_i16(diff, diff_stride, visible_w, visible_h);
   }
-#else
-  // Otherwise, MSE by default
-  d = aom_sum_squares_2d_i16(diff, diff_stride, bsw, bsh);
-#endif  // CONFIG_DAALA_DIST
 
   return d;
 }
@@ -1169,6 +1182,17 @@ static int prune_one_for_sby(const AV1_COMP *cpi, BLOCK_SIZE bsize,
                           pd->dst.stride);
 }
 
+#if CONFIG_EXT_TX
+// 1D Transforms used in inter set, this needs to be changed if
+// ext_tx_used_inter is changed
+static const int ext_tx_used_inter_1D[EXT_TX_SETS_INTER][TX_TYPES_1D] = {
+  { 1, 0, 0, 0 }, { 1, 1, 1, 1 }, { 1, 1, 1, 1 }, { 1, 0, 0, 1 },
+#if CONFIG_MRC_TX
+  { 1, 0, 0, 1 },
+#endif  // CONFIG_MRC_TX
+};
+#endif  // CONFIG_EXT_TX
+
 static int prune_tx_types(const AV1_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
                           const MACROBLOCKD *const xd, int tx_set) {
 #if CONFIG_EXT_TX
@@ -1392,22 +1416,18 @@ static int64_t av1_block_error2_c(const tran_low_t *coeff,
                                   const tran_low_t *ref, intptr_t block_size,
                                   int64_t *ssz) {
   int64_t error;
+  int64_t ssz_trash;
   // Use the existing sse codes for calculating distortion of decoded signal:
   // i.e. (orig - decoded)^2
-  error = av1_block_error_fp(coeff, dqcoeff, block_size);
+  error = av1_block_error(coeff, dqcoeff, block_size, &ssz_trash);
   // prediction residue^2 = (orig - ref)^2
-  *ssz = av1_block_error_fp(coeff, ref, block_size);
+  *ssz = av1_block_error(coeff, ref, block_size, &ssz_trash);
   return error;
 }
 #endif  // CONFIG_HIGHBITDEPTH
 #endif  // CONFIG_PVQ
 
 #if !CONFIG_PVQ || CONFIG_VAR_TX
-/* The trailing '0' is a terminator which is used inside av1_cost_coeffs() to
- * decide whether to include cost of a trailing EOB node or not (i.e. we
- * can skip this if the last coefficient in this transform block, e.g. the
- * 16th coefficient in a 4x4 block or the 64th coefficient in a 8x8 block,
- * were non-zero). */
 #if !CONFIG_LV_MAP
 static int cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
                        int block, TX_SIZE tx_size, const SCAN_ORDER *scan_order,
@@ -1421,17 +1441,19 @@ static int cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
   const uint16_t *band_count = &band_count_table[tx_size][1];
   const int eob = p->eobs[block];
   const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  const int tx_size_ctx = txsize_sqr_map[tx_size];
-  unsigned int(*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
-      x->token_costs[tx_size_ctx][type][is_inter_block(mbmi)];
+  const TX_SIZE tx_size_ctx = txsize_sqr_map[tx_size];
   uint8_t token_cache[MAX_TX_SQUARE];
   int pt = combine_entropy_contexts(*a, *l);
   int c, cost;
   const int16_t *scan = scan_order->scan;
   const int16_t *nb = scan_order->neighbors;
   const int ref = is_inter_block(mbmi);
-  aom_prob *blockz_probs =
-      cm->fc->blockzero_probs[txsize_sqr_map[tx_size]][type][ref];
+  int(*head_token_costs)[COEFF_CONTEXTS][TAIL_TOKENS] =
+      x->token_head_costs[tx_size_ctx][type][ref];
+  int(*tail_token_costs)[COEFF_CONTEXTS][TAIL_TOKENS] =
+      x->token_tail_costs[tx_size_ctx][type][ref];
+  const int seg_eob = av1_get_tx_eob(&cm->seg, mbmi->segment_id, tx_size);
+  int eob_val;
 
 #if CONFIG_HIGHBITDEPTH
   const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, xd->bd);
@@ -1446,8 +1468,8 @@ static int cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
   (void)cm;
 
   if (eob == 0) {
-    // single eob token
-    cost = av1_cost_bit(blockz_probs[pt], 0);
+    // block zero
+    cost = (*head_token_costs)[pt][0];
   } else {
     if (use_fast_coef_costing) {
       int band_left = *band_count++;
@@ -1456,10 +1478,13 @@ static int cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
       int v = qcoeff[0];
       int16_t prev_t;
       cost = av1_get_token_cost(v, &prev_t, cat6_bits);
-      cost += (*token_costs)[!prev_t][pt][prev_t];
+      eob_val = (eob == 1) ? EARLY_EOB : NO_EOB;
+      cost += av1_get_coeff_token_cost(
+          prev_t, eob_val, 1, (*head_token_costs)[pt], (*tail_token_costs)[pt]);
 
       token_cache[0] = av1_pt_energy_class[prev_t];
-      ++token_costs;
+      ++head_token_costs;
+      ++tail_token_costs;
 
       // ac tokens
       for (c = 1; c < eob; c++) {
@@ -1468,17 +1493,18 @@ static int cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
 
         v = qcoeff[rc];
         cost += av1_get_token_cost(v, &t, cat6_bits);
-        cost += (*token_costs)[!t][!prev_t][t];
+        eob_val =
+            (c + 1 == eob) ? (c + 1 == seg_eob ? LAST_EOB : EARLY_EOB) : NO_EOB;
+        cost += av1_get_coeff_token_cost(t, eob_val, 0,
+                                         (*head_token_costs)[!prev_t],
+                                         (*tail_token_costs)[!prev_t]);
         prev_t = t;
         if (!--band_left) {
           band_left = *band_count++;
-          ++token_costs;
+          ++head_token_costs;
+          ++tail_token_costs;
         }
       }
-
-      // eob token
-      cost += (*token_costs)[0][!prev_t][EOB_TOKEN];
-
     } else {  // !use_fast_coef_costing
       int band_left = *band_count++;
 
@@ -1486,10 +1512,13 @@ static int cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
       int v = qcoeff[0];
       int16_t tok;
       cost = av1_get_token_cost(v, &tok, cat6_bits);
-      cost += (*token_costs)[!tok][pt][tok];
+      eob_val = (eob == 1) ? EARLY_EOB : NO_EOB;
+      cost += av1_get_coeff_token_cost(tok, eob_val, 1, (*head_token_costs)[pt],
+                                       (*tail_token_costs)[pt]);
 
       token_cache[0] = av1_pt_energy_class[tok];
-      ++token_costs;
+      ++head_token_costs;
+      ++tail_token_costs;
 
       // ac tokens
       for (c = 1; c < eob; c++) {
@@ -1498,17 +1527,17 @@ static int cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
         v = qcoeff[rc];
         cost += av1_get_token_cost(v, &tok, cat6_bits);
         pt = get_coef_context(nb, token_cache, c);
-        cost += (*token_costs)[!tok][pt][tok];
+        eob_val =
+            (c + 1 == eob) ? (c + 1 == seg_eob ? LAST_EOB : EARLY_EOB) : NO_EOB;
+        cost += av1_get_coeff_token_cost(
+            tok, eob_val, 0, (*head_token_costs)[pt], (*tail_token_costs)[pt]);
         token_cache[rc] = av1_pt_energy_class[tok];
         if (!--band_left) {
           band_left = *band_count++;
-          ++token_costs;
+          ++head_token_costs;
+          ++tail_token_costs;
         }
       }
-
-      // eob token
-      pt = get_coef_context(nb, token_cache, c);
-      cost += (*token_costs)[0][pt][EOB_TOKEN];
     }
   }
 
@@ -1520,10 +1549,25 @@ int av1_cost_coeffs(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
                     int blk_row, int blk_col, int block, TX_SIZE tx_size,
                     const SCAN_ORDER *scan_order, const ENTROPY_CONTEXT *a,
                     const ENTROPY_CONTEXT *l, int use_fast_coef_costing) {
+  const AV1_COMMON *const cm = &cpi->common;
 #if !CONFIG_LV_MAP
   (void)blk_row;
   (void)blk_col;
-  const AV1_COMMON *const cm = &cpi->common;
+#if CONFIG_MRC_TX
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  const TX_TYPE tx_type = av1_get_tx_type(xd->plane[plane].plane_type, xd,
+                                          blk_row, blk_col, block, tx_size);
+  const int is_inter = is_inter_block(mbmi);
+  if (tx_type == MRC_DCT && ((is_inter && SIGNAL_MRC_MASK_INTER) ||
+                             (!is_inter && SIGNAL_MRC_MASK_INTRA))) {
+    const int mrc_mask_cost =
+        av1_cost_color_map(x, plane, block, mbmi->sb_type, tx_size, MRC_MAP);
+    return cost_coeffs(cm, x, plane, block, tx_size, scan_order, a, l,
+                       use_fast_coef_costing) +
+           mrc_mask_cost;
+  }
+#endif
   return cost_coeffs(cm, x, plane, block, tx_size, scan_order, a, l,
                      use_fast_coef_costing);
 #else  // !CONFIG_LV_MAP
@@ -1545,7 +1589,7 @@ int av1_cost_coeffs(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
 
   TXB_CTX txb_ctx;
   get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
-  return av1_cost_coeffs_txb(cpi, x, plane, blk_row, blk_col, block, tx_size,
+  return av1_cost_coeffs_txb(cm, x, plane, blk_row, blk_col, block, tx_size,
                              &txb_ctx);
 #endif  // !CONFIG_LV_MAP
 }
@@ -1600,31 +1644,16 @@ static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x,
   assert(visible_cols > 0);
 
 #if CONFIG_DIST_8X8
-  if (plane == 0 && txb_cols >= 8 && txb_rows >= 8)
-    return av1_dist_8x8(cpi, xd, src, src_stride, dst, dst_stride, tx_bsize,
-                        txb_cols, txb_rows, visible_cols, visible_rows,
-                        x->qindex);
+  if (x->using_dist_8x8 && plane == 0 && txb_cols >= 8 && txb_rows >= 8)
+    return (unsigned)av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride,
+                                  tx_bsize, txb_cols, txb_rows, visible_cols,
+                                  visible_rows, x->qindex);
 #endif  // CONFIG_DIST_8X8
 
-#if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
-  if ((txb_rows == visible_rows && txb_cols == visible_cols) &&
-      tx_bsize < BLOCK_SIZES) {
-#else
-  if (txb_rows == visible_rows && txb_cols == visible_cols) {
-#endif
-    unsigned sse;
-    cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
-    return sse;
-  }
-#if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    uint64_t sse = aom_highbd_sse_odd_size(src, src_stride, dst, dst_stride,
-                                           visible_cols, visible_rows);
-    return (unsigned int)ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
-  }
-#endif  // CONFIG_HIGHBITDEPTH
-  unsigned sse = aom_sse_odd_size(src, src_stride, dst, dst_stride,
-                                  visible_cols, visible_rows);
+  unsigned sse = pixel_dist_visible_only(cpi, x, src, src_stride, dst,
+                                         dst_stride, tx_bsize, txb_rows,
+                                         txb_cols, visible_rows, visible_cols);
+
   return sse;
 }
 
@@ -1649,8 +1678,8 @@ static int64_t pixel_diff_dist(const MACROBLOCK *x, int plane,
                      NULL, &visible_cols, &visible_rows);
 
 #if CONFIG_DIST_8X8
-  if (plane == 0 && txb_width >= 8 && txb_height >= 8)
-    return av1_dist_8x8_diff(xd, src, src_stride, diff, diff_stride, txb_width,
+  if (x->using_dist_8x8 && plane == 0 && txb_width >= 8 && txb_height >= 8)
+    return av1_dist_8x8_diff(x, src, src_stride, diff, diff_stride, txb_width,
                              txb_height, visible_cols, visible_rows, x->qindex);
   else
 #endif
@@ -1658,7 +1687,6 @@ static int64_t pixel_diff_dist(const MACROBLOCK *x, int plane,
                                   visible_rows);
 }
 
-#if CONFIG_PALETTE || CONFIG_INTRABC
 int av1_count_colors(const uint8_t *src, int stride, int rows, int cols) {
   int val_count[256];
   memset(val_count, 0, sizeof(val_count));
@@ -1693,7 +1721,6 @@ int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols,
   return n;
 }
 #endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_PALETTE || CONFIG_INTRABC
 
 void av1_dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
                     BLOCK_SIZE plane_bsize, int block, int blk_row, int blk_col,
@@ -1707,7 +1734,11 @@ void av1_dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
   const struct macroblockd_plane *const pd = &xd->plane[plane];
 #endif  // CONFIG_DIST_8X8
 
-  if (cpi->sf.use_transform_domain_distortion && !CONFIG_DIST_8X8) {
+  if (cpi->sf.use_transform_domain_distortion
+#if CONFIG_DIST_8X8
+      && !x->using_dist_8x8
+#endif
+      ) {
     // Transform domain distortion computation is more efficient as it does
     // not involve an inverse transform, but it is less accurate.
     const int buffer_length = tx_size_2d[tx_size];
@@ -1721,25 +1752,22 @@ void av1_dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
 #if CONFIG_HIGHBITDEPTH
     const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;
     *out_dist = av1_highbd_block_error2_c(coeff, dqcoeff, ref_coeff,
-                                          buffer_length, &this_sse, bd) >>
-                shift;
+                                          buffer_length, &this_sse, bd);
 #else
-    *out_dist = av1_block_error2_c(coeff, dqcoeff, ref_coeff, buffer_length,
-                                   &this_sse) >>
-                shift;
+    *out_dist =
+        av1_block_error2_c(coeff, dqcoeff, ref_coeff, buffer_length, &this_sse);
 #endif  // CONFIG_HIGHBITDEPTH
 #else   // !CONFIG_PVQ
 #if CONFIG_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
       *out_dist = av1_highbd_block_error(coeff, dqcoeff, buffer_length,
-                                         &this_sse, xd->bd) >>
-                  shift;
+                                         &this_sse, xd->bd);
     else
 #endif
-      *out_dist =
-          av1_block_error(coeff, dqcoeff, buffer_length, &this_sse) >> shift;
+      *out_dist = av1_block_error(coeff, dqcoeff, buffer_length, &this_sse);
 #endif  // CONFIG_PVQ
-    *out_sse = this_sse >> shift;
+    *out_dist = RIGHT_SIGNED_SHIFT(*out_dist, shift);
+    *out_sse = RIGHT_SIGNED_SHIFT(this_sse, shift);
   } else {
     const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
 #if !CONFIG_PVQ || CONFIG_DIST_8X8
@@ -1808,17 +1836,23 @@ void av1_dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
         (void)dst;
 #endif  // !CONFIG_PVQ
 
+#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+        uint8_t *mrc_mask = BLOCK_OFFSET(xd->mrc_mask, block);
+#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
         const PLANE_TYPE plane_type = get_plane_type(plane);
         TX_TYPE tx_type =
             av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
         av1_inverse_transform_block(xd, dqcoeff,
-#if CONFIG_LGT
+#if CONFIG_LGT_FROM_PRED
                                     xd->mi[0]->mbmi.mode,
 #endif
+#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+                                    mrc_mask,
+#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
                                     tx_type, tx_size, recon, MAX_TX_SIZE, eob);
 
 #if CONFIG_DIST_8X8
-        if (plane == 0 && (bsw < 8 || bsh < 8)) {
+        if (x->using_dist_8x8 && plane == 0 && (bsw < 8 || bsh < 8)) {
           // Save decoded pixels for inter block in pd->pred to avoid
           // block_8x8_rd_txfm_daala_dist() need to produce them
           // by calling av1_inverse_transform_block() again.
@@ -1864,12 +1898,23 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
   const AV1_COMP *cpi = args->cpi;
   ENTROPY_CONTEXT *a = args->t_above + blk_col;
   ENTROPY_CONTEXT *l = args->t_left + blk_row;
-#if !CONFIG_TXK_SEL
   const AV1_COMMON *cm = &cpi->common;
-#endif
   int64_t rd1, rd2, rd;
   RD_STATS this_rd_stats;
 
+#if CONFIG_DIST_8X8
+  // If sub8x8 tx, 8x8 or larger partition, and luma channel,
+  // dist-8x8 disables early skip, because the distortion metrics for
+  // sub8x8 tx (MSE) and reference distortion from 8x8 or larger partition
+  // (new distortion metric) are different.
+  // Exception is: dist-8x8 is enabled but still MSE is used,
+  // i.e. "--tune=" encoder option is not used.
+  int disable_early_skip =
+      x->using_dist_8x8 && plane == 0 && plane_bsize >= BLOCK_8X8 &&
+      (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4) &&
+      x->tune_metric != AOM_TUNE_PSNR;
+#endif  // CONFIG_DIST_8X8
+
 #if !CONFIG_SUPERTX && !CONFIG_VAR_TX
   assert(tx_size == av1_get_tx_size(plane, xd));
 #endif  // !CONFIG_SUPERTX
@@ -1879,26 +1924,8 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
   if (args->exit_early) return;
 
   if (!is_inter_block(mbmi)) {
-    av1_predict_intra_block_facade(xd, plane, block, blk_col, blk_row, tx_size);
-#if CONFIG_DPCM_INTRA
-    const int block_raster_idx =
-        av1_block_index_to_raster_order(tx_size, block);
-    const PREDICTION_MODE mode = (plane == AOM_PLANE_Y)
-                                     ? get_y_mode(xd->mi[0], block_raster_idx)
-                                     : get_uv_mode(mbmi->uv_mode);
-    TX_TYPE tx_type =
-        av1_get_tx_type((plane == AOM_PLANE_Y) ? PLANE_TYPE_Y : PLANE_TYPE_UV,
-                        xd, blk_row, blk_col, block, tx_size);
-    if (av1_use_dpcm_intra(plane, mode, tx_type, mbmi)) {
-      int8_t skip;
-      av1_encode_block_intra_dpcm(cm, x, mode, plane, block, blk_row, blk_col,
-                                  plane_bsize, tx_size, tx_type, a, l, &skip);
-      av1_dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col,
-                     tx_size, &this_rd_stats.dist, &this_rd_stats.sse,
-                     OUTPUT_HAS_DECODED_PIXELS);
-      goto CALCULATE_RD;
-    }
-#endif  // CONFIG_DPCM_INTRA
+    av1_predict_intra_block_facade(cm, xd, plane, block, blk_col, blk_row,
+                                   tx_size);
     av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
   }
 
@@ -1921,21 +1948,32 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
 #if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
     tmp_dist =
-        av1_highbd_block_error(coeff, dqcoeff, buffer_length, &tmp, xd->bd) >>
-        shift;
+        av1_highbd_block_error(coeff, dqcoeff, buffer_length, &tmp, xd->bd);
   else
 #endif
-    tmp_dist = av1_block_error(coeff, dqcoeff, buffer_length, &tmp) >> shift;
+    tmp_dist = av1_block_error(coeff, dqcoeff, buffer_length, &tmp);
+  tmp_dist = RIGHT_SIGNED_SHIFT(tmp_dist, shift);
 
-  if (RDCOST(x->rdmult, 0, tmp_dist) + args->this_rd < args->best_rd) {
+  if (
+#if CONFIG_DIST_8X8
+      disable_early_skip ||
+#endif
+      RDCOST(x->rdmult, 0, tmp_dist) + args->this_rd < args->best_rd) {
     av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size,
-                   a, l);
+                   a, l, 1);
   } else {
     args->exit_early = 1;
     return;
   }
 #endif  // DISABLE_TRELLISQ_SEARCH
 
+#if CONFIG_MRC_TX
+  if (mbmi->tx_type == MRC_DCT && !mbmi->valid_mrc_mask) {
+    args->exit_early = 1;
+    return;
+  }
+#endif  // CONFIG_MRC_TX
+
   if (!is_inter_block(mbmi)) {
     struct macroblock_plane *const p = &x->plane[plane];
     av1_inverse_transform_block_facade(xd, plane, block, blk_row, blk_col,
@@ -1949,19 +1987,15 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
                    OUTPUT_HAS_PREDICTED_PIXELS);
   }
 #if CONFIG_CFL
-  if (plane == AOM_PLANE_Y && x->cfl_store_y) {
-    struct macroblockd_plane *const pd = &xd->plane[plane];
-    const int dst_stride = pd->dst.stride;
-    uint8_t *dst =
-        &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
-    // TODO (ltrudeau) Store sub-8x8 inter blocks when bottom right block is
-    // intra predicted.
-    cfl_store(xd->cfl, dst, dst_stride, blk_row, blk_col, tx_size, plane_bsize);
+  if (plane == AOM_PLANE_Y && xd->cfl->store_y) {
+#if CONFIG_CHROMA_SUB8X8
+    assert(!is_inter_block(mbmi) || plane_bsize < BLOCK_8X8);
+#else
+    assert(!is_inter_block(mbmi));
+#endif  // CONFIG_CHROMA_SUB8X8
+    cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize);
   }
-#endif
-#if CONFIG_DPCM_INTRA
-CALCULATE_RD : {}
-#endif  // CONFIG_DPCM_INTRA
+#endif  // CONFIG_CFL
   rd = RDCOST(x->rdmult, 0, this_rd_stats.dist);
   if (args->this_rd + rd > args->best_rd) {
     args->exit_early = 1;
@@ -2008,16 +2042,12 @@ CALCULATE_RD : {}
   args->this_rd += rd;
 
 #if CONFIG_DIST_8X8
-  if (!(plane == 0 && plane_bsize >= BLOCK_8X8 &&
-        (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4))) {
+  if (!disable_early_skip)
 #endif
     if (args->this_rd > args->best_rd) {
       args->exit_early = 1;
       return;
     }
-#if CONFIG_DIST_8X8
-  }
-#endif
 }
 
 #if CONFIG_DIST_8X8
@@ -2033,8 +2063,10 @@ static void dist_8x8_sub8x8_txfm_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
   const uint8_t *src = &p->src.buf[0];
   const uint8_t *dst = &pd->dst.buf[0];
   const int16_t *pred = &pd->pred[0];
-  const int bw = block_size_wide[bsize];
-  const int bh = block_size_high[bsize];
+  int bw = block_size_wide[bsize];
+  int bh = block_size_high[bsize];
+  int visible_w = bw;
+  int visible_h = bh;
 
   int i, j;
   int64_t rd, rd1, rd2;
@@ -2044,6 +2076,9 @@ static void dist_8x8_sub8x8_txfm_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
   assert((bw & 0x07) == 0);
   assert((bh & 0x07) == 0);
 
+  get_txb_dimensions(xd, 0, bsize, 0, 0, bsize, &bw, &bh, &visible_w,
+                     &visible_h);
+
 #if CONFIG_HIGHBITDEPTH
   uint8_t *pred8;
   DECLARE_ALIGNED(16, uint16_t, pred16[MAX_TX_SQUARE]);
@@ -2064,22 +2099,30 @@ static void dist_8x8_sub8x8_txfm_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
   } else {
 #endif
     for (j = 0; j < bh; j++)
-      for (i = 0; i < bw; i++) pred8[j * bw + i] = pred[j * bw + i];
+      for (i = 0; i < bw; i++) pred8[j * bw + i] = (uint8_t)pred[j * bw + i];
 #if CONFIG_HIGHBITDEPTH
   }
 #endif  // CONFIG_HIGHBITDEPTH
 
-  tmp1 = av1_dist_8x8(cpi, xd, src, src_stride, pred8, bw, bsize, bw, bh, bw,
-                      bh, qindex);
-  tmp2 = av1_dist_8x8(cpi, xd, src, src_stride, dst, dst_stride, bsize, bw, bh,
-                      bw, bh, qindex);
+  tmp1 = (unsigned)av1_dist_8x8(cpi, x, src, src_stride, pred8, bw, bsize, bw,
+                                bh, visible_w, visible_h, qindex);
+  tmp2 = (unsigned)av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride, bsize,
+                                bw, bh, visible_w, visible_h, qindex);
 
   if (!is_inter_block(mbmi)) {
+    if (x->tune_metric == AOM_TUNE_PSNR) {
+      assert(args->rd_stats.sse == tmp1 * 16);
+      assert(args->rd_stats.dist == tmp2 * 16);
+    }
     args->rd_stats.sse = (int64_t)tmp1 * 16;
     args->rd_stats.dist = (int64_t)tmp2 * 16;
   } else {
     // For inter mode, the decoded pixels are provided in pd->pred,
     // while the predicted pixels are in dst.
+    if (x->tune_metric == AOM_TUNE_PSNR) {
+      assert(args->rd_stats.sse == tmp2 * 16);
+      assert(args->rd_stats.dist == tmp1 * 16);
+    }
     args->rd_stats.sse = (int64_t)tmp2 * 16;
     args->rd_stats.dist = (int64_t)tmp1 * 16;
   }
@@ -2116,7 +2159,8 @@ static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
   av1_foreach_transformed_block_in_plane(xd, bsize, plane, block_rd_txfm,
                                          &args);
 #if CONFIG_DIST_8X8
-  if (!args.exit_early && plane == 0 && bsize >= BLOCK_8X8 &&
+  if (x->using_dist_8x8 && !args.exit_early && plane == 0 &&
+      bsize >= BLOCK_8X8 &&
       (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4))
     dist_8x8_sub8x8_txfm_rd(cpi, x, bsize, &args);
 #endif
@@ -2174,23 +2218,14 @@ static int tx_size_cost(const AV1_COMP *const cpi, const MACROBLOCK *const x,
   const MACROBLOCKD *const xd = &x->e_mbd;
   const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
 
-  const int tx_select = cm->tx_mode == TX_MODE_SELECT &&
-#if CONFIG_EXT_PARTITION_TYPES
-                        // Currently these block shapes can only use 4x4
-                        // transforms
-                        mbmi->sb_type != BLOCK_4X16 &&
-                        mbmi->sb_type != BLOCK_16X4 &&
-#endif
-                        mbmi->sb_type >= BLOCK_8X8;
-
-  if (tx_select) {
+  if (cm->tx_mode == TX_MODE_SELECT && block_signals_txsize(mbmi->sb_type)) {
     const int is_inter = is_inter_block(mbmi);
-    const int tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize]
-                                     : intra_tx_size_cat_lookup[bsize];
+    const int32_t tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize]
+                                         : intra_tx_size_cat_lookup[bsize];
     const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size];
     const int depth = tx_size_to_depth(coded_tx_size);
     const int tx_size_ctx = get_tx_size_context(xd);
-    int r_tx_size = cpi->tx_size_cost[tx_size_cat][tx_size_ctx][depth];
+    int r_tx_size = x->tx_size_cost[tx_size_cat][tx_size_ctx][depth];
 #if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
     if (is_quarter_tx_allowed(xd, mbmi, is_inter) && tx_size != coded_tx_size)
       r_tx_size += av1_cost_bit(cm->fc->quarter_tx_size_prob,
@@ -2202,12 +2237,38 @@ static int tx_size_cost(const AV1_COMP *const cpi, const MACROBLOCK *const x,
   }
 }
 
-// #TODO(angiebird): use this function whenever it's possible
-int av1_tx_type_cost(const AV1_COMP *cpi, const MACROBLOCKD *xd,
-                     BLOCK_SIZE bsize, int plane, TX_SIZE tx_size,
-                     TX_TYPE tx_type) {
+#if CONFIG_LGT_FROM_PRED
+int av1_lgt_cost(const AV1_COMMON *cm, const MACROBLOCK *x,
+                 const MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
+                 TX_SIZE tx_size, int use_lgt) {
+  if (plane > 0) return 0;
+  const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  const int is_inter = is_inter_block(mbmi);
+
+  assert(is_lgt_allowed(mbmi->mode, tx_size));
+  if (get_ext_tx_types(tx_size, bsize, is_inter, cm->reduced_tx_set_used) > 1 &&
+      !xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+    const int ext_tx_set =
+        get_ext_tx_set(tx_size, bsize, is_inter, cm->reduced_tx_set_used);
+    if (LGT_FROM_PRED_INTRA && !is_inter && ext_tx_set > 0 &&
+        ALLOW_INTRA_EXT_TX)
+      return x->intra_lgt_cost[txsize_sqr_map[tx_size]][mbmi->mode][use_lgt];
+    if (LGT_FROM_PRED_INTRA && is_inter && ext_tx_set > 0)
+      return x->inter_lgt_cost[txsize_sqr_map[tx_size]][use_lgt];
+  }
+  return 0;
+}
+#endif  // CONFIG_LGT_FROM_PRED
+
+// TODO(angiebird): use this function whenever it's possible
+int av1_tx_type_cost(const AV1_COMMON *cm, const MACROBLOCK *x,
+                     const MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
+                     TX_SIZE tx_size, TX_TYPE tx_type) {
   if (plane > 0) return 0;
 
+#if CONFIG_LGT_FROM_PRED
+  assert(!xd->mi[0]->mbmi.use_lgt);
+#endif
 #if CONFIG_VAR_TX
   tx_size = get_min_tx_size(tx_size);
 #endif
@@ -2215,31 +2276,31 @@ int av1_tx_type_cost(const AV1_COMP *cpi, const MACROBLOCKD *xd,
   const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   const int is_inter = is_inter_block(mbmi);
 #if CONFIG_EXT_TX
-  const AV1_COMMON *cm = &cpi->common;
   if (get_ext_tx_types(tx_size, bsize, is_inter, cm->reduced_tx_set_used) > 1 &&
       !xd->lossless[xd->mi[0]->mbmi.segment_id]) {
     const int ext_tx_set =
         get_ext_tx_set(tx_size, bsize, is_inter, cm->reduced_tx_set_used);
     if (is_inter) {
       if (ext_tx_set > 0)
-        return cpi
+        return x
             ->inter_tx_type_costs[ext_tx_set][txsize_sqr_map[tx_size]][tx_type];
     } else {
       if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX)
-        return cpi->intra_tx_type_costs[ext_tx_set][txsize_sqr_map[tx_size]]
-                                       [mbmi->mode][tx_type];
+        return x->intra_tx_type_costs[ext_tx_set][txsize_sqr_map[tx_size]]
+                                     [mbmi->mode][tx_type];
     }
   }
 #else
   (void)bsize;
+  (void)cm;
   if (tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
       !FIXED_TX_TYPE) {
     if (is_inter) {
-      return cpi->inter_tx_type_costs[tx_size][tx_type];
+      return x->inter_tx_type_costs[tx_size][tx_type];
     } else {
-      return cpi->intra_tx_type_costs[tx_size]
-                                     [intra_mode_to_tx_type_context[mbmi->mode]]
-                                     [tx_type];
+      return x->intra_tx_type_costs[tx_size]
+                                   [intra_mode_to_tx_type_context[mbmi->mode]]
+                                   [tx_type];
     }
   }
 #endif  // CONFIG_EXT_TX
@@ -2247,7 +2308,7 @@ int av1_tx_type_cost(const AV1_COMP *cpi, const MACROBLOCKD *xd,
 }
 static int64_t txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
                         RD_STATS *rd_stats, int64_t ref_best_rd, BLOCK_SIZE bs,
-                        TX_TYPE tx_type, int tx_size) {
+                        TX_TYPE tx_type, TX_SIZE tx_size) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
@@ -2278,7 +2339,15 @@ static int64_t txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
   if (rd_stats->rate == INT_MAX) return INT64_MAX;
 #if !CONFIG_TXK_SEL
   int plane = 0;
-  rd_stats->rate += av1_tx_type_cost(cpi, xd, bs, plane, tx_size, tx_type);
+#if CONFIG_LGT_FROM_PRED
+  if (is_lgt_allowed(mbmi->mode, tx_size))
+    rd_stats->rate +=
+        av1_lgt_cost(cm, x, xd, bs, plane, tx_size, mbmi->use_lgt);
+  if (!mbmi->use_lgt)
+    rd_stats->rate += av1_tx_type_cost(cm, x, xd, bs, plane, tx_size, tx_type);
+#else
+  rd_stats->rate += av1_tx_type_cost(cm, x, xd, bs, plane, tx_size, tx_type);
+#endif  // CONFIG_LGT_FROM_PRED
 #endif
 
   if (rd_stats->skip) {
@@ -2316,8 +2385,14 @@ static int skip_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs,
 #if CONFIG_MRC_TX
   // MRC_DCT only implemented for TX_32X32 so only include this tx in
   // the search for TX_32X32
-  if (tx_type == MRC_DCT && tx_size != TX_32X32) return 1;
+  if (tx_type == MRC_DCT &&
+      ((is_inter && !USE_MRC_INTER) || (!is_inter && !USE_MRC_INTRA) ||
+       tx_size != TX_32X32))
+    return 1;
 #endif  // CONFIG_MRC_TX
+#if CONFIG_LGT_FROM_PRED
+  if (mbmi->use_lgt && mbmi->ref_mv_idx > 0) return 1;
+#endif  // CONFIG_LGT_FROM_PRED
   if (mbmi->ref_mv_idx > 0 && tx_type != DCT_DCT) return 1;
   if (FIXED_TX_TYPE && tx_type != get_default_tx_type(0, xd, 0, tx_size))
     return 1;
@@ -2330,10 +2405,10 @@ static int skip_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs,
   if (max_tx_size >= TX_32X32 && tx_size == TX_4X4) return 1;
 #if CONFIG_EXT_TX
   const AV1_COMMON *const cm = &cpi->common;
-  int ext_tx_set =
-      get_ext_tx_set(tx_size, bs, is_inter, cm->reduced_tx_set_used);
+  const TxSetType tx_set_type =
+      get_ext_tx_set_type(tx_size, bs, is_inter, cm->reduced_tx_set_used);
+  if (!av1_ext_tx_used[tx_set_type][tx_type]) return 1;
   if (is_inter) {
-    if (!ext_tx_used_inter[ext_tx_set][tx_type]) return 1;
     if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE) {
       if (!do_tx_type_search(tx_type, prune)) return 1;
     }
@@ -2341,7 +2416,6 @@ static int skip_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs,
     if (!ALLOW_INTRA_EXT_TX && bs >= BLOCK_8X8) {
       if (tx_type != intra_mode_to_tx_type_context[mbmi->mode]) return 1;
     }
-    if (!ext_tx_used_intra[ext_tx_set][tx_type]) return 1;
   }
 #else   // CONFIG_EXT_TX
   if (tx_size >= TX_32X32 && tx_type != DCT_DCT) return 1;
@@ -2352,8 +2426,7 @@ static int skip_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs,
   return 0;
 }
 
-#if CONFIG_EXT_INTER && \
-    (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT || CONFIG_INTERINTRA)
+#if (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT || CONFIG_INTERINTRA)
 static int64_t estimate_yrd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bs,
                                    MACROBLOCK *x, int *r, int64_t *d, int *s,
                                    int64_t *sse, int64_t ref_best_rd) {
@@ -2366,7 +2439,7 @@ static int64_t estimate_yrd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bs,
   *sse = rd_stats.sse;
   return rd;
 }
-#endif  // CONFIG_EXT_INTER && (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT)
+#endif  // (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT)
 
 static void choose_largest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
                                    RD_STATS *rd_stats, int64_t ref_best_rd,
@@ -2382,9 +2455,14 @@ static void choose_largest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
   const int is_inter = is_inter_block(mbmi);
   int prune = 0;
   const int plane = 0;
-#if CONFIG_EXT_TX
-  int ext_tx_set;
-#endif  // CONFIG_EXT_TX
+#if CONFIG_LGT_FROM_PRED
+  int is_lgt_best = 0;
+  int search_lgt = is_inter
+                       ? LGT_FROM_PRED_INTER && !x->use_default_inter_tx_type &&
+                             !cpi->sf.tx_type_search.prune_mode > NO_PRUNE
+                       : LGT_FROM_PRED_INTRA && !x->use_default_intra_tx_type &&
+                             ALLOW_INTRA_EXT_TX;
+#endif  // CONFIG_LGT_FROM_PRED
   av1_invalid_rd_stats(rd_stats);
 
   mbmi->tx_size = tx_size_from_tx_mode(bs, cm->tx_mode, is_inter);
@@ -2392,8 +2470,10 @@ static void choose_largest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
   mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
 #endif  // CONFIG_VAR_TX
 #if CONFIG_EXT_TX
-  ext_tx_set =
+  int ext_tx_set =
       get_ext_tx_set(mbmi->tx_size, bs, is_inter, cm->reduced_tx_set_used);
+  const TxSetType tx_set_type =
+      get_ext_tx_set_type(mbmi->tx_size, bs, is_inter, cm->reduced_tx_set_used);
 #endif  // CONFIG_EXT_TX
 
   if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE)
@@ -2414,12 +2494,12 @@ static void choose_largest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
 #endif  // CONFIG_PVQ
 
     for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
+      if (!av1_ext_tx_used[tx_set_type][tx_type]) continue;
       RD_STATS this_rd_stats;
       if (is_inter) {
         if (x->use_default_inter_tx_type &&
             tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size))
           continue;
-        if (!ext_tx_used_inter[ext_tx_set][tx_type]) continue;
         if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE) {
           if (!do_tx_type_search(tx_type, prune)) continue;
         }
@@ -2430,7 +2510,6 @@ static void choose_largest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
         if (!ALLOW_INTRA_EXT_TX && bs >= BLOCK_8X8) {
           if (tx_type != intra_mode_to_tx_type_context[mbmi->mode]) continue;
         }
-        if (!ext_tx_used_intra[ext_tx_set][tx_type]) continue;
       }
 
       mbmi->tx_type = tx_type;
@@ -2441,7 +2520,7 @@ static void choose_largest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
       od_encode_rollback(&x->daala_enc, &pre_buf);
 #endif  // CONFIG_PVQ
       if (this_rd_stats.rate == INT_MAX) continue;
-      av1_tx_type_cost(cpi, xd, bs, plane, mbmi->tx_size, tx_type);
+      av1_tx_type_cost(cm, x, xd, bs, plane, mbmi->tx_size, tx_type);
 
       if (this_rd_stats.skip)
         this_rd = RDCOST(x->rdmult, s1, this_rd_stats.sse);
@@ -2464,6 +2543,33 @@ static void choose_largest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
 #if CONFIG_PVQ
     od_encode_rollback(&x->daala_enc, &post_buf);
 #endif  // CONFIG_PVQ
+#if CONFIG_LGT_FROM_PRED
+    // search LGT
+    if (search_lgt && is_lgt_allowed(mbmi->mode, mbmi->tx_size) &&
+        !cm->reduced_tx_set_used) {
+      RD_STATS this_rd_stats;
+      mbmi->use_lgt = 1;
+      txfm_rd_in_plane(x, cpi, &this_rd_stats, ref_best_rd, 0, bs,
+                       mbmi->tx_size, cpi->sf.use_fast_coef_costing);
+      if (this_rd_stats.rate != INT_MAX) {
+        av1_lgt_cost(cm, x, xd, bs, plane, mbmi->tx_size, 1);
+        if (this_rd_stats.skip)
+          this_rd = RDCOST(x->rdmult, s1, this_rd_stats.sse);
+        else
+          this_rd =
+              RDCOST(x->rdmult, this_rd_stats.rate + s0, this_rd_stats.dist);
+        if (is_inter_block(mbmi) && !xd->lossless[mbmi->segment_id] &&
+            !this_rd_stats.skip)
+          this_rd = AOMMIN(this_rd, RDCOST(x->rdmult, s1, this_rd_stats.sse));
+        if (this_rd < best_rd) {
+          best_rd = this_rd;
+          is_lgt_best = 1;
+          *rd_stats = this_rd_stats;
+        }
+      }
+      mbmi->use_lgt = 0;
+    }
+#endif  // CONFIG_LGT_FROM_PRED
   } else {
     mbmi->tx_type = DCT_DCT;
     txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, bs, mbmi->tx_size,
@@ -2484,7 +2590,7 @@ static void choose_largest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
                        mbmi->tx_size, cpi->sf.use_fast_coef_costing);
       if (this_rd_stats.rate == INT_MAX) continue;
 
-      av1_tx_type_cost(cpi, xd, bs, plane, mbmi->tx_size, tx_type);
+      av1_tx_type_cost(cm, x, xd, bs, plane, mbmi->tx_size, tx_type);
       if (is_inter) {
         if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE &&
             !do_tx_type_search(tx_type, prune))
@@ -2511,6 +2617,9 @@ static void choose_largest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
   }
 #endif  // CONFIG_EXT_TX
   mbmi->tx_type = best_tx_type;
+#if CONFIG_LGT_FROM_PRED
+  mbmi->use_lgt = is_lgt_best;
+#endif  // CONFIG_LGT_FROM_PRED
 }
 
 static void choose_smallest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
@@ -2549,6 +2658,11 @@ static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
   TX_SIZE best_tx_size = max_tx_size;
   TX_TYPE best_tx_type = DCT_DCT;
+#if CONFIG_LGT_FROM_PRED
+  int breakout = 0;
+  int is_lgt_best = 0;
+  mbmi->use_lgt = 0;
+#endif  // CONFIG_LGT_FROM_PRED
 #if CONFIG_TXK_SEL
   TX_TYPE best_txk_type[MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
 #endif  // CONFIG_TXK_SEL
@@ -2584,12 +2698,12 @@ static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
       if (mbmi->ref_mv_idx > 0 && tx_type != DCT_DCT) continue;
       const TX_SIZE rect_tx_size = max_txsize_rect_lookup[bs];
       RD_STATS this_rd_stats;
-      int ext_tx_set =
-          get_ext_tx_set(rect_tx_size, bs, is_inter, cm->reduced_tx_set_used);
-      if ((is_inter && ext_tx_used_inter[ext_tx_set][tx_type]) ||
-          (!is_inter && ext_tx_used_intra[ext_tx_set][tx_type])) {
+      const TxSetType tx_set_type = get_ext_tx_set_type(
+          rect_tx_size, bs, is_inter, cm->reduced_tx_set_used);
+      if (av1_ext_tx_used[tx_set_type][tx_type]) {
         rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, tx_type,
                       rect_tx_size);
+        ref_best_rd = AOMMIN(rd, ref_best_rd);
         if (rd < best_rd) {
 #if CONFIG_TXK_SEL
           memcpy(best_txk_type, mbmi->txk_type, sizeof(best_txk_type[0]) * 256);
@@ -2605,6 +2719,21 @@ static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
       if (mbmi->sb_type < BLOCK_8X8 && is_inter) break;
 #endif  // CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
     }
+#if CONFIG_LGT_FROM_PRED
+    const TX_SIZE rect_tx_size = max_txsize_rect_lookup[bs];
+    if (is_lgt_allowed(mbmi->mode, rect_tx_size) && !cm->reduced_tx_set_used) {
+      RD_STATS this_rd_stats;
+      mbmi->use_lgt = 1;
+      rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, 0, rect_tx_size);
+      if (rd < best_rd) {
+        is_lgt_best = 1;
+        best_tx_size = rect_tx_size;
+        best_rd = rd;
+        *rd_stats = this_rd_stats;
+      }
+      mbmi->use_lgt = 0;
+    }
+#endif  // CONFIG_LGT_FROM_PRED
   }
 
 #if CONFIG_RECT_TX_EXT
@@ -2632,10 +2761,9 @@ static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
       if (mbmi->ref_mv_idx > 0 && tx_type != DCT_DCT) continue;
       const TX_SIZE tx_size = quarter_txsize_lookup[bs];
       RD_STATS this_rd_stats;
-      int ext_tx_set =
-          get_ext_tx_set(tx_size, bs, is_inter, cm->reduced_tx_set_used);
-      if ((is_inter && ext_tx_used_inter[ext_tx_set][tx_type]) ||
-          (!is_inter && ext_tx_used_intra[ext_tx_set][tx_type])) {
+      const TxSetType tx_set_type =
+          get_ext_tx_set_type(tx_size, bs, is_inter, cm->reduced_tx_set_used);
+      if (av1_ext_tx_used[tx_set_type][tx_type]) {
         rd =
             txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, tx_type, tx_size);
         if (rd < best_rd) {
@@ -2644,6 +2772,9 @@ static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
                  sizeof(best_txk_type[0]) * num_blk);
 #endif
           best_tx_type = tx_type;
+#if CONFIG_LGT_FROM_PRED
+          is_lgt_best = 0;
+#endif
           best_tx_size = tx_size;
           best_rd = rd;
           *rd_stats = this_rd_stats;
@@ -2654,6 +2785,21 @@ static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
       if (mbmi->sb_type < BLOCK_8X8 && is_inter) break;
 #endif  // CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
     }
+#if CONFIG_LGT_FROM_PRED
+    if (is_lgt_allowed(mbmi->mode, tx_size) && !cm->reduced_tx_set_used) {
+      const TX_SIZE tx_size = quarter_txsize_lookup[bs];
+      RD_STATS this_rd_stats;
+      mbmi->use_lgt = 1;
+      rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, 0, tx_size);
+      if (rd < best_rd) {
+        is_lgt_best = 1;
+        best_tx_size = tx_size;
+        best_rd = rd;
+        *rd_stats = this_rd_stats;
+      }
+      mbmi->use_lgt = 0;
+    }
+#endif  // CONFIG_LGT_FROM_PRED
   }
 #endif  // CONFIG_RECT_TX_EXT
 #endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
@@ -2692,15 +2838,23 @@ static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
       if (cpi->sf.tx_size_search_breakout &&
           (rd == INT64_MAX ||
            (this_rd_stats.skip == 1 && tx_type != DCT_DCT && n < start_tx) ||
-           (n < (int)max_tx_size && rd > last_rd)))
+           (n < (int)max_tx_size && rd > last_rd))) {
+#if CONFIG_LGT_FROM_PRED
+        breakout = 1;
+#endif
         break;
+      }
 
       last_rd = rd;
+      ref_best_rd = AOMMIN(rd, ref_best_rd);
       if (rd < best_rd) {
 #if CONFIG_TXK_SEL
         memcpy(best_txk_type, mbmi->txk_type, sizeof(best_txk_type[0]) * 256);
 #endif
         best_tx_type = tx_type;
+#if CONFIG_LGT_FROM_PRED
+        is_lgt_best = 0;
+#endif
         best_tx_size = n;
         best_rd = rd;
         *rd_stats = this_rd_stats;
@@ -2710,9 +2864,28 @@ static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
       if (mbmi->sb_type < BLOCK_8X8 && is_inter) break;
 #endif  // CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
     }
+#if CONFIG_LGT_FROM_PRED
+    mbmi->use_lgt = 1;
+    if (is_lgt_allowed(mbmi->mode, n) && !skip_txfm_search(cpi, x, bs, 0, n) &&
+        !breakout) {
+      RD_STATS this_rd_stats;
+      rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, 0, n);
+      if (rd < best_rd) {
+        is_lgt_best = 1;
+        best_tx_size = n;
+        best_rd = rd;
+        *rd_stats = this_rd_stats;
+      }
+    }
+    mbmi->use_lgt = 0;
+#endif  // CONFIG_LGT_FROM_PRED
   }
   mbmi->tx_size = best_tx_size;
   mbmi->tx_type = best_tx_type;
+#if CONFIG_LGT_FROM_PRED
+  mbmi->use_lgt = is_lgt_best;
+  assert(!is_lgt_best || is_lgt_allowed(mbmi->mode, mbmi->tx_size));
+#endif  // CONFIG_LGT_FROM_PRED
 #if CONFIG_TXK_SEL
   memcpy(mbmi->txk_type, best_txk_type, sizeof(best_txk_type[0]) * 256);
 #endif
@@ -2768,6 +2941,7 @@ static int conditional_skipintra(PREDICTION_MODE mode,
 // Model based RD estimation for luma intra blocks.
 static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x,
                                BLOCK_SIZE bsize, int mode_cost) {
+  const AV1_COMMON *cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   assert(!is_inter_block(mbmi));
@@ -2785,7 +2959,7 @@ static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x,
   int block = 0;
   for (row = 0; row < max_blocks_high; row += stepr) {
     for (col = 0; col < max_blocks_wide; col += stepc) {
-      av1_predict_intra_block_facade(xd, 0, block, col, row, tx_size);
+      av1_predict_intra_block_facade(cm, xd, 0, block, col, row, tx_size);
       block += step;
     }
   }
@@ -2816,7 +2990,6 @@ static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x,
   return this_rd;
 }
 
-#if CONFIG_PALETTE
 // Extends 'color_map' array from 'orig_width x orig_height' to 'new_width x
 // new_height'. Extra rows and columns are filled in by copying last valid
 // row/column.
@@ -2875,6 +3048,7 @@ static int rd_pick_palette_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
   MODE_INFO *const mic = xd->mi[0];
   MB_MODE_INFO *const mbmi = &mic->mbmi;
   assert(!is_inter_block(mbmi));
+  assert(bsize >= BLOCK_8X8);
   int this_rate, colors, n;
   const int src_stride = x->plane[0].src.stride;
   const uint8_t *const src = x->plane[0].src.buf;
@@ -2897,9 +3071,8 @@ static int rd_pick_palette_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
 #endif  // CONFIG_FILTER_INTRA
 
   if (colors > 1 && colors <= 64) {
-    int r, c, i, j, k, palette_mode_cost;
+    int r, c, i, k, palette_mode_cost;
     const int max_itr = 50;
-    uint8_t color_order[PALETTE_MAX_SIZE];
     float *const data = x->palette_buffer->kmeans_data_buf;
     float centroids[PALETTE_MAX_SIZE];
     float lb, ub, val;
@@ -2950,11 +3123,8 @@ static int rd_pick_palette_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
     if (rows * cols > PALETTE_MAX_BLOCK_SIZE) return 0;
 
 #if CONFIG_PALETTE_DELTA_ENCODING
-    const MODE_INFO *above_mi = xd->above_mi;
-    const MODE_INFO *left_mi = xd->left_mi;
     uint16_t color_cache[2 * PALETTE_MAX_SIZE];
-    const int n_cache =
-        av1_get_palette_cache(above_mi, left_mi, 0, color_cache);
+    const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
 #endif  // CONFIG_PALETTE_DELTA_ENCODING
 
     for (n = colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors; n >= 2;
@@ -2998,7 +3168,7 @@ static int rd_pick_palette_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
                                block_height);
       palette_mode_cost =
           dc_mode_cost +
-          cpi->palette_y_size_cost[bsize - BLOCK_8X8][k - PALETTE_MIN_SIZE] +
+          x->palette_y_size_cost[bsize - BLOCK_8X8][k - PALETTE_MIN_SIZE] +
           write_uniform_cost(k, color_map[0]) +
           av1_cost_bit(
               av1_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_ctx],
@@ -3008,16 +3178,8 @@ static int rd_pick_palette_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
                                                     color_cache, n_cache,
 #endif  // CONFIG_PALETTE_DELTA_ENCODING
                                                     cpi->common.bit_depth);
-      for (i = 0; i < rows; ++i) {
-        for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
-          int color_idx;
-          const int color_ctx = av1_get_palette_color_index_context(
-              color_map, block_width, i, j, k, color_order, &color_idx);
-          assert(color_idx >= 0 && color_idx < k);
-          palette_mode_cost += cpi->palette_y_color_cost[k - PALETTE_MIN_SIZE]
-                                                        [color_ctx][color_idx];
-        }
-      }
+      palette_mode_cost +=
+          av1_cost_color_map(x, 0, 0, bsize, mbmi->tx_size, PALETTE_MAP);
       this_model_rd = intra_model_yrd(cpi, x, bsize, palette_mode_cost);
       if (*best_model_rd != INT64_MAX &&
           this_model_rd > *best_model_rd + (*best_model_rd >> 1))
@@ -3027,7 +3189,8 @@ static int rd_pick_palette_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
       if (tokenonly_rd_stats.rate == INT_MAX) continue;
       this_rate = tokenonly_rd_stats.rate + palette_mode_cost;
       this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
-      if (!xd->lossless[mbmi->segment_id] && mbmi->sb_type >= BLOCK_8X8) {
+      if (!xd->lossless[mbmi->segment_id] &&
+          block_signals_txsize(mbmi->sb_type)) {
         tokenonly_rd_stats.rate -= tx_size_cost(cpi, x, bsize, mbmi->tx_size);
       }
       if (this_rd < *best_rd) {
@@ -3046,12 +3209,11 @@ static int rd_pick_palette_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
 
   if (best_mbmi->palette_mode_info.palette_size[0] > 0) {
     memcpy(color_map, best_palette_color_map,
-           rows * cols * sizeof(best_palette_color_map[0]));
+           block_width * block_height * sizeof(best_palette_color_map[0]));
   }
   *mbmi = *best_mbmi;
   return rate_overhead;
 }
-#endif  // CONFIG_PALETTE
 
 static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
     const AV1_COMP *const cpi, MACROBLOCK *x, int row, int col,
@@ -3124,9 +3286,7 @@ static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
 
   xd->mi[0]->mbmi.tx_size = tx_size;
 
-#if CONFIG_PALETTE
   xd->mi[0]->mbmi.palette_mode_info.palette_size[0] = 0;
-#endif  // CONFIG_PALETTE
 
 #if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -3172,8 +3332,8 @@ static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
                          block == 0 || block == 2));
           xd->mi[0]->bmi[block_raster_idx].as_mode = mode;
           av1_predict_intra_block(
-              xd, pd->width, pd->height, txsize_to_bsize[tx_size], mode, dst,
-              dst_stride, dst, dst_stride, col + idx, row + idy, 0);
+              cm, xd, pd->width, pd->height, txsize_to_bsize[tx_size], mode,
+              dst, dst_stride, dst, dst_stride, col + idx, row + idy, 0);
 #if !CONFIG_PVQ
           aom_highbd_subtract_block(tx_height, tx_width, src_diff, 8, src,
                                     src_stride, dst, dst_stride, xd->bd);
@@ -3220,9 +3380,12 @@ static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
             if (!skip)
 #endif
               av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block),
-#if CONFIG_LGT
+#if CONFIG_LGT_FROM_PRED
                                           mode,
 #endif
+#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+                                          BLOCK_OFFSET(xd->mrc_mask, block),
+#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
                                           DCT_DCT, tx_size, dst, dst_stride,
                                           p->eobs[block]);
           } else {
@@ -3242,7 +3405,7 @@ static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
             av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8,
                             tx_size, coeff_ctx, AV1_XFORM_QUANT_FP);
             av1_optimize_b(cm, x, 0, 0, 0, block, BLOCK_8X8, tx_size,
-                           tempa + idx, templ + idy);
+                           tempa + idx, templ + idy, 1);
 #endif  // DISABLE_TRELLISQ_SEARCH
             ratey += av1_cost_coeffs(cpi, x, 0, 0, 0, block, tx_size,
                                      scan_order, tempa + idx, templ + idy,
@@ -3273,9 +3436,12 @@ static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
             if (!skip)
 #endif
               av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block),
-#if CONFIG_LGT
+#if CONFIG_LGT_FROM_PRED
                                           mode,
 #endif
+#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+                                          BLOCK_OFFSET(xd->mrc_mask, block),
+#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
                                           tx_type, tx_size, dst, dst_stride,
                                           p->eobs[block]);
             cpi->fn_ptr[sub_bsize].vf(src, src_stride, dst, dst_stride, &tmp);
@@ -3374,7 +3540,7 @@ static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
         assert(IMPLIES(tx_size == TX_4X8 || tx_size == TX_8X4,
                        block == 0 || block == 2));
         xd->mi[0]->bmi[block_raster_idx].as_mode = mode;
-        av1_predict_intra_block(xd, pd->width, pd->height,
+        av1_predict_intra_block(cm, xd, pd->width, pd->height,
                                 txsize_to_bsize[tx_size], mode, dst, dst_stride,
                                 dst, dst_stride,
 #if CONFIG_CB4X4
@@ -3416,7 +3582,7 @@ static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
                         BLOCK_8X8, tx_size, coeff_ctx, xform_quant);
 
         av1_optimize_b(cm, x, 0, 0, 0, block, BLOCK_8X8, tx_size, tempa + idx,
-                       templ + idy);
+                       templ + idy, 1);
 #endif  // DISABLE_TRELLISQ_SEARCH
         ratey += av1_cost_coeffs(cpi, x, 0, 0, 0, block, tx_size, scan_order,
                                  tempa + idx, templ + idy,
@@ -3459,9 +3625,12 @@ static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
           if (!skip)
 #endif  // CONFIG_PVQ
             av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block),
-#if CONFIG_LGT
+#if CONFIG_LGT_FROM_PRED
                                         mode,
 #endif
+#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+                                        BLOCK_OFFSET(xd->mrc_mask, block),
+#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
                                         tx_type, tx_size, dst, dst_stride,
                                         p->eobs[block]);
           unsigned int tmp;
@@ -3477,9 +3646,12 @@ static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
           if (!skip)
 #endif  // CONFIG_PVQ
             av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block),
-#if CONFIG_LGT
+#if CONFIG_LGT_FROM_PRED
                                         mode,
 #endif
+#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+                                        BLOCK_OFFSET(xd->mrc_mask, block),
+#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
                                         DCT_DCT, tx_size, dst, dst_stride,
                                         p->eobs[block]);
         }
@@ -3544,7 +3716,7 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(const AV1_COMP *const cpi,
   int64_t total_distortion = 0;
   int tot_rate_y = 0;
   int64_t total_rd = 0;
-  const int *bmode_costs = cpi->mbmode_cost[0];
+  const int *bmode_costs = mb->mbmode_cost[0];
   const int is_lossless = xd->lossless[mbmi->segment_id];
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
   const TX_SIZE tx_size = is_lossless ? TX_4X4 : max_txsize_rect_lookup[bsize];
@@ -3565,6 +3737,9 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(const AV1_COMP *const cpi,
   // expense of speed.
   mbmi->tx_type = DCT_DCT;
   mbmi->tx_size = tx_size;
+#if CONFIG_LGT_FROM_PRED
+  mbmi->use_lgt = 0;
+#endif
 
   if (y_skip) *y_skip = 1;
 
@@ -3583,15 +3758,23 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(const AV1_COMP *const cpi,
         const PREDICTION_MODE L =
             av1_left_block_mode(mic, left_mi, pred_block_idx);
 
-        bmode_costs = cpi->y_mode_costs[A][L];
+#if CONFIG_KF_CTX
+        const int above_ctx = intra_mode_context[A];
+        const int left_ctx = intra_mode_context[L];
+        bmode_costs = mb->y_mode_costs[above_ctx][left_ctx];
+#else
+        bmode_costs = mb->y_mode_costs[A][L];
+#endif
       }
       this_rd = rd_pick_intra_sub_8x8_y_subblock_mode(
           cpi, mb, idy, idx, &best_mode, bmode_costs,
           xd->plane[0].above_context + idx, xd->plane[0].left_context + idy, &r,
           &ry, &d, bsize, tx_size, y_skip, best_rd - total_rd);
-#if !CONFIG_DIST_8X8
-      if (this_rd >= best_rd - total_rd) return INT64_MAX;
-#endif  // !CONFIG_DIST_8X8
+#if CONFIG_DIST_8X8
+      if (!cpi->oxcf.using_dist_8x8)
+#endif
+        if (this_rd >= best_rd - total_rd) return INT64_MAX;
+
       total_rd += this_rd;
       cost += r;
       total_distortion += d;
@@ -3609,7 +3792,7 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(const AV1_COMP *const cpi,
   mbmi->mode = mic->bmi[3].as_mode;
 
 #if CONFIG_DIST_8X8
-  {
+  if (cpi->oxcf.using_dist_8x8) {
     const struct macroblock_plane *p = &mb->plane[0];
     const struct macroblockd_plane *pd = &xd->plane[0];
     const int src_stride = p->src.stride;
@@ -3617,11 +3800,8 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(const AV1_COMP *const cpi,
     uint8_t *src = p->src.buf;
     uint8_t *dst = pd->dst.buf;
 
-#if CONFIG_PVQ
-    use_activity_masking = mb->daala_enc.use_activity_masking;
-#endif  // CONFIG_PVQ
     // Daala-defined distortion computed for the block of 8x8 pixels
-    total_distortion = av1_dist_8x8(cpi, xd, src, src_stride, dst, dst_stride,
+    total_distortion = av1_dist_8x8(cpi, mb, src, src_stride, dst, dst_stride,
                                     BLOCK_8X8, 8, 8, 8, 8, mb->qindex)
                        << 4;
   }
@@ -3634,14 +3814,20 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(const AV1_COMP *const cpi,
         1) {
       const int eset =
           get_ext_tx_set(tx_size, bsize, 0, cpi->common.reduced_tx_set_used);
-      rate_tx_type = cpi->intra_tx_type_costs[eset][txsize_sqr_map[tx_size]]
-                                             [mbmi->mode][mbmi->tx_type];
+#if CONFIG_LGT_FROM_PRED
+      if (LGT_FROM_PRED_INTRA && is_lgt_allowed(mbmi->mode, tx_size))
+        rate_tx_type += mb->intra_lgt_cost[txsize_sqr_map[tx_size]][mbmi->mode]
+                                          [mbmi->use_lgt];
+      if (!LGT_FROM_PRED_INTRA || !mbmi->use_lgt)
+#endif  // CONFIG_LGT_FROM_PRED
+        rate_tx_type += mb->intra_tx_type_costs[eset][txsize_sqr_map[tx_size]]
+                                               [mbmi->mode][mbmi->tx_type];
     }
 #else
     rate_tx_type =
-        cpi->intra_tx_type_costs[txsize_sqr_map[tx_size]]
-                                [intra_mode_to_tx_type_context[mbmi->mode]]
-                                [mbmi->tx_type];
+        mb->intra_tx_type_costs[txsize_sqr_map[tx_size]]
+                               [intra_mode_to_tx_type_context[mbmi->mode]]
+                               [mbmi->tx_type];
 #endif  // CONFIG_EXT_TX
     assert(mbmi->tx_size == tx_size);
     cost += rate_tx_type;
@@ -3671,13 +3857,14 @@ static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
   TX_SIZE best_tx_size = TX_4X4;
   FILTER_INTRA_MODE_INFO filter_intra_mode_info;
   TX_TYPE best_tx_type;
+#if CONFIG_LGT_FROM_PRED
+  int use_lgt_when_selected;
+#endif
 
   av1_zero(filter_intra_mode_info);
   mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 1;
   mbmi->mode = DC_PRED;
-#if CONFIG_PALETTE
   mbmi->palette_mode_info.palette_size[0] = 0;
-#endif  // CONFIG_PALETTE
 
   for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) {
     int this_rate;
@@ -3702,6 +3889,9 @@ static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
       best_tx_size = mic->mbmi.tx_size;
       filter_intra_mode_info = mbmi->filter_intra_mode_info;
       best_tx_type = mic->mbmi.tx_type;
+#if CONFIG_LGT_FROM_PRED
+      use_lgt_when_selected = mic->mbmi.use_lgt;
+#endif
       *rate = this_rate;
       *rate_tokenonly = tokenonly_rd_stats.rate;
       *distortion = tokenonly_rd_stats.dist;
@@ -3713,6 +3903,9 @@ static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
   if (filter_intra_selected_flag) {
     mbmi->mode = DC_PRED;
     mbmi->tx_size = best_tx_size;
+#if CONFIG_LGT_FROM_PRED
+    mbmi->use_lgt = use_lgt_when_selected;
+#endif
     mbmi->filter_intra_mode_info.use_filter_intra_mode[0] =
         filter_intra_mode_info.use_filter_intra_mode[0];
     mbmi->filter_intra_mode_info.filter_intra_mode[0] =
@@ -3733,6 +3926,9 @@ static int64_t calc_rd_given_intra_angle(
     int64_t best_rd_in, int8_t angle_delta, int max_angle_delta, int *rate,
     RD_STATS *rd_stats, int *best_angle_delta, TX_SIZE *best_tx_size,
     TX_TYPE *best_tx_type,
+#if CONFIG_LGT_FROM_PRED
+    int *use_lgt_when_selected,
+#endif
 #if CONFIG_INTRA_INTERP
     INTRA_FILTER *best_filter,
 #endif  // CONFIG_INTRA_INTERP
@@ -3765,6 +3961,9 @@ static int64_t calc_rd_given_intra_angle(
     *best_filter = mbmi->intra_filter;
 #endif  // CONFIG_INTRA_INTERP
     *best_tx_type = mbmi->tx_type;
+#if CONFIG_LGT_FROM_PRED
+    *use_lgt_when_selected = mbmi->use_lgt;
+#endif
     *rate = this_rate;
     rd_stats->rate = tokenonly_rd_stats.rate;
     rd_stats->dist = tokenonly_rd_stats.dist;
@@ -3794,6 +3993,9 @@ static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
   int64_t this_rd, best_rd_in, rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
   TX_SIZE best_tx_size = mic->mbmi.tx_size;
   TX_TYPE best_tx_type = mbmi->tx_type;
+#if CONFIG_LGT_FROM_PRED
+  int use_lgt_when_selected = mbmi->use_lgt;
+#endif
 
   for (i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX;
 
@@ -3810,12 +4012,15 @@ static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
         this_rd = calc_rd_given_intra_angle(
             cpi, x, bsize,
 #if CONFIG_INTRA_INTERP
-            mode_cost + cpi->intra_filter_cost[intra_filter_ctx][filter],
+            mode_cost + x->intra_filter_cost[intra_filter_ctx][filter],
 #else
           mode_cost,
 #endif  // CONFIG_INTRA_INTERP
             best_rd_in, (1 - 2 * i) * angle_delta, MAX_ANGLE_DELTA, rate,
             rd_stats, &best_angle_delta, &best_tx_size, &best_tx_type,
+#if CONFIG_LGT_FROM_PRED
+            &use_lgt_when_selected,
+#endif
 #if CONFIG_INTRA_INTERP
             &best_filter,
 #endif  // CONFIG_INTRA_INTERP
@@ -3851,12 +4056,15 @@ static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
           calc_rd_given_intra_angle(
               cpi, x, bsize,
 #if CONFIG_INTRA_INTERP
-              mode_cost + cpi->intra_filter_cost[intra_filter_ctx][filter],
+              mode_cost + x->intra_filter_cost[intra_filter_ctx][filter],
 #else
             mode_cost,
 #endif  // CONFIG_INTRA_INTERP
               best_rd, (1 - 2 * i) * angle_delta, MAX_ANGLE_DELTA, rate,
               rd_stats, &best_angle_delta, &best_tx_size, &best_tx_type,
+#if CONFIG_LGT_FROM_PRED
+              &use_lgt_when_selected,
+#endif
 #if CONFIG_INTRA_INTERP
               &best_filter,
 #endif  // CONFIG_INTRA_INTERP
@@ -3876,10 +4084,13 @@ static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
         mic->mbmi.intra_filter = filter;
         this_rd = calc_rd_given_intra_angle(
             cpi, x, bsize,
-            mode_cost + cpi->intra_filter_cost[intra_filter_ctx][filter],
-            best_rd, best_angle_delta, MAX_ANGLE_DELTA, rate, rd_stats,
-            &best_angle_delta, &best_tx_size, &best_tx_type, &best_filter,
-            &best_rd, best_model_rd);
+            mode_cost + x->intra_filter_cost[intra_filter_ctx][filter], best_rd,
+            best_angle_delta, MAX_ANGLE_DELTA, rate, rd_stats,
+            &best_angle_delta, &best_tx_size, &best_tx_type,
+#if CONFIG_LGT_FROM_PRED
+            &use_lgt_when_selected,
+#endif
+            &best_filter, &best_rd, best_model_rd);
       }
     }
   }
@@ -3891,6 +4102,9 @@ static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
   mic->mbmi.intra_filter = best_filter;
 #endif  // CONFIG_INTRA_INTERP
   mbmi->tx_type = best_tx_type;
+#if CONFIG_LGT_FROM_PRED
+  mbmi->use_lgt = use_lgt_when_selected;
+#endif
   return best_rd;
 }
 
@@ -3919,9 +4133,7 @@ static const uint8_t gradient_to_angle_bin[2][7][16] = {
 /* clang-format off */
 static const uint8_t mode_to_angle_bin[INTRA_MODES] = {
   0, 2, 6, 0, 4, 3, 5, 7, 1, 0,
-#if CONFIG_ALT_INTRA
   0,
-#endif  // CONFIG_ALT_INTRA
 };
 /* clang-format on */
 
@@ -4064,16 +4276,12 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   uint16_t filter_intra_mode_skip_mask = (1 << FILTER_INTRA_MODES) - 1;
 #endif  // CONFIG_FILTER_INTRA
   const int *bmode_costs;
-#if CONFIG_PALETTE
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-  uint8_t *best_palette_color_map =
-      cpi->common.allow_screen_content_tools
-          ? x->palette_buffer->best_palette_color_map
-          : NULL;
   int palette_y_mode_ctx = 0;
   const int try_palette =
-      cpi->common.allow_screen_content_tools && bsize >= BLOCK_8X8;
-#endif  // CONFIG_PALETTE
+      av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type);
+  uint8_t *best_palette_color_map =
+      try_palette ? x->palette_buffer->best_palette_color_map : NULL;
   const MODE_INFO *above_mi = xd->above_mi;
   const MODE_INFO *left_mi = xd->left_mi;
   const PREDICTION_MODE A = av1_above_block_mode(mic, above_mi, 0);
@@ -4085,7 +4293,14 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   od_encode_checkpoint(&x->daala_enc, &pre_buf);
   od_encode_checkpoint(&x->daala_enc, &post_buf);
 #endif  // CONFIG_PVQ
-  bmode_costs = cpi->y_mode_costs[A][L];
+
+#if CONFIG_KF_CTX
+  const int above_ctx = intra_mode_context[A];
+  const int left_ctx = intra_mode_context[L];
+  bmode_costs = x->y_mode_costs[above_ctx][left_ctx];
+#else
+  bmode_costs = x->y_mode_costs[A][L];
+#endif
 
 #if CONFIG_EXT_INTRA
   mbmi->angle_delta[0] = 0;
@@ -4101,14 +4316,17 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
 #if CONFIG_FILTER_INTRA
   mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
 #endif  // CONFIG_FILTER_INTRA
-#if CONFIG_PALETTE
   pmi->palette_size[0] = 0;
-  if (above_mi)
-    palette_y_mode_ctx +=
-        (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
-  if (left_mi)
-    palette_y_mode_ctx += (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
-#endif  // CONFIG_PALETTE
+  if (try_palette) {
+    if (above_mi) {
+      palette_y_mode_ctx +=
+          (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
+    }
+    if (left_mi) {
+      palette_y_mode_ctx +=
+          (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
+    }
+  }
 
   if (cpi->sf.tx_type_search.fast_intra_tx_type_search)
     x->use_default_intra_tx_type = 1;
@@ -4160,21 +4378,20 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
 
     this_rate = this_rate_tokenonly + bmode_costs[mbmi->mode];
 
-    if (!xd->lossless[mbmi->segment_id] && mbmi->sb_type >= BLOCK_8X8) {
+    if (!xd->lossless[mbmi->segment_id] &&
+        block_signals_txsize(mbmi->sb_type)) {
       // super_block_yrd above includes the cost of the tx_size in the
       // tokenonly rate, but for intra blocks, tx_size is always coded
       // (prediction granularity), so we account for it in the full rate,
       // not the tokenonly rate.
       this_rate_tokenonly -= tx_size_cost(cpi, x, bsize, mbmi->tx_size);
     }
-#if CONFIG_PALETTE
     if (try_palette && mbmi->mode == DC_PRED) {
       this_rate +=
           av1_cost_bit(av1_default_palette_y_mode_prob[bsize - BLOCK_8X8]
                                                       [palette_y_mode_ctx],
                        0);
     }
-#endif  // CONFIG_PALETTE
 #if CONFIG_FILTER_INTRA
     if (mbmi->mode == DC_PRED)
       this_rate += av1_cost_bit(cpi->common.fc->filter_intra_probs[0], 0);
@@ -4185,8 +4402,7 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
       const int p_angle =
           mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP;
       if (av1_is_intra_filter_switchable(p_angle))
-        this_rate +=
-            cpi->intra_filter_cost[intra_filter_ctx][mbmi->intra_filter];
+        this_rate += x->intra_filter_cost[intra_filter_ctx][mbmi->intra_filter];
 #endif  // CONFIG_INTRA_INTERP
       if (av1_use_angle_delta(bsize)) {
         this_rate += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
@@ -4194,6 +4410,10 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
       }
     }
 #endif  // CONFIG_EXT_INTRA
+#if CONFIG_INTRABC
+    if (bsize >= BLOCK_8X8 && cpi->common.allow_screen_content_tools)
+      this_rate += x->intrabc_cost[0];
+#endif  // CONFIG_INTRABC
     this_rd = RDCOST(x->rdmult, this_rate, this_distortion);
 #if CONFIG_FILTER_INTRA
     if (best_rd == INT64_MAX || this_rd - best_rd < (best_rd >> 4)) {
@@ -4221,14 +4441,12 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   od_encode_rollback(&x->daala_enc, &post_buf);
 #endif  // CONFIG_PVQ
 
-#if CONFIG_PALETTE
   if (try_palette) {
     rd_pick_palette_intra_sby(cpi, x, bsize, palette_y_mode_ctx,
                               bmode_costs[DC_PRED], &best_mbmi,
                               best_palette_color_map, &best_rd, &best_model_rd,
                               rate, rate_tokenonly, distortion, skippable);
   }
-#endif  // CONFIG_PALETTE
 
 #if CONFIG_FILTER_INTRA
   if (beat_best_rd) {
@@ -4317,6 +4535,9 @@ void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
 
   int64_t tmp;
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+  uint8_t *mrc_mask = BLOCK_OFFSET(xd->mrc_mask, block);
+#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
   PLANE_TYPE plane_type = get_plane_type(plane);
   TX_TYPE tx_type =
       av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
@@ -4346,6 +4567,22 @@ void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
 
   int coeff_ctx = get_entropy_context(tx_size, a, l);
 
+  tmp = pixel_diff_dist(x, plane, diff, diff_stride, blk_row, blk_col,
+                        plane_bsize, txm_bsize);
+
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    tmp = ROUND_POWER_OF_TWO(tmp, (xd->bd - 8) * 2);
+#endif  // CONFIG_HIGHBITDEPTH
+  rd_stats->sse += tmp << 4;
+
+  if (rd_stats->invalid_rate) {
+    rd_stats->dist += tmp << 4;
+    rd_stats->rate += rd_stats->zero_rate;
+    rd_stats->skip = 1;
+    return;
+  }
+
 // TODO(any): Use av1_dist_block to compute distortion
 #if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -4373,43 +4610,59 @@ void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
   const int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
   tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
   const int buffer_length = tx_size_2d[tx_size];
-  int64_t tmp_dist;
+  int64_t tmp_dist, tmp_sse;
+#if CONFIG_DIST_8X8
+  int disable_early_skip =
+      x->using_dist_8x8 && plane == 0 && plane_bsize >= BLOCK_8X8 &&
+      (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4) &&
+      x->tune_metric != AOM_TUNE_PSNR;
+#endif  // CONFIG_DIST_8X8
+
 #if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
     tmp_dist =
-        av1_highbd_block_error(coeff, dqcoeff, buffer_length, &tmp, xd->bd) >>
-        shift;
+        av1_highbd_block_error(coeff, dqcoeff, buffer_length, &tmp_sse, xd->bd);
   else
 #endif
-    tmp_dist = av1_block_error(coeff, dqcoeff, buffer_length, &tmp) >> shift;
+    tmp_dist = av1_block_error(coeff, dqcoeff, buffer_length, &tmp_sse);
+
+  tmp_dist = RIGHT_SIGNED_SHIFT(tmp_dist, shift);
 
-  if (RDCOST(x->rdmult, 0, tmp_dist) < rd_stats->ref_rdcost) {
+#if CONFIG_MRC_TX
+  if (tx_type == MRC_DCT && !xd->mi[0]->mbmi.valid_mrc_mask) {
+    av1_invalid_rd_stats(rd_stats);
+    return;
+  }
+#endif  // CONFIG_MRC_TX
+  if (
+#if CONFIG_DIST_8X8
+      disable_early_skip ||
+#endif
+      RDCOST(x->rdmult, 0, tmp_dist) < rd_stats->ref_rdcost) {
     av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size,
-                   a, l);
+                   a, l, 1);
+  } else {
+    rd_stats->rate += rd_stats->zero_rate;
+    rd_stats->dist += tmp << 4;
+    rd_stats->skip = 1;
+    rd_stats->invalid_rate = 1;
+    return;
   }
 #endif  // DISABLE_TRELLISQ_SEARCH
 
-  tmp = pixel_diff_dist(x, plane, diff, diff_stride, blk_row, blk_col,
-                        plane_bsize, txm_bsize);
-
-#if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-    tmp = ROUND_POWER_OF_TWO(tmp, (xd->bd - 8) * 2);
-#endif  // CONFIG_HIGHBITDEPTH
-  rd_stats->sse += tmp * 16;
   const int eob = p->eobs[block];
 
-#if CONFIG_LGT
-  PREDICTION_MODE mode = get_prediction_mode(xd->mi[0], plane, tx_size, block);
-  av1_inverse_transform_block(xd, dqcoeff, mode, tx_type, tx_size, rec_buffer,
-                              MAX_TX_SIZE, eob);
-#else
-  av1_inverse_transform_block(xd, dqcoeff, tx_type, tx_size, rec_buffer,
-                              MAX_TX_SIZE, eob);
+  av1_inverse_transform_block(xd, dqcoeff,
+#if CONFIG_LGT_FROM_PRED
+                              xd->mi[0]->mbmi.mode,
 #endif
+#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+                              mrc_mask,
+#endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
+                              tx_type, tx_size, rec_buffer, MAX_TX_SIZE, eob);
   if (eob > 0) {
 #if CONFIG_DIST_8X8
-    if (plane == 0 && (bw < 8 && bh < 8)) {
+    if (x->using_dist_8x8 && plane == 0 && (bw < 8 && bh < 8)) {
       // Save sub8x8 luma decoded pixels
       // since 8x8 luma decoded pixels are not available for daala-dist
       // after recursive split of BLOCK_8x8 is done.
@@ -4451,12 +4704,12 @@ void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
 }
 
 static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
-                            int blk_col, int plane, int block, int block32,
-                            TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize,
+                            int blk_col, int plane, int block, TX_SIZE tx_size,
+                            int depth, BLOCK_SIZE plane_bsize,
                             ENTROPY_CONTEXT *ta, ENTROPY_CONTEXT *tl,
                             TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
                             RD_STATS *rd_stats, int64_t ref_best_rd,
-                            int *is_cost_valid, RD_STATS *rd_stats_stack) {
+                            int *is_cost_valid) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   struct macroblock_plane *const p = &x->plane[plane];
@@ -4519,32 +4772,28 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
   TX_SIZE txs_ctx = get_txsize_context(tx_size);
   TXB_CTX txb_ctx;
   get_txb_ctx(plane_bsize, tx_size, plane, pta, ptl, &txb_ctx);
+
+#if LV_MAP_PROB
+  zero_blk_rate = x->coeff_costs[txs_ctx][get_plane_type(plane)]
+                      .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
+#else
   zero_blk_rate =
       av1_cost_bit(xd->fc->txb_skip[txs_ctx][txb_ctx.txb_skip_ctx], 1);
+#endif  // LV_MAP_PROB
 #else
-  int tx_size_ctx = txsize_sqr_map[tx_size];
+  TX_SIZE tx_size_ctx = txsize_sqr_map[tx_size];
   int coeff_ctx = get_entropy_context(tx_size, pta, ptl);
-  zero_blk_rate = x->token_costs[tx_size_ctx][pd->plane_type][1][0][0]
-                                [coeff_ctx][EOB_TOKEN];
+  zero_blk_rate =
+      x->token_head_costs[tx_size_ctx][pd->plane_type][1][0][coeff_ctx][0];
 #endif
 
   rd_stats->ref_rdcost = ref_best_rd;
   rd_stats->zero_rate = zero_blk_rate;
   if (cpi->common.tx_mode == TX_MODE_SELECT || tx_size == TX_4X4) {
     inter_tx_size[0][0] = tx_size;
-
-    if (tx_size == TX_32X32 && mbmi->tx_type != DCT_DCT &&
-        rd_stats_stack[block32].rate != INT_MAX) {
-      *rd_stats = rd_stats_stack[block32];
-      p->eobs[block] = !rd_stats->skip;
-      x->blk_skip[plane][blk_row * bw + blk_col] = rd_stats->skip;
-    } else {
-      av1_tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
-                        plane_bsize, pta, ptl, rd_stats);
-      if (tx_size == TX_32X32) {
-        rd_stats_stack[block32] = *rd_stats;
-      }
-    }
+    av1_tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
+                      plane_bsize, pta, ptl, rd_stats);
+    if (rd_stats->rate == INT_MAX) return;
 
     if ((RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
              RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) ||
@@ -4599,11 +4848,12 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
 
       av1_tx_block_rd_b(cpi, x, quarter_txsize, 0, 0, plane, 0, plane_bsize,
                         pta, ptl, &rd_stats_qttx);
+      if (rd_stats->rate == INT_MAX) return;
 
       tx_size_ctx = txsize_sqr_map[quarter_txsize];
       coeff_ctx = get_entropy_context(quarter_txsize, pta, ptl);
-      zero_blk_rate = x->token_costs[tx_size_ctx][pd->plane_type][1][0][0]
-                                    [coeff_ctx][EOB_TOKEN];
+      zero_blk_rate =
+          x->token_head_costs[tx_size_ctx][pd->plane_type][1][0][coeff_ctx][0];
       if ((RDCOST(x->rdmult, rd_stats_qttx.rate, rd_stats_qttx.dist) >=
                RDCOST(x->rdmult, zero_blk_rate, rd_stats_qttx.sse) ||
            rd_stats_qttx.skip == 1) &&
@@ -4629,11 +4879,15 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
                         plane, block_offset_qttx, plane_bsize, pta, ptl,
                         &rd_stats_tmp);
 
+      if (rd_stats->rate == INT_MAX) return;
+
+#if !CONFIG_PVQ
       av1_set_txb_context(x, plane, 0, quarter_txsize, pta, ptl);
+#endif  // !CONFIG_PVQ
       coeff_ctx = get_entropy_context(quarter_txsize, pta + blk_col_offset,
                                       ptl + blk_row_offset);
-      zero_blk_rate = x->token_costs[tx_size_ctx][pd->plane_type][1][0][0]
-                                    [coeff_ctx][EOB_TOKEN];
+      zero_blk_rate =
+          x->token_head_costs[tx_size_ctx][pd->plane_type][1][0][coeff_ctx][0];
       if ((RDCOST(x->rdmult, rd_stats_tmp.rate, rd_stats_tmp.dist) >=
                RDCOST(x->rdmult, zero_blk_rate, rd_stats_tmp.sse) ||
            rd_stats_tmp.skip == 1) &&
@@ -4684,13 +4938,13 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
 #endif
   }
 
+  if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH
 #if CONFIG_MRC_TX
-  // If the tx type we are trying is MRC_DCT, we cannot partition the transform
-  // into anything smaller than TX_32X32
-  if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH && mbmi->tx_type != MRC_DCT) {
-#else
-  if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH) {
-#endif
+      // If the tx type we are trying is MRC_DCT, we cannot partition the
+      // transform into anything smaller than TX_32X32
+      && mbmi->tx_type != MRC_DCT
+#endif  // CONFIG_MRC_TX
+      ) {
     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
     const int bsl = tx_size_wide_unit[sub_txs];
     int sub_step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
@@ -4713,25 +4967,26 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
 
       if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
 
-      select_tx_block(cpi, x, offsetr, offsetc, plane, block, block32, sub_txs,
+      select_tx_block(cpi, x, offsetr, offsetc, plane, block, sub_txs,
                       depth + 1, plane_bsize, ta, tl, tx_above, tx_left,
-                      &this_rd_stats, ref_best_rd - tmp_rd, &this_cost_valid,
-                      rd_stats_stack);
+                      &this_rd_stats, ref_best_rd - tmp_rd, &this_cost_valid);
 #if CONFIG_DIST_8X8
-      if (plane == 0 && tx_size == TX_8X8) {
+      if (x->using_dist_8x8 && plane == 0 && tx_size == TX_8X8) {
         sub8x8_eob[i] = p->eobs[block];
       }
 #endif  // CONFIG_DIST_8X8
       av1_merge_rd_stats(&sum_rd_stats, &this_rd_stats);
 
       tmp_rd = RDCOST(x->rdmult, sum_rd_stats.rate, sum_rd_stats.dist);
-#if !CONFIG_DIST_8X8
-      if (this_rd < tmp_rd) break;
+#if CONFIG_DIST_8X8
+      if (!x->using_dist_8x8)
 #endif
+        if (this_rd < tmp_rd) break;
       block += sub_step;
     }
 #if CONFIG_DIST_8X8
-    if (this_cost_valid && plane == 0 && tx_size == TX_8X8) {
+    if (x->using_dist_8x8 && this_cost_valid && plane == 0 &&
+        tx_size == TX_8X8) {
       const int src_stride = p->src.stride;
       const int dst_stride = pd->dst.stride;
 
@@ -4757,7 +5012,7 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
       DECLARE_ALIGNED(16, uint8_t, pred8[8 * 8]);
 #endif  // CONFIG_HIGHBITDEPTH
 
-      dist_8x8 = av1_dist_8x8(cpi, xd, src, src_stride, dst, dst_stride,
+      dist_8x8 = av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride,
                               BLOCK_8X8, 8, 8, 8, 8, qindex) *
                  16;
       sum_rd_stats.sse = dist_8x8;
@@ -4802,7 +5057,7 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
               for (j = 0; j < 4; j++)
                 for (i = 0; i < 4; i++)
                   pred8[(row * 4 + j) * 8 + 4 * col + i] =
-                      pred[(row * 4 + j) * pred_stride + 4 * col + i];
+                      (uint8_t)pred[(row * 4 + j) * pred_stride + 4 * col + i];
             } else {
               for (j = 0; j < 4; j++)
                 for (i = 0; i < 4; i++)
@@ -4814,7 +5069,7 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
 #if CONFIG_HIGHBITDEPTH
       }
 #endif  // CONFIG_HIGHBITDEPTH
-      dist_8x8 = av1_dist_8x8(cpi, xd, src, src_stride, pred8, 8, BLOCK_8X8, 8,
+      dist_8x8 = av1_dist_8x8(cpi, x, src, src_stride, pred8, 8, BLOCK_8X8, 8,
                               8, 8, 8, qindex) *
                  16;
       sum_rd_stats.dist = dist_8x8;
@@ -4853,12 +5108,14 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
     }
 #endif
 
+#if !CONFIG_PVQ
     av1_set_txb_context(x, plane, block, tx_size_selected, pta, ptl);
 #if CONFIG_RECT_TX_EXT
     if (is_qttx_picked)
       av1_set_txb_context(x, plane, block_offset_qttx, tx_size_selected,
                           pta + blk_col_offset, ptl + blk_row_offset);
-#endif
+#endif  // CONFIG_RECT_TX_EXT
+#endif  // !CONFIG_PVQ
 
     txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size,
                           tx_size);
@@ -4889,7 +5146,7 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
 
 static void inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
                             RD_STATS *rd_stats, BLOCK_SIZE bsize,
-                            int64_t ref_best_rd, RD_STATS *rd_stats_stack) {
+                            int64_t ref_best_rd) {
   MACROBLOCKD *const xd = &x->e_mbd;
   int is_cost_valid = 1;
   int64_t this_rd = 0;
@@ -4908,7 +5165,8 @@ static void inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
     const int bw = tx_size_wide_unit[max_tx_size];
     int idx, idy;
     int block = 0;
-    int block32 = 0;
+    int init_depth =
+        (mi_height != mi_width) ? RECT_VARTX_DEPTH_INIT : SQR_VARTX_DEPTH_INIT;
     int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
     ENTROPY_CONTEXT ctxa[2 * MAX_MIB_SIZE];
     ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE];
@@ -4924,15 +5182,17 @@ static void inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
 
     for (idy = 0; idy < mi_height; idy += bh) {
       for (idx = 0; idx < mi_width; idx += bw) {
-        select_tx_block(cpi, x, idy, idx, 0, block, block32, max_tx_size,
-                        mi_height != mi_width, plane_bsize, ctxa, ctxl,
-                        tx_above, tx_left, &pn_rd_stats, ref_best_rd - this_rd,
-                        &is_cost_valid, rd_stats_stack);
+        select_tx_block(cpi, x, idy, idx, 0, block, max_tx_size, init_depth,
+                        plane_bsize, ctxa, ctxl, tx_above, tx_left,
+                        &pn_rd_stats, ref_best_rd - this_rd, &is_cost_valid);
+        if (pn_rd_stats.rate == INT_MAX) {
+          av1_invalid_rd_stats(rd_stats);
+          return;
+        }
         av1_merge_rd_stats(rd_stats, &pn_rd_stats);
         this_rd += AOMMIN(RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist),
                           RDCOST(x->rdmult, 0, pn_rd_stats.sse));
         block += step;
-        ++block32;
       }
     }
   }
@@ -4949,8 +5209,7 @@ static void inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
 
 static int64_t select_tx_size_fix_type(const AV1_COMP *cpi, MACROBLOCK *x,
                                        RD_STATS *rd_stats, BLOCK_SIZE bsize,
-                                       int64_t ref_best_rd, TX_TYPE tx_type,
-                                       RD_STATS *rd_stats_stack) {
+                                       int64_t ref_best_rd, TX_TYPE tx_type) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
@@ -4964,7 +5223,7 @@ static int64_t select_tx_size_fix_type(const AV1_COMP *cpi, MACROBLOCK *x,
   const int max_blocks_wide = max_block_wide(xd, bsize, 0);
 
   mbmi->tx_type = tx_type;
-  inter_block_yrd(cpi, x, rd_stats, bsize, ref_best_rd, rd_stats_stack);
+  inter_block_yrd(cpi, x, rd_stats, bsize, ref_best_rd);
   mbmi->min_tx_size = get_min_tx_size(mbmi->inter_tx_size[0][0]);
 
   if (rd_stats->rate == INT_MAX) return INT64_MAX;
@@ -4981,23 +5240,37 @@ static int64_t select_tx_size_fix_type(const AV1_COMP *cpi, MACROBLOCK *x,
       !xd->lossless[xd->mi[0]->mbmi.segment_id]) {
     const int ext_tx_set = get_ext_tx_set(mbmi->min_tx_size, bsize, is_inter,
                                           cm->reduced_tx_set_used);
-    if (is_inter) {
-      if (ext_tx_set > 0)
+#if CONFIG_LGT_FROM_PRED
+    if (is_lgt_allowed(mbmi->mode, mbmi->min_tx_size)) {
+      if (LGT_FROM_PRED_INTRA && !is_inter && ext_tx_set > 0 &&
+          ALLOW_INTRA_EXT_TX)
+        rd_stats->rate += x->intra_lgt_cost[txsize_sqr_map[mbmi->min_tx_size]]
+                                           [mbmi->mode][mbmi->use_lgt];
+      if (LGT_FROM_PRED_INTER && is_inter && ext_tx_set > 0)
         rd_stats->rate +=
-            cpi->inter_tx_type_costs[ext_tx_set]
+            x->inter_lgt_cost[txsize_sqr_map[mbmi->min_tx_size]][mbmi->use_lgt];
+    }
+    if (!mbmi->use_lgt) {
+#endif  // CONFIG_LGT_FROM_PRED
+      if (is_inter) {
+        if (ext_tx_set > 0)
+          rd_stats->rate +=
+              x->inter_tx_type_costs[ext_tx_set]
                                     [txsize_sqr_map[mbmi->min_tx_size]]
                                     [mbmi->tx_type];
-    } else {
-      if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX)
-        rd_stats->rate +=
-            cpi->intra_tx_type_costs[ext_tx_set][mbmi->min_tx_size][mbmi->mode]
+      } else {
+        if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX)
+          rd_stats->rate +=
+              x->intra_tx_type_costs[ext_tx_set][mbmi->min_tx_size][mbmi->mode]
                                     [mbmi->tx_type];
+      }
     }
+#if CONFIG_LGT_FROM_PRED
   }
+#endif
 #else
   if (mbmi->min_tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id])
-    rd_stats->rate +=
-        cpi->inter_tx_type_costs[mbmi->min_tx_size][mbmi->tx_type];
+    rd_stats->rate += x->inter_tx_type_costs[mbmi->min_tx_size][mbmi->tx_type];
 #endif  // CONFIG_EXT_TX
 #endif  // CONFIG_TXK_SEL
 
@@ -5013,6 +5286,162 @@ static int64_t select_tx_size_fix_type(const AV1_COMP *cpi, MACROBLOCK *x,
   return rd;
 }
 
+static uint32_t get_block_residue_hash(MACROBLOCK *x, BLOCK_SIZE bsize) {
+  const int rows = block_size_high[bsize];
+  const int cols = block_size_wide[bsize];
+  const int diff_stride = cols;
+  const struct macroblock_plane *const p = &x->plane[0];
+  const int16_t *diff = &p->src_diff[0];
+  uint8_t hash_data[MAX_SB_SQUARE];
+  for (int r = 0; r < rows; ++r) {
+    for (int c = 0; c < cols; ++c) {
+      hash_data[cols * r + c] = clip_pixel(diff[c] + 128);
+    }
+    diff += diff_stride;
+  }
+  return (av1_get_crc_value(&x->tx_rd_record.crc_calculator, hash_data,
+                            rows * cols)
+          << 7) +
+         bsize;
+}
+
+static void save_tx_rd_info(int n4, uint32_t hash, const MACROBLOCK *const x,
+                            const RD_STATS *const rd_stats,
+                            TX_RD_INFO *const tx_rd_info) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  tx_rd_info->hash_value = hash;
+  tx_rd_info->tx_type = mbmi->tx_type;
+  tx_rd_info->tx_size = mbmi->tx_size;
+#if CONFIG_VAR_TX
+  tx_rd_info->min_tx_size = mbmi->min_tx_size;
+  memcpy(tx_rd_info->blk_skip, x->blk_skip[0],
+         sizeof(tx_rd_info->blk_skip[0]) * n4);
+  for (int idy = 0; idy < xd->n8_h; ++idy)
+    for (int idx = 0; idx < xd->n8_w; ++idx)
+      tx_rd_info->inter_tx_size[idy][idx] = mbmi->inter_tx_size[idy][idx];
+#endif  // CONFIG_VAR_TX
+#if CONFIG_TXK_SEL
+  av1_copy(tx_rd_info->txk_type, mbmi->txk_type);
+#endif  // CONFIG_TXK_SEL
+  tx_rd_info->rd_stats = *rd_stats;
+}
+
+static void fetch_tx_rd_info(int n4, const TX_RD_INFO *const tx_rd_info,
+                             RD_STATS *const rd_stats, MACROBLOCK *const x) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  mbmi->tx_type = tx_rd_info->tx_type;
+  mbmi->tx_size = tx_rd_info->tx_size;
+#if CONFIG_VAR_TX
+  mbmi->min_tx_size = tx_rd_info->min_tx_size;
+  memcpy(x->blk_skip[0], tx_rd_info->blk_skip,
+         sizeof(tx_rd_info->blk_skip[0]) * n4);
+  for (int idy = 0; idy < xd->n8_h; ++idy)
+    for (int idx = 0; idx < xd->n8_w; ++idx)
+      mbmi->inter_tx_size[idy][idx] = tx_rd_info->inter_tx_size[idy][idx];
+#endif  // CONFIG_VAR_TX
+#if CONFIG_TXK_SEL
+  av1_copy(mbmi->txk_type, tx_rd_info->txk_type);
+#endif  // CONFIG_TXK_SEL
+  *rd_stats = tx_rd_info->rd_stats;
+}
+
+// Uses simple features on top of DCT coefficients to quickly predict
+// whether optimal RD decision is to skip encoding the residual.
+static int predict_skip_flag_8bit(const MACROBLOCK *x, BLOCK_SIZE bsize) {
+  if (bsize > BLOCK_16X16) return 0;
+  // Tuned for target false-positive rate of 5% for all block sizes:
+  const uint32_t threshold_table[] = { 50, 50, 50, 55, 47, 47, 53, 22, 22, 37 };
+  const struct macroblock_plane *const p = &x->plane[0];
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  tran_low_t DCT_coefs[32 * 32];
+  TxfmParam param;
+  param.tx_type = DCT_DCT;
+#if CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
+  param.tx_size = max_txsize_rect_lookup[bsize];
+#else
+  param.tx_size = max_txsize_lookup[bsize];
+#endif
+  param.bd = 8;
+  param.lossless = 0;
+  av1_fwd_txfm(p->src_diff, DCT_coefs, bw, &param);
+
+  uint32_t dc = (uint32_t)av1_dc_quant(x->qindex, 0, AOM_BITS_8);
+  uint32_t ac = (uint32_t)av1_ac_quant(x->qindex, 0, AOM_BITS_8);
+  uint32_t max_quantized_coef = (100 * (uint32_t)abs(DCT_coefs[0])) / dc;
+  for (int i = 1; i < bw * bh; i++) {
+    uint32_t cur_quantized_coef = (100 * (uint32_t)abs(DCT_coefs[i])) / ac;
+    if (cur_quantized_coef > max_quantized_coef)
+      max_quantized_coef = cur_quantized_coef;
+  }
+
+  return max_quantized_coef < threshold_table[AOMMAX(bsize - BLOCK_4X4, 0)];
+}
+
+// Used to set proper context for early termination with skip = 1.
+static void set_skip_flag(const AV1_COMP *cpi, MACROBLOCK *x,
+                          RD_STATS *rd_stats, int bsize) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const int n4 = bsize_to_num_blk(bsize);
+#if CONFIG_RECT_TX && (CONFIG_EXT_TX || CONFIG_VAR_TX)
+  const TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
+#else
+  const TX_SIZE tx_size = max_txsize_lookup[bsize];
+#endif
+  mbmi->tx_type = DCT_DCT;
+  for (int idy = 0; idy < xd->n8_h; ++idy)
+    for (int idx = 0; idx < xd->n8_w; ++idx)
+      mbmi->inter_tx_size[idy][idx] = tx_size;
+  mbmi->tx_size = tx_size;
+  mbmi->min_tx_size = get_min_tx_size(tx_size);
+  memset(x->blk_skip[0], 1, sizeof(uint8_t) * n4);
+  rd_stats->skip = 1;
+
+  // Rate.
+  const int tx_size_ctx = txsize_sqr_map[tx_size];
+  ENTROPY_CONTEXT ctxa[2 * MAX_MIB_SIZE];
+  ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE];
+  av1_get_entropy_contexts(bsize, 0, &xd->plane[0], ctxa, ctxl);
+  int coeff_ctx = get_entropy_context(tx_size, ctxa, ctxl);
+  int rate = x->token_head_costs[tx_size_ctx][PLANE_TYPE_Y][1][0][coeff_ctx][0];
+  if (tx_size > TX_4X4) {
+    int ctx = txfm_partition_context(
+        xd->above_txfm_context, xd->left_txfm_context, mbmi->sb_type, tx_size);
+    rate += av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 0);
+  }
+#if !CONFIG_TXK_SEL
+#if CONFIG_EXT_TX
+  const AV1_COMMON *cm = &cpi->common;
+  const int ext_tx_set = get_ext_tx_set(max_txsize_lookup[bsize], bsize, 1,
+                                        cm->reduced_tx_set_used);
+  if (get_ext_tx_types(mbmi->min_tx_size, bsize, 1, cm->reduced_tx_set_used) >
+          1 &&
+      !xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+    if (ext_tx_set > 0)
+      rate +=
+          x->inter_tx_type_costs[ext_tx_set][txsize_sqr_map[mbmi->min_tx_size]]
+                                [mbmi->tx_type];
+  }
+#else
+  if (mbmi->min_tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id])
+    rd_stats->rate += x->inter_tx_type_costs[mbmi->min_tx_size][mbmi->tx_type];
+#endif  // CONFIG_EXT_TX
+#endif  // CONFIG_TXK_SEL
+  rd_stats->rate = rate;
+
+  // Distortion.
+  int64_t tmp = pixel_diff_dist(x, 0, x->plane[0].src_diff,
+                                block_size_wide[bsize], 0, 0, bsize, bsize);
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    tmp = ROUND_POWER_OF_TWO(tmp, (xd->bd - 8) * 2);
+#endif  // CONFIG_HIGHBITDEPTH
+  rd_stats->dist = rd_stats->sse = (tmp << 4);
+}
+
 static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
                                RD_STATS *rd_stats, BLOCK_SIZE bsize,
                                int64_t ref_best_rd) {
@@ -5037,18 +5466,52 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
   const int n4 = bsize_to_num_blk(bsize);
   int idx, idy;
   int prune = 0;
-  const int count32 =
-      1 << (2 * (cm->mib_size_log2 - mi_width_log2_lookup[BLOCK_32X32]));
-#if CONFIG_EXT_PARTITION
-  RD_STATS rd_stats_stack[16];
-#else
-  RD_STATS rd_stats_stack[4];
-#endif  // CONFIG_EXT_PARTITION
 #if CONFIG_EXT_TX
+  const TxSetType tx_set_type = get_ext_tx_set_type(
+      max_tx_size, bsize, is_inter, cm->reduced_tx_set_used);
   const int ext_tx_set =
       get_ext_tx_set(max_tx_size, bsize, is_inter, cm->reduced_tx_set_used);
 #endif  // CONFIG_EXT_TX
 
+  av1_invalid_rd_stats(rd_stats);
+
+#if CONFIG_LGT_FROM_PRED
+  mbmi->use_lgt = 0;
+  int search_lgt = is_inter
+                       ? LGT_FROM_PRED_INTER &&
+                             (!cpi->sf.tx_type_search.prune_mode > NO_PRUNE)
+                       : LGT_FROM_PRED_INTRA && ALLOW_INTRA_EXT_TX;
+#endif  // CONFIG_LGT_FROM_PRED
+
+  const uint32_t hash = get_block_residue_hash(x, bsize);
+  TX_RD_RECORD *tx_rd_record = &x->tx_rd_record;
+
+  if (ref_best_rd != INT64_MAX) {
+    for (int i = 0; i < tx_rd_record->num; ++i) {
+      const int index = (tx_rd_record->index_start + i) % RD_RECORD_BUFFER_LEN;
+      // If there is a match in the tx_rd_record, fetch the RD decision and
+      // terminate early.
+      if (tx_rd_record->tx_rd_info[index].hash_value == hash) {
+        TX_RD_INFO *tx_rd_info = &tx_rd_record->tx_rd_info[index];
+        fetch_tx_rd_info(n4, tx_rd_info, rd_stats, x);
+        return;
+      }
+    }
+  }
+
+// If we predict that skip is the optimal RD decision - set the respective
+// context and terminate early.
+#if CONFIG_HIGHBITDEPTH
+  if (!(xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH))
+#endif  // CONFIG_HIGHBITDEPTH
+  {
+    if (is_inter && cpi->sf.tx_type_search.use_skip_flag_prediction &&
+        predict_skip_flag_8bit(x, bsize)) {
+      set_skip_flag(cpi, x, rd_stats, bsize);
+      return;
+    }
+  }
+
   if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE)
 #if CONFIG_EXT_TX
     prune = prune_tx_types(cpi, bsize, x, xd, ext_tx_set);
@@ -5056,10 +5519,7 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
     prune = prune_tx_types(cpi, bsize, x, xd, 0);
 #endif  // CONFIG_EXT_TX
 
-  av1_invalid_rd_stats(rd_stats);
-
-  for (idx = 0; idx < count32; ++idx)
-    av1_invalid_rd_stats(&rd_stats_stack[idx]);
+  int found = 0;
 
   for (tx_type = txk_start; tx_type < txk_end; ++tx_type) {
     RD_STATS this_rd_stats;
@@ -5067,11 +5527,14 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
 #if CONFIG_MRC_TX
     // MRC_DCT only implemented for TX_32X32 so only include this tx in
     // the search for TX_32X32
-    if (tx_type == MRC_DCT && max_tx_size != TX_32X32) continue;
+    if (tx_type == MRC_DCT &&
+        (max_tx_size != TX_32X32 || (is_inter && !USE_MRC_INTER) ||
+         (!is_inter && !USE_MRC_INTRA)))
+      continue;
 #endif  // CONFIG_MRC_TX
 #if CONFIG_EXT_TX
+    if (!av1_ext_tx_used[tx_set_type][tx_type]) continue;
     if (is_inter) {
-      if (!ext_tx_used_inter[ext_tx_set][tx_type]) continue;
       if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE) {
         if (!do_tx_type_search(tx_type, prune)) continue;
       }
@@ -5079,7 +5542,6 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
       if (!ALLOW_INTRA_EXT_TX && bsize >= BLOCK_8X8) {
         if (tx_type != intra_mode_to_tx_type_context[mbmi->mode]) continue;
       }
-      if (!ext_tx_used_intra[ext_tx_set][tx_type]) continue;
     }
 #else   // CONFIG_EXT_TX
     if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE &&
@@ -5094,8 +5556,8 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
       if (tx_type != DCT_DCT) continue;
 
     rd = select_tx_size_fix_type(cpi, x, &this_rd_stats, bsize, ref_best_rd,
-                                 tx_type, rd_stats_stack);
-
+                                 tx_type);
+    ref_best_rd = AOMMIN(rd, ref_best_rd);
     if (rd < best_rd) {
       best_rd = rd;
       *rd_stats = this_rd_stats;
@@ -5103,12 +5565,41 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
       best_tx = mbmi->tx_size;
       best_min_tx_size = mbmi->min_tx_size;
       memcpy(best_blk_skip, x->blk_skip[0], sizeof(best_blk_skip[0]) * n4);
+      found = 1;
       for (idy = 0; idy < xd->n8_h; ++idy)
         for (idx = 0; idx < xd->n8_w; ++idx)
           best_tx_size[idy][idx] = mbmi->inter_tx_size[idy][idx];
     }
   }
 
+  // We should always find at least one candidate unless ref_best_rd is less
+  // than INT64_MAX (in which case, all the calls to select_tx_size_fix_type
+  // might have failed to find something better)
+  assert(IMPLIES(!found, ref_best_rd != INT64_MAX));
+  if (!found) return;
+
+#if CONFIG_LGT_FROM_PRED
+  if (search_lgt && is_lgt_allowed(mbmi->mode, max_tx_size) &&
+      !cm->reduced_tx_set_used) {
+    RD_STATS this_rd_stats;
+    mbmi->use_lgt = 1;
+    rd = select_tx_size_fix_type(cpi, x, &this_rd_stats, bsize, ref_best_rd, 0);
+    if (rd < best_rd) {
+      best_rd = rd;
+      *rd_stats = this_rd_stats;
+      best_tx = mbmi->tx_size;
+      best_min_tx_size = mbmi->min_tx_size;
+      memcpy(best_blk_skip, x->blk_skip[0], sizeof(best_blk_skip[0]) * n4);
+      for (idy = 0; idy < xd->n8_h; ++idy)
+        for (idx = 0; idx < xd->n8_w; ++idx)
+          best_tx_size[idy][idx] = mbmi->inter_tx_size[idy][idx];
+    } else {
+      mbmi->use_lgt = 0;
+    }
+  }
+#endif  // CONFIG_LGT_FROM_PRED
+  // We found a candidate transform to use. Copy our results from the "best"
+  // array into mbmi.
   mbmi->tx_type = best_tx_type;
   for (idy = 0; idy < xd->n8_h; ++idy)
     for (idx = 0; idx < xd->n8_w; ++idx)
@@ -5116,6 +5607,19 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
   mbmi->tx_size = best_tx;
   mbmi->min_tx_size = best_min_tx_size;
   memcpy(x->blk_skip[0], best_blk_skip, sizeof(best_blk_skip[0]) * n4);
+
+  // Save the RD search results into tx_rd_record.
+  int index;
+  if (tx_rd_record->num < RD_RECORD_BUFFER_LEN) {
+    index =
+        (tx_rd_record->index_start + tx_rd_record->num) % RD_RECORD_BUFFER_LEN;
+    ++tx_rd_record->num;
+  } else {
+    index = tx_rd_record->index_start;
+    tx_rd_record->index_start =
+        (tx_rd_record->index_start + 1) % RD_RECORD_BUFFER_LEN;
+  }
+  save_tx_rd_info(n4, hash, x, rd_stats, &tx_rd_record->tx_rd_info[index]);
 }
 
 static void tx_block_rd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
@@ -5145,7 +5649,9 @@ static void tx_block_rd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
     ENTROPY_CONTEXT *tl = left_ctx + blk_row;
     av1_tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
                       plane_bsize, ta, tl, rd_stats);
+#if !CONFIG_PVQ
     av1_set_txb_context(x, plane, block, tx_size, ta, tl);
+#endif  // !CONFIG_PVQ
   } else {
     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
     const int bsl = tx_size_wide_unit[sub_txs];
@@ -5250,7 +5756,6 @@ static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x,
 }
 #endif  // CONFIG_VAR_TX
 
-#if CONFIG_PALETTE
 static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
                                        int dc_mode_cost,
                                        uint8_t *best_palette_color_map,
@@ -5263,6 +5768,7 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
   assert(!is_inter_block(mbmi));
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
   const BLOCK_SIZE bsize = mbmi->sb_type;
+  assert(bsize >= BLOCK_8X8);
   int this_rate;
   int64_t this_rd;
   int colors_u, colors_v, colors;
@@ -5296,17 +5802,14 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
 #endif  // CONFIG_HIGHBITDEPTH
 
 #if CONFIG_PALETTE_DELTA_ENCODING
-  const MODE_INFO *above_mi = xd->above_mi;
-  const MODE_INFO *left_mi = xd->left_mi;
   uint16_t color_cache[2 * PALETTE_MAX_SIZE];
-  const int n_cache = av1_get_palette_cache(above_mi, left_mi, 1, color_cache);
+  const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
 #endif  // CONFIG_PALETTE_DELTA_ENCODING
 
   colors = colors_u > colors_v ? colors_u : colors_v;
   if (colors > 1 && colors <= 64) {
     int r, c, n, i, j;
     const int max_itr = 50;
-    uint8_t color_order[PALETTE_MAX_SIZE];
     float lb_u, ub_u, val_u;
     float lb_v, ub_v, val_v;
     float *const data = x->palette_buffer->kmeans_data_buf;
@@ -5402,7 +5905,7 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
       if (tokenonly_rd_stats.rate == INT_MAX) continue;
       this_rate =
           tokenonly_rd_stats.rate + dc_mode_cost +
-          cpi->palette_uv_size_cost[bsize - BLOCK_8X8][n - PALETTE_MIN_SIZE] +
+          x->palette_uv_size_cost[bsize - BLOCK_8X8][n - PALETTE_MIN_SIZE] +
           write_uniform_cost(n, color_map[0]) +
           av1_cost_bit(
               av1_default_palette_uv_mode_prob[pmi->palette_size[0] > 0], 1);
@@ -5411,17 +5914,8 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
                                              color_cache, n_cache,
 #endif  // CONFIG_PALETTE_DELTA_ENCODING
                                              cpi->common.bit_depth);
-      for (i = 0; i < rows; ++i) {
-        for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
-          int color_idx;
-          const int color_ctx = av1_get_palette_color_index_context(
-              color_map, plane_block_width, i, j, n, color_order, &color_idx);
-          assert(color_idx >= 0 && color_idx < n);
-          this_rate += cpi->palette_uv_color_cost[n - PALETTE_MIN_SIZE]
-                                                 [color_ctx][color_idx];
-        }
-      }
-
+      this_rate +=
+          av1_cost_color_map(x, 1, 0, bsize, mbmi->tx_size, PALETTE_MAP);
       this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
       if (this_rd < *best_rd) {
         *best_rd = this_rd;
@@ -5438,10 +5932,10 @@ static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
   }
   if (best_mbmi->palette_mode_info.palette_size[1] > 0) {
     memcpy(color_map, best_palette_color_map,
-           rows * cols * sizeof(best_palette_color_map[0]));
+           plane_block_width * plane_block_height *
+               sizeof(best_palette_color_map[0]));
   }
 }
-#endif  // CONFIG_PALETTE
 
 #if CONFIG_FILTER_INTRA
 // Return 1 if an filter intra mode is selected; return 0 otherwise.
@@ -5461,9 +5955,7 @@ static int rd_pick_filter_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
   av1_zero(filter_intra_mode_info);
   mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 1;
   mbmi->uv_mode = UV_DC_PRED;
-#if CONFIG_PALETTE
   mbmi->palette_mode_info.palette_size[1] = 0;
-#endif  // CONFIG_PALETTE
 
   for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) {
     mbmi->filter_intra_mode_info.filter_intra_mode[1] = mode;
@@ -5472,7 +5964,7 @@ static int rd_pick_filter_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
 
     this_rate = tokenonly_rd_stats.rate +
                 av1_cost_bit(cpi->common.fc->filter_intra_probs[1], 1) +
-                cpi->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode] +
+                x->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode] +
                 write_uniform_cost(FILTER_INTRA_MODES, mode);
     this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
     if (this_rd < *best_rd) {
@@ -5586,11 +6078,10 @@ static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
 #endif  // CONFIG_EXT_INTRA
 
 #if CONFIG_CFL
-static int64_t cfl_alpha_dist(const uint8_t *y_pix, int y_stride,
-                              const int y_averages_q3[MAX_NUM_TXB],
-                              const uint8_t *src, int src_stride, int width,
-                              int height, TX_SIZE tx_size, int dc_pred,
-                              int alpha_q3, int64_t *dist_neg_out) {
+static int64_t cfl_alpha_dist_lbd(const int16_t *pred_buf_q3,
+                                  const uint8_t *src, int src_stride, int width,
+                                  int height, int dc_pred, int alpha_q3,
+                                  int64_t *dist_neg_out) {
   int64_t dist = 0;
   int diff;
 
@@ -5609,63 +6100,87 @@ static int64_t cfl_alpha_dist(const uint8_t *y_pix, int y_stride,
   }
 
   int64_t dist_neg = 0;
-  const int tx_height = tx_size_high[tx_size];
-  const int tx_width = tx_size_wide[tx_size];
-  const int y_block_row_off = y_stride * tx_height;
-  const int src_block_row_off = src_stride * tx_height;
-  const uint8_t *t_y_pix;
-  const uint8_t *t_src;
-  int a = 0;
-  for (int b_j = 0; b_j < height; b_j += tx_height) {
-    const int h = b_j + tx_height;
-    for (int b_i = 0; b_i < width; b_i += tx_width) {
-      const int w = b_i + tx_width;
-      const int tx_avg_q3 = y_averages_q3[a++];
-      t_y_pix = y_pix;
-      t_src = src;
-      for (int t_j = b_j; t_j < h; t_j++) {
-        for (int t_i = b_i; t_i < w; t_i++) {
-          const int uv = t_src[t_i];
-
-          const int scaled_luma =
-              get_scaled_luma_q0(alpha_q3, t_y_pix[t_i], tx_avg_q3);
-
-          // TODO(ltrudeau) add support for HBD.
-          diff = uv - clamp(scaled_luma + dc_pred, 0, 255);
-          dist += diff * diff;
-
-          // TODO(ltrudeau) add support for HBD.
-          diff = uv - clamp(-scaled_luma + dc_pred, 0, 255);
-          dist_neg += diff * diff;
-        }
-        t_y_pix += y_stride;
-        t_src += src_stride;
-      }
+  for (int j = 0; j < height; j++) {
+    for (int i = 0; i < width; i++) {
+      const int uv = src[i];
+      const int scaled_luma = get_scaled_luma_q0(alpha_q3, pred_buf_q3[i]);
+
+      diff = uv - clip_pixel(scaled_luma + dc_pred);
+      dist += diff * diff;
+
+      diff = uv - clip_pixel(-scaled_luma + dc_pred);
+      dist_neg += diff * diff;
     }
-    y_pix += y_block_row_off;
-    src += src_block_row_off;
+    pred_buf_q3 += MAX_SB_SIZE;
+    src += src_stride;
   }
 
   if (dist_neg_out) *dist_neg_out = dist_neg;
 
   return dist;
 }
+#if CONFIG_HIGHBITDEPTH
+static int64_t cfl_alpha_dist_hbd(const int16_t *pred_buf_q3,
+                                  const uint16_t *src, int src_stride,
+                                  int width, int height, int dc_pred,
+                                  int alpha_q3, int bit_depth,
+                                  int64_t *dist_neg_out) {
+  const int shift = 2 * (bit_depth - 8);
+  const int rounding = shift > 0 ? (1 << shift) >> 1 : 0;
+  int64_t dist = 0;
+  int diff;
 
-static inline void cfl_update_costs(CFL_CTX *cfl, FRAME_CONTEXT *ec_ctx) {
-  assert(ec_ctx->cfl_alpha_cdf[CFL_ALPHABET_SIZE - 1] ==
-         AOM_ICDF(CDF_PROB_TOP));
+  if (alpha_q3 == 0) {
+    for (int j = 0; j < height; j++) {
+      for (int i = 0; i < width; i++) {
+        diff = src[i] - dc_pred;
+        dist += diff * diff;
+      }
+      src += src_stride;
+    }
+    dist = (dist + rounding) >> shift;
 
-  aom_cdf_prob prev_cdf = 0;
+    if (dist_neg_out) *dist_neg_out = dist;
 
-  for (int c = 0; c < CFL_ALPHABET_SIZE; c++) {
-    const int sign_bit_cost = (cfl_alpha_codes[c][CFL_PRED_U] != 0) +
-                              (cfl_alpha_codes[c][CFL_PRED_V] != 0);
+    return dist;
+  }
+
+  int64_t dist_neg = 0;
+  for (int j = 0; j < height; j++) {
+    for (int i = 0; i < width; i++) {
+      const int uv = src[i];
+      const int scaled_luma = get_scaled_luma_q0(alpha_q3, pred_buf_q3[i]);
 
-    aom_cdf_prob prob = AOM_ICDF(ec_ctx->cfl_alpha_cdf[c]) - prev_cdf;
-    prev_cdf = AOM_ICDF(ec_ctx->cfl_alpha_cdf[c]);
+      diff = uv - clip_pixel_highbd(scaled_luma + dc_pred, bit_depth);
+      dist += diff * diff;
 
-    cfl->costs[c] = av1_cost_symbol(prob) + av1_cost_literal(sign_bit_cost);
+      diff = uv - clip_pixel_highbd(-scaled_luma + dc_pred, bit_depth);
+      dist_neg += diff * diff;
+    }
+    pred_buf_q3 += MAX_SB_SIZE;
+    src += src_stride;
+  }
+
+  if (dist_neg_out) *dist_neg_out = (dist_neg + rounding) >> shift;
+
+  return (dist + rounding) >> shift;
+}
+#endif  // CONFIG_HIGHBITDEPTH
+static int64_t cfl_alpha_dist(const int16_t *pred_buf_q3, const uint8_t *src,
+                              int src_stride, int width, int height,
+                              int dc_pred, int alpha_q3, int use_hbd,
+                              int bit_depth, int64_t *dist_neg_out) {
+#if CONFIG_HIGHBITDEPTH
+  if (use_hbd) {
+    const uint16_t *src_16 = CONVERT_TO_SHORTPTR(src);
+    return cfl_alpha_dist_hbd(pred_buf_q3, src_16, src_stride, width, height,
+                              dc_pred, alpha_q3, bit_depth, dist_neg_out);
   }
+#endif  // CONFIG_HIGHBITDEPTH
+  (void)use_hbd;
+  (void)bit_depth;
+  return cfl_alpha_dist_lbd(pred_buf_q3, src, src_stride, width, height,
+                            dc_pred, alpha_q3, dist_neg_out);
 }
 
 static int cfl_rd_pick_alpha(MACROBLOCK *const x, TX_SIZE tx_size) {
@@ -5677,7 +6192,6 @@ static int cfl_rd_pick_alpha(MACROBLOCK *const x, TX_SIZE tx_size) {
   const int src_stride_v = p_v->src.stride;
 
   MACROBLOCKD *const xd = &x->e_mbd;
-  FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
 
   CFL_CTX *const cfl = xd->cfl;
@@ -5686,74 +6200,71 @@ static int cfl_rd_pick_alpha(MACROBLOCK *const x, TX_SIZE tx_size) {
   const int height = cfl->uv_height;
   const int dc_pred_u = cfl->dc_pred[CFL_PRED_U];
   const int dc_pred_v = cfl->dc_pred[CFL_PRED_V];
-  const int *y_averages_q3 = cfl->y_averages_q3;
-  const uint8_t *y_pix = cfl->y_down_pix;
-
-  CFL_SIGN_TYPE *signs = mbmi->cfl_alpha_signs;
-
-  cfl_update_costs(cfl, ec_ctx);
+  const int16_t *pred_buf_q3 = cfl->pred_buf_q3;
+  const int use_hbd = get_bitdepth_data_path_index(xd);
 
   int64_t sse[CFL_PRED_PLANES][CFL_MAGS_SIZE];
   sse[CFL_PRED_U][0] =
-      cfl_alpha_dist(y_pix, MAX_SB_SIZE, y_averages_q3, src_u, src_stride_u,
-                     width, height, tx_size, dc_pred_u, 0, NULL);
+      cfl_alpha_dist(pred_buf_q3, src_u, src_stride_u, width, height, dc_pred_u,
+                     0, use_hbd, xd->bd, NULL);
   sse[CFL_PRED_V][0] =
-      cfl_alpha_dist(y_pix, MAX_SB_SIZE, y_averages_q3, src_v, src_stride_v,
-                     width, height, tx_size, dc_pred_v, 0, NULL);
+      cfl_alpha_dist(pred_buf_q3, src_v, src_stride_v, width, height, dc_pred_v,
+                     0, use_hbd, xd->bd, NULL);
 
-  for (int m = 1; m < CFL_MAGS_SIZE; m += 2) {
-    assert(cfl_alpha_mags_q3[m + 1] == -cfl_alpha_mags_q3[m]);
+  for (int c = 0; c < CFL_ALPHABET_SIZE; c++) {
+    const int m = c * 2 + 1;
+    const int abs_alpha_q3 = c + 1;
     sse[CFL_PRED_U][m] = cfl_alpha_dist(
-        y_pix, MAX_SB_SIZE, y_averages_q3, src_u, src_stride_u, width, height,
-        tx_size, dc_pred_u, cfl_alpha_mags_q3[m], &sse[CFL_PRED_U][m + 1]);
+        pred_buf_q3, src_u, src_stride_u, width, height, dc_pred_u,
+        abs_alpha_q3, use_hbd, xd->bd, &sse[CFL_PRED_U][m + 1]);
     sse[CFL_PRED_V][m] = cfl_alpha_dist(
-        y_pix, MAX_SB_SIZE, y_averages_q3, src_v, src_stride_v, width, height,
-        tx_size, dc_pred_v, cfl_alpha_mags_q3[m], &sse[CFL_PRED_V][m + 1]);
+        pred_buf_q3, src_v, src_stride_v, width, height, dc_pred_v,
+        abs_alpha_q3, use_hbd, xd->bd, &sse[CFL_PRED_V][m + 1]);
   }
 
   int64_t dist;
   int64_t cost;
-  int64_t best_cost;
+  int64_t best_cost = INT64_MAX;
+  int best_rate = 0;
 
   // Compute least squares parameter of the entire block
-  // IMPORTANT: We assume that the first code is 0,0
   int ind = 0;
-  signs[CFL_PRED_U] = CFL_SIGN_POS;
-  signs[CFL_PRED_V] = CFL_SIGN_POS;
-
-  dist = sse[CFL_PRED_U][0] + sse[CFL_PRED_V][0];
-  dist *= 16;
-  best_cost = RDCOST(x->rdmult, cfl->costs[0], dist);
-
-  for (int c = 1; c < CFL_ALPHABET_SIZE; c++) {
-    const int idx_u = cfl_alpha_codes[c][CFL_PRED_U];
-    const int idx_v = cfl_alpha_codes[c][CFL_PRED_V];
-    for (CFL_SIGN_TYPE sign_u = idx_u == 0; sign_u < CFL_SIGNS; sign_u++) {
-      for (CFL_SIGN_TYPE sign_v = idx_v == 0; sign_v < CFL_SIGNS; sign_v++) {
+  int signs = 0;
+
+  for (int joint_sign = 0; joint_sign < CFL_JOINT_SIGNS; joint_sign++) {
+    const int sign_u = CFL_SIGN_U(joint_sign);
+    const int sign_v = CFL_SIGN_V(joint_sign);
+    const int size_u = (sign_u == CFL_SIGN_ZERO) ? 1 : CFL_ALPHABET_SIZE;
+    const int size_v = (sign_v == CFL_SIGN_ZERO) ? 1 : CFL_ALPHABET_SIZE;
+    for (int u = 0; u < size_u; u++) {
+      const int idx_u = (sign_u == CFL_SIGN_ZERO) ? 0 : u * 2 + 1;
+      for (int v = 0; v < size_v; v++) {
+        const int idx_v = (sign_v == CFL_SIGN_ZERO) ? 0 : v * 2 + 1;
         dist = sse[CFL_PRED_U][idx_u + (sign_u == CFL_SIGN_NEG)] +
                sse[CFL_PRED_V][idx_v + (sign_v == CFL_SIGN_NEG)];
         dist *= 16;
-        cost = RDCOST(x->rdmult, cfl->costs[c], dist);
+        const int rate = x->cfl_cost[joint_sign][CFL_PRED_U][u] +
+                         x->cfl_cost[joint_sign][CFL_PRED_V][v];
+        cost = RDCOST(x->rdmult, rate, dist);
         if (cost < best_cost) {
           best_cost = cost;
-          ind = c;
-          signs[CFL_PRED_U] = sign_u;
-          signs[CFL_PRED_V] = sign_v;
+          best_rate = rate;
+          ind = (u << CFL_ALPHABET_SIZE_LOG2) + v;
+          signs = joint_sign;
         }
       }
     }
   }
 
   mbmi->cfl_alpha_idx = ind;
-  return cfl->costs[ind];
+  mbmi->cfl_alpha_signs = signs;
+  return best_rate;
 }
 #endif  // CONFIG_CFL
 
 static void init_sbuv_mode(MB_MODE_INFO *const mbmi) {
   mbmi->uv_mode = UV_DC_PRED;
-#if CONFIG_PALETTE
   mbmi->palette_mode_info.palette_size[1] = 0;
-#endif  // CONFIG_PALETTE
 #if CONFIG_FILTER_INTRA
   mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
 #endif  // CONFIG_FILTER_INTRA
@@ -5772,9 +6283,9 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   od_rollback_buffer buf;
   od_encode_checkpoint(&x->daala_enc, &buf);
 #endif  // CONFIG_PVQ
-#if CONFIG_PALETTE
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-#endif  // CONFIG_PALETTE
+  const int try_palette =
+      av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type);
 
   for (int mode_idx = 0; mode_idx < UV_INTRA_MODES; ++mode_idx) {
     int this_rate;
@@ -5782,7 +6293,7 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     UV_PREDICTION_MODE mode = uv_rd_search_mode_order[mode_idx];
 #if CONFIG_EXT_INTRA
     const int is_directional_mode =
-        av1_is_directional_mode(mode, mbmi->sb_type);
+        av1_is_directional_mode(get_uv_mode(mode), mbmi->sb_type);
 #endif  // CONFIG_EXT_INTRA
     if (!(cpi->sf.intra_uv_mode_mask[txsize_sqr_up_map[max_tx_size]] &
           (1 << mode)))
@@ -5791,7 +6302,8 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     mbmi->uv_mode = mode;
 #if CONFIG_CFL
     int cfl_alpha_rate = 0;
-    if (mode == UV_DC_PRED) {
+    if (mode == UV_CFL_PRED) {
+      assert(!is_directional_mode);
       const TX_SIZE uv_tx_size = av1_get_uv_tx_size(mbmi, &xd->plane[1]);
       cfl_alpha_rate = cfl_rd_pick_alpha(x, uv_tx_size);
     }
@@ -5799,7 +6311,7 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
 #if CONFIG_EXT_INTRA
     mbmi->angle_delta[1] = 0;
     if (is_directional_mode && av1_use_angle_delta(mbmi->sb_type)) {
-      const int rate_overhead = cpi->intra_uv_mode_cost[mbmi->mode][mode] +
+      const int rate_overhead = x->intra_uv_mode_cost[mbmi->mode][mode] +
                                 write_uniform_cost(2 * MAX_ANGLE_DELTA + 1, 0);
       if (!rd_pick_intra_angle_sbuv(cpi, x, bsize, rate_overhead, best_rd,
                                     &this_rate, &tokenonly_rd_stats))
@@ -5816,10 +6328,10 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     }
 #endif  // CONFIG_EXT_INTRA
     this_rate =
-        tokenonly_rd_stats.rate + cpi->intra_uv_mode_cost[mbmi->mode][mode];
+        tokenonly_rd_stats.rate + x->intra_uv_mode_cost[mbmi->mode][mode];
 
 #if CONFIG_CFL
-    if (mode == UV_DC_PRED) {
+    if (mode == UV_CFL_PRED) {
       this_rate += cfl_alpha_rate;
     }
 #endif
@@ -5830,15 +6342,12 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     }
 #endif  // CONFIG_EXT_INTRA
 #if CONFIG_FILTER_INTRA
-    if (mbmi->sb_type >= BLOCK_8X8 && mode == DC_PRED)
+    if (mbmi->sb_type >= BLOCK_8X8 && mode == UV_DC_PRED)
       this_rate += av1_cost_bit(cpi->common.fc->filter_intra_probs[1], 0);
 #endif  // CONFIG_FILTER_INTRA
-#if CONFIG_PALETTE
-    if (cpi->common.allow_screen_content_tools && mbmi->sb_type >= BLOCK_8X8 &&
-        mode == UV_DC_PRED)
+    if (try_palette && mode == UV_DC_PRED)
       this_rate += av1_cost_bit(
           av1_default_palette_uv_mode_prob[pmi->palette_size[0] > 0], 0);
-#endif  // CONFIG_PALETTE
 
 #if CONFIG_PVQ
     od_encode_rollback(&x->daala_enc, &buf);
@@ -5855,15 +6364,13 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     }
   }
 
-#if CONFIG_PALETTE
-  if (cpi->common.allow_screen_content_tools && mbmi->sb_type >= BLOCK_8X8) {
+  if (try_palette) {
     uint8_t *best_palette_color_map = x->palette_buffer->best_palette_color_map;
     rd_pick_palette_intra_sbuv(cpi, x,
-                               cpi->intra_uv_mode_cost[mbmi->mode][UV_DC_PRED],
+                               x->intra_uv_mode_cost[mbmi->mode][UV_DC_PRED],
                                best_palette_color_map, &best_mbmi, &best_rd,
                                rate, rate_tokenonly, distortion, skippable);
   }
-#endif  // CONFIG_PALETTE
 
 #if CONFIG_FILTER_INTRA
   if (mbmi->sb_type >= BLOCK_8X8) {
@@ -5880,19 +6387,17 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
 }
 
 static void choose_intra_uv_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
-                                 PICK_MODE_CONTEXT *ctx, BLOCK_SIZE bsize,
-                                 TX_SIZE max_tx_size, int *rate_uv,
-                                 int *rate_uv_tokenonly, int64_t *dist_uv,
-                                 int *skip_uv, UV_PREDICTION_MODE *mode_uv) {
+                                 BLOCK_SIZE bsize, TX_SIZE max_tx_size,
+                                 int *rate_uv, int *rate_uv_tokenonly,
+                                 int64_t *dist_uv, int *skip_uv,
+                                 UV_PREDICTION_MODE *mode_uv) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   // Use an estimated rd for uv_intra based on DC_PRED if the
   // appropriate speed flag is set.
-  (void)ctx;
-  init_sbuv_mode(&x->e_mbd.mi[0]->mbmi);
+  init_sbuv_mode(mbmi);
 #if CONFIG_CB4X4
-#if CONFIG_CHROMA_2X2
-  rd_pick_intra_sbuv_mode(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
-                          bsize, max_tx_size);
-#else
+#if !CONFIG_CHROMA_2X2
   if (x->skip_chroma_rd) {
     *rate_uv = 0;
     *rate_uv_tokenonly = 0;
@@ -5901,32 +6406,47 @@ static void choose_intra_uv_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
     *mode_uv = UV_DC_PRED;
     return;
   }
-  BLOCK_SIZE bs = scale_chroma_bsize(bsize, x->e_mbd.plane[1].subsampling_x,
-                                     x->e_mbd.plane[1].subsampling_y);
-  rd_pick_intra_sbuv_mode(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
-                          bs, max_tx_size);
-#endif  // CONFIG_CHROMA_2X2
+  bsize = scale_chroma_bsize(bsize, xd->plane[AOM_PLANE_U].subsampling_x,
+                             xd->plane[AOM_PLANE_U].subsampling_y);
+#endif  // !CONFIG_CHROMA_2X2
+#if CONFIG_CFL
+  // Only store reconstructed luma when there's chroma RDO. When there's no
+  // chroma RDO, the reconstructed luma will be stored in encode_superblock().
+  xd->cfl->store_y = !x->skip_chroma_rd;
+#endif  // CONFIG_CFL
 #else
-  rd_pick_intra_sbuv_mode(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
-                          bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize, max_tx_size);
+  bsize = bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize;
+#if CONFIG_CFL
+  xd->cfl->store_y = 1;
+#endif  // CONFIG_CFL
 #endif  // CONFIG_CB4X4
-  *mode_uv = x->e_mbd.mi[0]->mbmi.uv_mode;
+#if CONFIG_CFL
+  if (xd->cfl->store_y) {
+    // Perform one extra call to txfm_rd_in_plane(), with the values chosen
+    // during luma RDO, so we can store reconstructed luma values
+    RD_STATS this_rd_stats;
+    txfm_rd_in_plane(x, cpi, &this_rd_stats, INT64_MAX, AOM_PLANE_Y,
+                     mbmi->sb_type, mbmi->tx_size,
+                     cpi->sf.use_fast_coef_costing);
+    xd->cfl->store_y = 0;
+  }
+#endif  // CONFIG_CFL
+  rd_pick_intra_sbuv_mode(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
+                          bsize, max_tx_size);
+  *mode_uv = mbmi->uv_mode;
 }
 
-static int cost_mv_ref(const AV1_COMP *const cpi, PREDICTION_MODE mode,
+static int cost_mv_ref(const MACROBLOCK *const x, PREDICTION_MODE mode,
                        int16_t mode_context) {
-#if CONFIG_EXT_INTER
   if (is_inter_compound_mode(mode)) {
-    return cpi
+    return x
         ->inter_compound_mode_cost[mode_context][INTER_COMPOUND_OFFSET(mode)];
 #if CONFIG_COMPOUND_SINGLEREF
   } else if (is_inter_singleref_comp_mode(mode)) {
-    return cpi
-        ->inter_singleref_comp_mode_cost[mode_context]
-                                        [INTER_SINGLEREF_COMP_OFFSET(mode)];
+    return x->inter_singleref_comp_mode_cost[mode_context]
+                                            [INTER_SINGLEREF_COMP_OFFSET(mode)];
 #endif  // CONFIG_COMPOUND_SINGLEREF
   }
-#endif
 
   int mode_cost = 0;
   int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
@@ -5935,32 +6455,32 @@ static int cost_mv_ref(const AV1_COMP *const cpi, PREDICTION_MODE mode,
   assert(is_inter_mode(mode));
 
   if (mode == NEWMV) {
-    mode_cost = cpi->newmv_mode_cost[mode_ctx][0];
+    mode_cost = x->newmv_mode_cost[mode_ctx][0];
     return mode_cost;
   } else {
-    mode_cost = cpi->newmv_mode_cost[mode_ctx][1];
+    mode_cost = x->newmv_mode_cost[mode_ctx][1];
     mode_ctx = (mode_context >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK;
 
     if (is_all_zero_mv) return mode_cost;
 
     if (mode == ZEROMV) {
-      mode_cost += cpi->zeromv_mode_cost[mode_ctx][0];
+      mode_cost += x->zeromv_mode_cost[mode_ctx][0];
       return mode_cost;
     } else {
-      mode_cost += cpi->zeromv_mode_cost[mode_ctx][1];
+      mode_cost += x->zeromv_mode_cost[mode_ctx][1];
       mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
 
       if (mode_context & (1 << SKIP_NEARESTMV_OFFSET)) mode_ctx = 6;
       if (mode_context & (1 << SKIP_NEARMV_OFFSET)) mode_ctx = 7;
       if (mode_context & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET)) mode_ctx = 8;
 
-      mode_cost += cpi->refmv_mode_cost[mode_ctx][mode != NEARESTMV];
+      mode_cost += x->refmv_mode_cost[mode_ctx][mode != NEARESTMV];
       return mode_cost;
     }
   }
 }
 
-#if CONFIG_EXT_INTER && (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT)
+#if (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT)
 static int get_interinter_compound_type_bits(BLOCK_SIZE bsize,
                                              COMPOUND_TYPE comp_type) {
   (void)bsize;
@@ -5975,7 +6495,7 @@ static int get_interinter_compound_type_bits(BLOCK_SIZE bsize,
     default: assert(0); return 0;
   }
 }
-#endif  // CONFIG_EXT_INTER && (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT)
+#endif  // (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT)
 
 typedef struct {
   int eobs;
@@ -5986,9 +6506,7 @@ typedef struct {
   int64_t brdcost;
   int_mv mvs[2];
   int_mv pred_mv[2];
-#if CONFIG_EXT_INTER
   int_mv ref_mv[2];
-#endif  // CONFIG_EXT_INTER
 
 #if CONFIG_CHROMA_2X2
   ENTROPY_CONTEXT ta[4];
@@ -6009,16 +6527,12 @@ typedef struct {
   int64_t sse;
   int segment_yrate;
   PREDICTION_MODE modes[4];
-#if CONFIG_EXT_INTER
 #if CONFIG_COMPOUND_SINGLEREF
   SEG_RDSTAT rdstat[4][INTER_MODES + INTER_SINGLEREF_COMP_MODES +
                        INTER_COMPOUND_MODES];
 #else   // !CONFIG_COMPOUND_SINGLEREF
   SEG_RDSTAT rdstat[4][INTER_MODES + INTER_COMPOUND_MODES];
 #endif  // CONFIG_COMPOUND_SINGLEREF
-#else   // !CONFIG_EXT_INTER
-  SEG_RDSTAT rdstat[4][INTER_MODES];
-#endif  // CONFIG_EXT_INTER
   int mvthresh;
 } BEST_SEG_INFO;
 
@@ -6032,10 +6546,9 @@ static INLINE int mv_check_bounds(const MvLimits *mv_limits, const MV *mv) {
 // Check if NEARESTMV/NEARMV/ZEROMV is the cheapest way encode zero motion.
 // TODO(aconverse): Find out if this is still productive then clean up or remove
 static int check_best_zero_mv(
-    const AV1_COMP *const cpi, const int16_t mode_context[TOTAL_REFS_PER_FRAME],
-#if CONFIG_EXT_INTER
+    const AV1_COMP *const cpi, const MACROBLOCK *const x,
+    const int16_t mode_context[TOTAL_REFS_PER_FRAME],
     const int16_t compound_mode_context[TOTAL_REFS_PER_FRAME],
-#endif  // CONFIG_EXT_INTER
     int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME], int this_mode,
     const MV_REFERENCE_FRAME ref_frames[2], const BLOCK_SIZE bsize, int block,
     int mi_row, int mi_col) {
@@ -6045,34 +6558,33 @@ static int check_best_zero_mv(
 #endif
   (void)mi_row;
   (void)mi_col;
+  (void)cpi;
 #if CONFIG_GLOBAL_MOTION
-  if (this_mode == ZEROMV
-#if CONFIG_EXT_INTER
-      || this_mode == ZERO_ZEROMV
-#endif  // CONFIG_EXT_INTER
-      ) {
+  if (this_mode == ZEROMV || this_mode == ZERO_ZEROMV) {
     for (int cur_frm = 0; cur_frm < 1 + comp_pred_mode; cur_frm++) {
       zeromv[cur_frm].as_int =
           gm_get_motion_vector(&cpi->common.global_motion[ref_frames[cur_frm]],
                                cpi->common.allow_high_precision_mv, bsize,
-                               mi_col, mi_row, block)
+                               mi_col, mi_row, block
+#if CONFIG_AMVR
+                               ,
+                               cpi->common.cur_frame_mv_precision_level
+#endif
+                               )
               .as_int;
     }
   }
 #endif  // CONFIG_GLOBAL_MOTION
 
-#if !CONFIG_EXT_INTER
-  assert(ref_frames[1] != INTRA_FRAME);  // Just sanity check
-#endif                                   // !CONFIG_EXT_INTER
   if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
       frame_mv[this_mode][ref_frames[0]].as_int == zeromv[0].as_int &&
       (ref_frames[1] <= INTRA_FRAME ||
        frame_mv[this_mode][ref_frames[1]].as_int == zeromv[1].as_int)) {
     int16_t rfc =
         av1_mode_context_analyzer(mode_context, ref_frames, bsize, block);
-    int c1 = cost_mv_ref(cpi, NEARMV, rfc);
-    int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
-    int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
+    int c1 = cost_mv_ref(x, NEARMV, rfc);
+    int c2 = cost_mv_ref(x, NEARESTMV, rfc);
+    int c3 = cost_mv_ref(x, ZEROMV, rfc);
 
     if (this_mode == NEARMV) {
       if (c1 > c3) return 0;
@@ -6092,16 +6604,14 @@ static int check_best_zero_mv(
           return 0;
       }
     }
-  }
-#if CONFIG_EXT_INTER
-  else if ((this_mode == NEAREST_NEARESTMV || this_mode == NEAR_NEARMV ||
-            this_mode == ZERO_ZEROMV) &&
-           frame_mv[this_mode][ref_frames[0]].as_int == zeromv[0].as_int &&
-           frame_mv[this_mode][ref_frames[1]].as_int == zeromv[1].as_int) {
+  } else if ((this_mode == NEAREST_NEARESTMV || this_mode == NEAR_NEARMV ||
+              this_mode == ZERO_ZEROMV) &&
+             frame_mv[this_mode][ref_frames[0]].as_int == zeromv[0].as_int &&
+             frame_mv[this_mode][ref_frames[1]].as_int == zeromv[1].as_int) {
     int16_t rfc = compound_mode_context[ref_frames[0]];
-    int c2 = cost_mv_ref(cpi, NEAREST_NEARESTMV, rfc);
-    int c3 = cost_mv_ref(cpi, ZERO_ZEROMV, rfc);
-    int c5 = cost_mv_ref(cpi, NEAR_NEARMV, rfc);
+    int c2 = cost_mv_ref(x, NEAREST_NEARESTMV, rfc);
+    int c3 = cost_mv_ref(x, ZERO_ZEROMV, rfc);
+    int c5 = cost_mv_ref(x, NEAR_NEARMV, rfc);
 
     if (this_mode == NEAREST_NEARESTMV) {
       if (c2 > c3) return 0;
@@ -6116,45 +6626,42 @@ static int check_best_zero_mv(
         return 0;
     }
   }
-#endif  // CONFIG_EXT_INTER
   return 1;
 }
 
 static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
                                 BLOCK_SIZE bsize, int_mv *frame_mv,
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
                                 int_mv *frame_comp_mv,
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
                                 int mi_row, int mi_col,
-#if CONFIG_EXT_INTER
                                 int_mv *ref_mv_sub8x8[2], const uint8_t *mask,
-                                int mask_stride,
-#endif  // CONFIG_EXT_INTER
-                                int *rate_mv, const int block) {
+                                int mask_stride, int *rate_mv,
+                                const int block) {
   const AV1_COMMON *const cm = &cpi->common;
   const int pw = block_size_wide[bsize];
   const int ph = block_size_high[bsize];
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
 // This function should only ever be called for compound modes
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
   if (!has_second_ref(mbmi)) {
     assert(is_inter_singleref_comp_mode(mbmi->mode));
     assert(frame_comp_mv);
   }
   assert(has_second_ref(mbmi) || is_inter_singleref_comp_mode(mbmi->mode));
-  const int refs[2] = { mbmi->ref_frame[0], has_second_ref(mbmi)
-                                                ? mbmi->ref_frame[1]
-                                                : mbmi->ref_frame[0] };
+  const int refs[2] = { mbmi->ref_frame[0],
+                        has_second_ref(mbmi) ? mbmi->ref_frame[1]
+                                             : mbmi->ref_frame[0] };
 #else
   assert(has_second_ref(mbmi));
   const int refs[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] };
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
   int_mv ref_mv[2];
   int ite, ref;
   struct scale_factors sf;
 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
-  // ic and ir are the 4x4 coordiantes of the sub8x8 at index "block"
+  // ic and ir are the 4x4 coordinates of the sub8x8 at index "block"
   const int ic = block & 1;
   const int ir = (block - ic) >> 1;
   struct macroblockd_plane *const pd = &xd->plane[0];
@@ -6162,18 +6669,19 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
   const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y) + 4 * ir;
 #if CONFIG_GLOBAL_MOTION
   int is_global[2];
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
-  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+#if CONFIG_COMPOUND_SINGLEREF
+  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref)
 #else
-  for (ref = 0; ref < 2; ++ref) {
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  for (ref = 0; ref < 2; ++ref)
+#endif  // CONFIG_COMPOUND_SINGLEREF
+  {
     WarpedMotionParams *const wm =
         &xd->global_motion[xd->mi[0]->mbmi.ref_frame[ref]];
     is_global[ref] = is_global_mv_block(xd->mi[0], block, wm->wmtype);
   }
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
   if (!has_second_ref(mbmi)) is_global[1] = is_global[0];
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
 #endif  // CONFIG_GLOBAL_MOTION
 #else   // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
   (void)block;
@@ -6195,20 +6703,21 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
   DECLARE_ALIGNED(16, uint8_t, second_pred[MAX_SB_SQUARE]);
 #endif  // CONFIG_HIGHBITDEPTH
 
-#if CONFIG_EXT_INTER && CONFIG_CB4X4
+#if CONFIG_CB4X4
   (void)ref_mv_sub8x8;
-#endif  // CONFIG_EXT_INTER && CONFIG_CB4X4
+#endif  // CONFIG_CB4X4
 
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
-  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+#if CONFIG_COMPOUND_SINGLEREF
+  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref)
 #else
-  for (ref = 0; ref < 2; ++ref) {
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
-#if CONFIG_EXT_INTER && !CONFIG_CB4X4
+  for (ref = 0; ref < 2; ++ref)
+#endif  // CONFIG_COMPOUND_SINGLEREF
+  {
+#if !CONFIG_CB4X4
     if (bsize < BLOCK_8X8 && ref_mv_sub8x8 != NULL)
       ref_mv[ref].as_int = ref_mv_sub8x8[ref]->as_int;
     else
-#endif  // CONFIG_EXT_INTER && !CONFIG_CB4X4
+#endif  // !CONFIG_CB4X4
       ref_mv[ref] = x->mbmi_ext->ref_mvs[refs[ref]][0];
 
     if (scaled_ref_frame[ref]) {
@@ -6223,7 +6732,7 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
     }
   }
 
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
   if (!has_second_ref(mbmi)) {
     assert(is_inter_singleref_comp_mode(mbmi->mode));
     // NOTE: For single ref comp mode, set up the 2nd set of ref_mv/pre_planes
@@ -6239,7 +6748,7 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
       av1_setup_pre_planes(xd, 1, scaled_ref_frame[0], mi_row, mi_col, NULL);
     }
   }
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
 
 // Since we have scaled the reference frames to match the size of the current
 // frame we must use a unit scaling factor during mode selection.
@@ -6253,14 +6762,15 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
 
 // Allow joint search multiple times iteratively for each reference frame
 // and break out of the search loop if it couldn't find a better mv.
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
   const int num_ites =
       (has_second_ref(mbmi) || mbmi->mode == SR_NEW_NEWMV) ? 4 : 1;
   const int start_ite = has_second_ref(mbmi) ? 0 : 1;
-  for (ite = start_ite; ite < (start_ite + num_ites); ite++) {
+  for (ite = start_ite; ite < (start_ite + num_ites); ite++)
 #else
-  for (ite = 0; ite < 4; ite++) {
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  for (ite = 0; ite < 4; ite++)
+#endif  // CONFIG_COMPOUND_SINGLEREF
+  {
     struct buf_2d ref_yv12[2];
     int bestsme = INT_MAX;
     int sadpb = x->sadperbit16;
@@ -6288,23 +6798,23 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
     ref_yv12[1] = xd->plane[plane].pre[1];
 
 // Get the prediction block from the 'other' reference frame.
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
     MV *const the_other_mv = (has_second_ref(mbmi) || id)
                                  ? &frame_mv[refs[!id]].as_mv
                                  : &frame_comp_mv[refs[0]].as_mv;
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
 
 #if CONFIG_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
       av1_highbd_build_inter_predictor(
           ref_yv12[!id].buf, ref_yv12[!id].stride, second_pred, pw,
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
           the_other_mv,
-#else   // !(CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF)
+#else   // !(CONFIG_COMPOUND_SINGLEREF)
           &frame_mv[refs[!id]].as_mv,
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
-          &sf, pw, ph, 0, mbmi->interp_filter,
+#endif  // CONFIG_COMPOUND_SINGLEREF
+          &sf, pw, ph, 0, mbmi->interp_filters,
 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
           &warp_types, p_col, p_row,
 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
@@ -6314,12 +6824,12 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
 #endif  // CONFIG_HIGHBITDEPTH
       av1_build_inter_predictor(
           ref_yv12[!id].buf, ref_yv12[!id].stride, second_pred, pw,
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
           the_other_mv,
-#else   // !(CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF)
+#else   // !(CONFIG_COMPOUND_SINGLEREF)
         &frame_mv[refs[!id]].as_mv,
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
-          &sf, pw, ph, &conv_params, mbmi->interp_filter,
+#endif  // CONFIG_COMPOUND_SINGLEREF
+          &sf, pw, ph, &conv_params, mbmi->interp_filters,
 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
           &warp_types, p_col, p_row, plane, !id,
 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
@@ -6334,74 +6844,75 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
 
 // Use the mv result from the single mode as mv predictor.
 // Use the mv result from the single mode as mv predictor.
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
     if (!has_second_ref(mbmi) && id)
       *best_mv = frame_comp_mv[refs[0]].as_mv;
     else
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
       *best_mv = frame_mv[refs[id]].as_mv;
 
     best_mv->col >>= 3;
     best_mv->row >>= 3;
 
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
     if (!has_second_ref(mbmi))
       av1_set_mvcost(x, refs[0], 0, mbmi->ref_mv_idx);
     else
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
       av1_set_mvcost(x, refs[id], id, mbmi->ref_mv_idx);
 
     // Small-range full-pixel motion search.
-    bestsme =
-        av1_refining_search_8p_c(x, sadpb, search_range, &cpi->fn_ptr[bsize],
-#if CONFIG_EXT_INTER
-                                 mask, mask_stride, id,
-#endif
-                                 &ref_mv[id].as_mv, second_pred);
+    bestsme = av1_refining_search_8p_c(x, sadpb, search_range,
+                                       &cpi->fn_ptr[bsize], mask, mask_stride,
+                                       id, &ref_mv[id].as_mv, second_pred);
     if (bestsme < INT_MAX) {
-#if CONFIG_EXT_INTER
       if (mask)
         bestsme = av1_get_mvpred_mask_var(x, best_mv, &ref_mv[id].as_mv,
                                           second_pred, mask, mask_stride, id,
                                           &cpi->fn_ptr[bsize], 1);
       else
-#endif
         bestsme = av1_get_mvpred_av_var(x, best_mv, &ref_mv[id].as_mv,
                                         second_pred, &cpi->fn_ptr[bsize], 1);
     }
 
     x->mv_limits = tmp_mv_limits;
 
-    if (bestsme < INT_MAX) {
+#if CONFIG_AMVR
+    if (cpi->common.cur_frame_mv_precision_level) {
+      x->best_mv.as_mv.row *= 8;
+      x->best_mv.as_mv.col *= 8;
+    }
+    if (bestsme < INT_MAX && cpi->common.cur_frame_mv_precision_level == 0)
+#else
+    if (bestsme < INT_MAX)
+#endif
+    {
       int dis; /* TODO: use dis in distortion calculation later. */
       unsigned int sse;
       bestsme = cpi->find_fractional_mv_step(
           x, &ref_mv[id].as_mv, cpi->common.allow_high_precision_mv,
           x->errorperbit, &cpi->fn_ptr[bsize], 0,
           cpi->sf.mv.subpel_iters_per_step, NULL, x->nmvjointcost, x->mvcost,
-          &dis, &sse, second_pred,
-#if CONFIG_EXT_INTER
-          mask, mask_stride, id,
-#endif
-          pw, ph, cpi->sf.use_upsampled_references);
+          &dis, &sse, second_pred, mask, mask_stride, id, pw, ph,
+          cpi->sf.use_upsampled_references);
     }
 
     // Restore the pointer to the first (possibly scaled) prediction buffer.
     if (id) xd->plane[plane].pre[0] = ref_yv12[0];
 
     if (bestsme < last_besterr[id]) {
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
       // NOTE: For single ref comp mode, frame_mv stores the first mv and
       //       frame_comp_mv stores the second mv.
       if (!has_second_ref(mbmi) && id)
         frame_comp_mv[refs[0]].as_mv = *best_mv;
       else
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
         frame_mv[refs[id]].as_mv = *best_mv;
       last_besterr[id] = bestsme;
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
       if (!has_second_ref(mbmi)) last_besterr[!id] = last_besterr[id];
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
     } else {
       break;
     }
@@ -6409,11 +6920,12 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
 
   *rate_mv = 0;
 
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
-  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+#if CONFIG_COMPOUND_SINGLEREF
+  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref)
 #else
-  for (ref = 0; ref < 2; ++ref) {
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+  for (ref = 0; ref < 2; ++ref)
+#endif  // CONFIG_COMPOUND_SINGLEREF
+  {
     if (scaled_ref_frame[ref]) {
       // Restore the prediction frame pointers to their unscaled versions.
       int i;
@@ -6421,14 +6933,14 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
         xd->plane[i].pre[ref] = backup_yv12[ref][i];
     }
 
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
     if (!has_second_ref(mbmi))
       av1_set_mvcost(x, refs[0], 0, mbmi->ref_mv_idx);
     else
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
       av1_set_mvcost(x, refs[ref], ref, mbmi->ref_mv_idx);
 
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
     if (!has_second_ref(mbmi)) {
       // NOTE: For single ref comp mode, i.e. !has_second_ref(mbmi) is true, the
       //       first mv is stored in frame_mv[] and the second mv is stored in
@@ -6442,25 +6954,25 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
                                   &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
                                   x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
     } else {
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
-#if CONFIG_EXT_INTER && !CONFIG_CB4X4
+#endif  // CONFIG_COMPOUND_SINGLEREF
+#if !CONFIG_CB4X4
       if (bsize >= BLOCK_8X8)
-#endif  // CONFIG_EXT_INTER && !CONFIG_CB4X4
+#endif  // !CONFIG_CB4X4
         *rate_mv += av1_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
                                     &x->mbmi_ext->ref_mvs[refs[ref]][0].as_mv,
                                     x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
-#if CONFIG_EXT_INTER && !CONFIG_CB4X4
+#if !CONFIG_CB4X4
       else
         *rate_mv += av1_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
                                     &ref_mv_sub8x8[ref]->as_mv, x->nmvjointcost,
                                     x->mvcost, MV_COST_WEIGHT);
-#endif  // CONFIG_EXT_INTER && !CONFIG_CB4X4
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // !CONFIG_CB4X4
+#if CONFIG_COMPOUND_SINGLEREF
     }
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
   }
 
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
   if (!has_second_ref(mbmi)) {
     if (scaled_ref_frame[0]) {
       // Restore the prediction frame pointers to their unscaled versions.
@@ -6469,7 +6981,7 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
         xd->plane[i].pre[1] = backup_yv12[1][i];
     }
   }
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
 }
 
 static void estimate_ref_frame_costs(
@@ -6516,6 +7028,7 @@ static void estimate_ref_frame_costs(
       aom_prob ref_single_p3 = av1_get_pred_prob_single_ref_p3(cm, xd);
       aom_prob ref_single_p4 = av1_get_pred_prob_single_ref_p4(cm, xd);
       aom_prob ref_single_p5 = av1_get_pred_prob_single_ref_p5(cm, xd);
+      aom_prob ref_single_p6 = av1_get_pred_prob_single_ref_p6(cm, xd);
 #endif  // CONFIG_EXT_REFS
 
       unsigned int base_cost = av1_cost_bit(intra_inter_p, 1);
@@ -6523,7 +7036,7 @@ static void estimate_ref_frame_costs(
       ref_costs_single[LAST_FRAME] =
 #if CONFIG_EXT_REFS
           ref_costs_single[LAST2_FRAME] = ref_costs_single[LAST3_FRAME] =
-              ref_costs_single[BWDREF_FRAME] =
+              ref_costs_single[BWDREF_FRAME] = ref_costs_single[ALTREF2_FRAME] =
 #endif  // CONFIG_EXT_REFS
                   ref_costs_single[GOLDEN_FRAME] =
                       ref_costs_single[ALTREF_FRAME] = base_cost;
@@ -6534,6 +7047,7 @@ static void estimate_ref_frame_costs(
       ref_costs_single[LAST3_FRAME] += av1_cost_bit(ref_single_p1, 0);
       ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p1, 0);
       ref_costs_single[BWDREF_FRAME] += av1_cost_bit(ref_single_p1, 1);
+      ref_costs_single[ALTREF2_FRAME] += av1_cost_bit(ref_single_p1, 1);
       ref_costs_single[ALTREF_FRAME] += av1_cost_bit(ref_single_p1, 1);
 
       ref_costs_single[LAST_FRAME] += av1_cost_bit(ref_single_p3, 0);
@@ -6542,6 +7056,7 @@ static void estimate_ref_frame_costs(
       ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p3, 1);
 
       ref_costs_single[BWDREF_FRAME] += av1_cost_bit(ref_single_p2, 0);
+      ref_costs_single[ALTREF2_FRAME] += av1_cost_bit(ref_single_p2, 0);
       ref_costs_single[ALTREF_FRAME] += av1_cost_bit(ref_single_p2, 1);
 
       ref_costs_single[LAST_FRAME] += av1_cost_bit(ref_single_p4, 0);
@@ -6549,6 +7064,9 @@ static void estimate_ref_frame_costs(
 
       ref_costs_single[LAST3_FRAME] += av1_cost_bit(ref_single_p5, 0);
       ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p5, 1);
+
+      ref_costs_single[BWDREF_FRAME] += av1_cost_bit(ref_single_p6, 0);
+      ref_costs_single[ALTREF2_FRAME] += av1_cost_bit(ref_single_p6, 1);
 #else   // !CONFIG_EXT_REFS
       ref_costs_single[LAST_FRAME] += av1_cost_bit(ref_single_p1, 0);
       ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p1, 1);
@@ -6563,6 +7081,7 @@ static void estimate_ref_frame_costs(
       ref_costs_single[LAST2_FRAME] = 512;
       ref_costs_single[LAST3_FRAME] = 512;
       ref_costs_single[BWDREF_FRAME] = 512;
+      ref_costs_single[ALTREF2_FRAME] = 512;
 #endif  // CONFIG_EXT_REFS
       ref_costs_single[GOLDEN_FRAME] = 512;
       ref_costs_single[ALTREF_FRAME] = 512;
@@ -6574,6 +7093,7 @@ static void estimate_ref_frame_costs(
       aom_prob ref_comp_p1 = av1_get_pred_prob_comp_ref_p1(cm, xd);
       aom_prob ref_comp_p2 = av1_get_pred_prob_comp_ref_p2(cm, xd);
       aom_prob bwdref_comp_p = av1_get_pred_prob_comp_bwdref_p(cm, xd);
+      aom_prob bwdref_comp_p1 = av1_get_pred_prob_comp_bwdref_p1(cm, xd);
 #endif  // CONFIG_EXT_REFS
 
       unsigned int base_cost = av1_cost_bit(intra_inter_p, 1);
@@ -6589,7 +7109,8 @@ static void estimate_ref_frame_costs(
 #else
               base_cost;
 #endif  // USE_UNI_COMP_REFS
-      ref_bicomp_costs[BWDREF_FRAME] = ref_bicomp_costs[ALTREF_FRAME] = 0;
+      ref_bicomp_costs[BWDREF_FRAME] = ref_bicomp_costs[ALTREF2_FRAME] = 0;
+      ref_bicomp_costs[ALTREF_FRAME] = 0;
 
       ref_bicomp_costs[LAST_FRAME] += av1_cost_bit(ref_comp_p, 0);
       ref_bicomp_costs[LAST2_FRAME] += av1_cost_bit(ref_comp_p, 0);
@@ -6603,14 +7124,18 @@ static void estimate_ref_frame_costs(
       ref_bicomp_costs[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p2, 1);
 
       ref_bicomp_costs[BWDREF_FRAME] += av1_cost_bit(bwdref_comp_p, 0);
+      ref_bicomp_costs[ALTREF2_FRAME] += av1_cost_bit(bwdref_comp_p, 0);
       ref_bicomp_costs[ALTREF_FRAME] += av1_cost_bit(bwdref_comp_p, 1);
 
-      int ref0;
+      ref_bicomp_costs[BWDREF_FRAME] += av1_cost_bit(bwdref_comp_p1, 0);
+      ref_bicomp_costs[ALTREF2_FRAME] += av1_cost_bit(bwdref_comp_p1, 1);
+
+      int ref0, ref1;
       for (ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) {
-        ref_costs_comp[ref0][BWDREF_FRAME] =
-            ref_bicomp_costs[ref0] + ref_bicomp_costs[BWDREF_FRAME];
-        ref_costs_comp[ref0][ALTREF_FRAME] =
-            ref_bicomp_costs[ref0] + ref_bicomp_costs[ALTREF_FRAME];
+        for (ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1) {
+          ref_costs_comp[ref0][ref1] =
+              ref_bicomp_costs[ref0] + ref_bicomp_costs[ref1];
+        }
       }
 
       aom_prob uni_comp_ref_p = av1_get_pred_prob_uni_comp_ref_p(cm, xd);
@@ -6642,7 +7167,8 @@ static void estimate_ref_frame_costs(
               ref_costs_comp[GOLDEN_FRAME] = base_cost;
 
 #if CONFIG_EXT_REFS
-      ref_costs_comp[BWDREF_FRAME] = ref_costs_comp[ALTREF_FRAME] = 0;
+      ref_costs_comp[BWDREF_FRAME] = ref_costs_comp[ALTREF2_FRAME] =
+          ref_costs_comp[ALTREF_FRAME] = 0;
 #endif  // CONFIG_EXT_REFS
 
 #if CONFIG_EXT_REFS
@@ -6660,7 +7186,11 @@ static void estimate_ref_frame_costs(
       // NOTE(zoeliu): BWDREF and ALTREF each add an extra cost by coding 1
       //               more bit.
       ref_costs_comp[BWDREF_FRAME] += av1_cost_bit(bwdref_comp_p, 0);
+      ref_costs_comp[ALTREF2_FRAME] += av1_cost_bit(bwdref_comp_p, 0);
       ref_costs_comp[ALTREF_FRAME] += av1_cost_bit(bwdref_comp_p, 1);
+
+      ref_costs_comp[BWDREF_FRAME] += av1_cost_bit(bwdref_comp_p1, 0);
+      ref_costs_comp[ALTREF2_FRAME] += av1_cost_bit(bwdref_comp_p1, 1);
 #else   // !CONFIG_EXT_REFS
       ref_costs_comp[LAST_FRAME] += av1_cost_bit(ref_comp_p, 0);
       ref_costs_comp[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p, 1);
@@ -6668,10 +7198,10 @@ static void estimate_ref_frame_costs(
 #endif  // CONFIG_EXT_COMP_REFS
     } else {
 #if CONFIG_EXT_COMP_REFS
-      int ref0;
+      int ref0, ref1;
       for (ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) {
-        ref_costs_comp[ref0][BWDREF_FRAME] = 512;
-        ref_costs_comp[ref0][ALTREF_FRAME] = 512;
+        for (ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1)
+          ref_costs_comp[ref0][ref1] = 512;
       }
       ref_costs_comp[LAST_FRAME][LAST2_FRAME] = 512;
       ref_costs_comp[LAST_FRAME][LAST3_FRAME] = 512;
@@ -6683,6 +7213,7 @@ static void estimate_ref_frame_costs(
       ref_costs_comp[LAST2_FRAME] = 512;
       ref_costs_comp[LAST3_FRAME] = 512;
       ref_costs_comp[BWDREF_FRAME] = 512;
+      ref_costs_comp[ALTREF2_FRAME] = 512;
       ref_costs_comp[ALTREF_FRAME] = 512;
 #endif  // CONFIG_EXT_REFS
       ref_costs_comp[GOLDEN_FRAME] = 512;
@@ -6732,17 +7263,19 @@ static void setup_buffer_inter(
   // Gets an initial list of candidate vectors from neighbours and orders them
   av1_find_mv_refs(cm, xd, mi, ref_frame, &mbmi_ext->ref_mv_count[ref_frame],
                    mbmi_ext->ref_mv_stack[ref_frame],
-#if CONFIG_EXT_INTER
-                   mbmi_ext->compound_mode_context,
-#endif  // CONFIG_EXT_INTER
-                   candidates, mi_row, mi_col, NULL, NULL,
-                   mbmi_ext->mode_context);
+                   mbmi_ext->compound_mode_context, candidates, mi_row, mi_col,
+                   NULL, NULL, mbmi_ext->mode_context);
 
-  // Candidate refinement carried out at encoder and decoder
+// Candidate refinement carried out at encoder and decoder
+#if CONFIG_AMVR
+  av1_find_best_ref_mvs(cm->allow_high_precision_mv, candidates,
+                        &frame_nearest_mv[ref_frame], &frame_near_mv[ref_frame],
+                        cm->cur_frame_mv_precision_level);
+#else
   av1_find_best_ref_mvs(cm->allow_high_precision_mv, candidates,
                         &frame_nearest_mv[ref_frame],
                         &frame_near_mv[ref_frame]);
-
+#endif
 // Further refinement that is encode side only to test the top few candidates
 // in full and choose the best as the centre point for subsequent searches.
 // The current implementation doesn't support scaling.
@@ -6758,10 +7291,7 @@ static void setup_buffer_inter(
 
 static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
                                  BLOCK_SIZE bsize, int mi_row, int mi_col,
-#if CONFIG_EXT_INTER
-                                 int ref_idx,
-#endif  // CONFIG_EXT_INTER
-                                 int *rate_mv) {
+                                 int ref_idx, int *rate_mv) {
   MACROBLOCKD *xd = &x->e_mbd;
   const AV1_COMMON *cm = &cpi->common;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
@@ -6770,17 +7300,12 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
   int step_param;
   int sadpb = x->sadperbit16;
   MV mvp_full;
-#if CONFIG_EXT_INTER
 #if CONFIG_COMPOUND_SINGLEREF
   int ref =
       has_second_ref(mbmi) ? mbmi->ref_frame[ref_idx] : mbmi->ref_frame[0];
 #else   // !CONFIG_COMPOUND_SINGLEREF
   int ref = mbmi->ref_frame[ref_idx];
 #endif  // CONFIG_COMPOUND_SINGLEREF
-#else   // !CONFIG_EXT_INTER
-  int ref = mbmi->ref_frame[0];
-  int ref_idx = 0;
-#endif  // CONFIG_EXT_INTER
   MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv;
 
   MvLimits tmp_mv_limits = x->mv_limits;
@@ -6812,7 +7337,7 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
   // Work out the size of the first step in the mv step search.
   // 0 here is maximum length first step. 1 is AOMMAX >> 1 etc.
   if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) {
-    // Take wtd average of the step_params based on the last frame's
+    // Take the weighted average of the step_params based on the last frame's
     // max mv magnitude and that based on the best ref mvs of the current
     // block for the given reference.
     step_param =
@@ -6834,10 +7359,13 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
     int bhl = b_height_log2_lookup[bsize];
     int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4);
 
-    if (tlevel < 5) step_param += 2;
+    if (tlevel < 5) {
+      step_param += 2;
+      step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 1);
+    }
 
     // prev_mv_sad is not setup for dynamically scaled frames.
-    if (cpi->oxcf.resize_mode != RESIZE_DYNAMIC) {
+    if (cpi->oxcf.resize_mode != RESIZE_RANDOM) {
       int i;
       for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) {
         if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
@@ -6874,9 +7402,16 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
   switch (mbmi->motion_mode) {
     case SIMPLE_TRANSLATION:
 #endif  // CONFIG_MOTION_VAR
+#if CONFIG_HASH_ME
       bestsme = av1_full_pixel_search(cpi, x, bsize, &mvp_full, step_param,
                                       sadpb, cond_cost_list(cpi, cost_list),
-                                      &ref_mv, INT_MAX, 1);
+                                      &ref_mv, INT_MAX, 1, (MI_SIZE * mi_col),
+                                      (MI_SIZE * mi_row), 0);
+#else
+  bestsme = av1_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb,
+                                  cond_cost_list(cpi, cost_list), &ref_mv,
+                                  INT_MAX, 1);
+#endif
 #if CONFIG_MOTION_VAR
       break;
     case OBMC_CAUSAL:
@@ -6891,7 +7426,15 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
 
   x->mv_limits = tmp_mv_limits;
 
+#if CONFIG_AMVR
+  if (cpi->common.cur_frame_mv_precision_level) {
+    x->best_mv.as_mv.row *= 8;
+    x->best_mv.as_mv.col *= 8;
+  }
+  if (bestsme < INT_MAX && cpi->common.cur_frame_mv_precision_level == 0) {
+#else
   if (bestsme < INT_MAX) {
+#endif
     int dis; /* TODO: use dis in distortion calculation later. */
 #if CONFIG_MOTION_VAR
     switch (mbmi->motion_mode) {
@@ -6908,11 +7451,8 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
               x, &ref_mv, cm->allow_high_precision_mv, x->errorperbit,
               &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
               cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
-              x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL,
-#if CONFIG_EXT_INTER
-              NULL, 0, 0,
-#endif
-              pw, ph, 1);
+              x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, NULL,
+              0, 0, pw, ph, 1);
 
           if (try_second) {
             const int minc =
@@ -6936,11 +7476,7 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
                   &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
                   cpi->sf.mv.subpel_iters_per_step,
                   cond_cost_list(cpi, cost_list), x->nmvjointcost, x->mvcost,
-                  &dis, &x->pred_sse[ref], NULL,
-#if CONFIG_EXT_INTER
-                  NULL, 0, 0,
-#endif
-                  pw, ph, 1);
+                  &dis, &x->pred_sse[ref], NULL, NULL, 0, 0, pw, ph, 1);
               if (this_var < best_mv_var) best_mv = x->best_mv.as_mv;
               x->best_mv.as_mv = best_mv;
             }
@@ -6950,11 +7486,8 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
               x, &ref_mv, cm->allow_high_precision_mv, x->errorperbit,
               &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
               cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
-              x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL,
-#if CONFIG_EXT_INTER
-              NULL, 0, 0,
-#endif
-              0, 0, 0);
+              x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, NULL,
+              0, 0, 0, 0, 0);
         }
 #if CONFIG_MOTION_VAR
         break;
@@ -6994,7 +7527,6 @@ static INLINE void restore_dst_buf(MACROBLOCKD *xd, BUFFER_SET dst) {
   }
 }
 
-#if CONFIG_EXT_INTER
 static void build_second_inter_pred(const AV1_COMP *cpi, MACROBLOCK *x,
                                     BLOCK_SIZE bsize, const MV *other_mv,
                                     int mi_row, int mi_col, const int block,
@@ -7013,7 +7545,7 @@ static void build_second_inter_pred(const AV1_COMP *cpi, MACROBLOCK *x,
   struct scale_factors sf;
 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
   struct macroblockd_plane *const pd = &xd->plane[0];
-  // ic and ir are the 4x4 coordiantes of the sub8x8 at index "block"
+  // ic and ir are the 4x4 coordinates of the sub8x8 at index "block"
   const int ic = block & 1;
   const int ir = (block - ic) >> 1;
   const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x) + 4 * ic;
@@ -7079,7 +7611,7 @@ static void build_second_inter_pred(const AV1_COMP *cpi, MACROBLOCK *x,
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     av1_highbd_build_inter_predictor(
         ref_yv12.buf, ref_yv12.stride, second_pred, pw, other_mv, &sf, pw, ph,
-        0, mbmi->interp_filter,
+        0, mbmi->interp_filters,
 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
         &warp_types, p_col, p_row,
 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
@@ -7088,7 +7620,7 @@ static void build_second_inter_pred(const AV1_COMP *cpi, MACROBLOCK *x,
 #endif  // CONFIG_HIGHBITDEPTH
     av1_build_inter_predictor(
         ref_yv12.buf, ref_yv12.stride, second_pred, pw, other_mv, &sf, pw, ph,
-        &conv_params, mbmi->interp_filter,
+        &conv_params, mbmi->interp_filters,
 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
         &warp_types, p_col, p_row, plane, !ref_idx,
 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
@@ -7197,7 +7729,15 @@ static void compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
 
   x->mv_limits = tmp_mv_limits;
 
+#if CONFIG_AMVR
+  if (cpi->common.cur_frame_mv_precision_level) {
+    x->best_mv.as_mv.row *= 8;
+    x->best_mv.as_mv.col *= 8;
+  }
+  if (bestsme < INT_MAX && cpi->common.cur_frame_mv_precision_level == 0) {
+#else
   if (bestsme < INT_MAX) {
+#endif
     int dis; /* TODO: use dis in distortion calculation later. */
     unsigned int sse;
     bestsme = cpi->find_fractional_mv_step(
@@ -7339,9 +7879,8 @@ static void do_masked_motion_search_indexed(
     tmp_mv[1].as_int = frame_mv[rf[1]].as_int;
 }
 #endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
-#endif  // CONFIG_EXT_INTER
 
-// In some situations we want to discount tha pparent cost of a new motion
+// In some situations we want to discount the apparent cost of a new motion
 // vector. Where there is a subtle motion field and especially where there is
 // low spatial complexity then it can be hard to cover the cost of a new motion
 // vector in a single block, even if that motion vector reduces distortion.
@@ -7371,7 +7910,6 @@ static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
            xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
 }
 
-#if CONFIG_EXT_INTER
 #if CONFIG_WEDGE
 static int estimate_wedge_sign(const AV1_COMP *cpi, const MACROBLOCK *x,
                                const BLOCK_SIZE bsize, const uint8_t *pred0,
@@ -7416,7 +7954,6 @@ static int estimate_wedge_sign(const AV1_COMP *cpi, const MACROBLOCK *x,
   return (tl + br > 0);
 }
 #endif  // CONFIG_WEDGE
-#endif  // CONFIG_EXT_INTER
 
 #if !CONFIG_DUAL_FILTER
 static InterpFilter predict_interp_filter(
@@ -7440,19 +7977,17 @@ static InterpFilter predict_interp_filter(
                   (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
   if (pred_filter_search) {
     InterpFilter af = SWITCHABLE, lf = SWITCHABLE;
-    if (xd->up_available) af = xd->mi[-xd->mi_stride]->mbmi.interp_filter;
-    if (xd->left_available) lf = xd->mi[-1]->mbmi.interp_filter;
+    if (xd->up_available)
+      af = av1_extract_interp_filter(
+          xd->mi[-xd->mi_stride]->mbmi.interp_filters, 0);
+    if (xd->left_available)
+      lf = av1_extract_interp_filter(xd->mi[-1]->mbmi.interp_filters, 0);
 
-#if CONFIG_EXT_INTER
     if ((this_mode != NEWMV && this_mode != NEW_NEWMV) || (af == lf))
-#else
-    if ((this_mode != NEWMV) || (af == lf))
-#endif  // CONFIG_EXT_INTER
       best_filter = af;
   }
   if (is_comp_pred) {
     if (cpi->sf.adaptive_mode_search) {
-#if CONFIG_EXT_INTER
       switch (this_mode) {
         case NEAREST_NEARESTMV:
           if (single_filter[NEARESTMV][refs[0]] ==
@@ -7495,11 +8030,6 @@ static InterpFilter predict_interp_filter(
             best_filter = single_filter[this_mode][refs[0]];
           break;
       }
-#else
-      if (single_filter[this_mode][refs[0]] ==
-          single_filter[this_mode][refs[1]])
-        best_filter = single_filter[this_mode][refs[0]];
-#endif  // CONFIG_EXT_INTER
     }
   }
   if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) {
@@ -7509,7 +8039,6 @@ static InterpFilter predict_interp_filter(
 }
 #endif  // !CONFIG_DUAL_FILTER
 
-#if CONFIG_EXT_INTER
 // Choose the best wedge index and sign
 #if CONFIG_WEDGE
 static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
@@ -7924,7 +8453,6 @@ static int64_t build_and_cost_compound_type(
   return best_rd_cur;
 }
 #endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
-#endif  // CONFIG_EXT_INTER
 
 typedef struct {
 #if CONFIG_MOTION_VAR
@@ -7935,23 +8463,21 @@ typedef struct {
   int left_pred_stride[MAX_MB_PLANE];
 #endif  // CONFIG_MOTION_VAR
   int_mv *single_newmv;
-#if CONFIG_EXT_INTER
   // Pointer to array of motion vectors to use for each ref and their rates
   // Should point to first of 2 arrays in 2D array
   int *single_newmv_rate;
   // Pointer to array of predicted rate-distortion
   // Should point to first of 2 arrays in 2D array
   int64_t (*modelled_rd)[TOTAL_REFS_PER_FRAME];
-#endif  // CONFIG_EXT_INTER
   InterpFilter single_filter[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
 } HandleInterModeArgs;
 
 static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
                             const BLOCK_SIZE bsize,
                             int_mv (*const mode_mv)[TOTAL_REFS_PER_FRAME],
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
                             int_mv (*const mode_comp_mv)[TOTAL_REFS_PER_FRAME],
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
                             const int mi_row, const int mi_col,
                             int *const rate_mv, int_mv *const single_newmv,
                             HandleInterModeArgs *const args) {
@@ -7960,13 +8486,11 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
   const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   const int is_comp_pred = has_second_ref(mbmi);
   const PREDICTION_MODE this_mode = mbmi->mode;
-#if CONFIG_EXT_INTER
   const int is_comp_interintra_pred = (mbmi->ref_frame[1] == INTRA_FRAME);
-#endif  // CONFIG_EXT_INTER
   int_mv *const frame_mv = mode_mv[this_mode];
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
   int_mv *const frame_comp_mv = mode_comp_mv[this_mode];
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
   const int refs[2] = { mbmi->ref_frame[0],
                         mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] };
   int i;
@@ -7974,7 +8498,6 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
   (void)args;
 
   if (is_comp_pred) {
-#if CONFIG_EXT_INTER
     for (i = 0; i < 2; ++i) {
       single_newmv[refs[i]].as_int = args->single_newmv[refs[i]].as_int;
     }
@@ -7985,9 +8508,9 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
 
       if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
         joint_motion_search(cpi, x, bsize, frame_mv,
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
                             NULL,  // int_mv *frame_comp_mv
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif                             // CONFIG_COMPOUND_SINGLEREF
                             mi_row, mi_col, NULL, NULL, 0, rate_mv, 0);
       } else {
         *rate_mv = 0;
@@ -8034,24 +8557,7 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
       }
     }
-#else   // !CONFIG_EXT_INTER
-    // Initialize mv using single prediction mode result.
-    frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
-    frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
-
-    if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
-      joint_motion_search(cpi, x, bsize, frame_mv, mi_row, mi_col, rate_mv, 0);
-    } else {
-      *rate_mv = 0;
-      for (i = 0; i < 2; ++i) {
-        av1_set_mvcost(x, refs[i], i, mbmi->ref_mv_idx);
-        *rate_mv += av1_mv_bit_cost(&frame_mv[refs[i]].as_mv,
-                                    &mbmi_ext->ref_mvs[refs[i]][0].as_mv,
-                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
-      }
-    }
-#endif  // CONFIG_EXT_INTER
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
   } else if (is_inter_singleref_comp_mode(this_mode)) {
     // Single ref comp mode
     const int mode0 = compound_ref0_mode(this_mode);
@@ -8085,9 +8591,8 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
                                   &mbmi_ext->ref_mvs[refs[0]][0].as_mv,
                                   x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
     }
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
   } else {
-#if CONFIG_EXT_INTER
     if (is_comp_interintra_pred) {
       x->best_mv = args->single_newmv[refs[0]];
       *rate_mv = args->single_newmv_rate[refs[0]];
@@ -8096,10 +8601,6 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
       args->single_newmv[refs[0]] = x->best_mv;
       args->single_newmv_rate[refs[0]] = *rate_mv;
     }
-#else
-    single_motion_search(cpi, x, bsize, mi_row, mi_col, rate_mv);
-    single_newmv[refs[0]] = x->best_mv;
-#endif  // CONFIG_EXT_INTER
 
     if (x->best_mv.as_int == INVALID_MV) return INT64_MAX;
 
@@ -8149,7 +8650,7 @@ int64_t interpolation_filter_search(
 
   set_default_interp_filters(mbmi, assign_filter);
 
-  *switchable_rate = av1_get_switchable_rate(cpi, xd);
+  *switchable_rate = av1_get_switchable_rate(cm, x, xd);
   av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
   model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate, &tmp_dist,
                   skip_txfm_sb, skip_sse_sb);
@@ -8164,12 +8665,7 @@ int64_t interpolation_filter_search(
       const int filter_set_size = SWITCHABLE_FILTERS;
 #endif  // CONFIG_DUAL_FILTER
       int best_in_temp = 0;
-#if CONFIG_DUAL_FILTER
-      InterpFilter best_filter[4];
-      av1_copy(best_filter, mbmi->interp_filter);
-#else
-      InterpFilter best_filter = mbmi->interp_filter;
-#endif  // CONFIG_DUAL_FILTER
+      InterpFilters best_filters = mbmi->interp_filters;
       restore_dst_buf(xd, *tmp_dst);
       // EIGHTTAP_REGULAR mode is calculated beforehand
       for (i = 1; i < filter_set_size; ++i) {
@@ -8178,14 +8674,12 @@ int64_t interpolation_filter_search(
         int tmp_rs;
         int64_t tmp_rd;
 #if CONFIG_DUAL_FILTER
-        mbmi->interp_filter[0] = filter_sets[i][0];
-        mbmi->interp_filter[1] = filter_sets[i][1];
-        mbmi->interp_filter[2] = filter_sets[i][0];
-        mbmi->interp_filter[3] = filter_sets[i][1];
+        mbmi->interp_filters =
+            av1_make_interp_filters(filter_sets[i][0], filter_sets[i][1]);
 #else
-        mbmi->interp_filter = (InterpFilter)i;
+        mbmi->interp_filters = av1_broadcast_interp_filter((InterpFilter)i);
 #endif  // CONFIG_DUAL_FILTER
-        tmp_rs = av1_get_switchable_rate(cpi, xd);
+        tmp_rs = av1_get_switchable_rate(cm, x, xd);
         av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
         model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
                         &tmp_dist, &tmp_skip_sb, &tmp_skip_sse);
@@ -8193,12 +8687,8 @@ int64_t interpolation_filter_search(
 
         if (tmp_rd < *rd) {
           *rd = tmp_rd;
-          *switchable_rate = av1_get_switchable_rate(cpi, xd);
-#if CONFIG_DUAL_FILTER
-          av1_copy(best_filter, mbmi->interp_filter);
-#else
-          best_filter = mbmi->interp_filter;
-#endif  // CONFIG_DUAL_FILTER
+          *switchable_rate = av1_get_switchable_rate(cm, x, xd);
+          best_filters = mbmi->interp_filters;
           *skip_txfm_sb = tmp_skip_sb;
           *skip_sse_sb = tmp_skip_sse;
           best_in_temp = !best_in_temp;
@@ -8214,24 +8704,29 @@ int64_t interpolation_filter_search(
       } else {
         restore_dst_buf(xd, *orig_dst);
       }
-#if CONFIG_DUAL_FILTER
-      av1_copy(mbmi->interp_filter, best_filter);
-#else
-      mbmi->interp_filter = best_filter;
-#endif  // CONFIG_DUAL_FILTER
+      mbmi->interp_filters = best_filters;
     } else {
-#if CONFIG_DUAL_FILTER
-      for (i = 0; i < 4; ++i)
-        assert(mbmi->interp_filter[i] == EIGHTTAP_REGULAR);
-#else
-      assert(mbmi->interp_filter == EIGHTTAP_REGULAR);
-#endif  // CONFIG_DUAL_FILTER
+      assert(mbmi->interp_filters ==
+             av1_broadcast_interp_filter(EIGHTTAP_REGULAR));
     }
   }
 
   return 0;
 }
 
+#if CONFIG_DUAL_FILTER
+static InterpFilters condition_interp_filters_on_mv(
+    InterpFilters interp_filters, const MACROBLOCKD *xd) {
+  InterpFilter filters[2];
+  for (int i = 0; i < 2; ++i)
+    filters[i] = (has_subpel_mv_component(xd->mi[0], xd, i))
+                     ? av1_extract_interp_filter(interp_filters, i)
+                     : EIGHTTAP_REGULAR;
+
+  return av1_make_interp_filters(filters[0], filters[1]);
+}
+#endif
+
 // TODO(afergs): Refactor the MBMI references in here - there's four
 // TODO(afergs): Refactor optional args - add them to a struct or remove
 static int64_t motion_mode_rd(
@@ -8242,10 +8737,8 @@ static int64_t motion_mode_rd(
     const int *refs, int rate_mv,
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
     // only used when WARPED_MOTION is on?
-    int_mv *const single_newmv,
-#if CONFIG_EXT_INTER
-    int rate2_bmc_nocoeff, MB_MODE_INFO *best_bmc_mbmi, int rate_mv_bmc,
-#endif  // CONFIG_EXT_INTER
+    int_mv *const single_newmv, int rate2_bmc_nocoeff,
+    MB_MODE_INFO *best_bmc_mbmi, int rate_mv_bmc,
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
     int rs, int *skip_txfm_sb, int64_t *skip_sse_sb, BUFFER_SET *orig_dst) {
   const AV1_COMMON *const cm = &cpi->common;
@@ -8263,6 +8756,9 @@ static int64_t motion_mode_rd(
   (void)rate_mv;
   (void)is_comp_pred;
   (void)this_mode;
+#if !CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR
+  (void)single_newmv;
+#endif
 
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
   MOTION_MODE motion_mode, last_motion_mode_allowed;
@@ -8298,23 +8794,10 @@ static int64_t motion_mode_rd(
 #else
   mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref);
 #endif  // WARPED_MOTION_SORT_SAMPLES
-#if CONFIG_EXT_INTER
   best_bmc_mbmi->num_proj_ref[0] = mbmi->num_proj_ref[0];
-#endif  // CONFIG_EXT_INTER
 #endif  // CONFIG_WARPED_MOTION
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
   rate2_nocoeff = rd_stats->rate;
-#if CONFIG_NCOBMC_ADAPT_WEIGHT
-  // We cannot estimate the rd cost for the motion mode NCOBMC_ADAPT_WEIGHT
-  // right now since it requires mvs from all neighboring blocks. We will
-  // check if this mode is beneficial after all the mv's in the current
-  // superblock are selected.
-  last_motion_mode_allowed = motion_mode_allowed_wrapper(1,
-#if CONFIG_GLOBAL_MOTION
-                                                         0, xd->global_motion,
-#endif  // CONFIG_GLOBAL_MOTION
-                                                         mi);
-#else
   last_motion_mode_allowed = motion_mode_allowed(
 #if CONFIG_GLOBAL_MOTION
       0, xd->global_motion,
@@ -8323,7 +8806,6 @@ static int64_t motion_mode_rd(
       xd,
 #endif
       mi);
-#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
   base_mbmi = *mbmi;
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 
@@ -8334,54 +8816,44 @@ static int64_t motion_mode_rd(
     int64_t tmp_rd = INT64_MAX;
     int tmp_rate;
     int64_t tmp_dist;
-#if CONFIG_EXT_INTER
     int tmp_rate2 =
         motion_mode != SIMPLE_TRANSLATION ? rate2_bmc_nocoeff : rate2_nocoeff;
-#else
-    int tmp_rate2 = rate2_nocoeff;
-#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_NCOBMC_ADAPT_WEIGHT
+    // We cannot estimate the rd cost for the motion mode NCOBMC_ADAPT_WEIGHT
+    // right now since it requires mvs from all neighboring blocks. We will
+    // check if this mode is beneficial after all the mv's in the current
+    // superblock are selected.
+    if (motion_mode == NCOBMC_ADAPT_WEIGHT) continue;
+#endif
 
     *mbmi = base_mbmi;
     mbmi->motion_mode = motion_mode;
 #if CONFIG_MOTION_VAR
     if (mbmi->motion_mode == OBMC_CAUSAL) {
-#if CONFIG_EXT_INTER
       *mbmi = *best_bmc_mbmi;
       mbmi->motion_mode = OBMC_CAUSAL;
-#endif  // CONFIG_EXT_INTER
       if (!is_comp_pred &&
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
           !is_inter_singleref_comp_mode(this_mode) &&
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
           have_newmv_in_inter_mode(this_mode)) {
         int tmp_rate_mv = 0;
 
-        single_motion_search(cpi, x, bsize, mi_row, mi_col,
-#if CONFIG_EXT_INTER
-                             0,
-#endif  // CONFIG_EXT_INTER
-                             &tmp_rate_mv);
+        single_motion_search(cpi, x, bsize, mi_row, mi_col, 0, &tmp_rate_mv);
         mbmi->mv[0].as_int = x->best_mv.as_int;
         if (discount_newmv_test(cpi, this_mode, mbmi->mv[0], mode_mv,
                                 refs[0])) {
           tmp_rate_mv = AOMMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
         }
-#if CONFIG_EXT_INTER
         tmp_rate2 = rate2_bmc_nocoeff - rate_mv_bmc + tmp_rate_mv;
-#else
-        tmp_rate2 = rate2_nocoeff - rate_mv + tmp_rate_mv;
-#endif  // CONFIG_EXT_INTER
 #if CONFIG_DUAL_FILTER
-        if (!has_subpel_mv_component(xd->mi[0], xd, 0))
-          mbmi->interp_filter[0] = EIGHTTAP_REGULAR;
-        if (!has_subpel_mv_component(xd->mi[0], xd, 1))
-          mbmi->interp_filter[1] = EIGHTTAP_REGULAR;
+        mbmi->interp_filters =
+            condition_interp_filters_on_mv(mbmi->interp_filters, xd);
 #endif  // CONFIG_DUAL_FILTER
         av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
-#if CONFIG_EXT_INTER
       } else {
         av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
-#endif  // CONFIG_EXT_INTER
       }
       av1_build_obmc_inter_prediction(
           cm, xd, mi_row, mi_col, args->above_pred_buf, args->above_pred_stride,
@@ -8396,20 +8868,11 @@ static int64_t motion_mode_rd(
 #if WARPED_MOTION_SORT_SAMPLES
       int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
 #endif  // WARPED_MOTION_SORT_SAMPLES
-#if CONFIG_EXT_INTER
       *mbmi = *best_bmc_mbmi;
       mbmi->motion_mode = WARPED_CAUSAL;
-#endif  // CONFIG_EXT_INTER
       mbmi->wm_params[0].wmtype = DEFAULT_WMTYPE;
-#if CONFIG_DUAL_FILTER
-      for (int dir = 0; dir < 4; ++dir)
-        mbmi->interp_filter[dir] = cm->interp_filter == SWITCHABLE
-                                       ? EIGHTTAP_REGULAR
-                                       : cm->interp_filter;
-#else
-      mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP_REGULAR
-                                                            : cm->interp_filter;
-#endif  // CONFIG_DUAL_FILTER
+      mbmi->interp_filters = av1_broadcast_interp_filter(
+          av1_unswitchable_filter(cm->interp_filter));
 
 #if WARPED_MOTION_SORT_SAMPLES
       memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
@@ -8418,9 +8881,7 @@ static int64_t motion_mode_rd(
       if (mbmi->num_proj_ref[0] > 1) {
         mbmi->num_proj_ref[0] = sortSamples(pts_mv0, &mbmi->mv[0].as_mv, pts,
                                             pts_inref, mbmi->num_proj_ref[0]);
-#if CONFIG_EXT_INTER
         best_bmc_mbmi->num_proj_ref[0] = mbmi->num_proj_ref[0];
-#endif  // CONFIG_EXT_INTER
       }
 #endif  // WARPED_MOTION_SORT_SAMPLES
 
@@ -8461,19 +8922,13 @@ static int64_t motion_mode_rd(
                                     refs[0])) {
               tmp_rate_mv = AOMMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
             }
-#if CONFIG_EXT_INTER
 #if WARPED_MOTION_SORT_SAMPLES
             best_bmc_mbmi->num_proj_ref[0] = mbmi->num_proj_ref[0];
 #endif  // WARPED_MOTION_SORT_SAMPLES
             tmp_rate2 = rate2_bmc_nocoeff - rate_mv_bmc + tmp_rate_mv;
-#else
-            tmp_rate2 = rate2_nocoeff - rate_mv + tmp_rate_mv;
-#endif  // CONFIG_EXT_INTER
 #if CONFIG_DUAL_FILTER
-            if (!has_subpel_mv_component(xd->mi[0], xd, 0))
-              mbmi->interp_filter[0] = EIGHTTAP_REGULAR;
-            if (!has_subpel_mv_component(xd->mi[0], xd, 1))
-              mbmi->interp_filter[1] = EIGHTTAP_REGULAR;
+            mbmi->interp_filters =
+                condition_interp_filters_on_mv(mbmi->interp_filters, xd);
 #endif  // CONFIG_DUAL_FILTER
           } else {
             // Restore the old MV and WM parameters.
@@ -8503,10 +8958,10 @@ static int64_t motion_mode_rd(
 #if CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR
       if (last_motion_mode_allowed == WARPED_CAUSAL)
 #endif  // CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR
-        rd_stats->rate += cpi->motion_mode_cost[bsize][mbmi->motion_mode];
+        rd_stats->rate += x->motion_mode_cost[bsize][mbmi->motion_mode];
 #if CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR
       else
-        rd_stats->rate += cpi->motion_mode_cost1[bsize][mbmi->motion_mode];
+        rd_stats->rate += x->motion_mode_cost1[bsize][mbmi->motion_mode];
 #endif  // CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR
     }
 #if CONFIG_WARPED_MOTION
@@ -8629,25 +9084,11 @@ static int64_t motion_mode_rd(
     }
 
 #if CONFIG_GLOBAL_MOTION
-    if (this_mode == ZEROMV
-#if CONFIG_EXT_INTER
-        || this_mode == ZERO_ZEROMV
-#endif  // CONFIG_EXT_INTER
-        ) {
+    if (this_mode == ZEROMV || this_mode == ZERO_ZEROMV) {
       if (is_nontrans_global_motion(xd)) {
         rd_stats->rate -= rs;
-#if CONFIG_DUAL_FILTER
-        mbmi->interp_filter[0] = cm->interp_filter == SWITCHABLE
-                                     ? EIGHTTAP_REGULAR
-                                     : cm->interp_filter;
-        mbmi->interp_filter[1] = cm->interp_filter == SWITCHABLE
-                                     ? EIGHTTAP_REGULAR
-                                     : cm->interp_filter;
-#else
-        mbmi->interp_filter = cm->interp_filter == SWITCHABLE
-                                  ? EIGHTTAP_REGULAR
-                                  : cm->interp_filter;
-#endif  // CONFIG_DUAL_FILTER
+        mbmi->interp_filters = av1_broadcast_interp_filter(
+            av1_unswitchable_filter(cm->interp_filter));
       }
     }
 #endif  // CONFIG_GLOBAL_MOTION
@@ -8697,48 +9138,43 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
                                  RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
                                  int *disable_skip,
                                  int_mv (*mode_mv)[TOTAL_REFS_PER_FRAME],
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
                                  int_mv (*mode_comp_mv)[TOTAL_REFS_PER_FRAME],
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
                                  int mi_row, int mi_col,
                                  HandleInterModeArgs *args,
                                  const int64_t ref_best_rd) {
   const AV1_COMMON *cm = &cpi->common;
-  (void)cm;
   MACROBLOCKD *xd = &x->e_mbd;
   MODE_INFO *mi = xd->mi[0];
   MB_MODE_INFO *mbmi = &mi->mbmi;
   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   const int is_comp_pred = has_second_ref(mbmi);
   const int this_mode = mbmi->mode;
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
   const int is_singleref_comp_mode = is_inter_singleref_comp_mode(this_mode);
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
   int_mv *frame_mv = mode_mv[this_mode];
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
   // The comp mv for the compound mode in single ref
   int_mv *frame_comp_mv = mode_comp_mv[this_mode];
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
   int i;
   int refs[2] = { mbmi->ref_frame[0],
                   (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
   int_mv cur_mv[2];
   int rate_mv = 0;
-#if CONFIG_EXT_INTER
   int pred_exists = 1;
 #if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT || CONFIG_INTERINTRA
   const int bw = block_size_wide[bsize];
-#endif  // ONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
+#endif  // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
   int_mv single_newmv[TOTAL_REFS_PER_FRAME];
 #if CONFIG_INTERINTRA
-  const unsigned int *const interintra_mode_cost =
-      cpi->interintra_mode_cost[size_group_lookup[bsize]];
+  const int *const interintra_mode_cost =
+      x->interintra_mode_cost[size_group_lookup[bsize]];
 #endif  // CONFIG_INTERINTRA
   const int is_comp_interintra_pred = (mbmi->ref_frame[1] == INTRA_FRAME);
   uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
-#else
-  int_mv *const single_newmv = args->single_newmv;
-#endif  // CONFIG_EXT_INTER
 #if CONFIG_HIGHBITDEPTH
   DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
 #else
@@ -8747,11 +9183,9 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   uint8_t *tmp_buf;
 
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-#if CONFIG_EXT_INTER
   int rate2_bmc_nocoeff;
   MB_MODE_INFO best_bmc_mbmi;
   int rate_mv_bmc;
-#endif  // CONFIG_EXT_INTER
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
   int64_t rd = INT64_MAX;
   BUFFER_SET orig_dst, tmp_dst;
@@ -8766,7 +9200,6 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   mbmi->ncobmc_mode[1] = NO_OVERLAP;
 #endif
 
-#if CONFIG_EXT_INTER
 #if CONFIG_INTERINTRA
   int compmode_interintra_cost = 0;
   mbmi->use_wedge_interintra = 0;
@@ -8775,6 +9208,9 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   int compmode_interinter_cost = 0;
   mbmi->interinter_compound_type = COMPOUND_AVERAGE;
 #endif
+#if CONFIG_LGT_FROM_PRED
+  mbmi->use_lgt = 0;
+#endif
 
 #if CONFIG_INTERINTRA
   if (!cm->allow_interintra_compound && is_comp_interintra_pred)
@@ -8785,9 +9221,7 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   assert(!is_comp_interintra_pred || (!is_comp_pred));
   // is_comp_interintra_pred implies is_interintra_allowed(mbmi->sb_type)
   assert(!is_comp_interintra_pred || is_interintra_allowed(mbmi));
-#endif  // CONFIG_EXT_INTER
 
-#if CONFIG_EXT_INTER
 #if CONFIG_COMPOUND_SINGLEREF
   if (is_comp_pred || is_singleref_comp_mode)
 #else   // !CONFIG_COMPOUND_SINGLEREF
@@ -8795,7 +9229,6 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
 #endif  // CONFIG_COMPOUND_SINGLEREF
     mode_ctx = mbmi_ext->compound_mode_context[refs[0]];
   else
-#endif  // CONFIG_EXT_INTER
     mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
                                          mbmi->ref_frame, bsize, -1);
 
@@ -8818,21 +9251,21 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     if (frame_mv[refs[0]].as_int == INVALID_MV ||
         frame_mv[refs[1]].as_int == INVALID_MV)
       return INT64_MAX;
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
   } else if (is_singleref_comp_mode) {
     if (frame_mv[refs[0]].as_int == INVALID_MV ||
         frame_comp_mv[refs[0]].as_int == INVALID_MV)
       return INT64_MAX;
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
   }
 
   mbmi->motion_mode = SIMPLE_TRANSLATION;
   if (have_newmv_in_inter_mode(this_mode)) {
     const int64_t ret_val =
         handle_newmv(cpi, x, bsize, mode_mv,
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
                      mode_comp_mv,
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
                      mi_row, mi_col, &rate_mv, single_newmv, args);
     if (ret_val != 0)
       return ret_val;
@@ -8847,7 +9280,7 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     mbmi->mv[i].as_int = cur_mv[i].as_int;
   }
 
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
   if (!is_comp_pred && is_singleref_comp_mode) {
     cur_mv[1] = frame_comp_mv[refs[0]];
     // Clip "next_nearest" so that it does not extend to far out of image
@@ -8855,17 +9288,9 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     if (mv_check_bounds(&x->mv_limits, &cur_mv[1].as_mv)) return INT64_MAX;
     mbmi->mv[1].as_int = cur_mv[1].as_int;
   }
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
 
-#if CONFIG_EXT_INTER
-  if (this_mode == NEAREST_NEARESTMV)
-#else
-  if (this_mode == NEARESTMV && is_comp_pred)
-#endif  // CONFIG_EXT_INTER
-  {
-#if !CONFIG_EXT_INTER
-    uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
-#endif  // !CONFIG_EXT_INTER
+  if (this_mode == NEAREST_NEARESTMV) {
     if (mbmi_ext->ref_mv_count[ref_frame_type] > 0) {
       cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv;
       cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv;
@@ -8878,7 +9303,6 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     }
   }
 
-#if CONFIG_EXT_INTER
   if (mbmi_ext->ref_mv_count[ref_frame_type] > 0) {
 #if CONFIG_COMPOUND_SINGLEREF
     if (this_mode == NEAREST_NEWMV ||  // this_mode == SR_NEAREST_NEWMV ||
@@ -8889,7 +9313,12 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     {
       cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv;
 
+#if CONFIG_AMVR
+      lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv,
+                         cm->cur_frame_mv_precision_level);
+#else
       lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv);
+#endif
       clamp_mv2(&cur_mv[0].as_mv, xd);
       if (mv_check_bounds(&x->mv_limits, &cur_mv[0].as_mv)) return INT64_MAX;
       mbmi->mv[0].as_int = cur_mv[0].as_int;
@@ -8898,7 +9327,12 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     if (this_mode == NEW_NEARESTMV) {
       cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv;
 
+#if CONFIG_AMVR
+      lower_mv_precision(&cur_mv[1].as_mv, cm->allow_high_precision_mv,
+                         cm->cur_frame_mv_precision_level);
+#else
       lower_mv_precision(&cur_mv[1].as_mv, cm->allow_high_precision_mv);
+#endif
       clamp_mv2(&cur_mv[1].as_mv, xd);
       if (mv_check_bounds(&x->mv_limits, &cur_mv[1].as_mv)) return INT64_MAX;
       mbmi->mv[1].as_int = cur_mv[1].as_int;
@@ -8914,7 +9348,12 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
         this_mode == NEAR_NEARMV) {
       cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
 
+#if CONFIG_AMVR
+      lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv,
+                         cm->cur_frame_mv_precision_level);
+#else
       lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv);
+#endif
       clamp_mv2(&cur_mv[0].as_mv, xd);
       if (mv_check_bounds(&x->mv_limits, &cur_mv[0].as_mv)) return INT64_MAX;
       mbmi->mv[0].as_int = cur_mv[0].as_int;
@@ -8932,28 +9371,17 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
 #endif  // CONFIG_COMPOUND_SINGLEREF
         cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
 
+#if CONFIG_AMVR
+      lower_mv_precision(&cur_mv[1].as_mv, cm->allow_high_precision_mv,
+                         cm->cur_frame_mv_precision_level);
+#else
       lower_mv_precision(&cur_mv[1].as_mv, cm->allow_high_precision_mv);
+#endif
       clamp_mv2(&cur_mv[1].as_mv, xd);
       if (mv_check_bounds(&x->mv_limits, &cur_mv[1].as_mv)) return INT64_MAX;
       mbmi->mv[1].as_int = cur_mv[1].as_int;
     }
   }
-#else   // !CONFIG_EXT_INTER
-  if (this_mode == NEARMV && is_comp_pred) {
-    uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
-    if (mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
-      int ref_mv_idx = mbmi->ref_mv_idx + 1;
-      cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
-      cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
-
-      for (i = 0; i < 2; ++i) {
-        clamp_mv2(&cur_mv[i].as_mv, xd);
-        if (mv_check_bounds(&x->mv_limits, &cur_mv[i].as_mv)) return INT64_MAX;
-        mbmi->mv[i].as_int = cur_mv[i].as_int;
-      }
-    }
-  }
-#endif  // CONFIG_EXT_INTER
 
   // do first prediction into the destination buffer. Do the next
   // prediction into a temporary buffer. Then keep track of which one
@@ -8978,26 +9406,15 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
   // initiation of a motion field.
   if (discount_newmv_test(cpi, this_mode, frame_mv[refs[0]], mode_mv,
                           refs[0])) {
-#if CONFIG_EXT_INTER
-    rd_stats->rate +=
-        AOMMIN(cost_mv_ref(cpi, this_mode, mode_ctx),
-               cost_mv_ref(cpi, is_comp_pred ? NEAREST_NEARESTMV : NEARESTMV,
-                           mode_ctx));
-#else
-    rd_stats->rate += AOMMIN(cost_mv_ref(cpi, this_mode, mode_ctx),
-                             cost_mv_ref(cpi, NEARESTMV, mode_ctx));
-#endif  // CONFIG_EXT_INTER
+    rd_stats->rate += AOMMIN(
+        cost_mv_ref(x, this_mode, mode_ctx),
+        cost_mv_ref(x, is_comp_pred ? NEAREST_NEARESTMV : NEARESTMV, mode_ctx));
   } else {
-    rd_stats->rate += cost_mv_ref(cpi, this_mode, mode_ctx);
+    rd_stats->rate += cost_mv_ref(x, this_mode, mode_ctx);
   }
 
   if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd &&
-#if CONFIG_EXT_INTER
-      mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV
-#else
-      mbmi->mode != NEARESTMV
-#endif  // CONFIG_EXT_INTER
-      )
+      mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV)
     return INT64_MAX;
 
   int64_t ret_val = interpolation_filter_search(
@@ -9005,7 +9422,6 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
       &rd, &rs, &skip_txfm_sb, &skip_sse_sb);
   if (ret_val != 0) return ret_val;
 
-#if CONFIG_EXT_INTER
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
   best_bmc_mbmi = *mbmi;
   rate2_bmc_nocoeff = rd_stats->rate;
@@ -9028,7 +9444,6 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     int best_tmp_rate_mv = rate_mv;
     int tmp_skip_txfm_sb;
     int64_t tmp_skip_sse_sb;
-    int compound_type_cost[COMPOUND_TYPES];
     DECLARE_ALIGNED(16, uint8_t, pred0[2 * MAX_SB_SQUARE]);
     DECLARE_ALIGNED(16, uint8_t, pred1[2 * MAX_SB_SQUARE]);
     uint8_t *preds0[1] = { pred0 };
@@ -9040,6 +9455,7 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     masked_compound_used = masked_compound_used && cm->allow_masked_compound;
 #endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
     COMPOUND_TYPE cur_type;
+    int best_compmode_interinter_cost = 0;
 
     best_mv[0].as_int = cur_mv[0].as_int;
     best_mv[1].as_int = cur_mv[1].as_int;
@@ -9049,7 +9465,7 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     best_compound_data.seg_mask = tmp_mask_buf;
 #endif  // CONFIG_COMPOUND_SEGMENT
 
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
     // TODO(zoeliu): To further check whether the following setups are needed.
     // Single ref compound mode: Prepare the 2nd ref frame predictor the same as
     // the 1st one.
@@ -9058,11 +9474,9 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
       for (i = 0; i < MAX_MB_PLANE; i++)
         xd->plane[i].pre[1] = xd->plane[i].pre[0];
     }
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
 
     if (masked_compound_used) {
-      av1_cost_tokens(compound_type_cost, cm->fc->compound_type_prob[bsize],
-                      av1_compound_type_tree);
       // get inter predictors to use for masked compound modes
       av1_build_inter_predictors_for_planes_single_buf(
           xd, bsize, 0, 0, mi_row, mi_col, 0, preds0, strides);
@@ -9076,11 +9490,19 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
       tmp_rate_mv = rate_mv;
       best_rd_cur = INT64_MAX;
       mbmi->interinter_compound_type = cur_type;
+      int masked_type_cost = 0;
+      if (masked_compound_used) {
+#if CONFIG_WEDGE && CONFIG_COMPOUND_SEGMENT
+        if (!is_interinter_compound_used(COMPOUND_WEDGE, bsize))
+          masked_type_cost += av1_cost_literal(1);
+        else
+#endif  // CONFIG_WEDGE && CONFIG_COMPOUND_SEGMENT
+          masked_type_cost +=
+              x->compound_type_cost[bsize][mbmi->interinter_compound_type];
+      }
       rs2 = av1_cost_literal(get_interinter_compound_type_bits(
                 bsize, mbmi->interinter_compound_type)) +
-            (masked_compound_used
-                 ? compound_type_cost[mbmi->interinter_compound_type]
-                 : 0);
+            masked_type_cost;
 
       switch (cur_type) {
         case COMPOUND_AVERAGE:
@@ -9130,6 +9552,7 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
 #endif  // CONFIG_COMPOUND_SEGMENT
         best_compound_data.interinter_compound_type =
             mbmi->interinter_compound_type;
+        best_compmode_interinter_cost = rs2;
         if (have_newmv_in_inter_mode(this_mode)) {
           if (use_masked_motion_search(cur_type)) {
             best_tmp_rate_mv = tmp_rate_mv;
@@ -9174,12 +9597,7 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
 
     pred_exists = 0;
 
-    compmode_interinter_cost =
-        av1_cost_literal(get_interinter_compound_type_bits(
-            bsize, mbmi->interinter_compound_type)) +
-        (masked_compound_used
-             ? compound_type_cost[mbmi->interinter_compound_type]
-             : 0);
+    compmode_interinter_cost = best_compmode_interinter_cost;
   }
 #endif  // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
 
@@ -9216,7 +9634,7 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     for (j = 0; j < INTERINTRA_MODES; ++j) {
       mbmi->interintra_mode = (INTERINTRA_MODE)j;
       rmode = interintra_mode_cost[mbmi->interintra_mode];
-      av1_build_intra_predictors_for_interintra(xd, bsize, 0, &orig_dst,
+      av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, &orig_dst,
                                                 intrapred, bw);
       av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
       model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
@@ -9229,7 +9647,7 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     }
     mbmi->interintra_mode = best_interintra_mode;
     rmode = interintra_mode_cost[mbmi->interintra_mode];
-    av1_build_intra_predictors_for_interintra(xd, bsize, 0, &orig_dst,
+    av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, &orig_dst,
                                               intrapred, bw);
     av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
     av1_subtract_plane(x, bsize, 0);
@@ -9340,16 +9758,11 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
                     &tmp_dist, &skip_txfm_sb, &skip_sse_sb);
     rd = RDCOST(x->rdmult, rs + tmp_rate, tmp_dist);
   }
-#endif  // CONFIG_EXT_INTER
 
   if (!is_comp_pred)
-#if CONFIG_DUAL_FILTER
-    args->single_filter[this_mode][refs[0]] = mbmi->interp_filter[0];
-#else
-    args->single_filter[this_mode][refs[0]] = mbmi->interp_filter;
-#endif  // CONFIG_DUAL_FILTER
+    args->single_filter[this_mode][refs[0]] =
+        av1_extract_interp_filter(mbmi->interp_filters, 0);
 
-#if CONFIG_EXT_INTER
   if (args->modelled_rd != NULL) {
     if (is_comp_pred) {
       const int mode0 = compound_ref0_mode(this_mode);
@@ -9364,7 +9777,6 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
       args->modelled_rd[this_mode][refs[0]] = rd;
     }
   }
-#endif  // CONFIG_EXT_INTER
 
   if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
     // if current pred_error modeled rd is substantially more than the best
@@ -9375,7 +9787,6 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
     }
   }
 
-#if CONFIG_EXT_INTER
 #if CONFIG_INTERINTRA
   rd_stats->rate += compmode_interintra_cost;
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
@@ -9385,18 +9796,14 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
 #if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
   rd_stats->rate += compmode_interinter_cost;
 #endif
-#endif
 
-  ret_val = motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv,
-                           disable_skip, mode_mv, mi_row, mi_col, args,
-                           ref_best_rd, refs, rate_mv,
+  ret_val = motion_mode_rd(
+      cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv, disable_skip, mode_mv,
+      mi_row, mi_col, args, ref_best_rd, refs, rate_mv,
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-                           single_newmv,
-#if CONFIG_EXT_INTER
-                           rate2_bmc_nocoeff, &best_bmc_mbmi, rate_mv_bmc,
-#endif  // CONFIG_EXT_INTER
+      single_newmv, rate2_bmc_nocoeff, &best_bmc_mbmi, rate_mv_bmc,
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-                           rs, &skip_txfm_sb, &skip_sse_sb, &orig_dst);
+      rs, &skip_txfm_sb, &skip_sse_sb, &orig_dst);
   if (ret_val != 0) return ret_val;
 
   return 0;  // The rate-distortion cost will be re-calculated by caller.
@@ -9407,11 +9814,10 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
                                        RD_STATS *rd_cost, BLOCK_SIZE bsize,
                                        int64_t best_rd) {
   const AV1_COMMON *const cm = &cpi->common;
-  if (bsize < BLOCK_8X8 || !cm->allow_screen_content_tools) return INT64_MAX;
+  if (!av1_allow_intrabc(bsize, cm)) return INT64_MAX;
 
   MACROBLOCKD *const xd = &x->e_mbd;
   const TileInfo *tile = &xd->tile;
-  FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
   MODE_INFO *const mi = xd->mi[0];
   const int mi_row = -xd->mb_to_top_edge / (8 * MI_SIZE);
   const int mi_col = -xd->mb_to_left_edge / (8 * MI_SIZE);
@@ -9425,11 +9831,8 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
   int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
   av1_find_mv_refs(cm, xd, mi, ref_frame, &mbmi_ext->ref_mv_count[ref_frame],
                    mbmi_ext->ref_mv_stack[ref_frame],
-#if CONFIG_EXT_INTER
-                   mbmi_ext->compound_mode_context,
-#endif  // CONFIG_EXT_INTER
-                   candidates, mi_row, mi_col, NULL, NULL,
-                   mbmi_ext->mode_context);
+                   mbmi_ext->compound_mode_context, candidates, mi_row, mi_col,
+                   NULL, NULL, mbmi_ext->mode_context);
 
   int_mv nearestmv, nearmv;
   av1_find_best_ref_mvs(0, candidates, &nearestmv, &nearmv);
@@ -9495,9 +9898,16 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
     mvp_full.row >>= 3;
     int sadpb = x->sadperbit16;
     int cost_list[5];
+#if CONFIG_HASH_ME
+    int bestsme = av1_full_pixel_search(
+        cpi, x, bsize, &mvp_full, step_param, sadpb,
+        cond_cost_list(cpi, cost_list), &dv_ref.as_mv, INT_MAX, 1,
+        (MI_SIZE * mi_col), (MI_SIZE * mi_row), 1);
+#else
     int bestsme = av1_full_pixel_search(cpi, x, bsize, &mvp_full, step_param,
                                         sadpb, cond_cost_list(cpi, cost_list),
                                         &dv_ref.as_mv, INT_MAX, 1);
+#endif
 
     x->mv_limits = tmp_mv_limits;
     if (bestsme == INT_MAX) continue;
@@ -9506,18 +9916,12 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
     if (mv_check_bounds(&x->mv_limits, &dv)) continue;
     if (!is_dv_valid(dv, tile, mi_row, mi_col, bsize)) continue;
 
-#if CONFIG_PALETTE
     memset(&mbmi->palette_mode_info, 0, sizeof(mbmi->palette_mode_info));
-#endif
     mbmi->use_intrabc = 1;
     mbmi->mode = DC_PRED;
     mbmi->uv_mode = UV_DC_PRED;
     mbmi->mv[0].as_mv = dv;
-#if CONFIG_DUAL_FILTER
-    for (int idx = 0; idx < 4; ++idx) mbmi->interp_filter[idx] = BILINEAR;
-#else
-    mbmi->interp_filter = BILINEAR;
-#endif
+    mbmi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
     mbmi->skip = 0;
     x->skip = 0;
     av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
@@ -9527,8 +9931,7 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
     // in MV_COST_WEIGHT is too large. Explore other values.
     int rate_mv = av1_mv_bit_cost(&dv, &dv_ref.as_mv, x->nmvjointcost,
                                   x->mvcost, MV_COST_WEIGHT_SUB);
-    const int rate_mode = av1_cost_bit(ec_ctx->intrabc_prob, 1);
-
+    const int rate_mode = x->intrabc_cost[1];
     RD_STATS rd_stats, rd_stats_uv;
     av1_subtract_plane(x, bsize, 0);
     super_block_yrd(cpi, x, &rd_stats, bsize, INT64_MAX);
@@ -9605,6 +10008,9 @@ void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
   mbmi->use_intrabc = 0;
   mbmi->mv[0].as_int = 0;
 #endif  // CONFIG_INTRABC
+#if CONFIG_LGT_FROM_PRED
+  mbmi->use_lgt = 0;
+#endif
 
   const int64_t intra_yrd =
       (bsize >= BLOCK_8X8 || unify_bsize)
@@ -9615,25 +10021,23 @@ void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
 
   if (intra_yrd < best_rd) {
 #if CONFIG_CFL
-    // Perform one extra txfm_rd_in_plane() call, this time with the best value
-    // so we can store reconstructed luma values
-    RD_STATS this_rd_stats;
-
 #if CONFIG_CB4X4
-    // Don't store the luma value if no chroma is associated.
-    // Don't worry, we will store this reconstructed luma in the following
-    // encode dry-run the chroma plane will never know.
-    x->cfl_store_y = !x->skip_chroma_rd;
+    // Only store reconstructed luma when there's chroma RDO. When there's no
+    // chroma RDO, the reconstructed luma will be stored in encode_superblock().
+    xd->cfl->store_y = !x->skip_chroma_rd;
 #else
-    x->cfl_store_y = 1;
-#endif
-
-    txfm_rd_in_plane(x, cpi, &this_rd_stats, INT64_MAX, AOM_PLANE_Y,
-                     mbmi->sb_type, mbmi->tx_size,
-                     cpi->sf.use_fast_coef_costing);
-
-    x->cfl_store_y = 0;
-#endif
+    xd->cfl->store_y = 1;
+#endif  // CONFIG_CB4X4
+    if (xd->cfl->store_y) {
+      // Perform one extra call to txfm_rd_in_plane(), with the values chosen
+      // during luma RDO, so we can store reconstructed luma values
+      RD_STATS this_rd_stats;
+      txfm_rd_in_plane(x, cpi, &this_rd_stats, INT64_MAX, AOM_PLANE_Y,
+                       mbmi->sb_type, mbmi->tx_size,
+                       cpi->sf.use_fast_coef_costing);
+      xd->cfl->store_y = 0;
+    }
+#endif  // CONFIG_CFL
     max_uv_tx_size = uv_txsize_lookup[bsize][mbmi->tx_size][pd[1].subsampling_x]
                                      [pd[1].subsampling_y];
     init_sbuv_mode(mbmi);
@@ -9646,7 +10050,7 @@ void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
                             &uv_skip, AOMMAX(BLOCK_8X8, bsize), max_uv_tx_size);
 #endif  // CONFIG_CB4X4
 
-    if (y_skip && uv_skip) {
+    if (y_skip && (uv_skip || x->skip_chroma_rd)) {
       rd_cost->rate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
                       av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
       rd_cost->dist = dist_y + dist_uv;
@@ -9656,9 +10060,6 @@ void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
       rd_cost->dist = dist_y + dist_uv;
     }
     rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-    rd_cost->dist_y = dist_y;
-#endif
   } else {
     rd_cost->rate = INT_MAX;
   }
@@ -9747,12 +10148,12 @@ int av1_active_edge_sb(const AV1_COMP *cpi, int mi_row, int mi_col) {
          av1_active_v_edge(cpi, mi_col, cpi->common.mib_size);
 }
 
-#if CONFIG_PALETTE
 static void restore_uv_color_map(const AV1_COMP *const cpi, MACROBLOCK *x) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
   const BLOCK_SIZE bsize = mbmi->sb_type;
+  assert(bsize >= BLOCK_8X8);
   int src_stride = x->plane[1].src.stride;
   const uint8_t *const src_u = x->plane[1].src.buf;
   const uint8_t *const src_v = x->plane[2].src.buf;
@@ -9796,24 +10197,20 @@ static void restore_uv_color_map(const AV1_COMP *const cpi, MACROBLOCK *x) {
   extend_palette_color_map(color_map, cols, rows, plane_block_width,
                            plane_block_height);
 }
-#endif  // CONFIG_PALETTE
 
 #if CONFIG_FILTER_INTRA
 static void pick_filter_intra_interframe(
-    const AV1_COMP *cpi, MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
-    BLOCK_SIZE bsize, int mi_row, int mi_col, int *rate_uv_intra,
-    int *rate_uv_tokenonly, int64_t *dist_uv, int *skip_uv,
-    UV_PREDICTION_MODE *mode_uv,
+    const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
+    int mi_col, int *rate_uv_intra, int *rate_uv_tokenonly, int64_t *dist_uv,
+    int *skip_uv, UV_PREDICTION_MODE *mode_uv,
     FILTER_INTRA_MODE_INFO *filter_intra_mode_info_uv,
 #if CONFIG_EXT_INTRA
     int8_t *uv_angle_delta,
 #endif  // CONFIG_EXT_INTRA
-#if CONFIG_PALETTE
-    PALETTE_MODE_INFO *pmi_uv, int palette_ctx,
-#endif  // CONFIG_PALETTE
-    int skip_mask, unsigned int *ref_costs_single, int64_t *best_rd,
-    int64_t *best_intra_rd, PREDICTION_MODE *best_intra_mode,
-    int *best_mode_index, int *best_skip2, int *best_mode_skippable,
+    PALETTE_MODE_INFO *pmi_uv, int palette_ctx, int skip_mask,
+    unsigned int *ref_costs_single, int64_t *best_rd, int64_t *best_intra_rd,
+    PREDICTION_MODE *best_intra_mode, int *best_mode_index, int *best_skip2,
+    int *best_mode_skippable,
 #if CONFIG_SUPERTX
     int *returnrate_nocoef,
 #endif  // CONFIG_SUPERTX
@@ -9821,12 +10218,12 @@ static void pick_filter_intra_interframe(
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-#if CONFIG_PALETTE
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-#endif  // CONFIG_PALETTE
+  const int try_palette =
+      av1_allow_palette(cm->allow_screen_content_tools, mbmi->sb_type);
   int rate2 = 0, rate_y = INT_MAX, skippable = 0, rate_uv, rate_dummy, i;
   int dc_mode_index;
-  const int *const intra_mode_cost = cpi->mbmode_cost[size_group_lookup[bsize]];
+  const int *const intra_mode_cost = x->mbmode_cost[size_group_lookup[bsize]];
   int64_t distortion2 = 0, distortion_y = 0, this_rd = *best_rd;
   int64_t distortion_uv, model_rd = INT64_MAX;
   TX_SIZE uv_tx;
@@ -9854,12 +10251,10 @@ static void pick_filter_intra_interframe(
   uv_tx = uv_txsize_lookup[bsize][mbmi->tx_size][xd->plane[1].subsampling_x]
                           [xd->plane[1].subsampling_y];
   if (rate_uv_intra[uv_tx] == INT_MAX) {
-    choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx, &rate_uv_intra[uv_tx],
+    choose_intra_uv_mode(cpi, x, bsize, uv_tx, &rate_uv_intra[uv_tx],
                          &rate_uv_tokenonly[uv_tx], &dist_uv[uv_tx],
                          &skip_uv[uv_tx], &mode_uv[uv_tx]);
-#if CONFIG_PALETTE
     if (cm->allow_screen_content_tools) pmi_uv[uv_tx] = *pmi;
-#endif  // CONFIG_PALETTE
     filter_intra_mode_info_uv[uv_tx] = mbmi->filter_intra_mode_info;
 #if CONFIG_EXT_INTRA
     uv_angle_delta[uv_tx] = mbmi->angle_delta[1];
@@ -9870,14 +10265,12 @@ static void pick_filter_intra_interframe(
   distortion_uv = dist_uv[uv_tx];
   skippable = skippable && skip_uv[uv_tx];
   mbmi->uv_mode = mode_uv[uv_tx];
-#if CONFIG_PALETTE
   if (cm->allow_screen_content_tools) {
     pmi->palette_size[1] = pmi_uv[uv_tx].palette_size[1];
     memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
            pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE,
            2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
   }
-#endif  // CONFIG_PALETTE
 #if CONFIG_EXT_INTRA
   mbmi->angle_delta[1] = uv_angle_delta[uv_tx];
 #endif  // CONFIG_EXT_INTRA
@@ -9889,13 +10282,10 @@ static void pick_filter_intra_interframe(
   }
 
   rate2 = rate_y + intra_mode_cost[mbmi->mode] + rate_uv +
-          cpi->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode];
-#if CONFIG_PALETTE
-  if (cpi->common.allow_screen_content_tools && mbmi->mode == DC_PRED &&
-      bsize >= BLOCK_8X8)
+          x->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode];
+  if (try_palette && mbmi->mode == DC_PRED)
     rate2 += av1_cost_bit(
         av1_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_ctx], 0);
-#endif  // CONFIG_PALETTE
 
   if (!xd->lossless[mbmi->segment_id]) {
     // super_block_yrd above includes the cost of the tx_size in the
@@ -9910,7 +10300,7 @@ static void pick_filter_intra_interframe(
   rate2 += write_uniform_cost(
       FILTER_INTRA_MODES, mbmi->filter_intra_mode_info.filter_intra_mode[0]);
 #if CONFIG_EXT_INTRA
-  if (av1_is_directional_mode(mbmi->uv_mode, bsize) &&
+  if (av1_is_directional_mode(get_uv_mode(mbmi->uv_mode), bsize) &&
       av1_use_angle_delta(bsize)) {
     rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
                                 MAX_ANGLE_DELTA + mbmi->angle_delta[1]);
@@ -9992,11 +10382,9 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
   const SPEED_FEATURES *const sf = &cpi->sf;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-#if CONFIG_PALETTE
   const int try_palette =
-      cpi->common.allow_screen_content_tools && bsize >= BLOCK_8X8;
+      av1_allow_palette(cm->allow_screen_content_tools, mbmi->sb_type);
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-#endif  // CONFIG_PALETTE
   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   const struct segmentation *const seg = &cm->seg;
   PREDICTION_MODE this_mode;
@@ -10004,15 +10392,13 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
   unsigned char segment_id = mbmi->segment_id;
   int comp_pred, i, k;
   int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
   int_mv frame_comp_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
   struct buf_2d yv12_mb[TOTAL_REFS_PER_FRAME][MAX_MB_PLANE];
   int_mv single_newmv[TOTAL_REFS_PER_FRAME] = { { 0 } };
-#if CONFIG_EXT_INTER
   int single_newmv_rate[TOTAL_REFS_PER_FRAME] = { 0 };
   int64_t modelled_rd[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
-#endif  // CONFIG_EXT_INTER
   static const int flag_list[TOTAL_REFS_PER_FRAME] = {
     0,
     AOM_LAST_FLAG,
@@ -10023,6 +10409,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     AOM_GOLD_FLAG,
 #if CONFIG_EXT_REFS
     AOM_BWD_FLAG,
+    AOM_ALT2_FLAG,
 #endif  // CONFIG_EXT_REFS
     AOM_ALT_FLAG
   };
@@ -10049,9 +10436,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
   int64_t dist_uvs[TX_SIZES_ALL];
   int skip_uvs[TX_SIZES_ALL];
   UV_PREDICTION_MODE mode_uv[TX_SIZES_ALL];
-#if CONFIG_PALETTE
   PALETTE_MODE_INFO pmi_uv[TX_SIZES_ALL];
-#endif  // CONFIG_PALETTE
 #if CONFIG_EXT_INTRA
   int8_t uv_angle_delta[TX_SIZES_ALL];
   int is_directional_mode, angle_stats_ready = 0;
@@ -10063,14 +10448,14 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 #endif  // CONFIG_FILTER_INTRA
   const int intra_cost_penalty = av1_get_intra_cost_penalty(
       cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
-  const int *const intra_mode_cost = cpi->mbmode_cost[size_group_lookup[bsize]];
+  const int *const intra_mode_cost = x->mbmode_cost[size_group_lookup[bsize]];
   int best_skip2 = 0;
   uint16_t ref_frame_skip_mask[2] = { 0 };
   uint32_t mode_skip_mask[TOTAL_REFS_PER_FRAME] = { 0 };
-#if CONFIG_EXT_INTER && CONFIG_INTERINTRA
+#if CONFIG_INTERINTRA
   MV_REFERENCE_FRAME best_single_inter_ref = LAST_FRAME;
   int64_t best_single_inter_rd = INT64_MAX;
-#endif  // CONFIG_EXT_INTER && CONFIG_INTERINTRA
+#endif  // CONFIG_INTERINTRA
   int mode_skip_start = sf->mode_skip_start + 1;
   const int *const rd_threshes = rd_opt->threshes[segment_id][bsize];
   const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
@@ -10088,25 +10473,17 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     { NULL },
     { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE },
 #endif  // CONFIG_MOTION_VAR
-#if CONFIG_EXT_INTER
     NULL,
     NULL,
     NULL,
-#else   // CONFIG_EXT_INTER
-    NULL,
-#endif  // CONFIG_EXT_INTER
     { { 0 } },
   };
 
-#if CONFIG_PALETTE || CONFIG_EXT_INTRA
   const int rows = block_size_high[bsize];
   const int cols = block_size_wide[bsize];
-#endif  // CONFIG_PALETTE || CONFIG_EXT_INTRA
-#if CONFIG_PALETTE
   int palette_ctx = 0;
   const MODE_INFO *above_mi = xd->above_mi;
   const MODE_INFO *left_mi = xd->left_mi;
-#endif  // CONFIG_PALETTE
 #if CONFIG_MOTION_VAR
   int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
   int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
@@ -10141,7 +10518,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 
   av1_zero(best_mbmode);
 
-#if CONFIG_PALETTE
   av1_zero(pmi_uv);
   if (try_palette) {
     if (above_mi)
@@ -10149,7 +10525,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     if (left_mi)
       palette_ctx += (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
   }
-#endif  // CONFIG_PALETTE
 
   estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
                            &comp_mode_p);
@@ -10168,16 +10543,10 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
   *returnrate_nocoef = INT_MAX;
 #endif  // CONFIG_SUPERTX
 
-#if CONFIG_SPEED_REFS
-  memset(x->mbmi_ext->ref_mvs, 0, sizeof(x->mbmi_ext->ref_mvs));
-#endif  // CONFIG_SPEED_REFS
-
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     x->pred_mv_sad[ref_frame] = INT_MAX;
     x->mbmi_ext->mode_context[ref_frame] = 0;
-#if CONFIG_EXT_INTER
     x->mbmi_ext->compound_mode_context[ref_frame] = 0;
-#endif  // CONFIG_EXT_INTER
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
       assert(get_ref_frame_buffer(cpi, ref_frame) != NULL);
       setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
@@ -10188,12 +10557,16 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     frame_mv[ZEROMV][ref_frame].as_int =
         gm_get_motion_vector(&cm->global_motion[ref_frame],
                              cm->allow_high_precision_mv, bsize, mi_col, mi_row,
-                             0)
+                             0
+#if CONFIG_AMVR
+                             ,
+                             cm->cur_frame_mv_precision_level
+#endif
+                             )
             .as_int;
 #else   // CONFIG_GLOBAL_MOTION
     frame_mv[ZEROMV][ref_frame].as_int = 0;
 #endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_EXT_INTER
     frame_mv[NEW_NEWMV][ref_frame].as_int = INVALID_MV;
 #if CONFIG_COMPOUND_SINGLEREF
     frame_mv[SR_NEW_NEWMV][ref_frame].as_int = INVALID_MV;
@@ -10203,12 +10576,16 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     frame_mv[ZERO_ZEROMV][ref_frame].as_int =
         gm_get_motion_vector(&cm->global_motion[ref_frame],
                              cm->allow_high_precision_mv, bsize, mi_col, mi_row,
-                             0)
+                             0
+#if CONFIG_AMVR
+                             ,
+                             cm->cur_frame_mv_precision_level
+#endif
+                             )
             .as_int;
 #else   // CONFIG_GLOBAL_MOTION
     frame_mv[ZERO_ZEROMV][ref_frame].as_int = 0;
 #endif  // CONFIG_GLOBAL_MOTION
-#endif  // CONFIG_EXT_INTER
   }
 
   for (; ref_frame < MODE_CTX_REF_FRAMES; ++ref_frame) {
@@ -10217,11 +10594,8 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     x->mbmi_ext->mode_context[ref_frame] = 0;
     av1_find_mv_refs(cm, xd, mi, ref_frame, &mbmi_ext->ref_mv_count[ref_frame],
                      mbmi_ext->ref_mv_stack[ref_frame],
-#if CONFIG_EXT_INTER
-                     mbmi_ext->compound_mode_context,
-#endif  // CONFIG_EXT_INTER
-                     candidates, mi_row, mi_col, NULL, NULL,
-                     mbmi_ext->mode_context);
+                     mbmi_ext->compound_mode_context, candidates, mi_row,
+                     mi_col, NULL, NULL, mbmi_ext->mode_context);
     if (mbmi_ext->ref_mv_count[ref_frame] < 2) {
       MV_REFERENCE_FRAME rf[2];
       av1_set_ref_frame(rf, ref_frame);
@@ -10257,25 +10631,11 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     if (!(cpi->ref_frame_flags & flag_list[ref_frame])) {
-// Skip checking missing references in both single and compound reference
-// modes. Note that a mode will be skipped iff both reference frames
-// are masked out.
-#if CONFIG_EXT_COMP_REFS
+      // Skip checking missing references in both single and compound reference
+      // modes. Note that a mode will be skipped iff both reference frames
+      // are masked out.
       ref_frame_skip_mask[0] |= (1 << ref_frame);
       ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
-#else  // !CONFIG_EXT_COMP_REFS
-#if CONFIG_EXT_REFS
-      if (ref_frame == BWDREF_FRAME || ref_frame == ALTREF_FRAME) {
-        ref_frame_skip_mask[0] |= (1 << ref_frame);
-        ref_frame_skip_mask[1] |= ((1 << ref_frame) | 0x01);
-      } else {
-#endif  // CONFIG_EXT_REFS
-        ref_frame_skip_mask[0] |= (1 << ref_frame);
-        ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
-#if CONFIG_EXT_REFS
-      }
-#endif  // CONFIG_EXT_REFS
-#endif  // CONFIG_EXT_COMP_REFS
     } else {
       for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
         // Skip fixed mv modes for poor references
@@ -10307,7 +10667,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       ref_frame_skip_mask[0] = (1 << LAST_FRAME) |
 #if CONFIG_EXT_REFS
                                (1 << LAST2_FRAME) | (1 << LAST3_FRAME) |
-                               (1 << BWDREF_FRAME) |
+                               (1 << BWDREF_FRAME) | (1 << ALTREF2_FRAME) |
 #endif  // CONFIG_EXT_REFS
                                (1 << GOLDEN_FRAME);
       ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
@@ -10317,7 +10677,12 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 #if CONFIG_GLOBAL_MOTION
       zeromv.as_int = gm_get_motion_vector(&cm->global_motion[ALTREF_FRAME],
                                            cm->allow_high_precision_mv, bsize,
-                                           mi_col, mi_row, 0)
+                                           mi_col, mi_row, 0
+#if CONFIG_AMVR
+                                           ,
+                                           cm->cur_frame_mv_precision_level
+#endif
+                                           )
                           .as_int;
 #else
       zeromv.as_int = 0;
@@ -10326,7 +10691,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         mode_skip_mask[ALTREF_FRAME] |= (1 << NEARMV);
       if (frame_mv[NEARESTMV][ALTREF_FRAME].as_int != zeromv.as_int)
         mode_skip_mask[ALTREF_FRAME] |= (1 << NEARESTMV);
-#if CONFIG_EXT_INTER
       if (frame_mv[NEAREST_NEARESTMV][ALTREF_FRAME].as_int != zeromv.as_int)
         mode_skip_mask[ALTREF_FRAME] |= (1 << NEAREST_NEARESTMV);
       if (frame_mv[NEAR_NEARMV][ALTREF_FRAME].as_int != zeromv.as_int)
@@ -10337,7 +10701,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
               zeromv.as_int)
         mode_skip_mask[ALTREF_FRAME] |= (1 << SR_NEAREST_NEARMV);
 #endif  // CONFIG_COMPOUND_SINGLEREF
-#endif  // CONFIG_EXT_INTER
     }
   }
 
@@ -10400,11 +10763,9 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 #if CONFIG_PVQ
   od_encode_checkpoint(&x->daala_enc, &pre_buf);
 #endif  // CONFIG_PVQ
-#if CONFIG_EXT_INTER
   for (i = 0; i < MB_MODE_COUNT; ++i)
     for (ref_frame = 0; ref_frame < TOTAL_REFS_PER_FRAME; ++ref_frame)
       modelled_rd[i][ref_frame] = INT64_MAX;
-#endif  // CONFIG_EXT_INTER
 
   for (midx = 0; midx < MAX_MODES; ++midx) {
     int mode_index;
@@ -10414,10 +10775,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     int compmode_cost = 0;
     int rate2 = 0, rate_y = 0, rate_uv = 0;
     int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-    int64_t distortion2_y = 0;
-    int64_t total_sse_y = INT64_MAX;
-#endif
     int skippable = 0;
     int this_skip2 = 0;
     int64_t total_sse = INT64_MAX;
@@ -10431,7 +10788,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     second_ref_frame = av1_mode_order[mode_index].ref_frame[1];
     mbmi->ref_mv_idx = 0;
 
-#if CONFIG_EXT_INTER
     if (ref_frame > INTRA_FRAME && second_ref_frame == INTRA_FRAME) {
       // Mode must by compatible
       if (!is_interintra_allowed_mode(this_mode)) continue;
@@ -10451,7 +10807,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
           frame_mv[compound_ref1_mode(this_mode)][ref_frame].as_int;
 #endif  // CONFIG_COMPOUND_SINGLEREF
     }
-#endif  // CONFIG_EXT_INTER
 
     // Look at the reference frame of the best mode so far and set the
     // skip mask to look at a subset of the remaining modes.
@@ -10481,6 +10836,10 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
           ref_frame_skip_mask[0] |= BWDREF_FRAME_MODE_MASK;
           ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
           break;
+        case ALTREF2_FRAME:
+          ref_frame_skip_mask[0] |= ALTREF2_FRAME_MODE_MASK;
+          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+          break;
 #endif  // CONFIG_EXT_REFS
         case ALTREF_FRAME: ref_frame_skip_mask[0] |= ALTREF_FRAME_MODE_MASK;
 #if CONFIG_EXT_REFS
@@ -10537,7 +10896,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     // This is only used in motion vector unit test.
     if (cpi->oxcf.motion_vector_unit_test && ref_frame == INTRA_FRAME) continue;
 
-#if CONFIG_ONE_SIDED_COMPOUND  // Changes LL bitstream
+#if CONFIG_ONE_SIDED_COMPOUND && !CONFIG_EXT_COMP_REFS  // Changes LL bitstream
 #if CONFIG_EXT_REFS
     if (cpi->oxcf.pass == 0) {
       // Complexity-compression trade-offs
@@ -10546,8 +10905,8 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       if (second_ref_frame == ALTREF_FRAME) continue;
       // if (second_ref_frame == BWDREF_FRAME) continue;
     }
-#endif
-#endif
+#endif  // CONFIG_EXT_REFS
+#endif  // CONFIG_ONE_SIDED_COMPOUND && !CONFIG_EXT_COMP_REFS
     comp_pred = second_ref_frame > INTRA_FRAME;
     if (comp_pred) {
       if (!cpi->allow_comp_inter_inter) continue;
@@ -10601,12 +10960,9 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     } else {
 #endif  // CONFIG_GLOBAL_MOTION
       const MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, second_ref_frame };
-      if (!check_best_zero_mv(cpi, mbmi_ext->mode_context,
-#if CONFIG_EXT_INTER
-                              mbmi_ext->compound_mode_context,
-#endif  // CONFIG_EXT_INTER
-                              frame_mv, this_mode, ref_frames, bsize, -1,
-                              mi_row, mi_col))
+      if (!check_best_zero_mv(cpi, x, mbmi_ext->mode_context,
+                              mbmi_ext->compound_mode_context, frame_mv,
+                              this_mode, ref_frames, bsize, -1, mi_row, mi_col))
         continue;
     }
 
@@ -10614,10 +10970,8 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     mbmi->uv_mode = UV_DC_PRED;
     mbmi->ref_frame[0] = ref_frame;
     mbmi->ref_frame[1] = second_ref_frame;
-#if CONFIG_PALETTE
     pmi->palette_size[0] = 0;
     pmi->palette_size[1] = 0;
-#endif  // CONFIG_PALETTE
 #if CONFIG_FILTER_INTRA
     mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
     mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
@@ -10639,18 +10993,18 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
     }
 
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
     // Single ref compound mode
     if (!comp_pred && is_inter_singleref_comp_mode(mbmi->mode)) {
       xd->block_refs[1] = xd->block_refs[0];
       for (i = 0; i < MAX_MB_PLANE; i++)
         xd->plane[i].pre[1] = xd->plane[i].pre[0];
     }
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
 
-#if CONFIG_EXT_INTER && CONFIG_INTERINTRA
+#if CONFIG_INTERINTRA
     mbmi->interintra_mode = (INTERINTRA_MODE)(II_DC_PRED - 1);
-#endif  // CONFIG_EXT_INTER && CONFIG_INTERINTRA
+#endif  // CONFIG_INTERINTRA
 
     if (ref_frame == INTRA_FRAME) {
       RD_STATS rd_stats_y;
@@ -10699,12 +11053,10 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       uv_tx = uv_txsize_lookup[bsize][mbmi->tx_size][pd->subsampling_x]
                               [pd->subsampling_y];
       if (rate_uv_intra[uv_tx] == INT_MAX) {
-        choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx, &rate_uv_intra[uv_tx],
+        choose_intra_uv_mode(cpi, x, bsize, uv_tx, &rate_uv_intra[uv_tx],
                              &rate_uv_tokenonly[uv_tx], &dist_uvs[uv_tx],
                              &skip_uvs[uv_tx], &mode_uv[uv_tx]);
-#if CONFIG_PALETTE
         if (try_palette) pmi_uv[uv_tx] = *pmi;
-#endif  // CONFIG_PALETTE
 
 #if CONFIG_EXT_INTRA
         uv_angle_delta[uv_tx] = mbmi->angle_delta[1];
@@ -10718,14 +11070,12 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       distortion_uv = dist_uvs[uv_tx];
       skippable = skippable && skip_uvs[uv_tx];
       mbmi->uv_mode = mode_uv[uv_tx];
-#if CONFIG_PALETTE
       if (try_palette) {
         pmi->palette_size[1] = pmi_uv[uv_tx].palette_size[1];
         memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
                pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE,
                2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
       }
-#endif  // CONFIG_PALETTE
 
 #if CONFIG_EXT_INTRA
       mbmi->angle_delta[1] = uv_angle_delta[uv_tx];
@@ -10742,20 +11092,18 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 #if CONFIG_CB4X4
       rate2 = rate_y + intra_mode_cost[mbmi->mode];
       if (!x->skip_chroma_rd)
-        rate2 += rate_uv + cpi->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode];
+        rate2 += rate_uv + x->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode];
 #else
       rate2 = rate_y + intra_mode_cost[mbmi->mode] + rate_uv +
-              cpi->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode];
+              x->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode];
 #endif  // CONFIG_CB4X4
 
-#if CONFIG_PALETTE
       if (try_palette && mbmi->mode == DC_PRED) {
         rate2 += av1_cost_bit(
             av1_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_ctx], 0);
       }
-#endif  // CONFIG_PALETTE
 
-      if (!xd->lossless[mbmi->segment_id] && bsize >= BLOCK_8X8) {
+      if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(bsize)) {
         // super_block_yrd above includes the cost of the tx_size in the
         // tokenonly rate, but for intra blocks, tx_size is always coded
         // (prediction granularity), so we account for it in the full rate,
@@ -10769,14 +11117,14 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         const int p_angle =
             mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP;
         if (av1_is_intra_filter_switchable(p_angle))
-          rate2 += cpi->intra_filter_cost[intra_filter_ctx][mbmi->intra_filter];
+          rate2 += x->intra_filter_cost[intra_filter_ctx][mbmi->intra_filter];
 #endif  // CONFIG_INTRA_INTERP
         if (av1_use_angle_delta(bsize)) {
           rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
                                       MAX_ANGLE_DELTA + mbmi->angle_delta[0]);
         }
       }
-      if (av1_is_directional_mode(mbmi->uv_mode, bsize) &&
+      if (av1_is_directional_mode(get_uv_mode(mbmi->uv_mode), bsize) &&
           av1_use_angle_delta(bsize)) {
         rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
                                     MAX_ANGLE_DELTA + mbmi->angle_delta[1]);
@@ -10806,19 +11154,15 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED)
         rate2 += intra_cost_penalty;
       distortion2 = distortion_y + distortion_uv;
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-      if (bsize < BLOCK_8X8) distortion2_y = distortion_y;
-#endif
     } else {
       int_mv backup_ref_mv[2];
 
-#if !SUB8X8_COMP_REF
-      if (bsize == BLOCK_4X4 && mbmi->ref_frame[1] > INTRA_FRAME) continue;
-#endif  // !SUB8X8_COMP_REF
+      if (!is_comp_ref_allowed(bsize) && mbmi->ref_frame[1] > INTRA_FRAME)
+        continue;
 
       backup_ref_mv[0] = mbmi_ext->ref_mvs[ref_frame][0];
       if (comp_pred) backup_ref_mv[1] = mbmi_ext->ref_mvs[second_ref_frame][0];
-#if CONFIG_EXT_INTER && CONFIG_INTERINTRA
+#if CONFIG_INTERINTRA
       if (second_ref_frame == INTRA_FRAME) {
         if (best_single_inter_ref != ref_frame) continue;
         mbmi->interintra_mode = intra_to_interintra_mode[best_intra_mode];
@@ -10836,11 +11180,10 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
 #endif  // CONFIG_FILTER_INTRA
       }
-#endif  // CONFIG_EXT_INTER && CONFIG_INTERINTRA
+#endif  // CONFIG_INTERINTRA
       mbmi->ref_mv_idx = 0;
       ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
 
-#if CONFIG_EXT_INTER
       if (comp_pred) {
         if (mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
           int ref_mv_idx = 0;
@@ -10887,7 +11230,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         }
 #endif  // CONFIG_COMPOUND_SINGLEREF
       } else {
-#endif  // CONFIG_EXT_INTER
         if (mbmi->mode == NEWMV && mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
           int ref;
           for (ref = 0; ref < 1 + comp_pred; ++ref) {
@@ -10899,38 +11241,21 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
             mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0] = this_mv;
           }
         }
-#if CONFIG_EXT_INTER
       }
-#endif  // CONFIG_EXT_INTER
       {
         RD_STATS rd_stats, rd_stats_y, rd_stats_uv;
         av1_init_rd_stats(&rd_stats);
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-        // While av1 master uses rd_stats_y.rate through out the codebase,
-        // which is set when handle_inter_moden is called, the daala-dist code
-        // in rd_pick_partition() for cb4x4 and sub8x8 blocks need to know
-        // .dist_y which comes from rd_stats_y.dist and rd_stats_y.sse.
-        // The problem is rd_stats_y.dist and rd_stats_y.sse are sometimes not
-        // initialized when rd_stats.skip = 1,
-        // then instead rd_stats.dist and rd_stats.sse have the
-        // combined luma and chroma dist and sse.
-        // This can be seen inside motion_mode_rd(), which is called by
-        // handle_inter_mode().
-        if (bsize < BLOCK_8X8) av1_init_rd_stats(&rd_stats_y);
-#endif
         rd_stats.rate = rate2;
 
         // Point to variables that are maintained between loop iterations
         args.single_newmv = single_newmv;
-#if CONFIG_EXT_INTER
         args.single_newmv_rate = single_newmv_rate;
         args.modelled_rd = modelled_rd;
-#endif  // CONFIG_EXT_INTER
         this_rd = handle_inter_mode(cpi, x, bsize, &rd_stats, &rd_stats_y,
                                     &rd_stats_uv, &disable_skip, frame_mv,
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
                                     frame_comp_mv,
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
                                     mi_row, mi_col, &args, best_rd);
 
         rate2 = rd_stats.rate;
@@ -10939,21 +11264,10 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         total_sse = rd_stats.sse;
         rate_y = rd_stats_y.rate;
         rate_uv = rd_stats_uv.rate;
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-        if (bsize < BLOCK_8X8) {
-          if (rd_stats_y.rate != INT_MAX) {
-            assert(rd_stats_y.sse < INT64_MAX);
-            assert(rd_stats_y.dist < INT64_MAX);
-          }
-          total_sse_y = rd_stats_y.sse;
-          distortion2_y = rd_stats_y.dist;
-        }
-#endif
       }
 
 // TODO(jingning): This needs some refactoring to improve code quality
 // and reduce redundant steps.
-#if CONFIG_EXT_INTER
 #if CONFIG_COMPOUND_SINGLEREF
       if ((have_nearmv_in_inter_mode(mbmi->mode) &&
            mbmi_ext->ref_mv_count[ref_frame_type] > 2) ||
@@ -10966,11 +11280,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
           ((mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) &&
            mbmi_ext->ref_mv_count[ref_frame_type] > 1))
 #endif  // CONFIG_COMPOUND_SINGLEREF
-#else   // !CONFIG_EXT_INTER
-      if ((mbmi->mode == NEARMV &&
-           mbmi_ext->ref_mv_count[ref_frame_type] > 2) ||
-          (mbmi->mode == NEWMV && mbmi_ext->ref_mv_count[ref_frame_type] > 1))
-#endif  // CONFIG_EXT_INTER
       {
         int_mv backup_mv = frame_mv[NEARMV][ref_frame];
         MB_MODE_INFO backup_mbmi = *mbmi;
@@ -10978,12 +11287,8 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         int64_t tmp_ref_rd = this_rd;
         int ref_idx;
 
-// TODO(jingning): This should be deprecated shortly.
-#if CONFIG_EXT_INTER
+        // TODO(jingning): This should be deprecated shortly.
         int idx_offset = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0;
-#else
-        int idx_offset = (mbmi->mode == NEARMV) ? 1 : 0;
-#endif  // CONFIG_EXT_INTER
         int ref_set =
             AOMMIN(2, mbmi_ext->ref_mv_count[ref_frame_type] - 1 - idx_offset);
 
@@ -10994,7 +11299,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         backup_fmv[0] = frame_mv[NEWMV][ref_frame];
         if (comp_pred) backup_fmv[1] = frame_mv[NEWMV][second_ref_frame];
 
-        rate2 += (rate2 < INT_MAX ? cpi->drl_mode_cost0[drl_ctx][0] : 0);
+        rate2 += (rate2 < INT_MAX ? x->drl_mode_cost0[drl_ctx][0] : 0);
 
         if (this_rd < INT64_MAX) {
           if (RDCOST(x->rdmult, rate_y + rate_uv, distortion2) <
@@ -11003,10 +11308,11 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
                 x->rdmult, rate2 + av1_cost_bit(av1_get_skip_prob(cm, xd), 0),
                 distortion2);
           else
-            tmp_ref_rd = RDCOST(
-                x->rdmult, rate2 + av1_cost_bit(av1_get_skip_prob(cm, xd), 1) -
-                               rate_y - rate_uv,
-                total_sse);
+            tmp_ref_rd =
+                RDCOST(x->rdmult,
+                       rate2 + av1_cost_bit(av1_get_skip_prob(cm, xd), 1) -
+                           rate_y - rate_uv,
+                       total_sse);
         }
 #if CONFIG_VAR_TX
         for (i = 0; i < MAX_MB_PLANE; ++i)
@@ -11027,7 +11333,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 
           mbmi->ref_mv_idx = 1 + ref_idx;
 
-#if CONFIG_EXT_INTER
           if (comp_pred) {
             int ref_mv_idx = mbmi->ref_mv_idx;
             // Special case: NEAR_NEWMV and NEW_NEARMV modes use
@@ -11092,7 +11397,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
             }
 #endif  // CONFIG_COMPOUND_SINGLEREF
           } else {
-#endif  // CONFIG_EXT_INTER
             for (ref = 0; ref < 1 + comp_pred; ++ref) {
               int_mv this_mv =
                   (ref == 0)
@@ -11104,9 +11408,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
                            xd->n8_h << MI_SIZE_LOG2, xd);
               mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0] = this_mv;
             }
-#if CONFIG_EXT_INTER
           }
-#endif
 
           cur_mv =
               mbmi_ext->ref_mv_stack[ref_frame][mbmi->ref_mv_idx + idx_offset]
@@ -11115,39 +11417,25 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 
           if (!mv_check_bounds(&x->mv_limits, &cur_mv.as_mv)) {
             int_mv dummy_single_newmv[TOTAL_REFS_PER_FRAME] = { { 0 } };
-#if CONFIG_EXT_INTER
             int dummy_single_newmv_rate[TOTAL_REFS_PER_FRAME] = { 0 };
-#endif  // CONFIG_EXT_INTER
 
             frame_mv[NEARMV][ref_frame] = cur_mv;
             av1_init_rd_stats(&tmp_rd_stats);
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-            // With the same reason as 'rd_stats_y' passed to above
-            // handle_inter_mode(), tmp_rd_stats_y.dist and
-            // tmp_rd_stats_y.sse are sometimes not initialized, esp. when
-            // tmp_rd_stats.skip = 1 and tmp_rd_stats.dist and .sse
-            // represent combined luma and chroma .dist and .sse,
-            // we should initialized tmp_rd_stats_y.
-            if (bsize < BLOCK_8X8) av1_init_rd_stats(&tmp_rd_stats_y);
-#endif
+
             // Point to variables that are not maintained between iterations
             args.single_newmv = dummy_single_newmv;
-#if CONFIG_EXT_INTER
             args.single_newmv_rate = dummy_single_newmv_rate;
             args.modelled_rd = NULL;
-#endif  // CONFIG_EXT_INTER
             tmp_alt_rd = handle_inter_mode(cpi, x, bsize, &tmp_rd_stats,
                                            &tmp_rd_stats_y, &tmp_rd_stats_uv,
                                            &dummy_disable_skip, frame_mv,
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
                                            frame_comp_mv,
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
                                            mi_row, mi_col, &args, best_rd);
             // Prevent pointers from escaping local scope
             args.single_newmv = NULL;
-#if CONFIG_EXT_INTER
             args.single_newmv_rate = NULL;
-#endif  // CONFIG_EXT_INTER
           }
 
           for (i = 0; i < mbmi->ref_mv_idx; ++i) {
@@ -11155,7 +11443,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
             drl1_ctx = av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type],
                                    i + idx_offset);
             tmp_rd_stats.rate +=
-                (tmp_rd_stats.rate < INT_MAX ? cpi->drl_mode_cost0[drl1_ctx][1]
+                (tmp_rd_stats.rate < INT_MAX ? x->drl_mode_cost0[drl1_ctx][1]
                                              : 0);
           }
 
@@ -11166,7 +11454,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
                 av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type],
                             mbmi->ref_mv_idx + idx_offset);
             tmp_rd_stats.rate +=
-                (tmp_rd_stats.rate < INT_MAX ? cpi->drl_mode_cost0[drl1_ctx][0]
+                (tmp_rd_stats.rate < INT_MAX ? x->drl_mode_cost0[drl1_ctx][0]
                                              : 0);
           }
 
@@ -11178,16 +11466,18 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
             if (RDCOST(x->rdmult, tmp_rd_stats_y.rate + tmp_rd_stats_uv.rate,
                        tmp_rd_stats.dist) <
                 RDCOST(x->rdmult, 0, tmp_rd_stats.sse))
-              tmp_alt_rd = RDCOST(
-                  x->rdmult, tmp_rd_stats.rate +
-                                 av1_cost_bit(av1_get_skip_prob(cm, xd), 0),
-                  tmp_rd_stats.dist);
+              tmp_alt_rd =
+                  RDCOST(x->rdmult,
+                         tmp_rd_stats.rate +
+                             av1_cost_bit(av1_get_skip_prob(cm, xd), 0),
+                         tmp_rd_stats.dist);
             else
-              tmp_alt_rd = RDCOST(
-                  x->rdmult, tmp_rd_stats.rate +
-                                 av1_cost_bit(av1_get_skip_prob(cm, xd), 1) -
-                                 tmp_rd_stats_y.rate - tmp_rd_stats_uv.rate,
-                  tmp_rd_stats.sse);
+              tmp_alt_rd =
+                  RDCOST(x->rdmult,
+                         tmp_rd_stats.rate +
+                             av1_cost_bit(av1_get_skip_prob(cm, xd), 1) -
+                             tmp_rd_stats_y.rate - tmp_rd_stats_uv.rate,
+                         tmp_rd_stats.sse);
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
           }
 
@@ -11203,16 +11493,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
             tmp_ref_rd = tmp_alt_rd;
             backup_mbmi = *mbmi;
             backup_skip = x->skip;
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-            if (bsize < BLOCK_8X8) {
-              if (tmp_rd_stats_y.rate != INT_MAX) {
-                assert(tmp_rd_stats_y.sse < INT64_MAX);
-                assert(tmp_rd_stats_y.dist < INT64_MAX);
-              }
-              total_sse_y = tmp_rd_stats_y.sse;
-              distortion2_y = tmp_rd_stats_y.dist;
-            }
-#endif
 #if CONFIG_VAR_TX
             for (i = 0; i < MAX_MB_PLANE; ++i)
               memcpy(x->blk_skip_drl[i], x->blk_skip[i],
@@ -11238,12 +11518,8 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 
       if (this_rd == INT64_MAX) continue;
 
-#if SUB8X8_COMP_REF
-      compmode_cost = av1_cost_bit(comp_mode_p, comp_pred);
-#else
-      if (mbmi->sb_type != BLOCK_4X4)
+      if (is_comp_ref_allowed(mbmi->sb_type))
         compmode_cost = av1_cost_bit(comp_mode_p, comp_pred);
-#endif  // SUB8X8_COMP_REF
 
       if (cm->reference_mode == REFERENCE_MODE_SELECT) rate2 += compmode_cost;
     }
@@ -11263,14 +11539,14 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       rate2 += ref_costs_single[ref_frame];
     }
 
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
     // Add the cost to signal single/comp mode in single ref.
     if (!comp_pred && cm->reference_mode != COMPOUND_REFERENCE) {
       aom_prob singleref_comp_mode_p = av1_get_inter_mode_prob(cm, xd);
       rate2 += av1_cost_bit(singleref_comp_mode_p,
                             is_inter_singleref_comp_mode(mbmi->mode));
     }
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
 
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
     if (ref_frame == INTRA_FRAME)
@@ -11299,12 +11575,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
           this_skip2 = 1;
           rate_y = 0;
           rate_uv = 0;
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-          if (bsize < BLOCK_8X8) {
-            assert(total_sse_y < INT64_MAX);
-            distortion2_y = total_sse_y;
-          }
-#endif
         }
       } else {
         // Add in the cost of the no skip flag.
@@ -11324,25 +11594,19 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
     }
 
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-    if ((bsize < BLOCK_8X8) && (rate2 != INT_MAX)) {
-      assert(distortion2_y < INT64_MAX);
-    }
-#endif
-
     if (ref_frame == INTRA_FRAME) {
       // Keep record of best intra rd
       if (this_rd < best_intra_rd) {
         best_intra_rd = this_rd;
         best_intra_mode = mbmi->mode;
       }
-#if CONFIG_EXT_INTER && CONFIG_INTERINTRA
+#if CONFIG_INTERINTRA
     } else if (second_ref_frame == NONE_FRAME) {
       if (this_rd < best_single_inter_rd) {
         best_single_inter_rd = this_rd;
         best_single_inter_ref = mbmi->ref_frame[0];
       }
-#endif  // CONFIG_EXT_INTER && CONFIG_INTERINTRA
+#endif  // CONFIG_INTERINTRA
     }
 
     if (!disable_skip && ref_frame == INTRA_FRAME) {
@@ -11388,12 +11652,11 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 #endif
             mi);
         if (motion_allowed == WARPED_CAUSAL)
-          *returnrate_nocoef -= cpi->motion_mode_cost[bsize][mbmi->motion_mode];
+          *returnrate_nocoef -= x->motion_mode_cost[bsize][mbmi->motion_mode];
         else if (motion_allowed == OBMC_CAUSAL)
-          *returnrate_nocoef -=
-              cpi->motion_mode_cost1[bsize][mbmi->motion_mode];
+          *returnrate_nocoef -= x->motion_mode_cost1[bsize][mbmi->motion_mode];
 #else
-        *returnrate_nocoef -= cpi->motion_mode_cost[bsize][mbmi->motion_mode];
+        *returnrate_nocoef -= x->motion_mode_cost[bsize][mbmi->motion_mode];
 #endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 #endif  // CONFIG_SUPERTX
@@ -11406,12 +11669,6 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         best_rate_y = rate_y + av1_cost_bit(av1_get_skip_prob(cm, xd),
                                             this_skip2 || skippable);
         best_rate_uv = rate_uv;
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-        if (bsize < BLOCK_8X8) {
-          assert(distortion2_y < INT64_MAX);
-          rd_cost->dist_y = distortion2_y;
-        }
-#endif
 #if CONFIG_VAR_TX
         for (i = 0; i < MAX_MB_PLANE; ++i)
           memcpy(ctx->blk_skip[i], x->blk_skip[i],
@@ -11419,11 +11676,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 #endif  // CONFIG_VAR_TX
       }
     }
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-    if ((bsize < BLOCK_8X8) && (rd_cost->rate != INT_MAX)) {
-      assert(rd_cost->dist_y < INT64_MAX);
-    }
-#endif
+
     /* keep record of best compound/single-only prediction */
     if (!disable_skip && ref_frame != INTRA_FRAME) {
       int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
@@ -11475,14 +11728,14 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
         xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
     }
 
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
     // Single ref compound mode
     if (!has_second_ref(mbmi) && is_inter_singleref_comp_mode(mbmi->mode)) {
       xd->block_refs[1] = xd->block_refs[0];
       for (i = 0; i < MAX_MB_PLANE; i++)
         xd->plane[i].pre[1] = xd->plane[i].pre[0];
     }
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#endif  // CONFIG_COMPOUND_SINGLEREF
 
     if (is_inter_mode(mbmi->mode)) {
       av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
@@ -11497,6 +11750,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 #if CONFIG_VAR_TX
       if (cm->tx_mode == TX_MODE_SELECT || xd->lossless[mbmi->segment_id]) {
         select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+        assert(rd_stats_y.rate != INT_MAX);
       } else {
         int idx, idy;
         super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
@@ -11538,6 +11792,9 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
 #endif  // CONFIG_VAR_TX
       best_mbmode.tx_type = mbmi->tx_type;
       best_mbmode.tx_size = mbmi->tx_size;
+#if CONFIG_LGT_FROM_PRED
+      best_mbmode.use_lgt = mbmi->use_lgt;
+#endif
 #if CONFIG_VAR_TX
       for (idy = 0; idy < xd->n8_h; ++idy)
         for (idx = 0; idx < xd->n8_w; ++idx)
@@ -11554,23 +11811,9 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
       rd_cost->dist = rd_stats_y.dist + rd_stats_uv.dist;
       rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
       best_skip2 = skip_blk;
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-      if (bsize < BLOCK_8X8) {
-        assert(rd_cost->rate != INT_MAX);
-        assert(rd_cost->dist_y < INT64_MAX);
-        rd_cost->dist_y = rd_stats_y.dist;
-      }
-#endif
     }
   }
 
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-  if ((bsize < BLOCK_8X8) && (rd_cost->rate != INT_MAX)) {
-    assert(rd_cost->dist_y < INT64_MAX);
-  }
-#endif
-
-#if CONFIG_PALETTE
   // Only try palette mode when the best mode so far is an intra mode.
   if (try_palette && !is_inter_mode(best_mbmode.mode)) {
     int rate2 = 0;
@@ -11603,7 +11846,7 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     uv_tx = uv_txsize_lookup[bsize][mbmi->tx_size][xd->plane[1].subsampling_x]
                             [xd->plane[1].subsampling_y];
     if (rate_uv_intra[uv_tx] == INT_MAX) {
-      choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx, &rate_uv_intra[uv_tx],
+      choose_intra_uv_mode(cpi, x, bsize, uv_tx, &rate_uv_intra[uv_tx],
                            &rate_uv_tokenonly[uv_tx], &dist_uvs[uv_tx],
                            &skip_uvs[uv_tx], &mode_uv[uv_tx]);
       pmi_uv[uv_tx] = *pmi;
@@ -11666,28 +11909,21 @@ void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
     }
   }
 PALETTE_EXIT:
-#endif  // CONFIG_PALETTE
 
 #if CONFIG_FILTER_INTRA
   // TODO(huisu): filter-intra is turned off in lossless mode for now to
   // avoid a unit test failure
-  if (!xd->lossless[mbmi->segment_id] &&
-#if CONFIG_PALETTE
-      pmi->palette_size[0] == 0 &&
-#endif  // CONFIG_PALETTE
+  if (!xd->lossless[mbmi->segment_id] && pmi->palette_size[0] == 0 &&
       !dc_skipped && best_mode_index >= 0 &&
       best_intra_rd < (best_rd + (best_rd >> 3))) {
     pick_filter_intra_interframe(
-        cpi, x, ctx, bsize, mi_row, mi_col, rate_uv_intra, rate_uv_tokenonly,
+        cpi, x, bsize, mi_row, mi_col, rate_uv_intra, rate_uv_tokenonly,
         dist_uvs, skip_uvs, mode_uv, filter_intra_mode_info_uv,
 #if CONFIG_EXT_INTRA
         uv_angle_delta,
 #endif  // CONFIG_EXT_INTRA
-#if CONFIG_PALETTE
-        pmi_uv, palette_ctx,
-#endif  // CONFIG_PALETTE
-        0, ref_costs_single, &best_rd, &best_intra_rd, &best_intra_mode,
-        &best_mode_index, &best_skip2, &best_mode_skippable,
+        pmi_uv, palette_ctx, 0, ref_costs_single, &best_rd, &best_intra_rd,
+        &best_intra_mode, &best_mode_index, &best_skip2, &best_mode_skippable,
 #if CONFIG_SUPERTX
         returnrate_nocoef,
 #endif  // CONFIG_SUPERTX
@@ -11699,15 +11935,11 @@ PALETTE_EXIT:
 // Therefore, sometimes, NEWMV is chosen instead of NEARESTMV, NEARMV, and
 // ZEROMV. Here, checks are added for those cases, and the mode decisions
 // are corrected.
-#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
+#if CONFIG_COMPOUND_SINGLEREF
 // NOTE: For SR_NEW_NEWMV, no need to check as the two mvs from the same ref
 //       are surely different from each other.
-#endif  // CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
-  if (best_mbmode.mode == NEWMV
-#if CONFIG_EXT_INTER
-      || best_mbmode.mode == NEW_NEWMV
-#endif  // CONFIG_EXT_INTER
-      ) {
+#endif  // CONFIG_COMPOUND_SINGLEREF
+  if (best_mbmode.mode == NEWMV || best_mbmode.mode == NEW_NEWMV) {
     const MV_REFERENCE_FRAME refs[2] = { best_mbmode.ref_frame[0],
                                          best_mbmode.ref_frame[1] };
     int comp_pred_mode = refs[1] > INTRA_FRAME;
@@ -11716,14 +11948,25 @@ PALETTE_EXIT:
 #if CONFIG_GLOBAL_MOTION
     zeromv[0].as_int = gm_get_motion_vector(&cm->global_motion[refs[0]],
                                             cm->allow_high_precision_mv, bsize,
-                                            mi_col, mi_row, 0)
+                                            mi_col, mi_row, 0
+#if CONFIG_AMVR
+                                            ,
+                                            cm->cur_frame_mv_precision_level
+#endif
+                                            )
                            .as_int;
-    zeromv[1].as_int = comp_pred_mode
-                           ? gm_get_motion_vector(&cm->global_motion[refs[1]],
-                                                  cm->allow_high_precision_mv,
-                                                  bsize, mi_col, mi_row, 0)
-                                 .as_int
-                           : 0;
+    zeromv[1].as_int =
+        comp_pred_mode
+            ? gm_get_motion_vector(&cm->global_motion[refs[1]],
+                                   cm->allow_high_precision_mv, bsize, mi_col,
+                                   mi_row, 0
+#if CONFIG_AMVR
+                                   ,
+                                   cm->cur_frame_mv_precision_level
+#endif
+                                   )
+                  .as_int
+            : 0;
 #else
     zeromv[0].as_int = 0;
     zeromv[1].as_int = 0;
@@ -11749,7 +11992,6 @@ PALETTE_EXIT:
       int_mv nearestmv[2];
       int_mv nearmv[2];
 
-#if CONFIG_EXT_INTER
       if (mbmi_ext->ref_mv_count[rf_type] > 1) {
         nearmv[0] = mbmi_ext->ref_mv_stack[rf_type][1].this_mv;
         nearmv[1] = mbmi_ext->ref_mv_stack[rf_type][1].comp_mv;
@@ -11757,22 +11999,6 @@ PALETTE_EXIT:
         nearmv[0] = frame_mv[NEARMV][refs[0]];
         nearmv[1] = frame_mv[NEARMV][refs[1]];
       }
-#else
-      int ref_set = (mbmi_ext->ref_mv_count[rf_type] >= 2)
-                        ? AOMMIN(2, mbmi_ext->ref_mv_count[rf_type] - 2)
-                        : INT_MAX;
-
-      for (i = 0; i <= ref_set && ref_set != INT_MAX; ++i) {
-        nearmv[0] = mbmi_ext->ref_mv_stack[rf_type][i + 1].this_mv;
-        nearmv[1] = mbmi_ext->ref_mv_stack[rf_type][i + 1].comp_mv;
-
-        if (nearmv[0].as_int == best_mbmode.mv[0].as_int &&
-            nearmv[1].as_int == best_mbmode.mv[1].as_int) {
-          best_mbmode.mode = NEARMV;
-          best_mbmode.ref_mv_idx = i;
-        }
-      }
-#endif  // CONFIG_EXT_INTER
       if (mbmi_ext->ref_mv_count[rf_type] >= 1) {
         nearestmv[0] = mbmi_ext->ref_mv_stack[rf_type][0].this_mv;
         nearestmv[1] = mbmi_ext->ref_mv_stack[rf_type][0].comp_mv;
@@ -11782,9 +12008,7 @@ PALETTE_EXIT:
       }
 
       if (nearestmv[0].as_int == best_mbmode.mv[0].as_int &&
-          nearestmv[1].as_int == best_mbmode.mv[1].as_int)
-#if CONFIG_EXT_INTER
-      {
+          nearestmv[1].as_int == best_mbmode.mv[1].as_int) {
         best_mbmode.mode = NEAREST_NEARESTMV;
       } else {
         int ref_set = (mbmi_ext->ref_mv_count[rf_type] >= 2)
@@ -11808,21 +12032,12 @@ PALETTE_EXIT:
             best_mbmode.mv[1].as_int == zeromv[1].as_int)
           best_mbmode.mode = ZERO_ZEROMV;
       }
-#else
-      {
-        best_mbmode.mode = NEARESTMV;
-      } else if (best_mbmode.mv[0].as_int == zeromv[0].as_int &&
-                 best_mbmode.mv[1].as_int == zeromv[1].as_int) {
-        best_mbmode.mode = ZEROMV;
-      }
-#endif  // CONFIG_EXT_INTER
     }
   }
 
   // Make sure that the ref_mv_idx is only nonzero when we're
   // using a mode which can support ref_mv_idx
   if (best_mbmode.ref_mv_idx != 0 &&
-#if CONFIG_EXT_INTER
 #if CONFIG_COMPOUND_SINGLEREF
       !(best_mbmode.mode == NEWMV || best_mbmode.mode == SR_NEW_NEWMV ||
         best_mbmode.mode == NEW_NEWMV ||
@@ -11831,45 +12046,31 @@ PALETTE_EXIT:
       !(best_mbmode.mode == NEWMV || best_mbmode.mode == NEW_NEWMV ||
         have_nearmv_in_inter_mode(best_mbmode.mode)))
 #endif  // CONFIG_COMPOUND_SINGLEREF
-#else   // !CONFIG_EXT_INTER
-      !(best_mbmode.mode == NEARMV || best_mbmode.mode == NEWMV))
-#endif  // CONFIG_EXT_INTER
   {
     best_mbmode.ref_mv_idx = 0;
   }
 
-  {
+  if (best_mbmode.ref_frame[0] > INTRA_FRAME &&
+      best_mbmode.ref_frame[1] <= INTRA_FRAME) {
     int8_t ref_frame_type = av1_ref_frame_type(best_mbmode.ref_frame);
     int16_t mode_ctx = mbmi_ext->mode_context[ref_frame_type];
     if (mode_ctx & (1 << ALL_ZERO_FLAG_OFFSET)) {
-      int_mv zeromv[2];
+      int_mv zeromv;
 #if CONFIG_GLOBAL_MOTION
-      const MV_REFERENCE_FRAME refs[2] = { best_mbmode.ref_frame[0],
-                                           best_mbmode.ref_frame[1] };
-      zeromv[0].as_int = gm_get_motion_vector(&cm->global_motion[refs[0]],
-                                              cm->allow_high_precision_mv,
-                                              bsize, mi_col, mi_row, 0)
-                             .as_int;
-      zeromv[1].as_int = (refs[1] != NONE_FRAME)
-                             ? gm_get_motion_vector(&cm->global_motion[refs[1]],
-                                                    cm->allow_high_precision_mv,
-                                                    bsize, mi_col, mi_row, 0)
-                                   .as_int
-                             : 0;
-      lower_mv_precision(&zeromv[0].as_mv, cm->allow_high_precision_mv);
-      lower_mv_precision(&zeromv[1].as_mv, cm->allow_high_precision_mv);
+      const MV_REFERENCE_FRAME ref = best_mbmode.ref_frame[0];
+      zeromv.as_int = gm_get_motion_vector(&cm->global_motion[ref],
+                                           cm->allow_high_precision_mv, bsize,
+                                           mi_col, mi_row, 0
+#if CONFIG_AMVR
+                                           ,
+                                           cm->cur_frame_mv_precision_level
+#endif
+                                           )
+                          .as_int;
 #else
-      zeromv[0].as_int = zeromv[1].as_int = 0;
+      zeromv.as_int = 0;
 #endif  // CONFIG_GLOBAL_MOTION
-      if (best_mbmode.ref_frame[0] > INTRA_FRAME &&
-          best_mbmode.mv[0].as_int == zeromv[0].as_int &&
-#if CONFIG_EXT_INTER
-          (best_mbmode.ref_frame[1] <= INTRA_FRAME)
-#else
-          (best_mbmode.ref_frame[1] == NONE_FRAME ||
-           best_mbmode.mv[1].as_int == zeromv[1].as_int)
-#endif  // CONFIG_EXT_INTER
-              ) {
+      if (best_mbmode.mv[0].as_int == zeromv.as_int) {
         best_mbmode.mode = ZEROMV;
       }
     }
@@ -11881,24 +12082,14 @@ PALETTE_EXIT:
     return;
   }
 
-#if CONFIG_DUAL_FILTER
   assert((cm->interp_filter == SWITCHABLE) ||
-         (cm->interp_filter == best_mbmode.interp_filter[0]) ||
+         (cm->interp_filter ==
+          av1_extract_interp_filter(best_mbmode.interp_filters, 0)) ||
          !is_inter_block(&best_mbmode));
+#if CONFIG_DUAL_FILTER
   assert((cm->interp_filter == SWITCHABLE) ||
-         (cm->interp_filter == best_mbmode.interp_filter[1]) ||
-         !is_inter_block(&best_mbmode));
-  if (best_mbmode.ref_frame[1] > INTRA_FRAME) {
-    assert((cm->interp_filter == SWITCHABLE) ||
-           (cm->interp_filter == best_mbmode.interp_filter[2]) ||
-           !is_inter_block(&best_mbmode));
-    assert((cm->interp_filter == SWITCHABLE) ||
-           (cm->interp_filter == best_mbmode.interp_filter[3]) ||
-           !is_inter_block(&best_mbmode));
-  }
-#else
-  assert((cm->interp_filter == SWITCHABLE) ||
-         (cm->interp_filter == best_mbmode.interp_filter) ||
+         (cm->interp_filter ==
+          av1_extract_interp_filter(best_mbmode.interp_filters, 1)) ||
          !is_inter_block(&best_mbmode));
 #endif  // CONFIG_DUAL_FILTER
 
@@ -11913,11 +12104,7 @@ PALETTE_EXIT:
 // Note: this section is needed since the mode may have been forced to
 // ZEROMV by the all-zero mode handling of ref-mv.
 #if CONFIG_GLOBAL_MOTION
-  if (mbmi->mode == ZEROMV
-#if CONFIG_EXT_INTER
-      || mbmi->mode == ZERO_ZEROMV
-#endif  // CONFIG_EXT_INTER
-      ) {
+  if (mbmi->mode == ZEROMV || mbmi->mode == ZERO_ZEROMV) {
 #if CONFIG_WARPED_MOTION || CONFIG_MOTION_VAR
     // Correct the motion mode for ZEROMV
     const MOTION_MODE last_motion_mode_allowed =
@@ -11932,17 +12119,8 @@ PALETTE_EXIT:
 
     // Correct the interpolation filter for ZEROMV
     if (is_nontrans_global_motion(xd)) {
-#if CONFIG_DUAL_FILTER
-      mbmi->interp_filter[0] = cm->interp_filter == SWITCHABLE
-                                   ? EIGHTTAP_REGULAR
-                                   : cm->interp_filter;
-      mbmi->interp_filter[1] = cm->interp_filter == SWITCHABLE
-                                   ? EIGHTTAP_REGULAR
-                                   : cm->interp_filter;
-#else
-      mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP_REGULAR
-                                                            : cm->interp_filter;
-#endif  // CONFIG_DUAL_FILTER
+      mbmi->interp_filters = av1_broadcast_interp_filter(
+          av1_unswitchable_filter(cm->interp_filter));
     }
   }
 #endif  // CONFIG_GLOBAL_MOTION
@@ -11968,11 +12146,10 @@ PALETTE_EXIT:
   store_coding_context(x, ctx, best_mode_index, best_pred_diff,
                        best_mode_skippable);
 
-#if CONFIG_PALETTE
-  if (cm->allow_screen_content_tools && pmi->palette_size[1] > 0) {
+  if (pmi->palette_size[1] > 0) {
+    assert(try_palette);
     restore_uv_color_map(cpi, x);
   }
-#endif  // CONFIG_PALETTE
 }
 
 void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
@@ -12013,10 +12190,8 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
 
   assert(segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP));
 
-#if CONFIG_PALETTE
   mbmi->palette_mode_info.palette_size[0] = 0;
   mbmi->palette_mode_info.palette_size[1] = 0;
-#endif  // CONFIG_PALETTE
 
 #if CONFIG_FILTER_INTRA
   mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
@@ -12030,8 +12205,12 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
 #if CONFIG_GLOBAL_MOTION
   mbmi->mv[0].as_int =
       gm_get_motion_vector(&cm->global_motion[mbmi->ref_frame[0]],
-                           cm->allow_high_precision_mv, bsize, mi_col, mi_row,
-                           0)
+                           cm->allow_high_precision_mv, bsize, mi_col, mi_row, 0
+#if CONFIG_AMVR
+                           ,
+                           cm->cur_frame_mv_precision_level
+#endif
+                           )
           .as_int;
 #else   // CONFIG_GLOBAL_MOTION
   mbmi->mv[0].as_int = 0;
@@ -12041,6 +12220,9 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
 
   mbmi->ref_mv_idx = 0;
   mbmi->pred_mv[0].as_int = 0;
+#if CONFIG_LGT_FROM_PRED
+  mbmi->use_lgt = 0;
+#endif
 
   mbmi->motion_mode = SIMPLE_TRANSLATION;
 #if CONFIG_MOTION_VAR
@@ -12074,31 +12256,18 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
       int rs;
       int best_rs = INT_MAX;
       for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
-#if CONFIG_DUAL_FILTER
-        int k;
-        for (k = 0; k < 4; ++k) mbmi->interp_filter[k] = i;
-#else
-        mbmi->interp_filter = i;
-#endif  // CONFIG_DUAL_FILTER
-        rs = av1_get_switchable_rate(cpi, xd);
+        mbmi->interp_filters = av1_broadcast_interp_filter(i);
+        rs = av1_get_switchable_rate(cm, x, xd);
         if (rs < best_rs) {
           best_rs = rs;
-#if CONFIG_DUAL_FILTER
-          best_filter = mbmi->interp_filter[0];
-#else
-          best_filter = mbmi->interp_filter;
-#endif  // CONFIG_DUAL_FILTER
+          best_filter = av1_extract_interp_filter(mbmi->interp_filters, 0);
         }
       }
     }
   }
-// Set the appropriate filter
-#if CONFIG_DUAL_FILTER
-  for (i = 0; i < 4; ++i) mbmi->interp_filter[i] = best_filter;
-#else
-  mbmi->interp_filter = best_filter;
-#endif  // CONFIG_DUAL_FILTER
-  rate2 += av1_get_switchable_rate(cpi, xd);
+  // Set the appropriate filter
+  mbmi->interp_filters = av1_broadcast_interp_filter(best_filter);
+  rate2 += av1_get_switchable_rate(cm, x, xd);
 
   if (cm->reference_mode == REFERENCE_MODE_SELECT)
     rate2 += av1_cost_bit(comp_mode_p, comp_pred);
@@ -12111,22 +12280,16 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
   rd_cost->rate = rate2;
   rd_cost->dist = distortion2;
   rd_cost->rdcost = this_rd;
-#if CONFIG_DIST_8X8 && CONFIG_CB4X4
-  if (bsize < BLOCK_8X8) rd_cost->dist_y = distortion2;
-#endif
+
   if (this_rd >= best_rd_so_far) {
     rd_cost->rate = INT_MAX;
     rd_cost->rdcost = INT64_MAX;
     return;
   }
 
-#if CONFIG_DUAL_FILTER
   assert((cm->interp_filter == SWITCHABLE) ||
-         (cm->interp_filter == mbmi->interp_filter[0]));
-#else
-  assert((cm->interp_filter == SWITCHABLE) ||
-         (cm->interp_filter == mbmi->interp_filter));
-#endif  // CONFIG_DUAL_FILTER
+         (cm->interp_filter ==
+          av1_extract_interp_filter(mbmi->interp_filters, 0)));
 
   av1_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact,
                             cpi->sf.adaptive_rd_thresh, bsize, THR_ZEROMV);
@@ -12137,6 +12300,124 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
 }
 
 #if CONFIG_MOTION_VAR
+
+struct calc_target_weighted_pred_ctxt {
+  const MACROBLOCK *x;
+  const uint8_t *tmp;
+  int tmp_stride;
+  int overlap;
+};
+
+static INLINE void calc_target_weighted_pred_above(MACROBLOCKD *xd,
+                                                   int rel_mi_col,
+                                                   uint8_t nb_mi_width,
+                                                   MODE_INFO *nb_mi,
+                                                   void *fun_ctxt) {
+  (void)nb_mi;
+
+  struct calc_target_weighted_pred_ctxt *ctxt =
+      (struct calc_target_weighted_pred_ctxt *)fun_ctxt;
+
+#if CONFIG_HIGHBITDEPTH
+  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+#else
+  const int is_hbd = 0;
+#endif  // CONFIG_HIGHBITDEPTH
+
+  const int bw = xd->n8_w << MI_SIZE_LOG2;
+  const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap);
+
+  int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_col * MI_SIZE);
+  int32_t *mask = ctxt->x->mask_buf + (rel_mi_col * MI_SIZE);
+  const uint8_t *tmp = ctxt->tmp + rel_mi_col * MI_SIZE;
+
+  if (!is_hbd) {
+    for (int row = 0; row < ctxt->overlap; ++row) {
+      const uint8_t m0 = mask1d[row];
+      const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+      for (int col = 0; col < nb_mi_width * MI_SIZE; ++col) {
+        wsrc[col] = m1 * tmp[col];
+        mask[col] = m0;
+      }
+      wsrc += bw;
+      mask += bw;
+      tmp += ctxt->tmp_stride;
+    }
+#if CONFIG_HIGHBITDEPTH
+  } else {
+    const uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp);
+
+    for (int row = 0; row < ctxt->overlap; ++row) {
+      const uint8_t m0 = mask1d[row];
+      const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+      for (int col = 0; col < nb_mi_width * MI_SIZE; ++col) {
+        wsrc[col] = m1 * tmp16[col];
+        mask[col] = m0;
+      }
+      wsrc += bw;
+      mask += bw;
+      tmp16 += ctxt->tmp_stride;
+    }
+#endif  // CONFIG_HIGHBITDEPTH
+  }
+}
+
+static INLINE void calc_target_weighted_pred_left(MACROBLOCKD *xd,
+                                                  int rel_mi_row,
+                                                  uint8_t nb_mi_height,
+                                                  MODE_INFO *nb_mi,
+                                                  void *fun_ctxt) {
+  (void)nb_mi;
+
+  struct calc_target_weighted_pred_ctxt *ctxt =
+      (struct calc_target_weighted_pred_ctxt *)fun_ctxt;
+
+#if CONFIG_HIGHBITDEPTH
+  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+#else
+  const int is_hbd = 0;
+#endif  // CONFIG_HIGHBITDEPTH
+
+  const int bw = xd->n8_w << MI_SIZE_LOG2;
+  const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap);
+
+  int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_row * MI_SIZE * bw);
+  int32_t *mask = ctxt->x->mask_buf + (rel_mi_row * MI_SIZE * bw);
+  const uint8_t *tmp = ctxt->tmp + (rel_mi_row * MI_SIZE * ctxt->tmp_stride);
+
+  if (!is_hbd) {
+    for (int row = 0; row < nb_mi_height * MI_SIZE; ++row) {
+      for (int col = 0; col < ctxt->overlap; ++col) {
+        const uint8_t m0 = mask1d[col];
+        const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+        wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 +
+                    (tmp[col] << AOM_BLEND_A64_ROUND_BITS) * m1;
+        mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0;
+      }
+      wsrc += bw;
+      mask += bw;
+      tmp += ctxt->tmp_stride;
+    }
+#if CONFIG_HIGHBITDEPTH
+  } else {
+    const uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp);
+
+    for (int row = 0; row < nb_mi_height * MI_SIZE; ++row) {
+      for (int col = 0; col < ctxt->overlap; ++col) {
+        const uint8_t m0 = mask1d[col];
+        const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+        wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 +
+                    (tmp16[col] << AOM_BLEND_A64_ROUND_BITS) * m1;
+        mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0;
+      }
+      wsrc += bw;
+      mask += bw;
+      tmp16 += ctxt->tmp_stride;
+    }
+#endif  // CONFIG_HIGHBITDEPTH
+  }
+}
+
 // This function has a structure similar to av1_build_obmc_inter_prediction
 //
 // The OBMC predictor is computed as:
@@ -12181,13 +12462,11 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
                                       int above_stride, const uint8_t *left,
                                       int left_stride) {
   const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
-  int row, col, i;
   const int bw = xd->n8_w << MI_SIZE_LOG2;
   const int bh = xd->n8_h << MI_SIZE_LOG2;
   int32_t *mask_buf = x->mask_buf;
   int32_t *wsrc_buf = x->wsrc_buf;
-  const int wsrc_stride = bw;
-  const int mask_stride = bw;
+
   const int src_scale = AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA;
 #if CONFIG_HIGHBITDEPTH
   const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
@@ -12200,86 +12479,20 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
   assert(xd->plane[0].subsampling_y == 0);
 
   av1_zero_array(wsrc_buf, bw * bh);
-  for (i = 0; i < bw * bh; ++i) mask_buf[i] = AOM_BLEND_A64_MAX_ALPHA;
+  for (int i = 0; i < bw * bh; ++i) mask_buf[i] = AOM_BLEND_A64_MAX_ALPHA;
 
   // handle above row
   if (xd->up_available) {
     const int overlap =
-        AOMMIN(block_size_high[bsize] >> 1, block_size_high[BLOCK_64X64] >> 1);
-    const int miw = AOMMIN(xd->n8_w, cm->mi_cols - mi_col);
-    const int mi_row_offset = -1;
-    const uint8_t *const mask1d = av1_get_obmc_mask(overlap);
-    const int neighbor_limit = max_neighbor_obmc[b_width_log2_lookup[bsize]];
-    int neighbor_count = 0;
-
-    assert(miw > 0);
-
-    i = 0;
-    do {  // for each mi in the above row
-      const int mi_col_offset = i;
-      const MB_MODE_INFO *above_mbmi =
-          &xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]->mbmi;
-#if CONFIG_CHROMA_SUB8X8
-      if (above_mbmi->sb_type < BLOCK_8X8)
-        above_mbmi =
-            &xd->mi[mi_col_offset + 1 + mi_row_offset * xd->mi_stride]->mbmi;
-#endif
-      const BLOCK_SIZE a_bsize = AOMMAX(above_mbmi->sb_type, BLOCK_8X8);
-      const int above_step =
-          AOMMIN(mi_size_wide[a_bsize], mi_size_wide[BLOCK_64X64]);
-      const int mi_step = AOMMIN(xd->n8_w, above_step);
-      const int neighbor_bw = mi_step * MI_SIZE;
-
-      if (is_neighbor_overlappable(above_mbmi)) {
-        if (!CONFIG_CB4X4 && (a_bsize == BLOCK_4X4 || a_bsize == BLOCK_4X8))
-          neighbor_count += 2;
-        else
-          neighbor_count++;
-        if (neighbor_count > neighbor_limit) break;
-
-        const int tmp_stride = above_stride;
-        int32_t *wsrc = wsrc_buf + (i * MI_SIZE);
-        int32_t *mask = mask_buf + (i * MI_SIZE);
-
-        if (!is_hbd) {
-          const uint8_t *tmp = above;
-
-          for (row = 0; row < overlap; ++row) {
-            const uint8_t m0 = mask1d[row];
-            const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
-            for (col = 0; col < neighbor_bw; ++col) {
-              wsrc[col] = m1 * tmp[col];
-              mask[col] = m0;
-            }
-            wsrc += wsrc_stride;
-            mask += mask_stride;
-            tmp += tmp_stride;
-          }
-#if CONFIG_HIGHBITDEPTH
-        } else {
-          const uint16_t *tmp = CONVERT_TO_SHORTPTR(above);
-
-          for (row = 0; row < overlap; ++row) {
-            const uint8_t m0 = mask1d[row];
-            const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
-            for (col = 0; col < neighbor_bw; ++col) {
-              wsrc[col] = m1 * tmp[col];
-              mask[col] = m0;
-            }
-            wsrc += wsrc_stride;
-            mask += mask_stride;
-            tmp += tmp_stride;
-          }
-#endif  // CONFIG_HIGHBITDEPTH
-        }
-      }
-
-      above += neighbor_bw;
-      i += mi_step;
-    } while (i < miw);
+        AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1;
+    struct calc_target_weighted_pred_ctxt ctxt = { x, above, above_stride,
+                                                   overlap };
+    foreach_overlappable_nb_above(cm, (MACROBLOCKD *)xd, mi_col,
+                                  max_neighbor_obmc[b_width_log2_lookup[bsize]],
+                                  calc_target_weighted_pred_above, &ctxt);
   }
 
-  for (i = 0; i < bw * bh; ++i) {
+  for (int i = 0; i < bw * bh; ++i) {
     wsrc_buf[i] *= AOM_BLEND_A64_MAX_ALPHA;
     mask_buf[i] *= AOM_BLEND_A64_MAX_ALPHA;
   }
@@ -12287,102 +12500,33 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
   // handle left column
   if (xd->left_available) {
     const int overlap =
-        AOMMIN(block_size_wide[bsize] >> 1, block_size_wide[BLOCK_64X64] >> 1);
-    const int mih = AOMMIN(xd->n8_h, cm->mi_rows - mi_row);
-    const int mi_col_offset = -1;
-    const uint8_t *const mask1d = av1_get_obmc_mask(overlap);
-    const int neighbor_limit = max_neighbor_obmc[b_height_log2_lookup[bsize]];
-    int neighbor_count = 0;
-
-    assert(mih > 0);
-
-    i = 0;
-    do {  // for each mi in the left column
-      const int mi_row_offset = i;
-      MB_MODE_INFO *left_mbmi =
-          &xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]->mbmi;
-
-#if CONFIG_CHROMA_SUB8X8
-      if (left_mbmi->sb_type < BLOCK_8X8)
-        left_mbmi =
-            &xd->mi[mi_col_offset + (mi_row_offset + 1) * xd->mi_stride]->mbmi;
-#endif
-      const BLOCK_SIZE l_bsize = AOMMAX(left_mbmi->sb_type, BLOCK_8X8);
-      const int left_step =
-          AOMMIN(mi_size_high[l_bsize], mi_size_high[BLOCK_64X64]);
-      const int mi_step = AOMMIN(xd->n8_h, left_step);
-      const int neighbor_bh = mi_step * MI_SIZE;
-
-      if (is_neighbor_overlappable(left_mbmi)) {
-        if (!CONFIG_CB4X4 && (l_bsize == BLOCK_4X4 || l_bsize == BLOCK_8X4))
-          neighbor_count += 2;
-        else
-          neighbor_count++;
-        if (neighbor_count > neighbor_limit) break;
-
-        const int tmp_stride = left_stride;
-        int32_t *wsrc = wsrc_buf + (i * MI_SIZE * wsrc_stride);
-        int32_t *mask = mask_buf + (i * MI_SIZE * mask_stride);
-
-        if (!is_hbd) {
-          const uint8_t *tmp = left;
-
-          for (row = 0; row < neighbor_bh; ++row) {
-            for (col = 0; col < overlap; ++col) {
-              const uint8_t m0 = mask1d[col];
-              const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
-              wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 +
-                          (tmp[col] << AOM_BLEND_A64_ROUND_BITS) * m1;
-              mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0;
-            }
-            wsrc += wsrc_stride;
-            mask += mask_stride;
-            tmp += tmp_stride;
-          }
-#if CONFIG_HIGHBITDEPTH
-        } else {
-          const uint16_t *tmp = CONVERT_TO_SHORTPTR(left);
-
-          for (row = 0; row < neighbor_bh; ++row) {
-            for (col = 0; col < overlap; ++col) {
-              const uint8_t m0 = mask1d[col];
-              const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
-              wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 +
-                          (tmp[col] << AOM_BLEND_A64_ROUND_BITS) * m1;
-              mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0;
-            }
-            wsrc += wsrc_stride;
-            mask += mask_stride;
-            tmp += tmp_stride;
-          }
-#endif  // CONFIG_HIGHBITDEPTH
-        }
-      }
-
-      left += neighbor_bh * left_stride;
-      i += mi_step;
-    } while (i < mih);
+        AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1;
+    struct calc_target_weighted_pred_ctxt ctxt = { x, left, left_stride,
+                                                   overlap };
+    foreach_overlappable_nb_left(cm, (MACROBLOCKD *)xd, mi_row,
+                                 max_neighbor_obmc[b_height_log2_lookup[bsize]],
+                                 calc_target_weighted_pred_left, &ctxt);
   }
 
   if (!is_hbd) {
     const uint8_t *src = x->plane[0].src.buf;
 
-    for (row = 0; row < bh; ++row) {
-      for (col = 0; col < bw; ++col) {
+    for (int row = 0; row < bh; ++row) {
+      for (int col = 0; col < bw; ++col) {
         wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col];
       }
-      wsrc_buf += wsrc_stride;
+      wsrc_buf += bw;
       src += x->plane[0].src.stride;
     }
 #if CONFIG_HIGHBITDEPTH
   } else {
     const uint16_t *src = CONVERT_TO_SHORTPTR(x->plane[0].src.buf);
 
-    for (row = 0; row < bh; ++row) {
-      for (col = 0; col < bw; ++col) {
+    for (int row = 0; row < bh; ++row) {
+      for (int col = 0; col < bw; ++col) {
         wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col];
       }
-      wsrc_buf += wsrc_stride;
+      wsrc_buf += bw;
       src += x->plane[0].src.stride;
     }
 #endif  // CONFIG_HIGHBITDEPTH
@@ -12508,8 +12652,9 @@ void av1_check_ncobmc_rd(const struct AV1_COMP *cpi, struct macroblock *x,
   }
 
   if (rd_causal >
-      RDCOST(x->rdmult, rd_stats_y.rate + rd_stats_uv.rate +
-                            av1_cost_bit(cm->fc->motion_mode_prob[bsize][0], 1),
+      RDCOST(x->rdmult,
+             rd_stats_y.rate + rd_stats_uv.rate +
+                 av1_cost_bit(cm->fc->motion_mode_prob[bsize][0], 1),
              (rd_stats_y.dist + rd_stats_uv.dist))) {
     x->skip = skip_blk;
   } else {
@@ -12518,4 +12663,328 @@ void av1_check_ncobmc_rd(const struct AV1_COMP *cpi, struct macroblock *x,
   }
 }
 #endif  // CONFIG_NCOBMC
+
+int64_t get_prediction_rd_cost(const struct AV1_COMP *cpi, struct macroblock *x,
+                               int mi_row, int mi_col, int *skip_blk,
+                               MB_MODE_INFO *backup_mbmi) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  BLOCK_SIZE bsize = mbmi->sb_type;
+#if CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_WARPED_MOTION
+  const MOTION_MODE motion_allowed = motion_mode_allowed(
+#if CONFIG_GLOBAL_MOTION
+      0, xd->global_motion,
+#endif  // CONFIG_GLOBAL_MOTION
+#if CONFIG_WARPED_MOTION
+      xd,
+#endif
+      xd->mi[0]);
+#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_WARPED_MOTION
+  RD_STATS rd_stats_y, rd_stats_uv;
+  int rate_skip0 = av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
+  int rate_skip1 = av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+  int64_t this_rd;
+  int ref;
+
+#if CONFIG_CB4X4
+  x->skip_chroma_rd =
+      !is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+                           xd->plane[1].subsampling_y);
+#endif
+
+  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+    YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[ref]);
+    assert(cfg != NULL);
+    av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
+                         &xd->block_refs[ref]->sf);
+  }
+  av1_setup_dst_planes(x->e_mbd.plane, bsize,
+                       get_frame_new_buffer(&cpi->common), mi_row, mi_col);
+
+#if CONFIG_NCOBMC_ADAPT_WEIGHT
+  if (mbmi->motion_mode != NCOBMC_ADAPT_WEIGHT)
+#endif
+    av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
+
+#if CONFIG_MOTION_VAR
+  if (mbmi->motion_mode == OBMC_CAUSAL) {
+#if CONFIG_NCOBMC
+    av1_build_ncobmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+#else
+    av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+#endif
+  }
+#endif  // CONFIG_MOTION_VAR
+
+#if CONFIG_NCOBMC_ADAPT_WEIGHT
+  if (mbmi->motion_mode == NCOBMC_ADAPT_WEIGHT)
+    for (int plane = 0; plane < MAX_MB_PLANE; ++plane)
+      get_pred_from_intrpl_buf(xd, mi_row, mi_col, bsize, plane);
+#endif
+  av1_subtract_plane(x, bsize, 0);
+
+#if CONFIG_VAR_TX
+  if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
+    select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+  } else {
+    int idx, idy;
+    super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+    for (idy = 0; idy < xd->n8_h; ++idy)
+      for (idx = 0; idx < xd->n8_w; ++idx)
+        mbmi->inter_tx_size[idy][idx] = mbmi->tx_size;
+    memset(x->blk_skip[0], rd_stats_y.skip,
+           sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
+  }
+  inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+#else
+  super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+  super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+#endif
+  assert(rd_stats_y.rate != INT_MAX && rd_stats_uv.rate != INT_MAX);
+
+  if (rd_stats_y.skip && rd_stats_uv.skip) {
+    rd_stats_y.rate = rate_skip1;
+    rd_stats_uv.rate = 0;
+    rd_stats_y.dist = rd_stats_y.sse;
+    rd_stats_uv.dist = rd_stats_uv.sse;
+    *skip_blk = 1;
+  } else if (RDCOST(x->rdmult,
+                    (rd_stats_y.rate + rd_stats_uv.rate + rate_skip0),
+                    (rd_stats_y.dist + rd_stats_uv.dist)) >
+             RDCOST(x->rdmult, rate_skip1,
+                    (rd_stats_y.sse + rd_stats_uv.sse))) {
+    rd_stats_y.rate = rate_skip1;
+    rd_stats_uv.rate = 0;
+    rd_stats_y.dist = rd_stats_y.sse;
+    rd_stats_uv.dist = rd_stats_uv.sse;
+    *skip_blk = 1;
+  } else {
+    rd_stats_y.rate += rate_skip0;
+    *skip_blk = 0;
+  }
+
+  if (backup_mbmi) *backup_mbmi = *mbmi;
+
+  this_rd = RDCOST(x->rdmult, (rd_stats_y.rate + rd_stats_uv.rate),
+                   (rd_stats_y.dist + rd_stats_uv.dist));
+#if CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_WARPED_MOTION
+  if (motion_allowed == NCOBMC_ADAPT_WEIGHT) {
+    assert(mbmi->motion_mode <= NCOBMC_ADAPT_WEIGHT);
+    this_rd +=
+        RDCOST(x->rdmult, x->motion_mode_cost2[bsize][mbmi->motion_mode], 0);
+  } else if (motion_allowed == OBMC_CAUSAL) {
+    assert(mbmi->motion_mode <= OBMC_CAUSAL);
+    this_rd +=
+        RDCOST(x->rdmult, x->motion_mode_cost1[bsize][mbmi->motion_mode], 0);
+  } else {
+#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_WARPED_MOTION
+    this_rd +=
+        RDCOST(x->rdmult, x->motion_mode_cost[bsize][mbmi->motion_mode], 0);
+#if CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_WARPED_MOTION
+  }
+#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT && CONFIG_WARPED_MOTION
+  return this_rd;
+}
+
+#if CONFIG_NCOBMC_ADAPT_WEIGHT
+void av1_check_ncobmc_adapt_weight_rd(const struct AV1_COMP *cpi,
+                                      struct macroblock *x, int mi_row,
+                                      int mi_col) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  BLOCK_SIZE bsize = mbmi->sb_type;
+#if CONFIG_VAR_TX
+  const int n4 = bsize_to_num_blk(bsize);
+  uint8_t st_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE * 8];
+  uint8_t obmc_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE * 8];
+  uint8_t ncobmc_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE * 8];
+#endif
+  MB_MODE_INFO st_mbmi, obmc_mbmi, ncobmc_mbmi;
+  int st_skip, obmc_skip, ncobmc_skip;
+  int64_t st_rd, obmc_rd, ncobmc_rd;
+#if CONFIG_WARPED_MOTION
+  const AV1_COMMON *const cm = &cpi->common;
+  const int is_warp_motion = mbmi->motion_mode == WARPED_CAUSAL;
+  const int rs = RDCOST(x->rdmult, av1_get_switchable_rate(cm, x, xd), 0);
+  MB_MODE_INFO warp_mbmi;
+  int64_t warp_rd;
+  int warp_skip;
+#endif
+
+  // Recompute the rd for the motion mode decided in rd loop
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+  st_rd = get_prediction_rd_cost(cpi, x, mi_row, mi_col, &st_skip, &st_mbmi);
+#if CONFIG_WARPED_MOTION
+  st_rd += rs;
+#endif
+#if CONFIG_VAR_TX
+  memcpy(st_blk_skip, x->blk_skip[0], sizeof(st_blk_skip[0]) * n4);
+#endif
+
+  mbmi->motion_mode = OBMC_CAUSAL;
+  obmc_rd =
+      get_prediction_rd_cost(cpi, x, mi_row, mi_col, &obmc_skip, &obmc_mbmi);
+#if CONFIG_WARPED_MOTION
+  obmc_rd += rs;
+#endif
+#if CONFIG_VAR_TX
+  memcpy(obmc_blk_skip, x->blk_skip[0], sizeof(obmc_blk_skip[0]) * n4);
+#endif
+
+  // Compute the rd cost for ncobmc adaptive weight
+  mbmi->motion_mode = NCOBMC_ADAPT_WEIGHT;
+  ncobmc_rd = get_prediction_rd_cost(cpi, x, mi_row, mi_col, &ncobmc_skip,
+                                     &ncobmc_mbmi);
+#if CONFIG_WARPED_MOTION
+  ncobmc_rd += rs;
+#endif
+  // Calculate the ncobmc mode costs
+  {
+    ADAPT_OVERLAP_BLOCK aob = adapt_overlap_block_lookup[bsize];
+    ncobmc_rd +=
+        RDCOST(x->rdmult, x->ncobmc_mode_cost[aob][mbmi->ncobmc_mode[0]], 0);
+    if (mi_size_wide[bsize] != mi_size_high[bsize])
+      ncobmc_rd +=
+          RDCOST(x->rdmult, x->ncobmc_mode_cost[aob][mbmi->ncobmc_mode[1]], 0);
+  }
+#if CONFIG_VAR_TX
+  memcpy(ncobmc_blk_skip, x->blk_skip[0], sizeof(ncobmc_blk_skip[0]) * n4);
+#endif
+
+#if CONFIG_WARPED_MOTION
+  if (is_warp_motion) {
+    mbmi->motion_mode = WARPED_CAUSAL;
+    warp_rd =
+        get_prediction_rd_cost(cpi, x, mi_row, mi_col, &warp_skip, &warp_mbmi);
+  } else {
+    warp_rd = INT64_MAX;
+  }
+#endif
+
+#if CONFIG_WARPED_MOTION
+  if (AOMMIN(ncobmc_rd, warp_rd) < AOMMIN(st_rd, obmc_rd)) {
+    if (ncobmc_rd < warp_rd) {
+      x->skip = ncobmc_skip;
+      *mbmi = ncobmc_mbmi;
+#if CONFIG_VAR_TX
+      memcpy(x->blk_skip[0], ncobmc_blk_skip, sizeof(ncobmc_blk_skip[0]) * n4);
+#endif
+    } else {
+      x->skip = warp_skip;
+      *mbmi = warp_mbmi;
+    }
+#else
+  if (ncobmc_rd < AOMMIN(st_rd, obmc_rd)) {
+    x->skip = ncobmc_skip;
+    *mbmi = ncobmc_mbmi;
+#if CONFIG_VAR_TX
+    memcpy(x->blk_skip[0], ncobmc_blk_skip, sizeof(ncobmc_blk_skip[0]) * n4);
+#endif
+#endif  // CONFIG_WARPED_MOTION
+  } else {
+    if (obmc_rd < st_rd) {
+      *mbmi = obmc_mbmi;
+      x->skip = obmc_skip;
+#if CONFIG_VAR_TX
+      memcpy(x->blk_skip[0], obmc_blk_skip, sizeof(obmc_blk_skip[0]) * n4);
+#endif
+    } else {
+      *mbmi = st_mbmi;
+      x->skip = st_skip;
+#if CONFIG_VAR_TX
+      memcpy(x->blk_skip[0], st_blk_skip, sizeof(st_blk_skip[0]) * n4);
+#endif
+    }
+  }
+}
+
+int64_t get_ncobmc_error(MACROBLOCKD *xd, int pxl_row, int pxl_col,
+                         BLOCK_SIZE bsize, int plane, struct buf_2d *src) {
+  const int wide = AOMMIN(mi_size_wide[bsize] * MI_SIZE,
+                          (xd->sb_mi_bd.mi_col_end + 1) * MI_SIZE - pxl_col);
+  const int high = AOMMIN(mi_size_high[bsize] * MI_SIZE,
+                          (xd->sb_mi_bd.mi_row_end + 1) * MI_SIZE - pxl_row);
+  const int ss_x = xd->plane[plane].subsampling_x;
+  const int ss_y = xd->plane[plane].subsampling_y;
+  int row_offset = (pxl_row - xd->sb_mi_bd.mi_row_begin * MI_SIZE) >> ss_y;
+  int col_offset = (pxl_col - xd->sb_mi_bd.mi_col_begin * MI_SIZE) >> ss_x;
+  int dst_stride = xd->ncobmc_pred_buf_stride[plane];
+  int dst_offset = row_offset * dst_stride + col_offset;
+  int src_stride = src->stride;
+
+  int r, c;
+  int64_t tmp, error = 0;
+
+  for (r = 0; r < (high >> ss_y); ++r) {
+    for (c = 0; c < (wide >> ss_x); ++c) {
+      tmp = xd->ncobmc_pred_buf[plane][r * dst_stride + c + dst_offset] -
+            src->buf[r * src_stride + c];
+      error += tmp * tmp;
+    }
+  }
+  return error;
+}
+
+int get_ncobmc_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                    MACROBLOCKD *xd, int mi_row, int mi_col, int bsize) {
+  const AV1_COMMON *const cm = &cpi->common;
+  uint8_t *pred_buf[4][MAX_MB_PLANE];
+
+  // TODO(weitinglin): stride size needs to be fixed for high-bit depth
+  int pred_stride[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+
+  // target block in pxl
+  int pxl_row = mi_row << MI_SIZE_LOG2;
+  int pxl_col = mi_col << MI_SIZE_LOG2;
+  int64_t error, best_error = INT64_MAX;
+  int plane, tmp_mode, best_mode = 0;
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    int len = sizeof(uint16_t);
+    ASSIGN_ALIGNED_PTRS_HBD(pred_buf[0], cm->ncobmcaw_buf[0], MAX_SB_SQUARE,
+                            len);
+    ASSIGN_ALIGNED_PTRS_HBD(pred_buf[1], cm->ncobmcaw_buf[1], MAX_SB_SQUARE,
+                            len);
+    ASSIGN_ALIGNED_PTRS_HBD(pred_buf[2], cm->ncobmcaw_buf[2], MAX_SB_SQUARE,
+                            len);
+    ASSIGN_ALIGNED_PTRS_HBD(pred_buf[3], cm->ncobmcaw_buf[3], MAX_SB_SQUARE,
+                            len);
+  } else {
+#endif  // CONFIG_HIGHBITDEPTH
+    ASSIGN_ALIGNED_PTRS(pred_buf[0], cm->ncobmcaw_buf[0], MAX_SB_SQUARE);
+    ASSIGN_ALIGNED_PTRS(pred_buf[1], cm->ncobmcaw_buf[1], MAX_SB_SQUARE);
+    ASSIGN_ALIGNED_PTRS(pred_buf[2], cm->ncobmcaw_buf[2], MAX_SB_SQUARE);
+    ASSIGN_ALIGNED_PTRS(pred_buf[3], cm->ncobmcaw_buf[3], MAX_SB_SQUARE);
+#if CONFIG_HIGHBITDEPTH
+  }
+#endif
+
+  av1_get_ext_blk_preds(cm, xd, bsize, mi_row, mi_col, pred_buf, pred_stride);
+  av1_get_ori_blk_pred(cm, xd, bsize, mi_row, mi_col, pred_buf[3], pred_stride);
+
+  for (tmp_mode = 0; tmp_mode < MAX_NCOBMC_MODES; ++tmp_mode) {
+    error = 0;
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+      build_ncobmc_intrpl_pred(cm, xd, plane, pxl_row, pxl_col, bsize, pred_buf,
+                               pred_stride, tmp_mode);
+      error += get_ncobmc_error(xd, pxl_row, pxl_col, bsize, plane,
+                                &x->plane[plane].src);
+    }
+    if (error < best_error) {
+      best_mode = tmp_mode;
+      best_error = error;
+    }
+  }
+
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    build_ncobmc_intrpl_pred(cm, xd, plane, pxl_row, pxl_col, bsize, pred_buf,
+                             pred_stride, best_mode);
+  }
+
+  return best_mode;
+}
+
+#endif  // CONFIG_NCOBMC_ADAPT_WEIGHT
 #endif  // CONFIG_MOTION_VAR
author	trav90 <travawine@palemoon.org>	2018-10-18 21:53:44 -0500
committer	trav90 <travawine@palemoon.org>	2018-10-18 21:53:44 -0500
commit	ec910d81405c736a4490383a250299a7837c2e64 (patch)
tree	4f27cc226f93a863121aef6c56313e4153a69b3e /third_party/aom/av1/encoder/rdopt.c
parent	01eb57073ba97b2d6cbf20f745dfcc508197adc3 (diff)
download	UXP-ec910d81405c736a4490383a250299a7837c2e64.tar UXP-ec910d81405c736a4490383a250299a7837c2e64.tar.gz UXP-ec910d81405c736a4490383a250299a7837c2e64.tar.lz UXP-ec910d81405c736a4490383a250299a7837c2e64.tar.xz UXP-ec910d81405c736a4490383a250299a7837c2e64.zip